From d63c664ca0021fbac31cee57ff1eaa8bce3d1903 Mon Sep 17 00:00:00 2001
From: rui-ren <ruiren1225@gmail.com>
Date: Thu, 15 Feb 2024 00:02:08 -0800
Subject: [PATCH 001/279] fix rocm  ci pipeline (#19525)

### Description
<!-- Describe your changes. -->

ROCm CI pipeline issue.
```
Downloading and preparing dataset wikitext/wikitext-2-raw-v1 (download: 4.50 MiB, generated: 12.91 MiB, post-processed: Unknown size, total: 17.41 MiB) to /home/onnxruntimedev/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/aa5e094000ec7afeb74c3be92c88313cd6f132d564c7effd961c10fd47c76f20...
    main()
  File "/stage/huggingface-transformers/examples/pytorch/language-modeling/run_mlm.py", line 242, in main
    datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
  File "/opt/miniconda/envs/rocm-ci/lib/python3.9/site-packages/datasets/load.py", line 856, in load_dataset
    builder_instance.download_and_prepare(
  File "/opt/miniconda/envs/rocm-ci/lib/python3.9/site-packages/datasets/builder.py", line 583, in download_and_prepare
    self._download_and_prepare(
  File "/opt/miniconda/envs/rocm-ci/lib/python3.9/site-packages/datasets/builder.py", line 639, in _download_and_prepare
    split_generators = self._split_generators(dl_manager, **split_generators_kwargs)
  File "/home/onnxruntimedev/.cache/huggingface/modules/datasets_modules/datasets/wikitext/aa5e094000ec7afeb74c3be92c88313cd6f132d564c7effd961c10fd47c76f20/wikitext.py", line 138, in _split_generators
    data_file = dl_manager.download_and_extract(self.config.data_url)
  File "/opt/miniconda/envs/rocm-ci/lib/python3.9/site-packages/datasets/utils/download_manager.py", line 289, in download_and_extract
    return self.extract(self.download(url_or_urls))
  File "/opt/miniconda/envs/rocm-ci/lib/python3.9/site-packages/datasets/utils/download_manager.py", line 197, in download
    downloaded_path_or_paths = map_nested(
  File "/opt/miniconda/envs/rocm-ci/lib/python3.9/site-packages/datasets/utils/py_utils.py", line 195, in map_nested
    return function(data_struct)
  File "/opt/miniconda/envs/rocm-ci/lib/python3.9/site-packages/datasets/utils/download_manager.py", line 220, in _download
    return cached_path(url_or_filename, download_config=download_config)
  File "/opt/miniconda/envs/rocm-ci/lib/python3.9/site-packages/datasets/utils/file_utils.py", line 281, in cached_path
    output_path = get_from_cache(
  File "/opt/miniconda/envs/rocm-ci/lib/python3.9/site-packages/datasets/utils/file_utils.py", line 634, in get_from_cache
    raise ConnectionError("Couldn't reach {}".format(url))
ConnectionError: Couldn't reach https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip

```


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
Update the `datasets` pipeline to latest version `2.17.0`.
---
 tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile b/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile
index 64710a982a29d..496b57b417fbd 100644
--- a/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile
+++ b/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile
@@ -112,7 +112,7 @@ RUN pip install \
     cerberus \
     sympy \
     h5py \
-    datasets==1.9.0 \
+    datasets==2.17.0 \
     requests \
     sacrebleu==1.5.1 \
     sacremoses \

From d0061d6fb15d40eeb35fa1b40a414cd231d51db9 Mon Sep 17 00:00:00 2001
From: sophies927 <107952697+sophies927@users.noreply.github.com>
Date: Thu, 15 Feb 2024 17:03:11 -0800
Subject: [PATCH 002/279] Update stale.yml to use old version as a bug fix
 (#19532)

### Description
Changed the actions/stale version back to v8 from v9.


### Motivation and Context
There is a well-documented issue w/ the new actions/stale version
(v9.0.0) that causes the following error: "Error delete _state: [403]
Resource not accessible by integration". See
https://github.com/actions/stale/issues/1133 for more context.

This issue is preventing the stale bot from labeling stale issues since
the version was updated b/c the action can no longer access the cache
and cannot apply labels to all issues due to GH API rate limiting.

There are two potential fixes if we continue to use the new version: (1)
run the action on all PRs/issues to avoid using the cache or (2) give
write access to the endpoints listed in
https://docs.github.com/en/rest/authentication/permissions-required-for-fine-grained-personal-access-tokens?apiVersion=2022-11-28#repository-permissions-for-actions.
Neither of these options is preferable, so I am going to wait until the
bug is fixed.

Note: The old version (v8.0.0) uses Node 16, which will be deprecated in
Spring 2024, instead of Node 20, so we should keep an eye on [this
issue](https://github.com/actions/stale/issues/1133) to see when they
make the fix and we can switch back to the new version.
---
 .github/workflows/stale.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index c94e3fa5bcb8c..181f3fb17d332 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -13,7 +13,7 @@ jobs:
       issues: write
       pull-requests: write
     steps:
-      - uses: actions/stale@v9.0.0
+      - uses: actions/stale@v8
         with:
           # Comma separated list of labels that can be assigned to issues to exclude them from being marked as stale
           exempt-issue-labels: contributions welcome, feature request, regression

From 4bfa69def85476b33ccfaf68cf070f3fb65d39f7 Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Thu, 15 Feb 2024 20:22:36 -0800
Subject: [PATCH 003/279] Speed Up DecoderMaskedSelfAttentionTest (#19531)

### Description
The unit tests take 19 minutes to run (in debug build) because of too
many combinations. I reduce the combinations and remain good test
coverage. After the change, the test can finish in 51 seconds.

Before:
[----------] 2 tests from DecoderMaskedSelfAttentionTest
[ RUN      ] DecoderMaskedSelfAttentionTest.Test_fp32
[       OK ] DecoderMaskedSelfAttentionTest.Test_fp32 (394086 ms)
[ RUN      ] DecoderMaskedSelfAttentionTest.Test_fp16
[       OK ] DecoderMaskedSelfAttentionTest.Test_fp16 (747035 ms)
[----------] 2 tests from DecoderMaskedSelfAttentionTest (1141122 ms
total)

After:
[----------] 2 tests from DecoderMaskedSelfAttentionTest
[ RUN      ] DecoderMaskedSelfAttentionTest.Test_fp32
[       OK ] DecoderMaskedSelfAttentionTest.Test_fp32 (21057 ms)
[ RUN      ] DecoderMaskedSelfAttentionTest.Test_fp16
[       OK ] DecoderMaskedSelfAttentionTest.Test_fp16 (30653 ms)
[----------] 2 tests from DecoderMaskedSelfAttentionTest (51710 ms
total)


### Motivation and Context
Reduce test time, and improve build pipeline efficiency.
---
 ...oder_masked_multihead_attention_op_test.cc | 451 ++++++++++--------
 1 file changed, 242 insertions(+), 209 deletions(-)

diff --git a/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc b/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc
index 6afb61bd1f0a1..8ea37ad054ed0 100644
--- a/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc
+++ b/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc
@@ -640,122 +640,139 @@ TEST(DecoderMaskedSelfAttentionTest, Test_fp32) {
     return;
   }
 
-  // Vary batch size
-  for (int batch_size = 1; batch_size <= 5; batch_size += 2) {
-    // Vary kv_lengths
-    for (int past_sequence_length = 1; past_sequence_length <= 3000; past_sequence_length += 150) {
-      int sequence_length = 1;
-      int number_of_heads = 12;
-      // Vary head_size / hidden_size
-      int hidden_sizes[3] = {384, 768, 1536};
-      for (int hidden_size : hidden_sizes) {
-        int head_size = (hidden_size / number_of_heads);
-        int total_sequence_length = sequence_length + past_sequence_length;
-        int max_sequence_length = past_sequence_length + 1;  // Always keep >  past_sequence_length
-
-        OpTester tester("DecoderMaskedSelfAttention", 1, onnxruntime::kMSDomain);
-        tester.AddAttribute<int64_t>("num_heads", static_cast<int64_t>(number_of_heads));
-        tester.AddAttribute<int64_t>("past_present_share_buffer", static_cast<int64_t>(1));
-
-        std::vector<int64_t> input_dims = {batch_size, sequence_length, hidden_size};
-        std::vector<int64_t> weights_dims = {hidden_size, 3 * hidden_size};
-        std::vector<int64_t> bias_dims = {3 * hidden_size};
-        std::vector<int64_t> output_dims = {batch_size, sequence_length, hidden_size};
-
-        auto input = CreateRandom<float>(batch_size * sequence_length * hidden_size);
-        tester.AddInput<float>("input", input_dims, input);
-
-        auto weight = CreateRandom<float>(hidden_size * 3 * hidden_size);
-        tester.AddInput<float>("weight", weights_dims, weight);
-
-        auto bias = CreateRandom<float>(3 * hidden_size);
-        tester.AddInput<float>("bias", bias_dims, bias);
-
-        // Mask
-        tester.AddOptionalInputEdge<int32_t>();
-
-        // Past
-        std::vector<int64_t> past_dims = {2, batch_size, number_of_heads, max_sequence_length, head_size};
-        int past_present_size = 2 * batch_size * number_of_heads * max_sequence_length * head_size;
-
-        auto kv_cache = CreateRandom<float>(past_present_size);
-
-        auto reordered_kv_cache = ReorderKVCache<float>(kv_cache, batch_size,
-                                                        number_of_heads, past_sequence_length, head_size, max_sequence_length);
-
-        // Validate if reordering went well - by transposing and checking equality
-        int chunk_size = 16 / sizeof(float);
-        int num_chunks = head_size / chunk_size;
-        auto transposed = Transpose<float>(kv_cache.data(), batch_size, number_of_heads, num_chunks, max_sequence_length, chunk_size);
-        CheckEquality<float>(transposed.data(), reordered_kv_cache.data(), batch_size, number_of_heads, num_chunks,
-                             max_sequence_length, past_sequence_length, chunk_size);
-
-        tester.AddInput<float>("past", past_dims, reordered_kv_cache);
-
-        // Rel
-        tester.AddOptionalInputEdge<float>();
-
-        // Past sequence length
-        std::vector<int32_t> arr_past_sequence_len(1, past_sequence_length);
-        tester.AddInput<int32_t>("past_sequence_length", {1}, arr_past_sequence_len);
-
-        // QKV MatMul
-        auto qkv = QKV(input, weight, bias, batch_size, sequence_length, hidden_size);
-        auto* qkv_matrix = qkv.data();
-
-        auto pair = MergePastKWithPresentKAndTranspose<float>(kv_cache.data(), qkv_matrix + hidden_size, batch_size,
-                                                              number_of_heads, past_sequence_length,
-                                                              max_sequence_length, head_size);
-
-        auto k_merged = pair.first;
-        auto k_transpose = pair.second;
-
-        auto qk_transpose = QK_Transpose<float>(qkv_matrix, k_transpose.data(), batch_size, number_of_heads,
-                                                total_sequence_length, head_size);
-
-        auto softmax_qk_transpose = Softmax_QK_Transpose<float>(qk_transpose.data(), batch_size, number_of_heads,
-                                                                sequence_length, total_sequence_length, head_size);
-
-        auto present = MergeReorderedKVCacheWithK<float>(reordered_kv_cache, qkv_matrix + hidden_size, batch_size,
-                                                         number_of_heads, past_sequence_length, max_sequence_length, head_size);
-
-        // Validate our test logic
-        // We want to validate if our merged "unordered" K is the same as
-        // the merged "ordered" K so that the QKT we do in our test code
-        // is equivalent to the QKT we do in the kernel
-        ValidateReorderedMergedKWithK<float>(k_merged.data(), present.data(), batch_size, number_of_heads, total_sequence_length, max_sequence_length, head_size);
+  // Buckets for test data:
+  // batch_size: 1, >=2
+  // past_sequence_length 0~30, 31~2046, >=2047 (so that total_sequence_length: 1~31, 32~2047, >=2048)
+  // head_size: 32, 64, 128
+  struct MyTestCase {
+    int batch_size;
+    int past_sequence_length;
+    int hidden_size;
+  } test_cases[] = {
+      {1, 0, 768},
+      {1, 1, 384},
+      {2, 30, 768},
+      {3, 31, 1536},
+      {4, 512, 384},
+      {1, 1024, 768},
+      {1, 2046, 1536},
+      {2, 2047, 384},
+      {3, 3000, 768},
+  };
+
+  constexpr int sequence_length = 1;
+  constexpr int number_of_heads = 12;
+
+  for (MyTestCase test_case : test_cases) {
+    int batch_size = test_case.batch_size;
+    int past_sequence_length = test_case.past_sequence_length;
+    int hidden_size = test_case.hidden_size;
+
+    int head_size = (hidden_size / number_of_heads);
+    int total_sequence_length = sequence_length + past_sequence_length;
+    int max_sequence_length = past_sequence_length + 1;  // Always keep >  past_sequence_length
+
+    OpTester tester("DecoderMaskedSelfAttention", 1, onnxruntime::kMSDomain);
+    tester.AddAttribute<int64_t>("num_heads", static_cast<int64_t>(number_of_heads));
+    tester.AddAttribute<int64_t>("past_present_share_buffer", static_cast<int64_t>(1));
+
+    std::vector<int64_t> input_dims = {batch_size, sequence_length, hidden_size};
+    std::vector<int64_t> weights_dims = {hidden_size, 3 * hidden_size};
+    std::vector<int64_t> bias_dims = {3 * hidden_size};
+    std::vector<int64_t> output_dims = {batch_size, sequence_length, hidden_size};
+
+    auto input = CreateRandom<float>(batch_size * sequence_length * hidden_size);
+    tester.AddInput<float>("input", input_dims, input);
+
+    auto weight = CreateRandom<float>(hidden_size * 3 * hidden_size);
+    tester.AddInput<float>("weight", weights_dims, weight);
+
+    auto bias = CreateRandom<float>(3 * hidden_size);
+    tester.AddInput<float>("bias", bias_dims, bias);
+
+    // Mask
+    tester.AddOptionalInputEdge<int32_t>();
+
+    // Past
+    std::vector<int64_t> past_dims = {2, batch_size, number_of_heads, max_sequence_length, head_size};
+    int past_present_size = 2 * batch_size * number_of_heads * max_sequence_length * head_size;
+
+    auto kv_cache = CreateRandom<float>(past_present_size);
+
+    auto reordered_kv_cache = ReorderKVCache<float>(kv_cache, batch_size,
+                                                    number_of_heads, past_sequence_length, head_size, max_sequence_length);
+
+    // Validate if reordering went well - by transposing and checking equality
+    int chunk_size = 16 / sizeof(float);
+    int num_chunks = head_size / chunk_size;
+    auto transposed = Transpose<float>(kv_cache.data(), batch_size, number_of_heads, num_chunks, max_sequence_length, chunk_size);
+    CheckEquality<float>(transposed.data(), reordered_kv_cache.data(), batch_size, number_of_heads, num_chunks,
+                         max_sequence_length, past_sequence_length, chunk_size);
+
+    tester.AddInput<float>("past", past_dims, reordered_kv_cache);
+
+    // Rel
+    tester.AddOptionalInputEdge<float>();
+
+    // Past sequence length
+    std::vector<int32_t> arr_past_sequence_len(1, past_sequence_length);
+    tester.AddInput<int32_t>("past_sequence_length", {1}, arr_past_sequence_len);
+
+    // QKV MatMul
+    auto qkv = QKV(input, weight, bias, batch_size, sequence_length, hidden_size);
+    auto* qkv_matrix = qkv.data();
+
+    auto pair = MergePastKWithPresentKAndTranspose<float>(kv_cache.data(), qkv_matrix + hidden_size, batch_size,
+                                                          number_of_heads, past_sequence_length,
+                                                          max_sequence_length, head_size);
+
+    auto k_merged = pair.first;
+    auto k_transpose = pair.second;
+
+    auto qk_transpose = QK_Transpose<float>(qkv_matrix, k_transpose.data(), batch_size, number_of_heads,
+                                            total_sequence_length, head_size);
+
+    auto softmax_qk_transpose = Softmax_QK_Transpose<float>(qk_transpose.data(), batch_size, number_of_heads,
+                                                            sequence_length, total_sequence_length, head_size);
+
+    auto present = MergeReorderedKVCacheWithK<float>(reordered_kv_cache, qkv_matrix + hidden_size, batch_size,
+                                                     number_of_heads, past_sequence_length, max_sequence_length, head_size);
+
+    // Validate our test logic
+    // We want to validate if our merged "unordered" K is the same as
+    // the merged "ordered" K so that the QKT we do in our test code
+    // is equivalent to the QKT we do in the kernel
+    ValidateReorderedMergedKWithK<float>(k_merged.data(), present.data(), batch_size, number_of_heads, total_sequence_length, max_sequence_length, head_size);
+
+    MergeReorderedKVCacheWithV<float>(present.data() + (past_present_size / 2), qkv_matrix + 2 * hidden_size, batch_size,
+                                      number_of_heads, past_sequence_length, max_sequence_length, head_size);
+
+    auto output = Softmax_QK_Transpose_V<float>(softmax_qk_transpose.data(), present.data() + (past_present_size / 2),
+                                                batch_size, number_of_heads,
+                                                sequence_length, total_sequence_length,
+                                                max_sequence_length, head_size);
 
-        MergeReorderedKVCacheWithV<float>(present.data() + (past_present_size / 2), qkv_matrix + 2 * hidden_size, batch_size,
-                                          number_of_heads, past_sequence_length, max_sequence_length, head_size);
-
-        auto output = Softmax_QK_Transpose_V<float>(softmax_qk_transpose.data(), present.data() + (past_present_size / 2),
-                                                    batch_size, number_of_heads,
-                                                    sequence_length, total_sequence_length,
-                                                    max_sequence_length, head_size);
-
-        // Output(s)
-        tester.AddOutput<float>("output", input_dims, output);
+    // Output(s)
+    tester.AddOutput<float>("output", input_dims, output);
 
-        tester.AddOutput<float>("present", past_dims, present);
+    tester.AddOutput<float>("present", past_dims, present);
 
-        // Run - Regular kernel execution path
-        {
-          std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
-          execution_providers.push_back(DefaultCudaExecutionProvider());
-          tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
-        }
+    // Run - Regular kernel execution path
+    {
+      std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+      execution_providers.push_back(DefaultCudaExecutionProvider());
+      tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+    }
 
-        // Test alternate kernel path of loading more KV data "in flight"
-        {
-          ScopedEnvironmentVariables scoped_env_vars{
-              EnvVarMap{{onnxruntime::contrib::attention::kDecoderMaskedAttentionLoadKVDataInFlight, "1"}}};
+    // Test alternate kernel path of loading more KV data "in flight"
+    {
+      ScopedEnvironmentVariables scoped_env_vars{
+          EnvVarMap{{onnxruntime::contrib::attention::kDecoderMaskedAttentionLoadKVDataInFlight, "1"}}};
 
-          std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
-          execution_providers.push_back(DefaultCudaExecutionProvider());
+      std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+      execution_providers.push_back(DefaultCudaExecutionProvider());
 
-          tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
-        }
-      }
+      tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
     }
   }
 }
@@ -766,122 +783,138 @@ TEST(DecoderMaskedSelfAttentionTest, Test_fp16) {
     return;
   }
 
-  // Vary batch size
-  for (int batch_size = 1; batch_size <= 5; batch_size += 2) {
-    // Vary kv_lengths
-    for (int past_sequence_length = 1; past_sequence_length <= 3000; past_sequence_length += 150) {
-      int sequence_length = 1;
-      int number_of_heads = 12;
-
-      // Vary head_size / hidden_size
-      int hidden_sizes[3] = {384, 768, 1536};
-      for (int hidden_size : hidden_sizes) {
-        int head_size = (hidden_size / number_of_heads);
-        int total_sequence_length = sequence_length + past_sequence_length;
-        int max_sequence_length = past_sequence_length + 1;  // Always keep >  past_sequence_length
-
-        OpTester tester("DecoderMaskedSelfAttention", 1, onnxruntime::kMSDomain);
-        tester.AddAttribute<int64_t>("num_heads", static_cast<int64_t>(number_of_heads));
-        tester.AddAttribute<int64_t>("past_present_share_buffer", static_cast<int64_t>(1));
-
-        std::vector<int64_t> input_dims = {batch_size, sequence_length, hidden_size};
-        std::vector<int64_t> weights_dims = {hidden_size, 3 * hidden_size};
-        std::vector<int64_t> bias_dims = {3 * hidden_size};
-        std::vector<int64_t> output_dims = {batch_size, sequence_length, hidden_size};
-
-        auto input = CreateRandom<MLFloat16>(batch_size * sequence_length * hidden_size);
-        tester.AddInput<MLFloat16>("input", input_dims, input);
-
-        auto weight = CreateRandom<MLFloat16>(hidden_size * 3 * hidden_size);
-        tester.AddInput<MLFloat16>("weight", weights_dims, weight);
-
-        auto bias = CreateRandom<MLFloat16>(3 * hidden_size);
-        tester.AddInput<MLFloat16>("bias", bias_dims, bias);
-
-        // Mask
-        tester.AddOptionalInputEdge<int32_t>();
-
-        // Past
-        std::vector<int64_t> past_dims = {2, batch_size, number_of_heads, max_sequence_length, head_size};
-        int past_present_size = 2 * batch_size * number_of_heads * max_sequence_length * head_size;
-
-        auto kv_cache = CreateRandom<MLFloat16>(past_present_size);
-
-        auto reordered_kv_cache = ReorderKVCache<MLFloat16>(kv_cache, batch_size,
-                                                            number_of_heads, past_sequence_length, head_size, max_sequence_length);
+  // Buckets for test data:
+  // batch_size: 1, >=2
+  // past_sequence_length 0, 1~30, 31~2046, >=2047 (so that total_sequence_length: 1, 2-31, 32~2047, >=2048)
+  // head_size: 32, 64, 128
+  struct MyTestCase {
+    int batch_size;
+    int past_sequence_length;
+    int hidden_size;
+  } test_cases[] = {
+      {1, 0, 768},
+      {1, 1, 768},
+      {3, 30, 384},
+      {8, 31, 1536},
+      {4, 256, 384},
+      {3, 1024, 768},
+      {2, 2046, 1536},
+      {1, 2047, 384},
+      {2, 3000, 768},
+  };
+
+  constexpr int sequence_length = 1;
+  constexpr int number_of_heads = 12;
+
+  for (MyTestCase test_case : test_cases) {
+    int batch_size = test_case.batch_size;
+    int past_sequence_length = test_case.past_sequence_length;
+    int hidden_size = test_case.hidden_size;
+
+    int head_size = (hidden_size / number_of_heads);
+    int total_sequence_length = sequence_length + past_sequence_length;
+    int max_sequence_length = past_sequence_length + 1;  // Always keep >  past_sequence_length
+
+    OpTester tester("DecoderMaskedSelfAttention", 1, onnxruntime::kMSDomain);
+    tester.AddAttribute<int64_t>("num_heads", static_cast<int64_t>(number_of_heads));
+    tester.AddAttribute<int64_t>("past_present_share_buffer", static_cast<int64_t>(1));
+
+    std::vector<int64_t> input_dims = {batch_size, sequence_length, hidden_size};
+    std::vector<int64_t> weights_dims = {hidden_size, 3 * hidden_size};
+    std::vector<int64_t> bias_dims = {3 * hidden_size};
+    std::vector<int64_t> output_dims = {batch_size, sequence_length, hidden_size};
+
+    auto input = CreateRandom<MLFloat16>(batch_size * sequence_length * hidden_size);
+    tester.AddInput<MLFloat16>("input", input_dims, input);
+
+    auto weight = CreateRandom<MLFloat16>(hidden_size * 3 * hidden_size);
+    tester.AddInput<MLFloat16>("weight", weights_dims, weight);
+
+    auto bias = CreateRandom<MLFloat16>(3 * hidden_size);
+    tester.AddInput<MLFloat16>("bias", bias_dims, bias);
+
+    // Mask
+    tester.AddOptionalInputEdge<int32_t>();
+
+    // Past
+    std::vector<int64_t> past_dims = {2, batch_size, number_of_heads, max_sequence_length, head_size};
+    int past_present_size = 2 * batch_size * number_of_heads * max_sequence_length * head_size;
+
+    auto kv_cache = CreateRandom<MLFloat16>(past_present_size);
+
+    auto reordered_kv_cache = ReorderKVCache<MLFloat16>(kv_cache, batch_size,
+                                                        number_of_heads, past_sequence_length, head_size, max_sequence_length);
 
-        // Validate if reordering went well - by transposing and checking equality
-        int chunk_size = 16 / sizeof(MLFloat16);
-        int num_chunks = head_size / chunk_size;
-        auto transposed = Transpose<MLFloat16>(kv_cache.data(), batch_size, number_of_heads, num_chunks, max_sequence_length, chunk_size);
-        CheckEquality<MLFloat16>(transposed.data(), reordered_kv_cache.data(), batch_size, number_of_heads, num_chunks,
-                                 max_sequence_length, past_sequence_length, chunk_size);
+    // Validate if reordering went well - by transposing and checking equality
+    int chunk_size = 16 / sizeof(MLFloat16);
+    int num_chunks = head_size / chunk_size;
+    auto transposed = Transpose<MLFloat16>(kv_cache.data(), batch_size, number_of_heads, num_chunks, max_sequence_length, chunk_size);
+    CheckEquality<MLFloat16>(transposed.data(), reordered_kv_cache.data(), batch_size, number_of_heads, num_chunks,
+                             max_sequence_length, past_sequence_length, chunk_size);
 
-        tester.AddInput<MLFloat16>("past", past_dims, reordered_kv_cache);
+    tester.AddInput<MLFloat16>("past", past_dims, reordered_kv_cache);
 
-        // Rel
-        tester.AddOptionalInputEdge<MLFloat16>();
+    // Rel
+    tester.AddOptionalInputEdge<MLFloat16>();
 
-        // Past sequence length
-        std::vector<int32_t> arr_past_sequence_len(1, past_sequence_length);
-        tester.AddInput<int32_t>("past_sequence_length", {1}, arr_past_sequence_len);
+    // Past sequence length
+    std::vector<int32_t> arr_past_sequence_len(1, past_sequence_length);
+    tester.AddInput<int32_t>("past_sequence_length", {1}, arr_past_sequence_len);
 
-        // QKV MatMul
-        auto qkv = QKV(input, weight, bias, batch_size, sequence_length, hidden_size);
-        auto* qkv_matrix = qkv.data();
+    // QKV MatMul
+    auto qkv = QKV(input, weight, bias, batch_size, sequence_length, hidden_size);
+    auto* qkv_matrix = qkv.data();
 
-        auto pair = MergePastKWithPresentKAndTranspose<MLFloat16>(kv_cache.data(), qkv_matrix + hidden_size, batch_size,
-                                                                  number_of_heads, past_sequence_length,
-                                                                  max_sequence_length, head_size);
+    auto pair = MergePastKWithPresentKAndTranspose<MLFloat16>(kv_cache.data(), qkv_matrix + hidden_size, batch_size,
+                                                              number_of_heads, past_sequence_length,
+                                                              max_sequence_length, head_size);
 
-        auto k_merged = pair.first;
-        auto k_transpose = pair.second;
+    auto k_merged = pair.first;
+    auto k_transpose = pair.second;
 
-        auto qk_transpose = QK_Transpose<MLFloat16>(qkv_matrix, k_transpose.data(), batch_size, number_of_heads,
-                                                    total_sequence_length, head_size);
+    auto qk_transpose = QK_Transpose<MLFloat16>(qkv_matrix, k_transpose.data(), batch_size, number_of_heads,
+                                                total_sequence_length, head_size);
 
-        auto softmax_qk_transpose = Softmax_QK_Transpose<MLFloat16>(qk_transpose.data(), batch_size, number_of_heads,
-                                                                    sequence_length, total_sequence_length, head_size);
+    auto softmax_qk_transpose = Softmax_QK_Transpose<MLFloat16>(qk_transpose.data(), batch_size, number_of_heads,
+                                                                sequence_length, total_sequence_length, head_size);
 
-        auto present = MergeReorderedKVCacheWithK<MLFloat16>(reordered_kv_cache, qkv_matrix + hidden_size, batch_size,
-                                                             number_of_heads, past_sequence_length, max_sequence_length, head_size);
+    auto present = MergeReorderedKVCacheWithK<MLFloat16>(reordered_kv_cache, qkv_matrix + hidden_size, batch_size,
+                                                         number_of_heads, past_sequence_length, max_sequence_length, head_size);
 
-        // Validate our test logic
-        // We want to validate if our merged "unordered" K is the same as
-        // the merged "ordered" K so that the QKT we do in our test code
-        // is equivalent to the QKT we do in the kernel
-        ValidateReorderedMergedKWithK<MLFloat16>(k_merged.data(), present.data(), batch_size, number_of_heads, total_sequence_length, max_sequence_length, head_size);
+    // Validate our test logic
+    // We want to validate if our merged "unordered" K is the same as
+    // the merged "ordered" K so that the QKT we do in our test code
+    // is equivalent to the QKT we do in the kernel
+    ValidateReorderedMergedKWithK<MLFloat16>(k_merged.data(), present.data(), batch_size, number_of_heads, total_sequence_length, max_sequence_length, head_size);
 
-        MergeReorderedKVCacheWithV<MLFloat16>(present.data() + (past_present_size / 2), qkv_matrix + 2 * hidden_size, batch_size,
-                                              number_of_heads, past_sequence_length, max_sequence_length, head_size);
+    MergeReorderedKVCacheWithV<MLFloat16>(present.data() + (past_present_size / 2), qkv_matrix + 2 * hidden_size, batch_size,
+                                          number_of_heads, past_sequence_length, max_sequence_length, head_size);
 
-        auto output = Softmax_QK_Transpose_V(softmax_qk_transpose.data(), present.data() + (past_present_size / 2),
-                                             batch_size, number_of_heads,
-                                             sequence_length, total_sequence_length,
-                                             max_sequence_length, head_size);
+    auto output = Softmax_QK_Transpose_V(softmax_qk_transpose.data(), present.data() + (past_present_size / 2),
+                                         batch_size, number_of_heads,
+                                         sequence_length, total_sequence_length,
+                                         max_sequence_length, head_size);
 
-        // Output(s)
-        tester.AddOutput<MLFloat16>("output", input_dims, output);
+    // Output(s)
+    tester.AddOutput<MLFloat16>("output", input_dims, output);
 
-        tester.AddOutput<MLFloat16>("present", past_dims, present);
+    tester.AddOutput<MLFloat16>("present", past_dims, present);
 
-        // Run - Regular kernel execution path
-        {
-          std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
-          execution_providers.push_back(DefaultCudaExecutionProvider());
-          tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
-        }
+    // Run - Regular kernel execution path
+    {
+      std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+      execution_providers.push_back(DefaultCudaExecutionProvider());
+      tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+    }
 
-        // Test alternate kernel path of loading more KV data "in flight"
-        {
-          ScopedEnvironmentVariables scoped_env_vars{
-              EnvVarMap{{onnxruntime::contrib::attention::kDecoderMaskedAttentionLoadKVDataInFlight, "1"}}};
+    // Test alternate kernel path of loading more KV data "in flight"
+    {
+      ScopedEnvironmentVariables scoped_env_vars{
+          EnvVarMap{{onnxruntime::contrib::attention::kDecoderMaskedAttentionLoadKVDataInFlight, "1"}}};
 
-          std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
-          execution_providers.push_back(DefaultCudaExecutionProvider());
-          tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
-        }
-      }
+      std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+      execution_providers.push_back(DefaultCudaExecutionProvider());
+      tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
     }
   }
 }
@@ -889,4 +922,4 @@ TEST(DecoderMaskedSelfAttentionTest, Test_fp16) {
 #endif
 
 }  // namespace test
-}  // namespace onnxruntime
\ No newline at end of file
+}  // namespace onnxruntime

From ef0b71308c0e2395d3ea63e627515ff8e624ad45 Mon Sep 17 00:00:00 2001
From: Sheil Kumar <smk2007@gmail.com>
Date: Fri, 16 Feb 2024 05:34:55 -0800
Subject: [PATCH 004/279] Optimize KahnsTopologicalSort and PriorityNodeCompare
 (#19475)

**Description**
1) During SessionInitialization, KahnsTopologicalSort is a major cause
of perf degradation.
The main cause of slow down is that the TopologicalSort needs to keep
track of nodes to visit in order, and reorder them based on priority (as
informed by a comparator). The existing implementation uses a
priority_queue that is backed by a std::vector container. However,
vectors are not good for insertion and reordering. The appropriate data
type for this operation is a linked list. However, linked lists like
std::list are not usable as a container for std::priority_queue. This is
because std::priority_queue requires random access, which linked lists
do not have. However, for this simple implementation, we can leverage a
std::list under the hood and perform insertions manually using
std::upper_bound. This drastically reduces the time taken by the method,
which currently instead causes numerous recopies and a lot of movement
inside the graph nodes to visit list.

2) In the comparator, I hide forward and backward attribute checking
behind the #ifdef ENABLE_TRAINING macro, as I believe it should only be
valid in the training scenario.

3) In noopelimination transformer, I prevent the creation of Initializer
(which unpacks tensorproto data) in every node and only create
initializers when Add/Sub/Mul/Div op nodes are detected.

**Motivation and Context**
Session creation time of many models is quite slow.

---------

Co-authored-by: Sheil Kumar <sheilk@microsoft.com>
---
 onnxruntime/core/graph/graph.cc               | 37 ++++++++--
 onnxruntime/core/graph/graph_viewer.cc        | 18 +++--
 .../core/optimizer/noop_elimination.cc        | 73 +++++++++++--------
 .../ort_optimizer_api_impl.cc                 |  2 +-
 4 files changed, 85 insertions(+), 45 deletions(-)

diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc
index 902839bee04ba..305122c56b865 100644
--- a/onnxruntime/core/graph/graph.cc
+++ b/onnxruntime/core/graph/graph.cc
@@ -1818,16 +1818,36 @@ void Graph::ReverseDFSFrom(gsl::span<const Node* const> from,
   }
 }
 
+template <typename T>
+struct VisitorPriorityQueue {
+  using ComparatorType = std::function<bool(T, T)>;
+  std::list<T> list_;
+  const ComparatorType comparator_ = nullptr;
+  VisitorPriorityQueue(const ComparatorType& comp) : comparator_(comp) {}
+
+  void push(T node) {
+    list_.insert(
+        std::upper_bound(list_.begin(), list_.end(), node, comparator_),
+        node);
+  }
+  bool empty() { return list_.empty(); }
+  T top() { return list_.back(); }
+  void pop() { list_.pop_back(); }
+};
+
 #if !defined(ORT_MINIMAL_BUILD)
 void Graph::KahnsTopologicalSort(const std::function<void(const Node*)>& enter,
                                  const std::function<bool(const Node*, const Node*)>& comp) const {
-  std::unordered_map<NodeIndex, size_t> in_degree;
-  std::priority_queue<const Node*, std::vector<const Node*>, decltype(comp)> to_visit(comp);
-  std::vector<NodeIndex> topo_order;
+  InlinedVector<size_t> in_degree(MaxNodeIndex(), 0);
+  InlinedVector<NodeIndex> topo_order;
+  VisitorPriorityQueue<const Node*> to_visit(comp);
+
+  auto number_of_nodes = NumberOfNodes();
+  topo_order.reserve(number_of_nodes);
 
   for (auto& node : Nodes()) {
     size_t input_edge_count = node.GetInputEdgesCount();
-    in_degree.insert({node.Index(), input_edge_count});
+    in_degree[node.Index()] = input_edge_count;
     if (input_edge_count == 0) {
       to_visit.push(&node);
     }
@@ -1844,16 +1864,17 @@ void Graph::KahnsTopologicalSort(const std::function<void(const Node*)>& enter,
     }
 
     for (auto node_it = current->OutputNodesBegin(); node_it != current->OutputNodesEnd(); ++node_it) {
-      in_degree[node_it->Index()]--;
+      auto& node_in_degree = in_degree[node_it->Index()];
+      node_in_degree--;
 
-      if (in_degree[node_it->Index()] == 0) {
+      if (node_in_degree == 0) {
         to_visit.push(&*node_it);
       }
     }
     topo_order.push_back(current->Index());
   }
 
-  if (NumberOfNodes() != static_cast<int>(topo_order.size())) {
+  if (number_of_nodes != static_cast<int>(topo_order.size())) {
     ORT_THROW("Some nodes are not included in the topological sort, graph have a cycle.");
   }
 }
@@ -2843,7 +2864,7 @@ void Graph::AddInitializedTensor(const TensorProto& tensor) {
 
   const gsl::not_null<TensorProto*> tensor_added{graph_proto_->add_initializer()};
   *(tensor_added) = tensor;
-  name_to_initial_tensor_[tensor.name()] = tensor_added;
+  name_to_initial_tensor_.emplace(tensor.name(), tensor_added);
   SetGraphResolveNeeded();
   if (!is_loaded_from_model_file_ && GetNodeArg(tensor.name()) == nullptr) {
     // make sure there is a NodeArg for the initializer as SetGraphInputsOutputs may add it to the graph inputs.
diff --git a/onnxruntime/core/graph/graph_viewer.cc b/onnxruntime/core/graph/graph_viewer.cc
index acf7b3a16541f..119d420066a84 100644
--- a/onnxruntime/core/graph/graph_viewer.cc
+++ b/onnxruntime/core/graph/graph_viewer.cc
@@ -14,8 +14,8 @@ bool NodeCompare::operator()(const Node* n1, const Node* n2) const {
 struct PriorityNodeCompare {
   inline bool IsHighPri(const Node* n) const {
     // local statics so we can compare std::strings in the checks
-    static const std::string shape_op("Shape");
-    static const std::string size_op("Size");
+    static constexpr std::string_view shape_op("Shape");
+    static constexpr std::string_view size_op("Size");
 
     const auto& op_type = n->OpType();
     return op_type == shape_op || op_type == size_op;
@@ -26,15 +26,20 @@ struct PriorityNodeCompare {
   // If return true, n2 will be output first
   bool operator()(const Node* n1, const Node* n2) const {
     // nodes in global high priority list will be output first
-    if (IsHighPri(n1) != IsHighPri(n2)) {
-      return IsHighPri(n2);
+    const bool isN1HighPri = IsHighPri(n1);
+    const bool isN2HighPri = IsHighPri(n2);
+    if (isN1HighPri != isN2HighPri) {
+      return isN2HighPri;
     }
 
     // nodes with lower priority value will be output first
-    if (n1->Priority() != n2->Priority()) {
-      return n1->Priority() > n2->Priority();
+    const auto n1_priority = n1->Priority();
+    const auto n2_priority = n2->Priority();
+    if (n1_priority != n2_priority) {
+      return n1_priority > n2_priority;
     }
 
+#ifdef ENABLE_TRAINING
     // nodes of forward pass will be output first
     auto n1_attrs = n1->GetAttributes();
     auto n2_attrs = n2->GetAttributes();
@@ -45,6 +50,7 @@ struct PriorityNodeCompare {
     if (n1_is_forward != n2_is_forward) {
       return n2_is_forward > n1_is_forward;
     }
+#endif
 
     // otherwise, nodes with lower index will be output first
     return n1->Index() > n2->Index();
diff --git a/onnxruntime/core/optimizer/noop_elimination.cc b/onnxruntime/core/optimizer/noop_elimination.cc
index b3c2991d54b28..bba39b698a27a 100644
--- a/onnxruntime/core/optimizer/noop_elimination.cc
+++ b/onnxruntime/core/optimizer/noop_elimination.cc
@@ -42,49 +42,62 @@ bool NoopElimination::SatisfyCondition(const Graph& graph, const Node& node, con
 
   // if initializer_rank is bigger, the output is expected to be initializer_rank per broadcasting rule,
   // but it won't happen if the case is accepted, thus reject it
-  auto initializer_rank = initializer->dims().size();
+  const auto& dims = initializer->dims();
+  auto initializer_rank = dims.size();
   const auto* other_input_shape = node.InputDefs()[input0_is_initializer ? 1 : 0]->Shape();
   if (other_input_shape == nullptr || initializer_rank > other_input_shape->dim_size()) {
     return false;
   }
 
-  int32_t data_type = initializer->data_type();
-  Initializer add_init(*initializer, graph.ModelPath());
-  if (add_init.size() > 1) {
+  int64_t tensor_size = 1;
+  for (auto i : dims) {
+    tensor_size *= i;
+  }
+
+  if (tensor_size > 1) {
     return false;
   }
+
   // handle edge case where the total size of the initializer is 0
-  if (add_init.size() == 0) {
+  if (tensor_size == 0) {
     return true;
   }
 
-  float value = 0.0f;
-  switch (data_type) {
-    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT:
-      value = *add_init.data<float>();
-      break;
-    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
-      value = math::halfToFloat(add_init.data<MLFloat16>()->val);
-      break;
-    case ONNX_NAMESPACE::TensorProto_DataType_DOUBLE:
-      value = static_cast<float>(*add_init.data<double>());
-      break;
-    case ONNX_NAMESPACE::TensorProto_DataType_INT32:
-      value = static_cast<float>(*add_init.data<int32_t>());
-      break;
-    case ONNX_NAMESPACE::TensorProto_DataType_INT64:
-      value = static_cast<float>(*add_init.data<int64_t>());
-      break;
-    default:
+  if (op_type == "Add" ||
+      op_type == "Sub" ||
+      op_type == "Mul" ||
+      op_type == "Div") {
+    int32_t data_type = initializer->data_type();
+    Initializer add_init(*initializer, graph.ModelPath());
+
+    float value = 0.0f;
+    switch (data_type) {
+      case ONNX_NAMESPACE::TensorProto_DataType_FLOAT:
+        value = *add_init.data<float>();
+        break;
+      case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
+        value = math::halfToFloat(add_init.data<MLFloat16>()->val);
+        break;
+      case ONNX_NAMESPACE::TensorProto_DataType_DOUBLE:
+        value = static_cast<float>(*add_init.data<double>());
+        break;
+      case ONNX_NAMESPACE::TensorProto_DataType_INT32:
+        value = static_cast<float>(*add_init.data<int32_t>());
+        break;
+      case ONNX_NAMESPACE::TensorProto_DataType_INT64:
+        value = static_cast<float>(*add_init.data<int64_t>());
+        break;
+      default:
+        return false;
+    }
+
+    if (value != 0.0f && (op_type == "Add" || op_type == "Sub")) {
       return false;
-  }
+    }
 
-  if ((op_type == "Add" || op_type == "Sub") && value != 0.0f) {
-    return false;
-  }
-
-  if ((op_type == "Mul" || op_type == "Div") && value != 1.0f) {
-    return false;
+    if (value != 1.0f && (op_type == "Mul" || op_type == "Div")) {
+      return false;
+    }
   }
 
   // reject node output is graph output for now
diff --git a/onnxruntime/core/optimizer/transpose_optimization/ort_optimizer_api_impl.cc b/onnxruntime/core/optimizer/transpose_optimization/ort_optimizer_api_impl.cc
index d9f08ffe1171e..c532f56b3d3d9 100644
--- a/onnxruntime/core/optimizer/transpose_optimization/ort_optimizer_api_impl.cc
+++ b/onnxruntime/core/optimizer/transpose_optimization/ort_optimizer_api_impl.cc
@@ -115,7 +115,7 @@ class ApiGraph final : public api::GraphRef {
     const auto& graph_outputs = graph_.GetOutputs();
     graph_outputs_.reserve(graph_outputs.size());
     for (const auto* output : graph_outputs) {
-      graph_outputs_.insert(output->Name());
+      graph_outputs_.emplace(output->Name());
     }
   }
 

From b84712151c06f0f59359916be572f71bd36721a4 Mon Sep 17 00:00:00 2001
From: Adrian Lizarraga <adlizarraga@microsoft.com>
Date: Fri, 16 Feb 2024 14:36:05 -0800
Subject: [PATCH 005/279] QNN EP: Fuse DQ -> Q sequences into a QNN Convert op
 (#19511)

### Description
Fuses DQ -> Q sequences into a QNN Convert operator if:
- Converting from one qtype to another. Ex: Dequantize(uint8 to float)
-> Quantize(float to uint16)
- The DQ and Q operators are not part of another node unit (i.e.,
standalone)
- The Q operator is the only consumer for the DQ operator.


### Motivation and Context
Allows faster execution of QDQ models with mixed activation types by
leveraging the QNN Convert operator, which converts between quantization
types. For certain models, this results in inference latency speed-ups
of up to 2x (depends on the number of DQ -> Q sequences).

#### Example for Add node unit with 16-bit I/O:

Original:
```
u8 ----> DQ ---> Q ---u16--> Add ---u16-->
                              ^
                              |
u16 --------------------------+
```

After fusing DQ -> Q:
```
u8 ----> Convert ---u16--> Add ---u16-->
                            ^
                            |
u16 ------------------------+
```
---
 .../optimizer/qdq_transformer/qdq_util.cc     |  43 ++++++++
 .../core/optimizer/qdq_transformer/qdq_util.h |  12 ++
 .../qnn/builder/op_builder_factory.h          |  23 ++++
 .../builder/opbuilder/convert_op_builder.cc   | 103 ++++++++++++++++++
 .../core/providers/qnn/builder/qnn_model.cc   |  35 +++++-
 .../providers/qnn/qnn_execution_provider.cc   |  88 +++++++++------
 .../providers/qnn/qnn_execution_provider.h    |   1 -
 .../test/providers/qnn/simple_op_htp_test.cc  |  55 ++++++++++
 8 files changed, 319 insertions(+), 41 deletions(-)
 create mode 100644 onnxruntime/core/providers/qnn/builder/opbuilder/convert_op_builder.cc

diff --git a/onnxruntime/core/optimizer/qdq_transformer/qdq_util.cc b/onnxruntime/core/optimizer/qdq_transformer/qdq_util.cc
index b1ab641a23256..4e3dff705bd41 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/qdq_util.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/qdq_util.cc
@@ -76,6 +76,49 @@ bool IsQDQPairSupported(
   }
 }
 
+bool IsDQQConversion(
+    const Node& dq_node, const Node& q_node,
+    const GetConstantInitializerFn& get_const_initializer,
+    const Path& model_path) {
+  ConstPointerContainer<std::vector<NodeArg*>> dq_input_defs = dq_node.InputDefs();
+  ConstPointerContainer<std::vector<NodeArg*>> q_input_defs = q_node.InputDefs();
+
+  // Q/DQ contains optional input is not supported
+  // non-scalar Q/DQ scale and zero point needs are not supported
+  if (dq_input_defs.size() != InputIndex::TOTAL_COUNT ||
+      q_input_defs.size() != InputIndex::TOTAL_COUNT ||
+      !optimizer_utils::IsScalar(*q_input_defs[InputIndex::SCALE_ID]) ||
+      !optimizer_utils::IsScalar(*q_input_defs[InputIndex::ZERO_POINT_ID]) ||
+      !optimizer_utils::IsScalar(*dq_input_defs[InputIndex::SCALE_ID]) ||
+      !optimizer_utils::IsScalar(*dq_input_defs[InputIndex::ZERO_POINT_ID])) {
+    return false;
+  }
+
+  // if Q/DQ scale and zero point are not constant, return false
+  const ONNX_NAMESPACE::TensorProto* dq_scale_tensor_proto =
+      get_const_initializer(dq_input_defs[InputIndex::SCALE_ID]->Name());
+  const ONNX_NAMESPACE::TensorProto* q_scale_tensor_proto =
+      get_const_initializer(q_input_defs[InputIndex::SCALE_ID]->Name());
+  const ONNX_NAMESPACE::TensorProto* dq_zp_tensor_proto =
+      get_const_initializer(dq_input_defs[InputIndex::ZERO_POINT_ID]->Name());
+  const ONNX_NAMESPACE::TensorProto* q_zp_tensor_proto =
+      get_const_initializer(q_input_defs[InputIndex::ZERO_POINT_ID]->Name());
+  if (nullptr == q_zp_tensor_proto ||
+      nullptr == dq_zp_tensor_proto ||
+      nullptr == q_scale_tensor_proto ||
+      nullptr == dq_scale_tensor_proto) {
+    return false;
+  }
+
+  // check Q/DQ have same scale type and different zero point type
+  Initializer q_zp(*q_zp_tensor_proto, model_path);
+  Initializer q_scale(*q_scale_tensor_proto, model_path);
+  Initializer dq_zp(*dq_zp_tensor_proto, model_path);
+  Initializer dq_scale(*dq_scale_tensor_proto, model_path);
+
+  return (dq_zp.data_type() != q_zp.data_type()) && (dq_scale.data_type() == q_scale.data_type());
+}
+
 bool IsDQSupported(const Node& dq_node, const GetConstantInitializerFn& get_const_initializer) {
   bool zero_point_exists = false;
   if (!QOrDQNodeHasConstantScalarScaleAndZeroPoint(dq_node, get_const_initializer, zero_point_exists)) {
diff --git a/onnxruntime/core/optimizer/qdq_transformer/qdq_util.h b/onnxruntime/core/optimizer/qdq_transformer/qdq_util.h
index bb0bf9438cfcb..8333168b0093f 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/qdq_util.h
+++ b/onnxruntime/core/optimizer/qdq_transformer/qdq_util.h
@@ -38,6 +38,18 @@ bool IsQDQPairSupported(
     const GetConstantInitializerFn& get_const_initializer,
     const Path& model_path);
 
+// Check if a DQ -> Q sequence represents a conversion in quantization data type.
+// Example of uint8 to uint16:
+//     Dequantize (uint8 to float) -> Quantize (float to uint16)
+// Requires:
+// 1. Q/DQ doesn't have optional input.
+// 2. scale and zero-point are constant scalars.
+// 3. Q and DQ have the same scale *type* and different zero-point *types*.
+bool IsDQQConversion(
+    const Node& dq_node, const Node& q_node,
+    const GetConstantInitializerFn& get_const_initializer,
+    const Path& model_path);
+
 // Check if DQ is supported in extended level QDQ transformers. It requires:
 // 1. DQ doesn't have optional input.
 // 2. scale and zero point is constant scalar
diff --git a/onnxruntime/core/providers/qnn/builder/op_builder_factory.h b/onnxruntime/core/providers/qnn/builder/op_builder_factory.h
index d95e2baa9457f..4a9106f0c06af 100644
--- a/onnxruntime/core/providers/qnn/builder/op_builder_factory.h
+++ b/onnxruntime/core/providers/qnn/builder/op_builder_factory.h
@@ -94,5 +94,28 @@ void CreatePadOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_r
 
 void CreateExpandOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 
+struct HandleConvertResult {
+  Status status;                // Indicates an unexpected error. Check if q_node_unit != nullptr to determine
+                                // whether a DQ -> Q sequence was successfully merged into a Convert.
+  const NodeUnit* q_node_unit;  // Non-null if successfully merged DQ -> Q sequence.
+                                // Set to nullptr if this node unit could not be merged into a Convert.
+};
+
+/**
+ * Tries to merge a DQ -> Q sequence into a QNN Convert operator. The DQ -> Q must be converting from
+ * one quantization type (e.g., uint8_t) to another (e.g., uint16_t).
+ *
+ * \param qnn_model_wrapper The QNN model that is being built.
+ * \param maybe_dq_node_unit The node unit that could potentially start the DQ -> Q sequence.
+ * \param logger The logger.
+ * \param do_op_validation True if should call QNN operator validation APIs.
+ * \return An qnn::HandleConvertResult object that indicates success/failure and provides a pointer
+ *         to the Q node unit that was successfully merged with the provided DQ node unit.
+ */
+HandleConvertResult TryHandleConvertSequence(QnnModelWrapper& qnn_model_wrapper,
+                                             const NodeUnit& maybe_dq_node_unit,
+                                             const std::unordered_map<const Node*, const NodeUnit*>& node_unit_map,
+                                             const logging::Logger& logger,
+                                             bool do_op_validation);
 }  // namespace qnn
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/convert_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/convert_op_builder.cc
new file mode 100644
index 0000000000000..977a9e0b3d9d0
--- /dev/null
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/convert_op_builder.cc
@@ -0,0 +1,103 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/graph/graph_utils.h"
+#include "core/optimizer/qdq_transformer/qdq_util.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
+#include "core/providers/shared/utils/utils.h"
+#include "core/providers/qnn/builder/qnn_model_wrapper.h"
+#include "core/providers/qnn/builder/op_builder_factory.h"
+#include "core/common/safeint.h"
+#include "onnx/defs/data_type_utils.h"
+
+#include "QnnOpDef.h"  // From QNN SDK: contains QNN constants (e.g., op names, param values).
+
+namespace onnxruntime {
+namespace qnn {
+
+class ConvertOpBuilder : public BaseOpBuilder {
+ public:
+  ConvertOpBuilder() : BaseOpBuilder("ConvertOpBuilder") {}
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(ConvertOpBuilder);
+
+  Status AddConvertToModelBuilder(QnnModelWrapper& qnn_model_wrapper,
+                                  const NodeUnit& dq_node_unit,
+                                  const NodeUnit& q_node_unit,
+                                  const logging::Logger& logger,
+                                  bool do_op_validation) const ORT_MUST_USE_RESULT;
+};
+
+Status ConvertOpBuilder::AddConvertToModelBuilder(QnnModelWrapper& qnn_model_wrapper,
+                                                  const NodeUnit& dq_node_unit,
+                                                  const NodeUnit& q_node_unit,
+                                                  const logging::Logger& logger,
+                                                  bool do_op_validation) const {
+  std::vector<std::string> input_names;
+
+  // Process the input from the DQ node
+  ORT_RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, dq_node_unit.Inputs()[0], logger, input_names));
+
+  // Process the output from the Q node. Override the QNN operator type to "Convert".
+  ORT_RETURN_IF_ERROR(ProcessOutputs(qnn_model_wrapper, q_node_unit, std::move(input_names), {},
+                                     logger, do_op_validation, QNN_OP_CONVERT));
+  return Status::OK();
+}
+
+HandleConvertResult TryHandleConvertSequence(QnnModelWrapper& qnn_model_wrapper,
+                                             const NodeUnit& maybe_dq_node_unit,
+                                             const std::unordered_map<const Node*, const NodeUnit*>& node_unit_map,
+                                             const logging::Logger& logger,
+                                             bool do_op_validation) {
+  const GraphViewer& graph_viewer = qnn_model_wrapper.GetGraphViewer();
+
+  // Looking for a standalone DQ to start the sequence.
+  if (maybe_dq_node_unit.OpType() != QDQ::DQOpName || maybe_dq_node_unit.UnitType() != NodeUnit::Type::SingleNode) {
+    return {};
+  }
+
+  const Node& dq_node = maybe_dq_node_unit.GetNode();
+
+  // DQ must have a single Q child. DQ must not produce a graph output.
+  auto children = graph_utils::FindChildrenByType(dq_node, QDQ::QOpName);
+  if (children.size() != 1 || dq_node.GetOutputEdgesCount() != 1 || graph_viewer.NodeProducesGraphOutput(dq_node)) {
+    return {};
+  }
+
+  const Node& q_node = *children[0];
+  const auto q_node_unit_it = node_unit_map.find(&q_node);
+
+  if (q_node_unit_it == node_unit_map.end()) {
+    return {ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Node does not have a corresponding NodeUnit"), nullptr};
+  }
+
+  const NodeUnit* q_node_unit = q_node_unit_it->second;
+
+  // Q child must not already be part of a QDQ NodeUnit (i.e., be standalone).
+  if (q_node_unit->UnitType() != NodeUnit::Type::SingleNode) {
+    return {};
+  }
+
+  auto get_const_initializer = [&graph_viewer](const std::string& initializer_name) {
+    return graph_viewer.GetConstantInitializer(initializer_name, true);
+  };
+
+  // DQ and Q must have equal scale type and different zp type.
+  if (!QDQ::IsDQQConversion(dq_node, q_node, get_const_initializer, graph_viewer.ModelPath())) {
+    return {};
+  }
+
+  ConvertOpBuilder op_builder;
+
+  LOGS(logger, VERBOSE) << " Adding QNN Convert. dq_node name: [" << dq_node.Name()
+                        << "] dq_node optype: [" << dq_node.OpType()
+                        << "] q_node name: [" << q_node_unit->Name()
+                        << "] q_node optype: [" << q_node_unit->OpType()
+                        << "]";
+
+  auto status = op_builder.AddConvertToModelBuilder(qnn_model_wrapper, maybe_dq_node_unit, *q_node_unit, logger,
+                                                    do_op_validation);
+  return status.IsOK() ? HandleConvertResult{status, q_node_unit} : HandleConvertResult{status, nullptr};
+}
+
+}  // namespace qnn
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model.cc b/onnxruntime/core/providers/qnn/builder/qnn_model.cc
index 314cab4a36ca9..dc91b9dfa199e 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model.cc
@@ -114,6 +114,8 @@ Status QnnModel::ComposeGraph(const GraphViewer& graph_viewer,
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to initialize qnn_model_wrapper.");
   }
 
+  std::unordered_set<const NodeUnit*> handled_node_units;
+
   // Op builer
   const auto& node_indices = graph_viewer.GetNodesInTopologicalOrder();
   for (size_t i = 0; i < node_indices.size(); i++) {
@@ -122,20 +124,43 @@ Status QnnModel::ComposeGraph(const GraphViewer& graph_viewer,
     // Check whether it's part of NodeUnit
     const NodeUnit& node_unit = GetNodeUnit(node, node_unit_map);
     // Q, DQ nodes in the node unit only carry the quantization parameters
-    // Add the QNN node when it is the target node (It's a normal node or a singel Q/DQ node)
+    // Add the QNN node when it is the target node (It's a normal node or a single Q/DQ node)
     const std::string& op_type = node_unit.OpType();
+
+    if (node != &node_unit.GetNode()) {
+      continue;
+    }
+
+    if (handled_node_units.count(&node_unit) != 0) {
+      continue;  // Already handled.
+    }
+
+    // Try to convert particular DQ -> Q sequences into QNN Convert op
+    auto convert_result = TryHandleConvertSequence(qnn_model_wrapper,
+                                                   node_unit,
+                                                   node_unit_map,
+                                                   logger_,
+                                                   false /*do_op_validation*/);
+    ORT_RETURN_IF_ERROR(convert_result.status);
+
+    if (convert_result.q_node_unit) {
+      // Successfully merged DQ -> Q sequence into a QNN Convert op.
+      // Mark both of these node units as handled.
+      handled_node_units.insert(&node_unit);
+      handled_node_units.insert(convert_result.q_node_unit);
+      continue;
+    }
+
     LOGS(logger_, VERBOSE) << " node name: [" << node->Name()
                            << "] node optype: [" << op_type
                            << "] as part of the NodeUnit type: [" << node_unit.OpType()
                            << "] name: [" << node_unit.Name()
                            << "]";
-    if (node != &node_unit.GetNode()) {
-      continue;
-    }
-
     if (const auto* op_builder = GetOpBuilder(op_type)) {
       ORT_RETURN_IF_ERROR(op_builder->AddToModelBuilder(qnn_model_wrapper, node_unit, logger_));
     }
+
+    handled_node_units.insert(&node_unit);
   }
 
   ORT_RETURN_IF_NOT(qnn_model_wrapper.ComposeQnnGraph(), "Failed to compose Qnn graph.");
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index b58f6e10df94c..f5a166d36b15a 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -286,33 +286,24 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
 }
 
 bool QNNExecutionProvider::IsNodeSupported(qnn::QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit,
-                                           std::unordered_map<const NodeUnit*, bool>& node_unit_supported_result,
                                            const logging::Logger& logger) const {
-  // If we have visited one of the nodes in the node_unit, use the result directly
-  const auto it = node_unit_supported_result.find(&node_unit);
-  if (it != node_unit_supported_result.cend()) {
-    return it->second;
+  const std::string& op_type = node_unit.OpType();
+  bool supported = false;
+  const auto* op_builder = qnn::GetOpBuilder(op_type);
+  if (op_builder == nullptr) {
+    LOGS(logger, WARNING) << "Operators of type `" << node_unit.OpType() << "` are not supported by QNN EP."
+                          << node_unit.OpType() << " node `" << node_unit.Name()
+                          << "` will not be assigned to QNN EP.";
   } else {
-    const std::string& op_type = node_unit.OpType();
-
-    bool supported = false;
-    const auto* op_builder = qnn::GetOpBuilder(op_type);
-    if (op_builder == nullptr) {
-      LOGS(logger, WARNING) << "Operators of type `" << node_unit.OpType() << "` are not supported by QNN EP."
-                            << node_unit.OpType() << " node `" << node_unit.Name()
-                            << "` will not be assigned to QNN EP.";
-    } else {
-      auto status = op_builder->IsOpSupported(qnn_model_wrapper,
-                                              node_unit, logger);
-      if (Status::OK() != status) {
-        LOGS(logger, WARNING) << node_unit.OpType() << " node `" << node_unit.Name()
-                              << "` is not supported: " << status.ErrorMessage();
-      }
-      supported = (Status::OK() == status);
+    auto status = op_builder->IsOpSupported(qnn_model_wrapper,
+                                            node_unit, logger);
+    if (Status::OK() != status) {
+      LOGS(logger, WARNING) << node_unit.OpType() << " node `" << node_unit.Name()
+                            << "` is not supported: " << status.ErrorMessage();
     }
-    node_unit_supported_result[&node_unit] = supported;
-    return supported;
+    supported = (Status::OK() == status);
   }
+  return supported;
 }
 
 std::unordered_set<const Node*>
@@ -391,24 +382,51 @@ QNNExecutionProvider::GetSupportedNodes(const GraphViewer& graph_viewer,
     if (node != &node_unit->GetNode()) {
       continue;
     }
-    const bool supported = IsNodeSupported(qnn_model_wrapper,
-                                           *node_unit,
-                                           node_unit_supported_result,
-                                           logger);
-    LOGS(logger, VERBOSE) << "Node supported: [" << supported
-                          << "] index: [" << node->Index()
-                          << "] name: [" << node->Name()
-                          << "] Operator type: [" << node->OpType()
-                          << "] as part of the NodeUnit type: [" << node_unit->OpType()
-                          << "] index: [" << node_unit->Index()
-                          << "] name: [" << node_unit->Name()
-                          << "]";
+
+    if (node_unit_supported_result.count(node_unit) != 0) {
+      continue;  // Already handled this node unit
+    }
+
+    // Try to convert certain standalone DQ -> Q sequences into QNN Convert op
+    auto convert_result = TryHandleConvertSequence(qnn_model_wrapper,
+                                                   *node_unit,
+                                                   node_unit_map,
+                                                   logger,
+                                                   true /*do_op_validation*/);
+    if (!convert_result.status.IsOK()) {
+      LOGS(logger, WARNING) << "Failed to convert DQ -> Q sequence to QNN Convert. "
+                            << "Type: " << node_unit->OpType() << ", Node name: " << node_unit->Name() << ", "
+                            << "Message: " << convert_result.status.ErrorMessage();
+    }
+
+    bool supported = false;
+
+    if (convert_result.status.IsOK() && convert_result.q_node_unit) {  // Merged DQ -> Q sequence into QNN Convert op
+      supported = true;
+
+      // Mark the Q node unit as handled and supported here so that we don't try to process it again.
+      node_unit_supported_result.insert({convert_result.q_node_unit, true});
+      supported_nodes.insert(&convert_result.q_node_unit->GetNode());
+    } else {
+      supported = IsNodeSupported(qnn_model_wrapper, *node_unit, logger);
+      LOGS(logger, VERBOSE) << "Node supported: [" << supported
+                            << "] index: [" << node->Index()
+                            << "] name: [" << node->Name()
+                            << "] Operator type: [" << node->OpType()
+                            << "] as part of the NodeUnit type: [" << node_unit->OpType()
+                            << "] index: [" << node_unit->Index()
+                            << "] name: [" << node_unit->Name()
+                            << "]";
+    }
+
     if (supported) {
       // If the node_unit is supported, add all of its nodes to the supported list.
       for (const auto* node_in_group : node_unit->GetAllNodesInGroup()) {
         supported_nodes.insert(node_in_group);
       }
     }
+
+    node_unit_supported_result.insert({node_unit, supported});
   }
 
   return supported_nodes;
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.h b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
index 09bcb24db4dc2..0bcaa39b22f6d 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.h
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
@@ -42,7 +42,6 @@ class QNNExecutionProvider : public IExecutionProvider {
 
  private:
   bool IsNodeSupported(qnn::QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit,
-                       std::unordered_map<const NodeUnit*, bool>& node_unit_supported_result,
                        const logging::Logger& logger) const;
 
   std::unordered_set<const Node*> GetSupportedNodes(const GraphViewer& graph_viewer,
diff --git a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
index 2f3b0e84a123e..a6422407d79fd 100644
--- a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
+++ b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
@@ -1110,6 +1110,61 @@ TEST_F(QnnHTPBackendTests, LpNormalization_u16_rank4) {
                          kOnnxDomain,
                          true);
 }
+
+static GetTestQDQModelFn<uint16_t> BuildQDQConvertAddTestCase(const TestInputDef<float>& input0_def,
+                                                              const TestInputDef<float>& input1_def) {
+  return [input0_def, input1_def](ModelTestBuilder& builder, std::vector<QuantParams<uint16_t>>& output_qparams) {
+    constexpr bool use_contrib_qdq = true;
+
+    // Input0 -> Quantize(u8) -> Dequantize(u8 to float) -> input0_after_qdq
+    NodeArg* input0 = MakeTestInput<float>(builder, input0_def);
+    QuantParams<uint8_t> input0_u8_qparams = GetTestInputQuantParams<uint8_t>(input0_def);
+    NodeArg* input0_after_qdq = AddQDQNodePair<uint8_t>(builder, input0, input0_u8_qparams.scale,
+                                                        input0_u8_qparams.zero_point, use_contrib_qdq);
+
+    // input0_after_qdq -> Quantize(u16) -> Dequantize(u16 to float)
+    QuantParams<uint16_t> input0_u16_qparams = GetTestInputQuantParams<uint16_t>(input0_def);
+    NodeArg* input0_after_convert = AddQDQNodePair<uint16_t>(builder, input0_after_qdq, input0_u16_qparams.scale,
+                                                             input0_u16_qparams.zero_point, use_contrib_qdq);
+
+    // Input1 -> Quantize(u16) -> Dequantize(u16 to float) -> input1_after_qdq
+    NodeArg* input1 = MakeTestInput<float>(builder, input1_def);
+    QuantParams<uint16_t> input1_qparams = GetTestInputQuantParams<uint16_t>(input1_def);
+    NodeArg* input1_after_qdq = AddQDQNodePair<uint16_t>(builder, input1, input1_qparams.scale,
+                                                         input1_qparams.zero_point, use_contrib_qdq);
+
+    // Add op -> op_output
+    auto* op_output = builder.MakeIntermediate();
+    builder.AddNode("Add", {input0_after_convert, input1_after_qdq}, {op_output});
+
+    // op_output -> Q -> DQ -> output
+    AddQDQNodePairWithOutputAsGraphOutput<uint16_t>(builder, op_output, output_qparams[0].scale,
+                                                    output_qparams[0].zero_point, use_contrib_qdq);
+  };
+}
+
+// Test quantization type conversion (mixed precision) with Add.
+// First input is converted from uint8_t to uint16_t.
+TEST_F(QnnHTPBackendTests, Add_U8_U16_Convert) {
+  std::vector<float> input0_data = GetFloatDataInRange(-10.0f, 10.0f, 8);
+  std::vector<float> input1_data = GetFloatDataInRange(-20.0f, 20.0f, 8);
+  TestInputDef<float> input0_def({1, 2, 2, 2}, false, input0_data);
+  TestInputDef<float> input1_def({1, 2, 2, 2}, false, input1_data);
+
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  TestQDQModelAccuracy(BuildOpTestCase<float>("Add", {input0_def, input1_def}, {}, {}, kOnnxDomain),
+                       BuildQDQConvertAddTestCase(input0_def, input1_def),
+                       provider_options,
+                       18,
+                       ExpectedEPNodeAssignment::All);
+}
+
 #endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
 
 }  // namespace test

From 1dce5e17321d50bf345022b525a937933473415a Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Fri, 16 Feb 2024 14:41:11 -0800
Subject: [PATCH 006/279] Disable TF32 in Linux_Test stage of Linux GPU CI
 Pipeline (#19541)

### Description
Some test thresholds that previously worked in T4 GPU does not work
anymore. The reason is current pipeline uses A10, and TF32 is enabled by
default.

Disable TF32 in Linux GPU CI Pipeline in testing to avoid such random
test failure.

### Motivation and Context
Linux Test has random failure at tests:

ProviderOptionsTest > testCUDAOptions() FAILED
org.opentest4j.AssertionFailedError: array contents differ at index
[446], expected: <0.0419757> but was: <0.041948937>
at
app//org.junit.jupiter.api.AssertionFailureBuilder.build(AssertionFailureBuilder.java:151)
at
app//org.junit.jupiter.api.AssertionFailureBuilder.buildAndThrow(AssertionFailureBuilder.java:132)
at
app//org.junit.jupiter.api.AssertArrayEquals.failArraysNotEqual(AssertArrayEquals.java:440)
at
app//org.junit.jupiter.api.AssertArrayEquals.assertArrayEquals(AssertArrayEquals.java:290)
at
app//org.junit.jupiter.api.AssertArrayEquals.assertArrayEquals(AssertArrayEquals.java:123)
at
app//org.junit.jupiter.api.AssertArrayEquals.assertArrayEquals(AssertArrayEquals.java:119)
at
app//org.junit.jupiter.api.Assertions.assertArrayEquals(Assertions.java:1360)
at
app//ai.onnxruntime.providers.ProviderOptionsTest.runProvider(ProviderOptionsTest.java:99)
at
app//ai.onnxruntime.providers.ProviderOptionsTest.testCUDAOptions(ProviderOptionsTest.java:43)

org.opentest4j.AssertionFailedError: array contents differ at index [6],
expected: <0.0225981> but was: <0.022587791>
at
app//org.junit.jupiter.api.AssertionFailureBuilder.build(AssertionFailureBuilder.java:151)
at
app//org.junit.jupiter.api.AssertionFailureBuilder.buildAndThrow(AssertionFailureBuilder.java:132)
at
app//org.junit.jupiter.api.AssertArrayEquals.failArraysNotEqual(AssertArrayEquals.java:440)
at
app//org.junit.jupiter.api.AssertArrayEquals.assertArrayEquals(AssertArrayEquals.java:290)
at
app//org.junit.jupiter.api.AssertArrayEquals.assertArrayEquals(AssertArrayEquals.java:123)
at
app//org.junit.jupiter.api.AssertArrayEquals.assertArrayEquals(AssertArrayEquals.java:119)
at
app//org.junit.jupiter.api.Assertions.assertArrayEquals(Assertions.java:1360)
at app//ai.onnxruntime.InferenceTest.runProvider(InferenceTest.java:676)
at app//ai.onnxruntime.InferenceTest.testCUDA(InferenceTest.java:615)
---
 tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
index b19a8b11db265..24319184dd0b8 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
@@ -204,6 +204,7 @@ jobs:
           --volume /data/models:/build/models:ro \
           --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
           --volume /data/onnx:/data/onnx \
+          -e NVIDIA_TF32_OVERRIDE=0 \
           $(Repository) \
           /bin/bash -c "
             set -ex; \

From 44d8ad93b20efdba921ca80f23485c084b5174d0 Mon Sep 17 00:00:00 2001
From: kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
Date: Fri, 16 Feb 2024 15:21:43 -0800
Subject: [PATCH 007/279] Whisper Timestamps and Temperature (#19509)

### Description
This PR updates exporting and running the Whisper model with beam search
by adding the following.

- Adds temperature as a graph input to the exported model
- Fixes the token ids by adding them as attributes to
`WhisperBeamSearch`
- Fixes the timestamps test cases so they pass now
- Fixes a bug with invoking `torch.onnx.export`
- Cleans up the Whisper scripts and groups the arguments in
`convert_to_onnx.py`
- Adds a `requirements.txt` file to specify package dependencies
- Adds `whisper-large-v3` to list of pretrained models
- Fixes a bug with missing cross-attention KV cache inputs in the
decoder subgraph

### Motivation and Context

- This is a follow-up to [this
PR](https://github.com/microsoft/onnxruntime/pull/19188).
- The incorrect token ids in the timestamps processor were first noticed
during [this PR
review](https://github.com/microsoft/onnxruntime/pull/17500#discussion_r1333520007).
When they were originally added in [this
PR](https://github.com/microsoft/onnxruntime/pull/15853), the offsets
were previously constant across the Whisper model sizes. When comparing
the new `whisper-large-v3` variant, the English-only variants (e.g.
`whisper-tiny.en`), and the original variants (e.g. `whisper-tiny`),
both the values and the offsets differ. Therefore, it is easier to set
the token ids as attributes to `WhisperBeamSearch` when exporting to
ensure the right values are used in the timestamps processor.
- The Hugging Face API for returning timestamps and the expected outputs
from the PyTorch model have both changed.
- The fix for `torch.onnx.export` is a follow-up to [this PR
review](https://github.com/microsoft/onnxruntime/pull/17179#issuecomment-1683001470).
- The argument grouping is a follow-up to [this PR
review](https://github.com/microsoft/onnxruntime/pull/17500#discussion_r1333521721).
- Specific package versions are needed to run the Whisper scripts and
the `requirements.txt` file ensures that these versions are installed.
- The `whisper-large-v3` variant is released and should be in the list
of official pretrained models.
- After the changes from [this
PR](https://github.com/microsoft/onnxruntime/pull/17316), the exported
model is not loading in an ORT inference session because the
cross-attention KV cache inputs are missing in the decoder subgraph.
---
 docs/ContribOperators.md                      |  32 +-
 .../transformers/beam_search_impl_whisper.h   |   4 +-
 .../transformers/beam_search_parameters.cc    |   8 +-
 .../cpu/transformers/generation_shared.h      |   9 +-
 .../cpu/transformers/logits_processor.h       |  81 +++--
 .../transformers/generation_device_helper.cc  |  12 +-
 .../core/graph/contrib_ops/contrib_defs.cc    |  40 +--
 .../transformers/models/whisper/README.md     |  46 ++-
 .../transformers/models/whisper/benchmark.py  |  22 +-
 .../models/whisper/benchmark_all.py           |   6 +
 .../models/whisper/convert_to_onnx.py         | 277 ++++++++++--------
 .../models/whisper/requirements-cpu.txt       |   2 +
 .../models/whisper/requirements-cuda.txt      |   4 +
 .../models/whisper/requirements.txt           |  11 +
 .../models/whisper/whisper_chain.py           | 272 +++++++++--------
 .../models/whisper/whisper_decoder.py         |   2 +-
 .../whisper/whisper_encoder_decoder_init.py   |   6 +-
 .../models/whisper/whisper_helper.py          |  79 ++---
 .../transformers/torch_onnx_export_helper.py  |   3 +-
 .../python/transformers/test_generation.py    |  19 +-
 .../test_whisper_timestamp_processor.py       |   4 +-
 21 files changed, 560 insertions(+), 379 deletions(-)
 create mode 100644 onnxruntime/python/tools/transformers/models/whisper/requirements-cpu.txt
 create mode 100644 onnxruntime/python/tools/transformers/models/whisper/requirements-cuda.txt
 create mode 100644 onnxruntime/python/tools/transformers/models/whisper/requirements.txt

diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
index e7b537d6894c8..f523e97293427 100644
--- a/docs/ContribOperators.md
+++ b/docs/ContribOperators.md
@@ -461,7 +461,7 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dt><tt>repetition_penalty</tt> (optional) : T</dt>
 <dd>The parameter for repetition penalty. Default value 1.0 means no penalty. Accepts value > 0.0. Shape is (1)</dd>
 <dt><tt>vocab_mask</tt> (optional) : M</dt>
-<dd>Mask of vocabulary. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (vacab_size)</dd>
+<dd>Mask of vocabulary. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (vocab_size)</dd>
 <dt><tt>prefix_vocab_mask</tt> (optional) : M</dt>
 <dd>Mask of vocabulary for first step. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (batch_size, vocab_size)</dd>
 <dt><tt>attention_mask</tt> (optional) : I</dt>
@@ -2252,7 +2252,7 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dt><tt>repetition_penalty</tt> (optional) : T</dt>
 <dd>The parameter for repetition penalty. Default value 1.0 means no penalty. Accepts value > 0.0. Shape is (1)</dd>
 <dt><tt>vocab_mask</tt> (optional) : I</dt>
-<dd>Mask of vocabulary. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (vacab_size)</dd>
+<dd>Mask of vocabulary. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (vocab_size)</dd>
 <dt><tt>prefix_vocab_mask</tt> (optional) : I</dt>
 <dd>Mask of vocabulary for first step. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (batch_size, vocab_size)</dd>
 <dt><tt>attention_mask</tt> (optional) : I</dt>
@@ -5154,7 +5154,7 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dt><tt>repetition_penalty</tt> (optional) : T</dt>
 <dd>The parameter for repetition penalty. Default value 1.0 means no penalty. Accepts value > 0.0. Shape is (1)</dd>
 <dt><tt>vocab_mask</tt> (optional) : I</dt>
-<dd>Mask of vocabulary. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (vacab_size)</dd>
+<dd>Mask of vocabulary. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (vocab_size)</dd>
 <dt><tt>prefix_vocab_mask</tt> (optional) : I</dt>
 <dd>Mask of vocabulary for first step. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (batch_size, vocab_size)</dd>
 <dt><tt>attention_mask</tt> (optional) : I</dt>
@@ -5743,12 +5743,14 @@ This version of the operator has been available since version 1 of the 'com.micr
 #### Attributes
 
 <dl>
+<dt><tt>beginning_timestamp_token_id</tt> : int</dt>
+<dd>The id of the first timestamp</dd>
 <dt><tt>decoder</tt> : graph (required)</dt>
 <dd>Decoder subgraph to execute in a loop.</dd>
 <dt><tt>decoder_output_cross_qk</tt> : int</dt>
 <dd>If nozero, decoder subgraph contains output Q*K from cross attentions. Default 0.</dd>
 <dt><tt>decoder_start_token_id</tt> : int</dt>
-<dd>The id of the token that indicates decoding starts.</dd>
+<dd>The id of the token that indicates decoding starts (i.e. the start of transcription token id)</dd>
 <dt><tt>early_stopping</tt> : int</dt>
 <dd>early stop or not</dd>
 <dt><tt>encoder</tt> : graph</dt>
@@ -5761,10 +5763,18 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dd>Must be 2 for whisper</dd>
 <dt><tt>no_repeat_ngram_size</tt> : int</dt>
 <dd>no repeat ngrams size</dd>
-<dt><tt>no_speech_token</tt> : int</dt>
+<dt><tt>no_speech_token_id</tt> : int</dt>
 <dd>The token in whisper model that marks all sequence empty. With this model, whisper could output no_speech_prob after. Default -1.</dd>
+<dt><tt>no_timestamps_token_id</tt> : int</dt>
+<dd>The id of the token that indicates no timestamps</dd>
 <dt><tt>pad_token_id</tt> : int (required)</dt>
 <dd>The id of the padding token</dd>
+<dt><tt>start_of_lm_token_id</tt> : int</dt>
+<dd>The id of the token that indicates LM starts</dd>
+<dt><tt>transcribe_token_id</tt> : int</dt>
+<dd>The id of the transcribe task</dd>
+<dt><tt>translate_token_id</tt> : int</dt>
+<dd>The id of the translate task</dd>
 <dt><tt>vocab_size</tt> : int</dt>
 <dd>Size of the vocabulary. If not provided, it will be inferred from the decoder subgraph's output shape</dd>
 </dl>
@@ -5783,11 +5793,11 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dt><tt>num_return_sequences</tt> : I</dt>
 <dd>The number of returned sequences in the batch. Shape is (1)</dd>
 <dt><tt>length_penalty</tt> (optional) : T</dt>
-<dd>Exponential penalty to the length. Default value 1.0 means no penalty.Value > 1.0 encourages longer sequences, while values < 1.0 produces shorter sequences.Shape is (1,)</dd>
+<dd>Exponential penalty to the length. Default value 1.0 means no penalty. Value > 1.0 encourages longer sequences, while values < 1.0 produces shorter sequences. Shape is (1,)</dd>
 <dt><tt>repetition_penalty</tt> (optional) : T</dt>
 <dd>The parameter for repetition penalty. Default value 1.0 means no penalty. Accepts value > 0.0. Shape is (1)</dd>
 <dt><tt>vocab_mask</tt> (optional) : M</dt>
-<dd>Mask of vocabulary. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (vacab_size)</dd>
+<dd>Mask of vocabulary. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (vocab_size)</dd>
 <dt><tt>prefix_vocab_mask</tt> (optional) : M</dt>
 <dd>Mask of vocabulary for first step. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (batch_size, vocab_size)</dd>
 <dt><tt>attention_mask</tt> (optional) : I</dt>
@@ -5797,7 +5807,7 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dt><tt>logits_processor</tt> (optional) : I</dt>
 <dd>Specific logits processor for different types of beamsearch models. Default value 0 means no specific logit processor. Accepts value >= 0. Shape is (1)</dd>
 <dt><tt>cross_qk_layer_head</tt> (optional) : I</dt>
-<dd>Only keep this list of (layer, head) of QK in the final cross_qk output when use_cross_qk is set. Default collect allits shape is (number of (layer, head) to keep, 2), i.e., [[layer_id1, head_id1], [layer_id2, head_id2]......]</dd>
+<dd>Only keep this list of (layer, head) of QK in the final cross_qk output when use_cross_qk is set. Default collect all its shape is (number of (layer, head) to keep, 2), i.e., [[layer_id1, head_id1], [layer_id2, head_id2]......]</dd>
 <dt><tt>extra_decoding_ids</tt> (optional) : I</dt>
 <dd>Part of the decoder_input_ids that we need cross qk for it. it is of shape  (batch_size, extra_decoding_ids_len).In such case, we should remove this from the tail of the decoder_input_ids, and put it here. ids < 0 in it (for multiple batch) are treated as stop of the extra_decoding_ids for corresponding batch.</dd>
 <dt><tt>temperature</tt> (optional) : T</dt>
@@ -5812,11 +5822,11 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dt><tt>sequences_scores</tt> (optional) : T</dt>
 <dd>Final beam score of the generated sequences. Shape is (batch_size, num_return_sequences)</dd>
 <dt><tt>scores</tt> (optional) : T</dt>
-<dd>Processed beam scores for each vocabulary token at each generation step.Beam scores consisting of log softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam.Shape is (max_length - sequence_length, batch_size, num_beams, vocab_size)</dd>
+<dd>Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam. Shape is (max_length - sequence_length, batch_size, num_beams, vocab_size)</dd>
 <dt><tt>cross_qk</tt> (optional) : V</dt>
-<dd>Output the accumulated stacked Q*K in cross attentions. Let H = number of Head of cross attention, F = the frames or kv-seq-len of the cross attention input, T = real decoded token length, L = number of layers,B = batch size, R = num_return_sequences. It then should return tensor of shape [B, R, L*H, T, F].If cross_qk_layer_head is given, shape is [B, R, cross_qk_layer_head.shape[0], T, F]</dd>
+<dd>Output the accumulated stacked Q*K in cross attentions. Let H = number of Head of cross attention, F = the frames or kv-seq-len of the cross attention input, T = real decoded token length, L = number of layers, B = batch size, R = num_return_sequences. It then should return tensor of shape [B, R, L*H, T, F]. If cross_qk_layer_head is given, shape is [B, R, cross_qk_layer_head.shape[0], T, F]</dd>
 <dt><tt>non_speech_probs</tt> (optional) : T</dt>
-<dd>For whisper model, output the probabilities from logits after encoder and context decoding for the no_speech_token.Currently we treat the last token's logits is what we need, in future extra graph logic may be add to the encoder/context-decoder subgraph.The prob is save before logits may be updated by extra-decoding-ids. The shape of non_speech_probs is [B]</dd>
+<dd>For whisper model, output the probabilities from logits after encoder and context decoding for the no_speech_token_id. The shape of non_speech_probs is [B]</dd>
 </dl>
 
 #### Type Constraints
diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_whisper.h b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_whisper.h
index 72e6d3930a548..af0904b7d6e4b 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_whisper.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_whisper.h
@@ -134,8 +134,8 @@ Status BeamSearchWhisper<T>::Execute(const FeedsFetchesManager& encoder_feeds_fe
     TensorShape no_speech_probs_shape{parameters->batch_size};
     Tensor* no_speech_probs = this->context_.Output(parameters->no_speech_probs_output_id, no_speech_probs_shape);
     if (no_speech_probs && no_speech_probs->MutableData<T>()) {
-      ORT_ENFORCE(parameters->no_speech_token >= 0 && parameters->no_speech_token < parameters->vocab_size,
-                  "no_speech_token id out of range, it is ", parameters->no_speech_token,
+      ORT_ENFORCE(parameters->no_speech_token_id >= 0 && parameters->no_speech_token_id < parameters->vocab_size,
+                  "no_speech_token_id is out of range, it is ", parameters->no_speech_token_id,
                   ", vocab_size is ", parameters->vocab_size);
       this->parameters_->no_speech_probs = (void*)no_speech_probs->MutableData<T>();
     }
diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_parameters.cc b/onnxruntime/contrib_ops/cpu/transformers/beam_search_parameters.cc
index bb6885c3216bc..93837e785b4a4 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_parameters.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_parameters.cc
@@ -153,7 +153,13 @@ void WhisperBeamSearchParameters::ParseFromAttributes(const OpKernelInfo& info)
   model_type = static_cast<int>(info.GetAttrOrDefault<int64_t>("model_type", IGenerationParameters::kModelTypeWhisper));
   ORT_ENFORCE(model_type == IGenerationParameters::kModelTypeWhisper);
 
-  no_speech_token = static_cast<int>(info.GetAttrOrDefault<int64_t>("no_speech_token", -1LL));
+  // Token ids are defined below in the order that they appear in the tokenizer
+  translate_token_id = static_cast<int>(info.GetAttrOrDefault<int64_t>("translate_token_id", -1LL));
+  transcribe_token_id = static_cast<int>(info.GetAttrOrDefault<int64_t>("transcribe_token_id", -1LL));
+  start_of_lm_token_id = static_cast<int>(info.GetAttrOrDefault<int64_t>("start_of_lm_token_id", -1LL));
+  no_speech_token_id = static_cast<int>(info.GetAttrOrDefault<int64_t>("no_speech_token_id", -1LL));
+  no_timestamps_token_id = static_cast<int>(info.GetAttrOrDefault<int64_t>("no_timestamps_token_id", -1LL));
+  beginning_timestamp_token_id = static_cast<int>(info.GetAttrOrDefault<int64_t>("beginning_timestamp_token_id", -1LL));
   cross_qk_layer_head_input_id = 12;
   extra_decoding_ids_input_id = 13;
   cross_qk_output_id = 3;
diff --git a/onnxruntime/contrib_ops/cpu/transformers/generation_shared.h b/onnxruntime/contrib_ops/cpu/transformers/generation_shared.h
index cb62e2f7bf4da..b1dd55eb20f34 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/generation_shared.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/generation_shared.h
@@ -183,7 +183,14 @@ struct IGenerationParameters {
   // Parameters for whisper model
   bool decoder_output_cross_qk = false;
   gsl::span<const int32_t> extra_decoding_ids;
-  int32_t no_speech_token = -1;
+
+  // Token ids are defined below in the order that they appear in the tokenizer
+  int32_t translate_token_id = -1;
+  int32_t transcribe_token_id = -1;
+  int32_t start_of_lm_token_id = -1;
+  int32_t no_speech_token_id = -1;
+  int32_t no_timestamps_token_id = -1;
+  int32_t beginning_timestamp_token_id = -1;
   void* no_speech_probs = nullptr;
 
   int cross_qk_layer_head_input_id = -1;
diff --git a/onnxruntime/contrib_ops/cpu/transformers/logits_processor.h b/onnxruntime/contrib_ops/cpu/transformers/logits_processor.h
index 03d4e89ac20fe..231eb17d1a947 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/logits_processor.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/logits_processor.h
@@ -10,6 +10,7 @@
 #include "contrib_ops/cpu/transformers/greedy_search_parameters.h"
 #include "contrib_ops/cpu/transformers/sampling_parameters.h"
 #include "contrib_ops/cpu/transformers/generation_shared.h"
+#include <iostream>
 
 namespace onnxruntime {
 namespace contrib {
@@ -34,6 +35,14 @@ struct NextTokenScores {
   }
 };
 
+#ifdef DEBUG_GENERATION
+template <typename T>
+void DumpScores(const char* name, const NextTokenScores<T>& next_token_scores) {
+  std::cout << name << std::endl;
+  ORT_UNUSED_PARAMETER(next_token_scores);
+}
+#endif
+
 // Interface for all scorers for beam search or beam sample.
 template <typename T>
 class ILogitsProcessor {
@@ -150,19 +159,25 @@ class PresencePenaltyLogitsProcessor : public ILogitsProcessor<T> {
 template <typename T>
 class TimestampLogitsProcessor : public ILogitsProcessor<T> {
  public:
-  TimestampLogitsProcessor(int eos_token_id, int max_initial_timestamp_index)
-      : eos_token_id_(eos_token_id), max_initial_timestamp_index_(max_initial_timestamp_index) {}
+  TimestampLogitsProcessor(int end_of_text_token_id,          // <|endoftext|>
+                           int start_of_transcript_token_id,  // <|startoftranscript|>
+                           int translate_token_id,            // <|translate|>
+                           int transcribe_token_id,           // <|transcribe|>
+                           int start_of_lm_token_id,          // <|startoflm|>
+                           int no_timestamps_token_id,        // <|notimestamps|>
+                           int beginning_timestamp_token_id,  // <|0.00|>
+                           int max_initial_timestamp_index)
+      : end_of_text_token_id_(end_of_text_token_id),
+        start_of_transcript_token_id_(start_of_transcript_token_id),
+        translate_token_id_(translate_token_id),
+        transcribe_token_id_(transcribe_token_id),
+        start_of_lm_token_id_(start_of_lm_token_id),
+        no_timestamps_token_id_(no_timestamps_token_id),
+        beginning_timestamp_token_id_(beginning_timestamp_token_id),
+        max_initial_timestamp_index_(max_initial_timestamp_index) {}
 
   void Process(const ISequences* sequences,
                NextTokenScores<T>& next_token_scores) override {
-    // TODO: translate_token_id_ and transcribe_token_id_ need to support both multilingual and English-only models.
-    const int beg_token_id_ = eos_token_id_ + 107;
-    const int not_token_id_ = eos_token_id_ + 106;
-    const int solm_token_id_ = eos_token_id_ + 105;
-    const int sot_token_id_ = eos_token_id_ + 1;
-    constexpr int translate_token_id_ = 50358;
-    constexpr int transcribe_token_id_ = 50359;
-
     const int batch_beam_size = next_token_scores.batch_beam_size;
     const int vocab_size = next_token_scores.vocab_size;
     for (int i = 0; i < batch_beam_size; i++) {
@@ -174,7 +189,7 @@ class TimestampLogitsProcessor : public ILogitsProcessor<T> {
       size_t sample_begin = 0;
       for (size_t j = 0; j < seq_length; j++) {
         sample_begin++;
-        if (sequence[j] >= beg_token_id_) {
+        if (sequence[j] >= beginning_timestamp_token_id_) {
           break;
         }
       }
@@ -182,30 +197,30 @@ class TimestampLogitsProcessor : public ILogitsProcessor<T> {
       // Suppress tokens
       for (int j = 0; j < vocab_size; j++) {
         // Suppress notimestamps and solm tokens
-        if (j == not_token_id_ || j == solm_token_id_) {
+        if (j == no_timestamps_token_id_ || j == start_of_lm_token_id_) {
           beam_token_scores[j] = std::numeric_limits<T>::lowest();
         }
 
         // Suppress sot, translate and transcribe tokens
         if (seq_length > sample_begin) {
-          if (j == sot_token_id_ || j == translate_token_id_ || j == transcribe_token_id_) {
+          if (j == start_of_transcript_token_id_ || j == translate_token_id_ || j == transcribe_token_id_) {
             beam_token_scores[j] = std::numeric_limits<T>::lowest();
           }
         }
       }
 
       // Timestamps should be in pair except the first one
-      const bool last_was_timestamp = seq_length > 0 && sequence.back() >= beg_token_id_;
-      const bool penultimate_was_timestamp = seq_length <= sample_begin || sequence[seq_length - 2] >= beg_token_id_;
+      const bool last_was_timestamp = seq_length > 0 && sequence.back() >= beginning_timestamp_token_id_;
+      const bool penultimate_was_timestamp = seq_length <= sample_begin || sequence[seq_length - 2] >= beginning_timestamp_token_id_;
       if (last_was_timestamp) {
         if (penultimate_was_timestamp) {
           // If timestamps show up in pair, or it's the first timestamp, no more timestamp is generated
-          for (int j = beg_token_id_; j < vocab_size; j++) {
+          for (int j = beginning_timestamp_token_id_; j < vocab_size; j++) {
             beam_token_scores[j] = std::numeric_limits<T>::lowest();
           }
         } else {
           // If timestamp doesn't show up in pair, generate timestamp
-          for (int j = 0; j < eos_token_id_; j++) {
+          for (int j = 0; j < end_of_text_token_id_; j++) {
             beam_token_scores[j] = std::numeric_limits<T>::lowest();
           }
         }
@@ -214,7 +229,7 @@ class TimestampLogitsProcessor : public ILogitsProcessor<T> {
       // Find timestamp tokens
       std::vector<int32_t> timestamps;
       for (const auto& word_id : sequence) {
-        if (word_id >= beg_token_id_) {
+        if (word_id >= beginning_timestamp_token_id_) {
           timestamps.push_back(word_id);
         }
       }
@@ -231,13 +246,13 @@ class TimestampLogitsProcessor : public ILogitsProcessor<T> {
           timestamp_last = timestamps.back() + 1;
         }
 
-        for (int j = beg_token_id_; j < timestamp_last; j++) {
+        for (int j = beginning_timestamp_token_id_; j < timestamp_last; j++) {
           beam_token_scores[j] = std::numeric_limits<T>::lowest();
         }
       }
 
       if (seq_length == sample_begin) {
-        const int last_allowed = beg_token_id_ + max_initial_timestamp_index_;
+        const int last_allowed = beginning_timestamp_token_id_ + max_initial_timestamp_index_;
         for (int j = last_allowed + 1; j < vocab_size; j++) {
           beam_token_scores[j] = std::numeric_limits<T>::lowest();
         }
@@ -247,8 +262,8 @@ class TimestampLogitsProcessor : public ILogitsProcessor<T> {
       float timestamp_logprob = std::numeric_limits<T>::lowest();
       {
         float logsumexp = 0.0f;
-        const float logprob_max = *std::max_element(beam_token_scores.begin() + beg_token_id_, beam_token_scores.end());
-        for (int j = beg_token_id_; j < vocab_size; ++j) {
+        const float logprob_max = *std::max_element(beam_token_scores.begin() + beginning_timestamp_token_id_, beam_token_scores.end());
+        for (int j = beginning_timestamp_token_id_; j < vocab_size; ++j) {
           if (beam_token_scores[j] > std::numeric_limits<T>::lowest()) {
             logsumexp += expf(beam_token_scores[j] - logprob_max);
           }
@@ -258,9 +273,9 @@ class TimestampLogitsProcessor : public ILogitsProcessor<T> {
         }
       }
 
-      const float max_text_token_logprob = *std::max_element(beam_token_scores.begin(), beam_token_scores.begin() + beg_token_id_);
+      const float max_text_token_logprob = *std::max_element(beam_token_scores.begin(), beam_token_scores.begin() + beginning_timestamp_token_id_);
       if (timestamp_logprob > max_text_token_logprob) {
-        for (int j = 0; j < beg_token_id_; ++j) {
+        for (int j = 0; j < beginning_timestamp_token_id_; ++j) {
           beam_token_scores[j] = std::numeric_limits<T>::lowest();
         }
       }
@@ -268,7 +283,13 @@ class TimestampLogitsProcessor : public ILogitsProcessor<T> {
   }
 
  private:
-  int eos_token_id_;
+  int end_of_text_token_id_;
+  int start_of_transcript_token_id_;
+  int translate_token_id_;
+  int transcribe_token_id_;
+  int start_of_lm_token_id_;
+  int no_timestamps_token_id_;
+  int beginning_timestamp_token_id_;
   int max_initial_timestamp_index_;
 };
 
@@ -330,7 +351,15 @@ class LogitsProcessorList : public ILogitsProcessorList {
     // Add timestamp processor for whisper model
     if (parameters.model_type == IGenerationParameters::kModelTypeWhisper && parameters.logits_processor == IGenerationParameters::kLogitsProcessorTypeWhisper) {
       constexpr int max_initial_timestamp_index = 50;
-      timestamp_processor_ = std::make_unique<TimestampLogitsProcessor<float>>(parameters.eos_token_id, max_initial_timestamp_index);
+      // Token ids are passed below in the order that they appear in the tokenizer
+      timestamp_processor_ = std::make_unique<TimestampLogitsProcessor<float>>(parameters.eos_token_id,
+                                                                               parameters.decoder_start_token_id,
+                                                                               parameters.translate_token_id,
+                                                                               parameters.transcribe_token_id,
+                                                                               parameters.start_of_lm_token_id,
+                                                                               parameters.no_timestamps_token_id,
+                                                                               parameters.beginning_timestamp_token_id,
+                                                                               max_initial_timestamp_index);
       processor_list_.push_back(timestamp_processor_.get());
     }
 
diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc
index bba30805ae1be..7adc2fe0a67ea 100644
--- a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc
+++ b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc
@@ -424,7 +424,7 @@ Status ProcessLogits(const OrtValue& logits,                                 //
   const bool is_whisper_model = (parameters->model_type == onnxruntime::contrib::transformers::IGenerationParameters::kModelTypeWhisper);
   if (step == 1 && is_whisper_model && parameters->no_speech_probs) {
     cuda::LaunchSaveNoSpeechProbs<T>(
-        (T*)parameters->no_speech_probs, Y_data, batch_size, num_beams, vocab_size, parameters->no_speech_token, cuda_stream);
+        (T*)parameters->no_speech_probs, Y_data, batch_size, num_beams, vocab_size, parameters->no_speech_token_id, cuda_stream);
   }
 
   // NOTE: currently we treat extra decoding ids are same
@@ -469,7 +469,15 @@ Status ProcessLogits(const OrtValue& logits,                                 //
                                          cudaMemcpyDeviceToHost,
                                          cuda_stream));
     constexpr int max_initial_timestamp_index = 50;
-    onnxruntime::contrib::transformers::TimestampLogitsProcessor<float> time_logit_processor(parameters->eos_token_id, max_initial_timestamp_index);
+    // Token ids are passed below in the order that they appear in the tokenizer
+    onnxruntime::contrib::transformers::TimestampLogitsProcessor<float> time_logit_processor(parameters->eos_token_id,
+                                                                                             parameters->decoder_start_token_id,
+                                                                                             parameters->translate_token_id,
+                                                                                             parameters->transcribe_token_id,
+                                                                                             parameters->start_of_lm_token_id,
+                                                                                             parameters->no_timestamps_token_id,
+                                                                                             parameters->beginning_timestamp_token_id,
+                                                                                             max_initial_timestamp_index);
     onnxruntime::contrib::transformers::NextTokenScores<float> next_token_scores_timestamp({cpu_next_token_scores_span, batch_beam_size, vocab_size});
 
     CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(cuda_stream));
diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
index 27c968a59eb91..e33ce20737f80 100644
--- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
@@ -1163,7 +1163,7 @@ ONNX_MS_OPERATOR_SET_SCHEMA(BeamSearch, 1,
                                        "Shape is (1,)",
                                        "T", OpSchema::Optional)
                                 .Input(6, "repetition_penalty", "The parameter for repetition penalty. Default value 1.0 means no penalty. Accepts value > 0.0. Shape is (1)", "T", OpSchema::Optional)
-                                .Input(7, "vocab_mask", "Mask of vocabulary. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (vacab_size)", "M", OpSchema::Optional)
+                                .Input(7, "vocab_mask", "Mask of vocabulary. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (vocab_size)", "M", OpSchema::Optional)
                                 .Input(8, "prefix_vocab_mask", "Mask of vocabulary for first step. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (batch_size, vocab_size)", "M", OpSchema::Optional)
                                 .Input(9, "attention_mask", "Custom attention mask. Shape is (batch_size, sequence_length)", "I", OpSchema::Optional)
                                 .Input(10, "decoder_input_ids", "The forced input id sequence for the decoder subgraph. Shape is (batch_size, initial_sequence_length)", "I", OpSchema::Optional)
@@ -1188,7 +1188,15 @@ ONNX_MS_OPERATOR_SET_SCHEMA(WhisperBeamSearch, 1,
                                 .SetDoc("Beam Search for whisper model, especiall with cross_qk features etc.")
                                 .Attr("eos_token_id", "The id of the end-of-sequence token", AttributeProto::INT)
                                 .Attr("pad_token_id", "The id of the padding token", AttributeProto::INT)
-                                .Attr("decoder_start_token_id", "The id of the token that indicates decoding starts.", AttributeProto::INT, static_cast<int64_t>(-1))
+                                .Attr("decoder_start_token_id", "The id of the token that indicates decoding starts (i.e. the start of transcription token id)", AttributeProto::INT, static_cast<int64_t>(-1))
+                                .Attr("translate_token_id", "The id of the translate task", AttributeProto::INT, OPTIONAL_VALUE)
+                                .Attr("transcribe_token_id", "The id of the transcribe task", AttributeProto::INT, OPTIONAL_VALUE)
+                                .Attr("start_of_lm_token_id", "The id of the token that indicates LM starts", AttributeProto::INT, OPTIONAL_VALUE)
+                                .Attr("no_speech_token_id",
+                                      "The token in whisper model that marks all sequence empty. With this model, whisper could output no_speech_prob after. Default -1.",
+                                      AttributeProto::INT, OPTIONAL_VALUE)
+                                .Attr("no_timestamps_token_id", "The id of the token that indicates no timestamps", AttributeProto::INT, OPTIONAL_VALUE)
+                                .Attr("beginning_timestamp_token_id", "The id of the first timestamp", AttributeProto::INT, OPTIONAL_VALUE)
                                 .Attr("no_repeat_ngram_size", "no repeat ngrams size", AttributeProto::INT, static_cast<int64_t>(0))
                                 .Attr("early_stopping", "early stop or not", AttributeProto::INT, static_cast<int64_t>(0))
                                 .Attr("model_type", "Must be 2 for whisper", AttributeProto::INT, static_cast<int64_t>(2))
@@ -1203,27 +1211,24 @@ ONNX_MS_OPERATOR_SET_SCHEMA(WhisperBeamSearch, 1,
                                       "If not provided, it will be inferred from the decoder subgraph's output shape",
                                       AttributeProto::INT, static_cast<int64_t>(-1))
                                 .Attr("decoder_output_cross_qk", "If nozero, decoder subgraph contains output Q*K from cross attentions. Default 0.", AttributeProto::INT, OPTIONAL_VALUE)
-                                .Attr("no_speech_token",
-                                      "The token in whisper model that marks all sequence empty. With this model, whisper could output no_speech_prob after. Default -1.",
-                                      AttributeProto::INT, OPTIONAL_VALUE)
                                 .Input(0, "input_ids", "The sequence used as a prompt for the generation in the encoder subgraph. Shape is (batch_size, sequence_length)", "F")
                                 .Input(1, "max_length", "The maximum length of the sequence to be generated. Shape is (1)", "I")
                                 .Input(2, "min_length", "The minimum length below which the score of eos_token_id is set to -Inf. Shape is (1)", "I", OpSchema::Optional)
                                 .Input(3, "num_beams", "Number of beams for beam search. 1 means no beam search. Shape is (1)", "I")
                                 .Input(4, "num_return_sequences", "The number of returned sequences in the batch. Shape is (1)", "I")
                                 .Input(5, "length_penalty",
-                                       "Exponential penalty to the length. Default value 1.0 means no penalty."
-                                       "Value > 1.0 encourages longer sequences, while values < 1.0 produces shorter sequences."
+                                       "Exponential penalty to the length. Default value 1.0 means no penalty. "
+                                       "Value > 1.0 encourages longer sequences, while values < 1.0 produces shorter sequences. "
                                        "Shape is (1,)",
                                        "T", OpSchema::Optional)
                                 .Input(6, "repetition_penalty", "The parameter for repetition penalty. Default value 1.0 means no penalty. Accepts value > 0.0. Shape is (1)", "T", OpSchema::Optional)
-                                .Input(7, "vocab_mask", "Mask of vocabulary. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (vacab_size)", "M", OpSchema::Optional)
+                                .Input(7, "vocab_mask", "Mask of vocabulary. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (vocab_size)", "M", OpSchema::Optional)
                                 .Input(8, "prefix_vocab_mask", "Mask of vocabulary for first step. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (batch_size, vocab_size)", "M", OpSchema::Optional)
                                 .Input(9, "attention_mask", "Custom attention mask. Shape is (batch_size, sequence_length)", "I", OpSchema::Optional)
                                 .Input(10, "decoder_input_ids", "The forced input id sequence for the decoder subgraph. Shape is (batch_size, initial_sequence_length)", "I", OpSchema::Optional)
                                 .Input(11, "logits_processor", "Specific logits processor for different types of beamsearch models. Default value 0 means no specific logit processor. Accepts value >= 0. Shape is (1)", "I", OpSchema::Optional)
                                 .Input(12, "cross_qk_layer_head",
-                                       "Only keep this list of (layer, head) of QK in the final cross_qk output when use_cross_qk is set. Default collect all"
+                                       "Only keep this list of (layer, head) of QK in the final cross_qk output when use_cross_qk is set. Default collect all "
                                        "its shape is (number of (layer, head) to keep, 2), i.e., [[layer_id1, head_id1], [layer_id2, head_id2]......]",
                                        "I", OpSchema::Optional)
                                 .Input(13, "extra_decoding_ids",
@@ -1235,20 +1240,19 @@ ONNX_MS_OPERATOR_SET_SCHEMA(WhisperBeamSearch, 1,
                                 .Output(0, "sequences", "Word IDs of generated sequences. Shape is (batch_size, num_return_sequences, max_sequence_length)", "I")
                                 .Output(1, "sequences_scores", "Final beam score of the generated sequences. Shape is (batch_size, num_return_sequences)", "T", OpSchema::Optional)
                                 .Output(2, "scores",
-                                        "Processed beam scores for each vocabulary token at each generation step."
-                                        "Beam scores consisting of log softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam."
+                                        "Processed beam scores for each vocabulary token at each generation step. "
+                                        "Beam scores consisting of log softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam. "
                                         "Shape is (max_length - sequence_length, batch_size, num_beams, vocab_size)",
                                         "T", OpSchema::Optional)
                                 .Output(3, "cross_qk",
                                         "Output the accumulated stacked Q*K in cross attentions. Let H = number of Head of cross attention, "
-                                        "F = the frames or kv-seq-len of the cross attention input, T = real decoded token length, L = number of layers,"
-                                        "B = batch size, R = num_return_sequences. It then should return tensor of shape [B, R, L*H, T, F]."
+                                        "F = the frames or kv-seq-len of the cross attention input, T = real decoded token length, L = number of layers, "
+                                        "B = batch size, R = num_return_sequences. It then should return tensor of shape [B, R, L*H, T, F]. "
                                         "If cross_qk_layer_head is given, shape is [B, R, cross_qk_layer_head.shape[0], T, F]",
                                         "V", OpSchema::Optional)
                                 .Output(4, "non_speech_probs",
-                                        "For whisper model, output the probabilities from logits after encoder and context decoding for the no_speech_token."
-                                        "Currently we treat the last token's logits is what we need, in future extra graph logic may be add to the encoder/context-decoder subgraph."
-                                        "The prob is save before logits may be updated by extra-decoding-ids. The shape of non_speech_probs is [B]",
+                                        "For whisper model, output the probabilities from logits after encoder and context decoding for the no_speech_token_id. "
+                                        "The shape of non_speech_probs is [B]",
                                         "T", OpSchema::Optional)
                                 .TypeConstraint("T", {"tensor(float)", "tensor(float16)"}, "Constrain to float tensors.")
                                 .TypeConstraint("F", {"tensor(float)", "tensor(int32)", "tensor(float16)"}, "Constrain input type to float or int tensors.")
@@ -1322,7 +1326,7 @@ ONNX_MS_OPERATOR_SET_SCHEMA(GreedySearch, 1,
                                 .Input(1, "max_length", "The maximum length of the sequence to be generated. Shape is (1)", "I")
                                 .Input(2, "min_length", "The minimum length below which the score of eos_token_id is set to -Inf. Shape is (1)", "I", OpSchema::Optional)
                                 .Input(3, "repetition_penalty", "The parameter for repetition penalty. Default value 1.0 means no penalty. Accepts value > 0.0. Shape is (1)", "T", OpSchema::Optional)
-                                .Input(4, "vocab_mask", "Mask of vocabulary. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (vacab_size)", "I", OpSchema::Optional)
+                                .Input(4, "vocab_mask", "Mask of vocabulary. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (vocab_size)", "I", OpSchema::Optional)
                                 .Input(5, "prefix_vocab_mask", "Mask of vocabulary for first step. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (batch_size, vocab_size)", "I", OpSchema::Optional)
                                 .Input(6, "attention_mask", "Custom attention mask. Shape is (batch_size, sequence_length)", "I", OpSchema::Optional)
                                 .Output(0, "sequences", "Word IDs of generated sequences. Shape is (batch_size, max_sequence_length)", "I")
@@ -1363,7 +1367,7 @@ ONNX_MS_OPERATOR_SET_SCHEMA(Sampling, 1,
                                 .Input(1, "max_length", "The maximum length of the sequence to be generated. Shape is (1)", "I")
                                 .Input(2, "min_length", "The minimum length below which the score of eos_token_id is set to -Inf. Shape is (1)", "I", OpSchema::Optional)
                                 .Input(3, "repetition_penalty", "The parameter for repetition penalty. Default value 1.0 means no penalty. Accepts value > 0.0. Shape is (1)", "T", OpSchema::Optional)
-                                .Input(4, "vocab_mask", "Mask of vocabulary. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (vacab_size)", "I", OpSchema::Optional)
+                                .Input(4, "vocab_mask", "Mask of vocabulary. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (vocab_size)", "I", OpSchema::Optional)
                                 .Input(5, "prefix_vocab_mask", "Mask of vocabulary for first step. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (batch_size, vocab_size)", "I", OpSchema::Optional)
                                 .Input(6, "attention_mask", "Custom attention mask. Shape is (batch_size, sequence_length)", "I", OpSchema::Optional)
                                 .Input(7, "presence_mask", "Presence penalty mask. Shape is (batch_size, vocab_size)", "I", OpSchema::Optional)
diff --git a/onnxruntime/python/tools/transformers/models/whisper/README.md b/onnxruntime/python/tools/transformers/models/whisper/README.md
index 02100266200f8..7a678f2734ade 100644
--- a/onnxruntime/python/tools/transformers/models/whisper/README.md
+++ b/onnxruntime/python/tools/transformers/models/whisper/README.md
@@ -1,5 +1,22 @@
 # Whisper
 
+## Prerequisites
+
+Please note the package versions needed for using Whisper in the `requirements.txt` file that fits your scenario.
+- `requirements-cpu.txt`
+  - For running Whisper on CPU
+- `requirements-cuda.txt`
+  - For running Whisper on CUDA
+  - Note that `torch` with CUDA enabled is not installed automatically. This is because `torch` should be installed with the CUDA version used on your machine. Please visit [the PyTorch website](https://pytorch.org/get-started/locally/) to download the `torch` version that is used with the CUDA version installed on your machine and satisfies the requirement listed in the file.
+- `requirements.txt`
+  - Package versions needed in each of the above files
+
+In addition to the above packages, you will need to install `ffmpeg` on your machine. Visit the [FFmpeg website](https://ffmpeg.org/) for details. You can also install it natively using package managers.
+
+- Linux: `sudo apt-get install ffmpeg`
+- MacOS: `sudo brew install ffmpeg`
+- Windows: Download from website
+
 ## Exporting Whisper with Beam Search
 
 There are several ways to export Whisper with beam search (using Whisper tiny as an example).
@@ -10,10 +27,10 @@ There are several ways to export Whisper with beam search (using Whisper tiny as
 # From source
 $ git clone https://github.com/microsoft/onnxruntime
 $ cd onnxruntime/onnxruntime/python/tools/transformers/
-$ python3 -m models.whisper.convert_to_onnx -m openai/whisper-tiny --output whispertiny --use_external_data_format
+$ python3 -m models.whisper.convert_to_onnx -m openai/whisper-large-v3 --output whisperlargev3 --use_external_data_format
 
 # From wheel
-$ python3 -m onnxruntime.transformers.models.whisper.convert_to_onnx -m openai/whisper-tiny --output whispertiny --use_external_data_format
+$ python3 -m onnxruntime.transformers.models.whisper.convert_to_onnx -m openai/whisper-large-v3 --output whisperlargev3 --use_external_data_format
 ```
 
 ### Option 2: end-to-end model from [Olive](https://github.com/microsoft/Olive/tree/main/examples/whisper)
@@ -39,40 +56,49 @@ model.save_pretrained(model_name.split("/")[-1] + "-onnx")
 
 Here are some additional examples for exporting Whisper with beam search.
 
+To see all available options
+```
+# From source:
+$ python3 -m models.whisper.convert_to_onnx --help
+
+# From wheel:
+$ python3 -m onnxruntime.transformers.models.whisper.convert_to_onnx --help
+```
+
 Export with Forced Decoder Input Ids
 ```
 # From source:
-$ python3 -m models.whisper.convert_to_onnx -m openai/whisper-tiny --output whispertiny --use_external_data_format --use_forced_decoder_ids
+$ python3 -m models.whisper.convert_to_onnx -m openai/whisper-large-v3 --output whisperlargev3 --use_external_data_format --use_forced_decoder_ids
 
 # From wheel:
-$ python3 -m onnxruntime.transformers.models.whisper.convert_to_onnx -m openai/whisper-tiny --output whispertiny --use_external_data_format --use_forced_decoder_ids
+$ python3 -m onnxruntime.transformers.models.whisper.convert_to_onnx -m openai/whisper-large-v3 --output whisperlargev3 --use_external_data_format --use_forced_decoder_ids
 ```
 
 Export + Optimize for FP32
 ```
 # From source:
-$ python3 -m models.whisper.convert_to_onnx -m openai/whisper-tiny --output whispertiny --use_external_data_format --optimize_onnx --precision fp32
+$ python3 -m models.whisper.convert_to_onnx -m openai/whisper-large-v3 --output whisperlargev3 --use_external_data_format --optimize_onnx --precision fp32
 
 # From wheel:
-$ python3 -m onnxruntime.transformers.models.whisper.convert_to_onnx -m openai/whisper-tiny --output whispertiny --use_external_data_format --optimize_onnx --precision fp32
+$ python3 -m onnxruntime.transformers.models.whisper.convert_to_onnx -m openai/whisper-large-v3 --output whisperlargev3 --use_external_data_format --optimize_onnx --precision fp32
 ```
 
 Export + Optimize for FP16 and GPU
 ```
 # From source:
-$ python3 -m models.whisper.convert_to_onnx -m openai/whisper-tiny --output whispertiny --use_external_data_format --optimize_onnx --precision fp16 --use_gpu --provider cuda --disable_auto_mixed_precision
+$ python3 -m models.whisper.convert_to_onnx -m openai/whisper-large-v3 --output whisperlargev3 --use_external_data_format --optimize_onnx --precision fp16 --use_gpu --provider cuda --disable_auto_mixed_precision
 
 # From wheel:
-$ python3 -m onnxruntime.transformers.models.whisper.convert_to_onnx -m openai/whisper-tiny --output whispertiny --use_external_data_format --optimize_onnx --precision fp16 --use_gpu --provider cuda --disable_auto_mixed_precision
+$ python3 -m onnxruntime.transformers.models.whisper.convert_to_onnx -m openai/whisper-large-v3 --output whisperlargev3 --use_external_data_format --optimize_onnx --precision fp16 --use_gpu --provider cuda --disable_auto_mixed_precision
 ```
 
 Export + Quantize for INT8
 ```
 # From source:
-$ python3 -m models.whisper.convert_to_onnx -m openai/whisper-tiny --output whispertiny --use_external_data_format --precision int8 --quantize_embedding_layer
+$ python3 -m models.whisper.convert_to_onnx -m openai/whisper-large-v3 --output whisperlargev3 --use_external_data_format --precision int8 --quantize_embedding_layer
 
 # From wheel:
-$ python3 -m onnxruntime.transformers.models.whisper.convert_to_onnx -m openai/whisper-tiny --output whispertiny --use_external_data_format --precision int8 --quantize_embedding_layer
+$ python3 -m onnxruntime.transformers.models.whisper.convert_to_onnx -m openai/whisper-large-v3 --output whisperlargev3 --use_external_data_format --precision int8 --quantize_embedding_layer
 ```
 
 ## Benchmark Whisper
diff --git a/onnxruntime/python/tools/transformers/models/whisper/benchmark.py b/onnxruntime/python/tools/transformers/models/whisper/benchmark.py
index 759ae6d14f184..e57385aa6db8f 100644
--- a/onnxruntime/python/tools/transformers/models/whisper/benchmark.py
+++ b/onnxruntime/python/tools/transformers/models/whisper/benchmark.py
@@ -1,3 +1,9 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
 import argparse
 import ast
 import datetime
@@ -54,6 +60,8 @@ def load_via_numpy():
             inputs["decoder_input_ids"] = np.array([args.decoder_input_ids], dtype=np.int32)
         if args.has_logits_processor:
             inputs["logits_processor"] = np.array([args.logits_processor], dtype=np.int32)
+        if args.has_temperature:
+            inputs["temperature"] = np.array([args.temperature], dtype=np.float32)
 
     # Measure time taken to load audio file
     logger.info(f"Load audio: {args.audio_path}")
@@ -163,6 +171,7 @@ def get_model(args: argparse.Namespace):
 def time_fn(args, fn, inputs):
     warmup_inputs = inputs[0] if type(inputs) is tuple else inputs
     benchmark_inputs = inputs[1] if type(inputs) is tuple else inputs
+    torch_device = torch.device(args.target_device)
 
     # Warm up
     warmup_range = (
@@ -180,7 +189,7 @@ def time_fn(args, fn, inputs):
 
     # Benchmark
     if args.device != "cpu":
-        torch.cuda.synchronize()
+        torch.cuda.synchronize(torch_device)
     start_time = time.time()
 
     bench_range = (
@@ -192,7 +201,7 @@ def time_fn(args, fn, inputs):
         fn(benchmark_inputs)
 
     if args.device != "cpu":
-        torch.cuda.synchronize()
+        torch.cuda.synchronize(torch_device)
     end_time = time.time()
 
     # Newline print after trange in order to print metrics on new lines without progress bar on same line
@@ -500,7 +509,13 @@ def parse_args():
         "--logits-processor",
         type=int,
         default=1,
-        help="Type of logits processor to use. See `BeamSearch` in https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/graph/contrib_ops/contrib_defs.cc for details.",
+        help="Whether to use timestamps logits processor or not (0 for false, 1 for true).",
+    )
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=1.0,
+        help="Temperature value for generation.",
     )
 
     # Args for accessing detailed info
@@ -581,6 +596,7 @@ def main():
         args.has_audio_stream = "audio_stream" in ort_model_inputs
         setattr(args, "has_decoder_input_ids", "decoder_input_ids" in ort_model_inputs)  # noqa: B010
         setattr(args, "has_logits_processor", "logits_processor" in ort_model_inputs)  # noqa: B010
+        setattr(args, "has_temperature", "temperature" in ort_model_inputs)  # noqa: B010
 
         if args.decoder_input_ids == []:
             args.decoder_input_ids = [config.decoder_start_token_id]
diff --git a/onnxruntime/python/tools/transformers/models/whisper/benchmark_all.py b/onnxruntime/python/tools/transformers/models/whisper/benchmark_all.py
index d205a2d340721..814b0dd1ef6ac 100644
--- a/onnxruntime/python/tools/transformers/models/whisper/benchmark_all.py
+++ b/onnxruntime/python/tools/transformers/models/whisper/benchmark_all.py
@@ -1,3 +1,9 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
 import argparse
 import datetime
 import json
diff --git a/onnxruntime/python/tools/transformers/models/whisper/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/whisper/convert_to_onnx.py
index bb697fe1e1506..35211aab272e4 100644
--- a/onnxruntime/python/tools/transformers/models/whisper/convert_to_onnx.py
+++ b/onnxruntime/python/tools/transformers/models/whisper/convert_to_onnx.py
@@ -28,17 +28,25 @@
 def parse_arguments(argv=None):
     parser = argparse.ArgumentParser()
 
-    pretrained_models = PRETRAINED_WHISPER_MODELS
-    parser.add_argument(
+    conversion_args = parser.add_argument_group("Conversion Process Args")
+    optional_inputs = parser.add_argument_group("Optional Inputs (for WhisperBeamSearch op)")
+    optional_outputs = parser.add_argument_group("Optional Outputs (for WhisperBeamSearch op)")
+    quant_args = parser.add_argument_group("INT8 Quantization Args")
+
+    #################################
+    # Conversion options for Whisper
+    #################################
+
+    conversion_args.add_argument(
         "-m",
         "--model_name_or_path",
         required=False,
         default=PRETRAINED_WHISPER_MODELS[0],
         type=str,
-        help="Model path, or pretrained model name in the list: " + ", ".join(pretrained_models),
+        help="Model path, or pretrained model name in the list: " + ", ".join(PRETRAINED_WHISPER_MODELS),
     )
 
-    parser.add_argument(
+    conversion_args.add_argument(
         "--model_impl",
         required=False,
         default="hf",
@@ -47,7 +55,7 @@ def parse_arguments(argv=None):
         help="Select implementation for export of encoder and decoder subgraphs",
     )
 
-    parser.add_argument(
+    conversion_args.add_argument(
         "--cache_dir",
         required=False,
         type=str,
@@ -55,7 +63,7 @@ def parse_arguments(argv=None):
         help="Directory to cache pre-trained models",
     )
 
-    parser.add_argument(
+    conversion_args.add_argument(
         "--output",
         required=False,
         type=str,
@@ -63,19 +71,24 @@ def parse_arguments(argv=None):
         help="Output directory",
     )
 
-    parser.add_argument(
+    conversion_args.add_argument(
         "-o",
         "--optimize_onnx",
         required=False,
         action="store_true",
         help="Use optimizer.py to optimize onnx model",
     )
-    parser.set_defaults(optimize_onnx=False)
+    conversion_args.set_defaults(optimize_onnx=False)
 
-    parser.add_argument("--use_gpu", required=False, action="store_true", help="use GPU for inference")
-    parser.set_defaults(use_gpu=False)
+    conversion_args.add_argument(
+        "--use_gpu",
+        required=False,
+        action="store_true",
+        help="Use GPU for model inference",
+    )
+    conversion_args.set_defaults(use_gpu=False)
 
-    parser.add_argument(
+    conversion_args.add_argument(
         "-p",
         "--precision",
         required=False,
@@ -85,221 +98,226 @@ def parse_arguments(argv=None):
         help="Precision of model to run. fp32 for full precision, fp16 for half precision, int8 for quantization",
     )
 
-    parser.add_argument("--verbose", required=False, action="store_true")
-    parser.set_defaults(verbose=False)
-
-    parser.add_argument("-e", "--use_external_data_format", required=False, action="store_true")
-    parser.set_defaults(use_external_data_format=False)
-
-    parser.add_argument(
-        "-s",
-        "--use_decoder_start_token",
+    conversion_args.add_argument(
+        "--use_int64_inputs",
         required=False,
         action="store_true",
-        help="Use config.decoder_start_token_id. Otherwise, add an extra graph input to \
-              the encoder-decoder-init subgraph for decoder_input_ids.",
+        help="Use int64 instead of int32 for input_ids and attention_mask.",
     )
-    parser.set_defaults(use_decoder_start_token=False)
+    conversion_args.set_defaults(use_int64_inputs=False)
 
-    parser.add_argument(
-        "-f",
-        "--use_forced_decoder_ids",
+    conversion_args.add_argument(
+        "--disable_auto_mixed_precision",
         required=False,
         action="store_true",
-        help="Use decoder_input_ids as an extra graph input to the beam search op",
+        help="Use pure fp16 instead of mixed precision",
     )
-    parser.set_defaults(use_forced_decoder_ids=False)
+    conversion_args.set_defaults(disable_auto_mixed_precision=False)
 
-    parser.add_argument(
-        "-l",
-        "--use_logits_processor",
+    conversion_args.add_argument(
+        "-r",
+        "--provider",
         required=False,
-        action="store_true",
-        help="Use logits_processor as an extra graph input to enable specific logits processing",
+        type=str,
+        default="cpu",
+        choices=list(PROVIDERS.keys()),
+        help="Provider to benchmark. Default is CPUExecutionProvider.",
     )
-    parser.set_defaults(use_specific_logits_processor=False)
 
-    parser.add_argument(
-        "-v",
-        "--use_vocab_mask",
+    conversion_args.add_argument(
+        "--verbose",
         required=False,
         action="store_true",
-        help="Use vocab_mask as an extra graph input to enable specific logits processing",
+        help="Enable verbose logging",
     )
-    parser.set_defaults(use_vocab_mask=False)
+    conversion_args.set_defaults(verbose=False)
 
-    parser.add_argument(
-        "-u",
-        "--use_prefix_vocab_mask",
+    conversion_args.add_argument(
+        "-e",
+        "--use_external_data_format",
         required=False,
         action="store_true",
-        help="Use prefix_vocab_mask as an extra graph input to enable specific logits processing",
+        help="Save weights in external file. Necessary for 'small', 'medium', and 'large' models. Optional for 'tiny' and 'base' models.",
     )
-    parser.set_defaults(use_prefix_vocab_mask=False)
+    conversion_args.set_defaults(use_external_data_format=False)
 
-    parser.add_argument(
+    conversion_args.add_argument(
         "-w",
         "--overwrite",
         required=False,
         action="store_true",
-        help="overwrite existing ONNX model",
+        help="Overwrite existing ONNX model",
     )
-    parser.set_defaults(overwrite=False)
+    conversion_args.set_defaults(overwrite=False)
 
-    parser.add_argument(
-        "--disable_auto_mixed_precision",
+    conversion_args.add_argument(
+        "--separate_encoder_and_decoder_init",
         required=False,
         action="store_true",
-        help="use pure fp16 instead of mixed precision",
+        help="Do not merge encoder and decoder init to initialize past KV caches. Output 3 instead of 2 ONNX models.",
     )
-    parser.set_defaults(disable_auto_mixed_precision=False)
+    conversion_args.set_defaults(separate_encoder_and_decoder_init=False)
 
-    parser.add_argument(
-        "--separate_encoder_and_decoder_init",
+    conversion_args.add_argument(
+        "--no_beam_search_op",
         required=False,
         action="store_true",
-        help="Do not merge encode and decoder init. Output 3 instead of 2 onnx models.",
+        help="Do not produce model with WhisperBeamSearch op, which chains encdecinit and decoder models into one op.",
     )
-    parser.set_defaults(separate_encoder_and_decoder_init=False)
+    conversion_args.set_defaults(no_beam_search_op=False)
 
-    parser.add_argument(
-        "--use_int64_inputs",
+    conversion_args.add_argument(
+        "--state_dict_path",
+        type=str,
+        default="",
+        help="Filepath to load pre-trained model with custom state dictionary (e.g. pytorch_model.bin)",
+    )
+
+    #############################################################
+    # Optional inputs for Whisper
+    # (listed below in the order that WhisperBeamSearch expects)
+    #############################################################
+
+    optional_inputs.add_argument(
+        "-v",
+        "--use_vocab_mask",
         required=False,
         action="store_true",
-        help="Use int64 instead of int32 for input_ids, position_ids and attention_mask.",
+        help="Use vocab_mask as an extra graph input to enable specific logits processing",
     )
-    parser.set_defaults(use_int64_inputs=False)
+    optional_inputs.set_defaults(use_vocab_mask=False)
 
-    parser.add_argument(
-        "--chain_model",
+    optional_inputs.add_argument(
+        "-u",
+        "--use_prefix_vocab_mask",
         required=False,
         action="store_true",
-        help="Produce beam search model with chained encdecinit and decoder.",
+        help="Use prefix_vocab_mask as an extra graph input to enable specific logits processing",
     )
-    parser.set_defaults(chain_model=True)
+    optional_inputs.set_defaults(use_prefix_vocab_mask=False)
 
-    parser.add_argument(
-        "--use_whisper_beamsearch",
+    optional_inputs.add_argument(
+        "-f",
+        "--use_forced_decoder_ids",
         required=False,
         action="store_true",
-        help="When chain_model, using WhisperBeamSearch operator rather than BeamSearch operator. \
-              It will be set to true when collect_cross_qk, extra_decoding_ids or output_no_speech_probs is set.",
+        help="Use decoder_input_ids as an extra graph input to the beam search op",
     )
-    parser.set_defaults(use_whisper_beamsearch=False)
+    optional_inputs.set_defaults(use_forced_decoder_ids=False)
 
-    parser.add_argument(
-        "--extra_decoding_ids",
+    optional_inputs.add_argument(
+        "-l",
+        "--use_logits_processor",
         required=False,
         action="store_true",
-        help="Need extra starting decoding ids for some feature like cross qk. Default if false.",
+        help="Use logits_processor as an extra graph input to enable specific logits processing",
     )
-    parser.set_defaults(extra_decoding_ids=False)
+    optional_inputs.set_defaults(use_specific_logits_processor=False)
 
-    parser.add_argument(
+    optional_inputs.add_argument(
         "--collect_cross_qk",
         required=False,
         action="store_true",
         help="Beam search model collect stacked cross QK.",
     )
-    parser.set_defaults(collect_cross_qk=False)
+    optional_inputs.set_defaults(collect_cross_qk=False)
 
-    parser.add_argument(
-        "--output_cross_qk",
+    optional_inputs.add_argument(
+        "--extra_decoding_ids",
         required=False,
         action="store_true",
-        help="Beam search model output collected qk as output. Also hint collect_cross_qk",
+        help="Need extra starting decoding ids for some feature like cross qk. Default if false.",
     )
-    parser.set_defaults(output_cross_qk=False)
+    optional_inputs.set_defaults(extra_decoding_ids=False)
 
-    parser.add_argument(
-        "--no_speech_token_id",
-        default=50362,
+    optional_inputs.add_argument(
+        "-t",
+        "--use_temperature",
+        required=False,
+        action="store_true",
+        help="Use temperature as an extra graph input for the WhisperBeamSearch op",
+    )
+    optional_inputs.set_defaults(use_temperature=False)
+
+    optional_inputs.add_argument(
+        "--no_repeat_ngram_size",
         type=int,
-        help="specify no_speech_token_id. Default is 50362. if >= 0, will be add into beam search attr. \
-              Note that default value maybe different between the multilingual and English-only models.",
+        default=0,
+        help="default to 0",
     )
 
-    parser.add_argument(
-        "--output_no_speech_probs",
+    #############################################################
+    # Optional outputs for Whisper
+    # (listed below in the order that WhisperBeamSearch expects)
+    #############################################################
+
+    optional_outputs.add_argument(
+        "--output_sequence_scores",
         required=False,
         action="store_true",
-        help="Beam search model output no speech probs which is computed from the encoder/context-decoder graph.",
+        help="Beam search model output scores for each generated sequence.",
     )
-    parser.set_defaults(output_no_speech_probs=False)
+    optional_outputs.set_defaults(output_sequence_scores=False)
 
-    parser.add_argument(
+    optional_outputs.add_argument(
         "--output_scores",
         required=False,
         action="store_true",
         help="Beam search model output scores over vocab per generated token.",
     )
-    parser.set_defaults(output_scores=False)
+    optional_outputs.set_defaults(output_scores=False)
 
-    parser.add_argument(
-        "--output_sequence_scores",
+    optional_outputs.add_argument(
+        "--output_cross_qk",
         required=False,
         action="store_true",
-        help="Beam search model output scores for each generated sequence.",
+        help="Beam search model output collected qk as output. Also hint collect_cross_qk",
     )
-    parser.set_defaults(output_sequence_scores=False)
+    optional_outputs.set_defaults(output_cross_qk=False)
 
-    parser.add_argument(
+    optional_outputs.add_argument(
         "--cross_qk_onnx_model",
         required=False,
         type=str,
         default=None,
-        help="the model which consume cross_qk.",
+        help="The model which consumes cross_qk outputs.",
     )
 
-    parser.add_argument(
-        "--beam_output_model",
-        type=str,
-        default="whisper_beamsearch.onnx",
-        help="default name is whisper_beamsearch.onnx.",
+    optional_outputs.add_argument(
+        "--output_no_speech_probs",
+        required=False,
+        action="store_true",
+        help="Beam search model output no speech probs which is computed from the encoder/context-decoder graph.",
     )
+    optional_outputs.set_defaults(output_no_speech_probs=False)
 
-    parser.add_argument(
+    ###################################
+    # Quantization options for Whisper
+    ###################################
+
+    quant_args.add_argument(
         "--quantize_embedding_layer",
         required=False,
         action="store_true",
         help="Quantize MatMul, GEMM, and Gather.",
     )
-    parser.set_defaults(quantize_embedding_layer=False)
+    quant_args.set_defaults(quantize_embedding_layer=False)
 
-    parser.add_argument(
+    quant_args.add_argument(
         "--quantize_per_channel",
         required=False,
         action="store_true",
         help="Quantize weights per each channel.",
     )
-    parser.set_defaults(quantize_per_channel=False)
+    quant_args.set_defaults(quantize_per_channel=False)
 
-    parser.add_argument(
+    quant_args.add_argument(
         "--quantize_reduce_range",
         required=False,
         action="store_true",
         help="Quantize weights with 7 bits.",
     )
-    parser.set_defaults(quantize_reduce_range=False)
-
-    parser.add_argument("--no_repeat_ngram_size", type=int, default=0, help="default to 0")
-
-    parser.add_argument(
-        "--state_dict_path",
-        type=str,
-        default="",
-        help="filepath to load pre-trained model with custom state dictionary (e.g. pytorch_model.bin)",
-    )
-
-    parser.add_argument(
-        "-r",
-        "--provider",
-        required=False,
-        type=str,
-        default="cpu",
-        choices=list(PROVIDERS.keys()),
-        help="Provider to benchmark. Default is CPUExecutionProvider.",
-    )
+    quant_args.set_defaults(quantize_reduce_range=False)
 
     args = parser.parse_args(argv)
     args.collect_cross_qk = args.collect_cross_qk or args.output_cross_qk
@@ -317,7 +335,7 @@ def export_onnx_models(
     optimize_onnx,
     precision,
     verbose,
-    use_decoder_start_token: bool = False,
+    use_forced_decoder_ids: bool = False,
     merge_encoder_and_decoder_init: bool = True,
     overwrite: bool = False,
     disable_auto_mixed_precision: bool = False,
@@ -362,7 +380,6 @@ def export_onnx_models(
                 onnx_path,
                 verbose,
                 use_external_data_format,
-                use_decoder_input_ids=not use_decoder_start_token,
                 use_int32_inputs=use_int32_inputs,
             )
         else:
@@ -406,7 +423,7 @@ def export_onnx_models(
                         extra_options={"MatMulConstBOnly": True},
                     )
             else:
-                logger.info(f"Skip optimizing: existed ONNX model {onnx_path}")
+                logger.info(f"Skip optimizing: existing ONNX model {onnx_path}")
         else:
             output_path = onnx_path
 
@@ -449,7 +466,7 @@ def main(argv=None):
         args.optimize_onnx,
         args.precision,
         args.verbose,
-        args.use_decoder_start_token,
+        args.use_forced_decoder_ids,
         not args.separate_encoder_and_decoder_init,
         args.overwrite,
         args.disable_auto_mixed_precision,
@@ -462,7 +479,7 @@ def main(argv=None):
     )
 
     max_diff = 0
-    if args.chain_model:
+    if not args.no_beam_search_op:
         logger.info("Chaining model ... :")
         args.beam_model_output_dir = WhisperHelper.get_onnx_path(
             output_dir,
diff --git a/onnxruntime/python/tools/transformers/models/whisper/requirements-cpu.txt b/onnxruntime/python/tools/transformers/models/whisper/requirements-cpu.txt
new file mode 100644
index 0000000000000..db2cd95324328
--- /dev/null
+++ b/onnxruntime/python/tools/transformers/models/whisper/requirements-cpu.txt
@@ -0,0 +1,2 @@
+-r requirements.txt
+onnxruntime>=1.17.1
\ No newline at end of file
diff --git a/onnxruntime/python/tools/transformers/models/whisper/requirements-cuda.txt b/onnxruntime/python/tools/transformers/models/whisper/requirements-cuda.txt
new file mode 100644
index 0000000000000..9bd215de9bc09
--- /dev/null
+++ b/onnxruntime/python/tools/transformers/models/whisper/requirements-cuda.txt
@@ -0,0 +1,4 @@
+-r requirements.txt
+# Please manually install torch>=1.13.0 with CUDA enabled for the CUDA version installed in your system.
+# Instructions can be found here: https://pytorch.org/get-started/locally/
+onnxruntime-gpu>=1.17.1
diff --git a/onnxruntime/python/tools/transformers/models/whisper/requirements.txt b/onnxruntime/python/tools/transformers/models/whisper/requirements.txt
new file mode 100644
index 0000000000000..c307a3665f8a0
--- /dev/null
+++ b/onnxruntime/python/tools/transformers/models/whisper/requirements.txt
@@ -0,0 +1,11 @@
+torch>=1.13.0
+transformers>=4.24.0
+openai-whisper
+ffmpeg-python
+datasets
+soundfile
+librosa
+optimum
+onnxruntime-extensions>=0.9.0
+protobuf==3.20.2
+numpy==1.23.3
\ No newline at end of file
diff --git a/onnxruntime/python/tools/transformers/models/whisper/whisper_chain.py b/onnxruntime/python/tools/transformers/models/whisper/whisper_chain.py
index a74666b7af297..14691da4ad643 100644
--- a/onnxruntime/python/tools/transformers/models/whisper/whisper_chain.py
+++ b/onnxruntime/python/tools/transformers/models/whisper/whisper_chain.py
@@ -1,3 +1,9 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
 import logging
 import os
 
@@ -9,7 +15,7 @@
     update_decoder_subgraph_share_buffer_and_use_decoder_masked_mha,
 )
 from onnx import TensorProto, helper
-from transformers import WhisperConfig
+from transformers import WhisperConfig, WhisperTokenizer
 
 logger = logging.getLogger(__name__)
 
@@ -23,11 +29,22 @@ def verify_inputs(beam_inputs, graph_inputs):
         assert graph_input.name in beam_input
 
 
+def clean_list(arr, remove_all_strings=True):
+    if remove_all_strings:
+        # Remove all empty strings in list
+        return list(filter(lambda elm: elm != "", arr))
+
+    # Remove empty strings at end of list
+    while len(arr) > 0:
+        if arr[-1] == "":
+            arr.pop()
+        else:
+            break
+    return arr
+
+
 def chain_model(args):
-    # Load encoder/decoder and insert necessary (but unused) graph inputs expected by BeamSearch op or WhisperBeamSearch op
-    args.use_whisper_beamsearch = (
-        args.use_whisper_beamsearch or args.collect_cross_qk or args.output_no_speech_probs or args.extra_decoding_ids
-    )
+    # Load encoder/decoder and insert necessary (but unused) graph inputs expected by WhisperBeamSearch op
     encoder_model = onnx.load_model(args.encoder_path, load_external_data=True)
     encoder_model.graph.name = "encoderdecoderinit subgraph"
 
@@ -35,7 +52,10 @@ def chain_model(args):
     decoder_model.graph.name = "decoder subgraph"
 
     config = WhisperConfig.from_pretrained(args.model_name_or_path)
+    tokenizer = WhisperTokenizer.from_pretrained(args.model_name_or_path)
 
+    # Create inputs/outputs for WhisperBeamSearch op
+    temperature_name = "temperature_fp16" if args.precision == Precision.FLOAT16 else "temperature"
     beam_inputs = [
         "input_features_fp16" if args.precision == Precision.FLOAT16 else "input_features",
         "max_length",
@@ -44,38 +64,27 @@ def chain_model(args):
         "num_return_sequences",
         "length_penalty_fp16" if args.precision == Precision.FLOAT16 else "length_penalty",
         "repetition_penalty_fp16" if args.precision == Precision.FLOAT16 else "repetition_penalty",
-        "vocab_mask" if args.use_prefix_vocab_mask else "",
+        "vocab_mask" if args.use_vocab_mask else "",
         "prefix_vocab_mask" if args.use_prefix_vocab_mask else "",
         "",  # attention mask
         "decoder_input_ids" if args.use_forced_decoder_ids else "",
         "logits_processor" if args.use_logits_processor else "",
+        "cross_qk_layer_head" if args.collect_cross_qk else "",
+        "extra_decoding_ids" if args.extra_decoding_ids else "",
+        temperature_name if args.use_temperature else "",
     ]
 
-    beam_outputs = ["sequences"]
-    if args.output_sequence_scores:
-        beam_outputs.append("sequence_scores_fp16" if args.precision == Precision.FLOAT16 else "sequence_scores")
-    if args.output_scores:
-        beam_outputs.append("scores_fp16" if args.precision == Precision.FLOAT16 else "scores")
-
-    if args.use_whisper_beamsearch:
-        assert len(beam_inputs) == 12
-        beam_inputs.extend(
-            [
-                "cross_qk_layer_head" if args.collect_cross_qk else "",
-                "extra_decoding_ids" if args.extra_decoding_ids else "",
-            ]
-        )
-        if args.collect_cross_qk:
-            while len(beam_outputs) < 3:
-                beam_outputs.extend([""])
-            beam_outputs.extend(["cross_qk"])
-        if args.output_no_speech_probs:
-            while len(beam_outputs) < 4:
-                beam_outputs.extend([""])
-            beam_outputs.extend(["no_speech_probs_beam"])
-
-    input_features_cast_node, len_pen_cast_node, rep_pen_cast_node = None, None, None
-    output_scores_cast_node = output_sequence_scores_cast_node = None
+    sequence_scores_name = "sequence_scores_fp16" if args.precision == Precision.FLOAT16 else "sequence_scores"
+    scores_name = "scores_fp16" if args.precision == Precision.FLOAT16 else "scores"
+    beam_outputs = [
+        "sequences",
+        sequence_scores_name if args.output_sequence_scores else "",
+        scores_name if args.output_scores else "",
+        "cross_qk" if args.collect_cross_qk else "",
+        "no_speech_probs_beam" if args.output_no_speech_probs else "",
+    ]
+
+    graph_nodes = []
     if args.precision == Precision.FLOAT16:
         input_features_cast_node = helper.make_node(
             "Cast",
@@ -98,6 +107,18 @@ def chain_model(args):
             name="CastRepetitionPenaltyToFp16",
             to=TensorProto.FLOAT16,
         )
+        graph_nodes.extend([input_features_cast_node, len_pen_cast_node, rep_pen_cast_node])
+
+        if args.use_temperature:
+            temp_cast_node = helper.make_node(
+                "Cast",
+                inputs=["temperature"],
+                outputs=["temperature_fp16"],
+                name="temperature_to_fp16",
+                to=TensorProto.FLOAT16,
+            )
+            graph_nodes.append(temp_cast_node)
+
         if args.output_sequence_scores:
             output_sequence_scores_cast_node = helper.make_node(
                 "Cast",
@@ -106,6 +127,8 @@ def chain_model(args):
                 name="CastOutputSequenceScoresToFp32",
                 to=TensorProto.FLOAT,
             )
+            graph_nodes.append(output_sequence_scores_cast_node)
+
         if args.output_scores:
             output_scores_cast_node = helper.make_node(
                 "Cast",
@@ -114,26 +137,38 @@ def chain_model(args):
                 name="CastScoresToFp32",
                 to=TensorProto.FLOAT,
             )
-
-    operator_type = "WhisperBeamSearch" if args.use_whisper_beamsearch else "BeamSearch"
-    node = helper.make_node(operator_type, inputs=beam_inputs, outputs=beam_outputs, name="BeamSearch_zcode")
-    node.domain = "com.microsoft"
-    node.attribute.extend(
-        [
-            helper.make_attribute("eos_token_id", config.eos_token_id),
-            helper.make_attribute("pad_token_id", config.pad_token_id),
-            helper.make_attribute("decoder_start_token_id", config.decoder_start_token_id),
-            helper.make_attribute("no_repeat_ngram_size", args.no_repeat_ngram_size),
-            helper.make_attribute("early_stopping", True),
-            helper.make_attribute("model_type", 2),
-        ]
+            graph_nodes.append(output_scores_cast_node)
+
+    # Create WhisperBeamSearch op
+    beam_search_attrs = [
+        helper.make_attribute("eos_token_id", config.eos_token_id),
+        helper.make_attribute("pad_token_id", config.pad_token_id),
+        helper.make_attribute(
+            "decoder_start_token_id", config.decoder_start_token_id
+        ),  # same as tokenizer.convert_tokens_to_ids(['<|startoftranscript|>'])[0]
+        helper.make_attribute("translate_token_id", tokenizer.convert_tokens_to_ids(["<|translate|>"])[0]),
+        helper.make_attribute("transcribe_token_id", tokenizer.convert_tokens_to_ids(["<|transcribe|>"])[0]),
+        helper.make_attribute("start_of_lm_token_id", tokenizer.convert_tokens_to_ids(["<|startoflm|>"])[0]),
+        helper.make_attribute("no_speech_token_id", tokenizer.convert_tokens_to_ids(["<|nospeech|>"])[0])
+        if args.output_no_speech_probs
+        else "",
+        helper.make_attribute("no_timestamps_token_id", tokenizer.convert_tokens_to_ids(["<|notimestamps|>"])[0]),
+        helper.make_attribute("beginning_timestamp_token_id", tokenizer.convert_tokens_to_ids(["<|0.00|>"])[0]),
+        helper.make_attribute("no_repeat_ngram_size", args.no_repeat_ngram_size),
+        helper.make_attribute("early_stopping", True),
+        helper.make_attribute("model_type", 2),
+        helper.make_attribute("decoder_output_cross_qk", 1) if args.collect_cross_qk else "",
+    ]
+    node = helper.make_node(
+        "WhisperBeamSearch",
+        inputs=clean_list(beam_inputs, remove_all_strings=False),
+        outputs=clean_list(beam_outputs, remove_all_strings=False),
+        name="BeamSearch",
+        domain="com.microsoft",
     )
-    if args.use_whisper_beamsearch:
-        if args.collect_cross_qk:
-            node.attribute.extend([helper.make_attribute("decoder_output_cross_qk", 1)])
-        if args.no_speech_token_id >= 0:
-            node.attribute.extend([helper.make_attribute("no_speech_token", args.no_speech_token_id)])
+    node.attribute.extend(clean_list(beam_search_attrs, remove_all_strings=True))
 
+    # Graph inputs
     input_features = helper.make_tensor_value_info(
         "input_features", TensorProto.FLOAT, ["batch_size", "feature_size", "sequence_length"]
     )
@@ -143,73 +178,63 @@ def chain_model(args):
     num_return_sequences = helper.make_tensor_value_info("num_return_sequences", TensorProto.INT32, [1])
     length_penalty = helper.make_tensor_value_info("length_penalty", TensorProto.FLOAT, [1])
     repetition_penalty = helper.make_tensor_value_info("repetition_penalty", TensorProto.FLOAT, [1])
+    vocab_mask = helper.make_tensor_value_info("vocab_mask", TensorProto.INT32, [config.vocab_size])
+    prefix_vocab_mask = helper.make_tensor_value_info(
+        "prefix_vocab_mask", TensorProto.INT32, ["batch_size", config.vocab_size]
+    )
+    decoder_input_ids = helper.make_tensor_value_info(
+        "decoder_input_ids", TensorProto.INT32, ["batch_size", "initial_sequence_length"]
+    )
+    logits_processor = helper.make_tensor_value_info("logits_processor", TensorProto.INT32, [1])
+    cross_qk_layer_head = helper.make_tensor_value_info("cross_qk_layer_head", TensorProto.INT32, ["num_layer_head", 2])
+    extra_decoding_ids = helper.make_tensor_value_info(
+        "extra_decoding_ids", TensorProto.INT32, ["batch_size", "extra_decoding_ids_len"]
+    )
+    temperature = helper.make_tensor_value_info("temperature", TensorProto.FLOAT, [1])
 
-    graph_inputs = [
-        input_features,
-        max_length,
-        min_length,
-        num_beams,
-        num_return_sequences,
-        length_penalty,
-        repetition_penalty,
-    ]
-    if args.use_vocab_mask:
-        vocab_mask = helper.make_tensor_value_info("vocab_mask", TensorProto.INT32, [config.vocab_size])
-        graph_inputs.append(vocab_mask)
-
-    if args.use_prefix_vocab_mask:
-        prefix_vocab_mask = helper.make_tensor_value_info(
-            "prefix_vocab_mask", TensorProto.INT32, ["batch_size", config.vocab_size]
-        )
-        graph_inputs.append(prefix_vocab_mask)
-
-    if args.use_forced_decoder_ids:
-        decoder_input_ids = helper.make_tensor_value_info(
-            "decoder_input_ids", TensorProto.INT32, ["batch_size", "initial_sequence_length"]
-        )
-        graph_inputs.append(decoder_input_ids)
-
-    if args.use_logits_processor:
-        logits_processor = helper.make_tensor_value_info("logits_processor", TensorProto.INT32, [1])
-        graph_inputs.append(logits_processor)
-
-    if args.collect_cross_qk:
-        cross_qk_layer_head = helper.make_tensor_value_info(
-            "cross_qk_layer_head", TensorProto.INT32, ["num_layer_head", 2]
-        )
-        graph_inputs.append(cross_qk_layer_head)
-
-    if args.extra_decoding_ids:
-        extra_decoding_ids = helper.make_tensor_value_info(
-            "extra_decoding_ids", TensorProto.INT32, ["batch_size", "extra_decoding_ids_len"]
-        )
-        graph_inputs.append(extra_decoding_ids)
+    graph_inputs = clean_list(
+        [
+            input_features,
+            max_length,
+            min_length,
+            num_beams,
+            num_return_sequences,
+            length_penalty,
+            repetition_penalty,
+            vocab_mask if args.use_vocab_mask else "",
+            prefix_vocab_mask if args.use_prefix_vocab_mask else "",
+            decoder_input_ids if args.use_forced_decoder_ids else "",
+            logits_processor if args.use_logits_processor else "",
+            cross_qk_layer_head if args.collect_cross_qk else "",
+            extra_decoding_ids if args.extra_decoding_ids else "",
+            temperature if args.use_temperature else "",
+        ]
+    )
 
-    # graph outputs
+    # Graph outputs
     sequences = helper.make_tensor_value_info(
         "sequences", TensorProto.INT32, ["batch_size", "num_return_sequences", "max_length"]
     )
-    graph_outputs = [sequences]
-    if args.output_cross_qk or (not args.cross_qk_onnx_model and args.collect_cross_qk):
-        cross_qk = helper.make_tensor_value_info(
-            "cross_qk",
-            TensorProto.FLOAT,
-            ["batch_size", "num_return_sequences", "num_layer_head_cross_qk", "max_length", "frames"],
-        )
-        graph_outputs.extend([cross_qk])
-
-    if args.output_no_speech_probs:
-        no_speech_probs = helper.make_tensor_value_info("no_speech_probs", TensorProto.FLOAT, ["batch_size"])
-        graph_outputs.extend([no_speech_probs])
-
-    if args.output_sequence_scores:
-        sequence_scores = helper.make_tensor_value_info("sequence_scores", TensorProto.FLOAT, ["batch_size"])
-        graph_outputs.extend([sequence_scores])
+    sequence_scores = helper.make_tensor_value_info("sequence_scores", TensorProto.FLOAT, ["batch_size"])
+    scores = helper.make_tensor_value_info("scores", TensorProto.FLOAT, ["batch_size"])
+    cross_qk = helper.make_tensor_value_info(
+        "cross_qk",
+        TensorProto.FLOAT,
+        ["batch_size", "num_return_sequences", "num_layer_head_cross_qk", "max_length", "frames"],
+    )
+    no_speech_probs = helper.make_tensor_value_info("no_speech_probs", TensorProto.FLOAT, ["batch_size"])
 
-    if args.output_scores:
-        scores = helper.make_tensor_value_info("scores", TensorProto.FLOAT, ["batch_size"])
-        graph_outputs.extend([scores])
+    graph_outputs = clean_list(
+        [
+            sequences,
+            sequence_scores if args.output_sequence_scores else "",
+            scores if args.output_scores else "",
+            cross_qk if args.output_cross_qk or (not args.cross_qk_onnx_model and args.collect_cross_qk) else "",
+            no_speech_probs if args.output_no_speech_probs else "",
+        ]
+    )
 
+    # Replace MultiHeadAttention with DecoderMaskedMultiHeadAttention for CUDA EP inference
     if hasattr(args, "use_gpu") and args.use_gpu:
         if update_decoder_subgraph_share_buffer_and_use_decoder_masked_mha(decoder_model.graph):
             logger.info("Updated whisper decoder subgraph to use DecoderMaskedMultiHeadAttention successfully!")
@@ -230,19 +255,7 @@ def chain_model(args):
 
     opset_import = [helper.make_opsetid(domain="com.microsoft", version=1), helper.make_opsetid(domain="", version=17)]
 
-    graph_nodes = (
-        [
-            input_features_cast_node,
-            len_pen_cast_node,
-            rep_pen_cast_node,
-            node,
-            output_sequence_scores_cast_node,
-            output_scores_cast_node,
-        ]
-        if args.precision == Precision.FLOAT16
-        else [node]
-    )
-    graph_nodes = [node for node in graph_nodes if node is not None]
+    graph_nodes.append(node)
     if args.output_no_speech_probs:
         prob_cast_node = helper.make_node(
             "Cast",
@@ -251,9 +264,16 @@ def chain_model(args):
             name="no_speech_probs_cast_to_fp32",
             to=TensorProto.FLOAT,
         )
-        graph_nodes.extend([prob_cast_node])
-
-    beam_graph = helper.make_graph(graph_nodes, "beam-search-test", graph_inputs, graph_outputs, initializers)
+        graph_nodes.append(prob_cast_node)
+
+    # Make graph with WhisperBeamSearch op
+    beam_graph = helper.make_graph(
+        graph_nodes,
+        name="WhisperBeamSearch Graph",
+        inputs=graph_inputs,
+        outputs=graph_outputs,
+        initializer=initializers,
+    )
     beam_graph_input_names = [gi.name for gi in graph_inputs]
     beam_graph_output_names = [go.name for go in graph_outputs]
 
@@ -287,10 +307,12 @@ def chain_model(args):
         ir_version=decoder_model.ir_version,
     )
 
+    # Save WhisperBeamSearch graph and external data
     if os.path.isfile(args.beam_model_output_dir):
         logger.info(f"Overwriting {args.beam_model_output_dir} and {args.beam_model_output_dir + '.data'}")
         os.remove(args.beam_model_output_dir)
         os.remove(args.beam_model_output_dir + ".data")
+
     onnx.save(
         beam_model,
         args.beam_model_output_dir,
diff --git a/onnxruntime/python/tools/transformers/models/whisper/whisper_decoder.py b/onnxruntime/python/tools/transformers/models/whisper/whisper_decoder.py
index 0d69960a095ac..93fd64c9eb7d3 100644
--- a/onnxruntime/python/tools/transformers/models/whisper/whisper_decoder.py
+++ b/onnxruntime/python/tools/transformers/models/whisper/whisper_decoder.py
@@ -170,7 +170,7 @@ def create_dummy(
             cross_attention_past_shape = [
                 batch_size,
                 num_attention_heads,
-                past_decode_sequence_length,
+                encode_sequence_length,
                 head_size,
             ]
 
diff --git a/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder_decoder_init.py b/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder_decoder_init.py
index 351173f525727..832f692e9980d 100644
--- a/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder_decoder_init.py
+++ b/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder_decoder_init.py
@@ -75,7 +75,7 @@ def create_dummy(
         config: WhisperConfig,
         batch_size: int,
         encode_sequence_length: int,
-        use_decoder_input_ids: int,
+        use_decoder_input_ids: bool,
         device: torch.device,
         use_int32_inputs: bool = False,
     ):  # -> WhisperEncoderDecoderInitInputs:
@@ -125,7 +125,7 @@ def export_onnx(
             model.config,
             batch_size=2,
             encode_sequence_length=3000,
-            use_decoder_input_ids=use_decoder_input_ids,
+            use_decoder_input_ids=True,
             device=device,
             use_int32_inputs=use_int32_inputs,
         )
@@ -159,7 +159,7 @@ def export_onnx(
         hidden_size = str(model.config.d_model)
         head_size = str(model.config.d_model // model.config.encoder_attention_heads)
         dynamic_axes = {
-            "encoder_input_ids": {0: "batch_size", 1: "encode_sequence_length"},
+            "encoder_input_ids": {0: "batch_size", 1: "feature_size"},
             "encoder_hidden_states": {
                 0: "batch_size",
                 1: "encode_sequence_length",
diff --git a/onnxruntime/python/tools/transformers/models/whisper/whisper_helper.py b/onnxruntime/python/tools/transformers/models/whisper/whisper_helper.py
index e2dc79ca247ce..1b47b9426d983 100644
--- a/onnxruntime/python/tools/transformers/models/whisper/whisper_helper.py
+++ b/onnxruntime/python/tools/transformers/models/whisper/whisper_helper.py
@@ -6,12 +6,14 @@
 
 import logging
 import os
-import sys
 from pathlib import Path
 from typing import Dict, Tuple, Union
 
 import numpy as np
 import torch
+from float16 import float_to_float16_max_diff
+from onnx_model import OnnxModel
+from optimizer import optimize_model
 from packaging import version
 from transformers import WhisperConfig, WhisperForConditionalGeneration, WhisperProcessor
 from transformers import __version__ as transformers_version
@@ -21,24 +23,20 @@
 
 from onnxruntime import InferenceSession
 
-sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
-from float16 import float_to_float16_max_diff
-from onnx_model import OnnxModel
-from optimizer import optimize_model
-
 logger = logging.getLogger(__name__)
 
 PRETRAINED_WHISPER_MODELS = [
     "whisper-tiny",
     "whisper-tiny.en",
+    "whisper-base",
+    "whisper-base.en",
     "whisper-small",
     "whisper-small.en",
     "whisper-medium",
     "whisper-medium.en",
-    "whisper-base",
-    "whisper-base.en",
     "whisper-large",
     "whisper-large-v2",
+    "whisper-large-v3",
 ]
 
 
@@ -346,7 +344,12 @@ def verify_onnx(
         ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         input_features = processor([ds[0]["audio"]["array"]], return_tensors="pt").input_features
 
-        batch_size, max_length, min_length, num_beams, num_return_sequences = 1, 26, 0, 5, 1
+        start_id = [config.decoder_start_token_id]  # ex: [50258]
+        prompt_ids = processor.get_decoder_prompt_ids(language="english", task="transcribe")
+        prompt_ids = list(map(lambda token: token[1], prompt_ids))  # ex: [50259, 50358, 50363]
+        forced_decoder_ids = start_id + prompt_ids  # ex: [50258, 50259, 50358, 50363]
+
+        batch_size, max_length, min_length, num_beams, num_return_sequences = 1, 30, 0, 1, 1
         length_penalty, repetition_penalty = 1.0, 1.0
         inputs = {
             "input_features": input_features.to(device),
@@ -383,43 +386,51 @@ def verify_onnx(
             elif name == "prefix_vocab_mask":
                 inputs[name] = np.ones((batch_size, config.vocab_size), dtype=ort_to_np[dtype])
             elif name == "decoder_input_ids":
-                raw_input_ids = (
-                    [[config.decoder_start_token_id]]
-                    if use_extra_decoding_ids
-                    else [[config.decoder_start_token_id, 50259, 50359, 50363]]
-                )
+                raw_input_ids = [start_id] if use_extra_decoding_ids else [forced_decoder_ids]
                 inputs[name] = np.array(raw_input_ids, dtype=ort_to_np[dtype])
             elif name == "logits_processor":
                 inputs[name] = np.array([1], dtype=ort_to_np[dtype])
             elif name == "cross_qk_layer_head":
                 inputs[name] = np.array([[0, 0]], dtype=ort_to_np[dtype])
             elif name == "extra_decoding_ids":
-                inputs[name] = np.repeat(np.array([[50259, 50359, 50363]], dtype=ort_to_np[dtype]), batch_size, 0)
+                inputs[name] = np.repeat(np.array([prompt_ids], dtype=ort_to_np[dtype]), batch_size, 0)
+            elif name == "temperature":
+                inputs[name] = np.array([1.0], dtype=ort_to_np[dtype])
             else:
                 inputs[name] = np.array([inputs[name]], dtype=ort_to_np[dtype])
         ort_outputs = ort_session.run(None, inputs)[0][0]
 
-        if pt_outputs.shape != ort_outputs.shape:
-            logger.warning("PyTorch and ONNX Runtime outputs do not have the same shape")
+        expected_transcription_no_comma = (
+            " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel."
+        )
+        expected_transcription_with_comma = (
+            " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel."
+        )
+        expected_transcription_with_quote_and_comma = (
+            ' "Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
+        )
+        expected_transcription_options = {
+            expected_transcription_no_comma,
+            expected_transcription_with_comma,
+            expected_transcription_with_quote_and_comma,
+        }
+        pt_transcription = processor.batch_decode(pt_outputs, skip_special_tokens=True)[0]
+        ort_transcription = processor.batch_decode(ort_outputs, skip_special_tokens=True)[0]
 
-        diff = pt_outputs - ort_outputs
-        max_diff = max(diff.min(), diff.max(), key=abs)
+        parity = (
+            pt_transcription in expected_transcription_options and ort_transcription in expected_transcription_options
+        )
+        max_diff = 0
 
-        if max_diff > 0:
-            # For ONNX Runtime INT8 model
-            pt_expected_transcription = (
-                " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel."
-            )
-            pt_transcription = processor.batch_decode(pt_outputs, skip_special_tokens=True)
-            ort_expected_transcription = (
-                " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel."
-            )
-            ort_transcription = processor.batch_decode(ort_outputs, skip_special_tokens=True)
+        if not parity:
+            if pt_outputs.shape != ort_outputs.shape:
+                diff = pt_outputs - ort_outputs[:, : len(pt_outputs[0])]
+            else:
+                diff = pt_outputs - ort_outputs
+            max_diff = max(diff.min(), diff.max(), key=abs)
 
-            parity = (
-                pt_expected_transcription == pt_transcription[0] and ort_expected_transcription == ort_transcription[0]
-            )
-            if parity:
-                max_diff = 0
+        if max_diff != 0:
+            logger.warning(f"PyTorch outputs: {pt_transcription}")
+            logger.warning(f"ONNX Runtime outputs: {ort_transcription}")
 
         return max_diff
diff --git a/onnxruntime/python/tools/transformers/torch_onnx_export_helper.py b/onnxruntime/python/tools/transformers/torch_onnx_export_helper.py
index f3e67930adbff..66f24c47f6cdb 100644
--- a/onnxruntime/python/tools/transformers/torch_onnx_export_helper.py
+++ b/onnxruntime/python/tools/transformers/torch_onnx_export_helper.py
@@ -4,6 +4,7 @@
 # --------------------------------------------------------------------------
 
 import torch
+from torch._C._onnx import OperatorExportTypes
 
 TrainingMode = torch.onnx.TrainingMode
 from packaging.version import Version  # noqa: E402
@@ -18,7 +19,7 @@ def torch_onnx_export(
     training=TrainingMode.EVAL,
     input_names=None,
     output_names=None,
-    operator_export_type=None,
+    operator_export_type=OperatorExportTypes.ONNX,
     opset_version=None,
     _retain_param_name=None,
     do_constant_folding=True,
diff --git a/onnxruntime/test/python/transformers/test_generation.py b/onnxruntime/test/python/transformers/test_generation.py
index 40ea8cf774918..33ec1bd7728fe 100644
--- a/onnxruntime/test/python/transformers/test_generation.py
+++ b/onnxruntime/test/python/transformers/test_generation.py
@@ -381,22 +381,23 @@ def test_logits_processor(self):
 
     @pytest.mark.slow
     def test_cross_qk_overall(self):
-        decoder_input_ids = [
-            "--chain_model",
-            "--collect_cross_qk",
-            "--output_cross_qk",
-            "--use_forced_decoder_ids",
-            "--extra_decoding_ids",
-            "--output_no_speech_probs",
+        cross_qk_input_args = [
             "--use_vocab_mask",
             "--use_prefix_vocab_mask",
+            "--use_forced_decoder_ids",
             "--use_logits_processor",
+            "--collect_cross_qk",
+            "--extra_decoding_ids",
         ]
-        self.run_configs(decoder_input_ids)
+        cross_qk_output_args = [
+            "--output_cross_qk",
+            "--output_no_speech_probs",
+        ]
+        self.run_configs(cross_qk_input_args + cross_qk_output_args)
 
     @pytest.mark.slow
     def test_openai_impl_whisper(self):
-        optional_args = ["--model_impl", "openai", "--chain_model", "--use_whisper_beamsearch"]
+        optional_args = ["--model_impl", "openai"]
         self.run_configs(optional_args)
 
 
diff --git a/onnxruntime/test/python/transformers/test_whisper_timestamp_processor.py b/onnxruntime/test/python/transformers/test_whisper_timestamp_processor.py
index 77ce09d7e793b..7892000ae45a0 100644
--- a/onnxruntime/test/python/transformers/test_whisper_timestamp_processor.py
+++ b/onnxruntime/test/python/transformers/test_whisper_timestamp_processor.py
@@ -50,7 +50,7 @@ def run_timestamp(self, provider: str):
         ort_out = sess.run(None, ort_inputs)
         ort_out_tensor = torch.from_numpy(ort_out[0])
         ort_transcription = processor.batch_decode(
-            ort_out_tensor[0][0].view(1, -1), skip_special_tokens=True, output_offsets=True
+            ort_out_tensor[0][0].view(1, -1), skip_special_tokens=True, output_offsets=True, decode_with_timestamps=True
         )
         print(ort_transcription)
         expected_transcription = [
@@ -58,7 +58,7 @@ def run_timestamp(self, provider: str):
                 "text": "<|0.00|> Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.<|5.44|>",
                 "offsets": [
                     {
-                        "text": "<|0.00|> Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.<|5.44|>",
+                        "text": " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.",
                         "timestamp": (0.0, 5.44),
                     }
                 ],

From 4874a41008138ecc1f26e9cd17e5d9d7febb29aa Mon Sep 17 00:00:00 2001
From: Adrian Lizarraga <adlizarraga@microsoft.com>
Date: Fri, 16 Feb 2024 16:59:43 -0800
Subject: [PATCH 008/279] [QNN EP] Update default QNN SDK to 2.19.2.240210
 (#19546)

### Description
Updates the default QNN SDK version to 2.19.2.240210.

### Motivation and Context
Build and test the latest version of QNN SDK in our pipelines.
---
 .../android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml          | 2 +-
 tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml | 2 +-
 .../github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml  | 2 +-
 .../github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml        | 2 +-
 tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml   | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
index 2b181810b0788..d37266a8e96d8 100644
--- a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
@@ -31,7 +31,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: qnn-v2.18.0.240101
+  default: qnn-v2.19.2.240210
 
 jobs:
 - job: Build_QNN_EP
diff --git a/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
index 0312b70d2b1d5..8fa5bdbf90931 100644
--- a/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
@@ -32,7 +32,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: qnn-v2.18.0.240101
+  default: qnn-v2.19.2.240210
 
 jobs:
   - job: Build_QNN_EP
diff --git a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
index b0509467e1689..9a38513d04a79 100644
--- a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
@@ -2,7 +2,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK Version
   type: string
-  default: qnn-v2.18.0.240101_win
+  default: qnn-v2.19.2.240210_win
 
 - name: build_config
   displayName: Build Configuration
diff --git a/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
index 13d4589a67cdc..dc861f7f1ed79 100644
--- a/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
@@ -32,7 +32,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: qnn-v2.18.0.240101_win
+  default: qnn-v2.19.2.240210_win
 
 jobs:
 - job: 'build'
diff --git a/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
index 6246bb83566e5..534d5c6d6135b 100644
--- a/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
@@ -32,7 +32,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: qnn-v2.18.0.240101_win
+  default: qnn-v2.19.2.240210_win
 
 jobs:
 - job: 'build'

From 06269a3952fb1759d93235b9d66f9beb10ae8663 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Fri, 16 Feb 2024 18:28:27 -0800
Subject: [PATCH 009/279] [js/webgpu] allow uint8 tensors for webgpu (#19545)

### Description
allow uint8 tensors for webgpu
---
 js/common/lib/tensor-impl.ts   | 2 +-
 js/common/lib/tensor.ts        | 2 +-
 js/web/lib/wasm/wasm-common.ts | 3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/js/common/lib/tensor-impl.ts b/js/common/lib/tensor-impl.ts
index e3e2b9c728556..de18126a9d0ae 100644
--- a/js/common/lib/tensor-impl.ts
+++ b/js/common/lib/tensor-impl.ts
@@ -103,7 +103,7 @@ export class Tensor implements TensorInterface {
         }
         case 'gpu-buffer': {
           if ((type !== 'float32' && type !== 'float16' && type !== 'int32' && type !== 'int64' && type !== 'uint32' &&
-               type !== 'bool')) {
+               type !== 'uint8' && type !== 'bool')) {
             throw new TypeError(`unsupported type "${type}" to create tensor from gpu buffer`);
           }
           this.gpuBufferData = arg0.gpuBuffer;
diff --git a/js/common/lib/tensor.ts b/js/common/lib/tensor.ts
index 6c08d1fe8e057..d5da33640dc7d 100644
--- a/js/common/lib/tensor.ts
+++ b/js/common/lib/tensor.ts
@@ -135,7 +135,7 @@ export declare namespace Tensor {
   /**
    * supported data types for constructing a tensor from a WebGPU buffer
    */
-  export type GpuBufferDataTypes = 'float32'|'float16'|'int32'|'int64'|'uint32'|'bool';
+  export type GpuBufferDataTypes = 'float32'|'float16'|'int32'|'int64'|'uint32'|'uint8'|'bool';
 
   /**
    * represent where the tensor data is stored
diff --git a/js/web/lib/wasm/wasm-common.ts b/js/web/lib/wasm/wasm-common.ts
index b9eff45e890c4..93910af1f1bf0 100644
--- a/js/web/lib/wasm/wasm-common.ts
+++ b/js/web/lib/wasm/wasm-common.ts
@@ -169,7 +169,8 @@ export const logLevelStringToEnum = (logLevel?: 'verbose'|'info'|'warning'|'erro
  * Check whether the given tensor type is supported by GPU buffer
  */
 export const isGpuBufferSupportedType = (type: Tensor.Type): type is Tensor.GpuBufferDataTypes => type === 'float32' ||
-    type === 'int32' || type === 'int64' || type === 'bool' || type === 'float16' || type === 'uint32';
+    type === 'float16' || type === 'int32' || type === 'int64' || type === 'uint32' || type === 'uint8' ||
+    type === 'bool';
 
 /**
  * Map string data location to integer value

From dfeda9019cfed2d6df5bcacc54269c7de481bdee Mon Sep 17 00:00:00 2001
From: satyajandhyala <satya.k.jandhyala@gmail.com>
Date: Sat, 17 Feb 2024 09:19:17 -0800
Subject: [PATCH 010/279] [JS/WebGPU] Add MatMulNBits (#19446)

### Description
Add MatMulNBits to support MatMul using 4-bit quantized weights


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 js/web/docs/webgpu-operators.md               |    1 +
 js/web/lib/wasm/jsep/util.ts                  |   28 +
 .../lib/wasm/jsep/webgpu/op-resolve-rules.ts  |    2 +
 .../lib/wasm/jsep/webgpu/ops/matmulnbits.ts   |  184 ++
 js/web/test/data/ops/matmulnbits.jsonc        | 1527 +++++++++++++++++
 js/web/test/suite-test-list.jsonc             |    1 +
 .../contrib_ops/js/js_contrib_kernels.cc      |   16 +-
 .../js/quantization/matmul_nbits.cc           |   25 +
 .../js/quantization/matmul_nbits.h            |   48 +
 9 files changed, 1825 insertions(+), 7 deletions(-)
 create mode 100644 js/web/lib/wasm/jsep/webgpu/ops/matmulnbits.ts
 create mode 100644 js/web/test/data/ops/matmulnbits.jsonc
 create mode 100644 onnxruntime/contrib_ops/js/quantization/matmul_nbits.cc
 create mode 100644 onnxruntime/contrib_ops/js/quantization/matmul_nbits.h

diff --git a/js/web/docs/webgpu-operators.md b/js/web/docs/webgpu-operators.md
index b21af8e715db3..4a8c92bb97bfd 100644
--- a/js/web/docs/webgpu-operators.md
+++ b/js/web/docs/webgpu-operators.md
@@ -62,6 +62,7 @@ Do not modify directly.*
 | LessOrEqual | ai.onnx(12-15,16+) |  |
 | Log | ai.onnx(6-12,13+) |  |
 | MatMul | ai.onnx(1-12,13+) |  |
+| MatMulNBits | com.microsoft(1+) |  |
 | MaxPool | ai.onnx(1-7,8-9,10,11,12+); com.ms.internal.nhwc(1-7,8-9,10,11,12+) | need perf optimization; need implementing activation |
 | MemcpyFromHost | ai.onnx(1+) |  |
 | MemcpyToHost | ai.onnx(1+) |  |
diff --git a/js/web/lib/wasm/jsep/util.ts b/js/web/lib/wasm/jsep/util.ts
index 6922d7ff5df6e..c0517ce363644 100644
--- a/js/web/lib/wasm/jsep/util.ts
+++ b/js/web/lib/wasm/jsep/util.ts
@@ -92,6 +92,34 @@ export class ShapeUtil {
     return ShapeUtil.getSizeFromDimensionRange(dims, 0, dims.length);
   }
 
+  /**
+   * convert dims corresponding to type change to pack. ex. uint8 data to uint32
+   */
+  static convertShape(dims: readonly number[], size = 4): readonly number[] {
+    const rank = dims.length;
+    if (rank === 0) {
+      return [];
+    }
+    const newDims = new Array(rank);
+    let i = rank - 1;
+    while (i >= 0) {
+      if (dims[i] % size === 0) {
+        newDims[i] = dims[i] / size;
+        break;
+      }
+      if (size % dims[i] !== 0) {
+        throw new Error('cannot convert shape');
+      }
+      newDims[i] = 1;
+      size /= dims[i];
+      i--;
+    }
+    for (i--; i >= 0; i--) {
+      newDims[i] = dims[i];
+    }
+    return newDims;
+  }
+
   /**
    * calculate the size (number of elements) from the given axis (inclusive)
    */
diff --git a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
index ac08c5fb1f7ab..ba874c8dd0f80 100644
--- a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
+++ b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
@@ -20,6 +20,7 @@ import {gemm, parseGemmAttributes} from './ops/gemm';
 import {instanceNorm} from './ops/instance-norm';
 import {layerNorm} from './ops/layer-norm';
 import {matMul} from './ops/matmul';
+import {matMulNBits, parseMatMulNBitsAttributes} from './ops/matmulnbits';
 import {multiHeadAttention, parseMultiHeadAttentionAttributes} from './ops/multi-head-attentiion';
 import {pad} from './ops/pad';
 import * as pool from './ops/pool';
@@ -92,6 +93,7 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   ['LessOrEqual', [binaryOps.lessOrEqual]],
   ['Log', [unaryOps.log]],
   ['MatMul', [matMul]],
+  ['MatMulNBits', [matMulNBits, parseMatMulNBitsAttributes]],
   // TODO: support new attributes for MaxPool-8 and MaxPool-10
   ['MaxPool', [pool.maxPool, pool.parseMaxPoolAttributes]],
   ['Mul', [binaryOps.mul]],
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/matmulnbits.ts b/js/web/lib/wasm/jsep/webgpu/ops/matmulnbits.ts
new file mode 100644
index 0000000000000..ead7635cf3ac4
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/ops/matmulnbits.ts
@@ -0,0 +1,184 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {DataType} from '../../../wasm-common';
+import {TensorView} from '../../tensor-view';
+import {ShapeUtil} from '../../util';
+import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
+import {ComputeContext, ProgramInfo, ProgramUniform} from '../types';
+
+import {createTensorShapeVariables, inputVariable, outputVariable, ShaderHelper, tensorTypeToWsglStorageType, UniformsArrayType} from './common';
+
+//  TODO support quantization bits not equal to 4
+export interface MatMulNBitsAttributes extends AttributeWithCacheKey {
+  k: number;
+  n: number;
+  accuracyLevel: number;
+  bits: number;
+  blockSize: number;
+}
+
+const validateInputs = (inputs: readonly TensorView[], attributes: MatMulNBitsAttributes): void => {
+  if (inputs.length < 3 || inputs.length > 4) {
+    throw new Error('MatMulNBits requires 3 or 4 inputs');
+  }
+  const a = inputs[0];
+  const aRank = a.dims.length;
+  if (a.dims[aRank - 1] !== attributes.k) {
+    throw new Error('The last dim of input shape does not match the k value');
+  }
+  const nBlocksPerCol = Math.floor((attributes.k + attributes.blockSize - 1) / attributes.blockSize);
+  const blobSize = attributes.blockSize / 8 * attributes.bits;
+  const b = inputs[1];
+  if (!ShapeUtil.areEqual(b.dims, [attributes.n, nBlocksPerCol, blobSize])) {
+    throw new Error('The second inputs must be 3D tensor with shape N X nBlocksPerCol X blobSize');
+  }
+  const scales = inputs[2];
+  const scalesShape = scales.dims;
+  if (ShapeUtil.size(scalesShape) !== attributes.n * nBlocksPerCol) {
+    throw new Error('scales input size error.');
+  }
+  if (inputs.length === 4) {
+    const zeroPoints = inputs[3];
+    const zeroPointsShape = zeroPoints.dims;
+    const expectedZeroPointsSize =
+        attributes.bits > 4 ? (attributes.n * nBlocksPerCol) : attributes.n * Math.floor((nBlocksPerCol + 1) / 2);
+    if (ShapeUtil.size(zeroPointsShape) !== expectedZeroPointsSize) {
+      throw new Error('zeroPoints input size error.');
+    }
+  }
+};
+
+export const createMatMulNBitsProgramInfo =
+    (inputs: readonly TensorView[], attributes: MatMulNBitsAttributes): ProgramInfo => {
+      const a = inputs[0];
+      const b = inputs[1];
+      const scales = inputs[2];
+      const aRank = a.dims.length;
+      const outputShape = a.dims.slice(0, aRank - 1).concat(attributes.n);
+      const outputSize = ShapeUtil.size(outputShape);
+
+
+      const programUniforms: ProgramUniform[] = [
+        {type: DataType.uint32, data: outputSize}, {type: DataType.uint32, data: attributes.k},
+        {type: DataType.uint32, data: attributes.n}, {type: DataType.uint32, data: attributes.accuracyLevel},
+        {type: DataType.uint32, data: attributes.bits}, {type: DataType.uint32, data: attributes.blockSize}
+      ];
+      programUniforms.push(...createTensorShapeVariables(a.dims));
+      programUniforms.push(...createTensorShapeVariables(ShapeUtil.convertShape(b.dims)));
+      programUniforms.push(...createTensorShapeVariables(scales.dims));
+      if (inputs.length === 4) {
+        programUniforms.push(...createTensorShapeVariables(ShapeUtil.convertShape(inputs[3].dims)));
+      }
+      programUniforms.push(...createTensorShapeVariables(outputShape));
+      const getShaderSource = (shaderHelper: ShaderHelper) => {
+        const a = inputVariable('a', inputs[0].dataType, inputs[0].dims.length);
+        const b = inputVariable('b', DataType.uint32, inputs[1].dims.length);
+        const scales = inputVariable('scales', inputs[2].dataType, inputs[2].dims.length);
+        const inputVariables = [a, b, scales];
+        const zeroPoints =
+            inputs.length === 4 ? inputVariable('zero_points', DataType.uint32, inputs[3].dims.length) : undefined;
+        if (zeroPoints) {
+          inputVariables.push(zeroPoints);
+        }
+        const output = outputVariable('output', inputs[0].dataType, outputShape.length);
+        const uniforms: UniformsArrayType = [
+          {name: 'output_size', type: 'u32'}, {name: 'k', type: 'u32'}, {name: 'n', type: 'u32'},
+          {name: 'accuracy_level', type: 'u32'}, {name: 'bits', type: 'u32'}, {name: 'block_size', type: 'u32'}
+        ];
+        const nBlocksPerCol = Math.floor((attributes.k + attributes.blockSize - 1) / attributes.blockSize);
+        const blobSize = attributes.blockSize / 8 * attributes.bits;
+        const wordPerBlob = blobSize / 4;
+        const dataType = tensorTypeToWsglStorageType(inputs[0].dataType);
+        return `
+        fn ortUnpack8x4snorm(value: u32) -> array<${dataType}, 8>{
+          var result = array<${dataType}, 8>();
+          var offset: u32 = 0;
+          let count: u32 = 4;
+          for (var i: u32 = 0; i < 8u; i++) {
+            result[i] = ${dataType}(extractBits(value, offset, count));
+            offset += count;
+          }
+          return result;
+        }
+        ${shaderHelper.registerUniforms(uniforms).declareVariables(...inputVariables, output)}
+        ${shaderHelper.mainStart()}
+          ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')}
+          var value: ${dataType} = 0.0;
+          let output_indices = ${output.offsetToIndices('global_idx')};
+          var a_indices: ${a.type.indices} = output_indices;
+          var n = ${output.indicesGet('output_indices', aRank - 1)};
+          // Two zero points are packed into one byte because uniforms.bits <= 4.
+          // zero_point_offset is either 0 or 4. It is bit offset within one byte.
+          // TODO support zero_point_offset for bits > 4
+          ${
+            zeroPoints ? `
+            var zero_point_index: u32 = n * ((${nBlocksPerCol} + 1) / 2) / 4;
+            var zero_point_word: u32 = ${zeroPoints.getByOffset('zero_point_index')};
+            var zero_point_offset: u32 = 0;` :
+                         ''}
+          var scale_idex = n * ${nBlocksPerCol};
+          var b_indices: ${b.type.indices};
+          ${b.indicesSet('b_indices', '0', 'n')};
+          var block_offset: u32 = 0;
+          for (var block: u32 = 0; block < ${nBlocksPerCol}; block++) {
+            // The scale and zero points are computed per block.
+            let scale = ${scales.getByOffset('scale_idex')};
+            // The default zero point is 8 for unsigned 4-bit quantization.
+            let zero_point: ${dataType} = ${
+            zeroPoints ? `${dataType}(extractBits(zero_point_word, zero_point_offset, 4))` : 8.0};
+            ${b.indicesSet('b_indices', '1', 'block')};
+            var word_offset: u32 = block_offset;
+            for (var word: u32 = 0; word < ${wordPerBlob}; word++) {
+              ${b.indicesSet('b_indices', '2', 'word')};
+              let b_value = ${b.getByIndices('b_indices')};
+              let b_quantized_values: array<${dataType}, 8> = ortUnpack8x4snorm(b_value);
+              // Number of B elements per 32-bit word is 32/bits = 32/4 = 8
+              var offset: u32 = word_offset;
+              for (var i: u32 = 0; i < 8; i++) {
+                ${a.indicesSet('a_indices', aRank - 1, 'offset')};
+                let a_value = ${a.getByIndices('a_indices')};
+                let b_quantized_value = b_quantized_values[i];
+                let b_dequantized_value = (b_quantized_value - zero_point) * scale;
+                value += a_value * b_dequantized_value;
+                offset++;
+              }
+              word_offset += 8;
+            }
+            scale_idex++;
+            ${
+            zeroPoints ? `
+            if (zero_point_offset == 28) {
+              zero_point_offset = 0;
+              zero_point_index++;
+              zero_point_word = ${zeroPoints.getByOffset('zero_point_index')};
+            } else {
+              zero_point_offset += 4;
+            }` :
+                         ''}
+            block_offset += uniforms.block_size;
+          }
+          ${output.setByOffset('global_idx', 'value')};
+        }
+        `;
+      };
+      return {
+        name: 'MatMulNBits',
+        shaderCache:
+            {hint: `${attributes.cacheKey};${inputs.length}`, inputDependencies: Array(inputs.length).fill('rank')},
+        getRunData: () => ({
+          outputs: [{dims: outputShape, dataType: inputs[0].dataType}],
+          dispatchGroup: {x: Math.ceil(outputSize / 64)},
+          programUniforms
+        }),
+        getShaderSource
+      };
+    };
+
+export const matMulNBits = (context: ComputeContext, attributes: MatMulNBitsAttributes): void => {
+  validateInputs(context.inputs, attributes);
+  context.compute(createMatMulNBitsProgramInfo(context.inputs, attributes));
+};
+
+export const parseMatMulNBitsAttributes = (attributes: Record<string, unknown>): MatMulNBitsAttributes =>
+    createAttributeWithCacheKey(attributes as Omit<MatMulNBitsAttributes, keyof AttributeWithCacheKey>);
diff --git a/js/web/test/data/ops/matmulnbits.jsonc b/js/web/test/data/ops/matmulnbits.jsonc
new file mode 100644
index 0000000000000..c57c431afb3ce
--- /dev/null
+++ b/js/web/test/data/ops/matmulnbits.jsonc
@@ -0,0 +1,1527 @@
+[
+  {
+    "name": "MatMulNBits; K=16, N=16, block_size=16, bits=4",
+    "operator": "MatMulNBits",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "attributes": [
+      { "name": "K", "data": 16, "type": "int" },
+      { "name": "N", "data": 16, "type": "int" },
+      { "name": "block_size", "data": 16, "type": "int" },
+      { "name": "bits", "data": 4, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "MatMulNBits; K=16, N=16, block_size=16, bits=4; symmetric",
+        "inputs": [
+          {
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+              55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
+              81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
+              106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
+              127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147,
+              148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
+              169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189,
+              190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210,
+              211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231,
+              232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252,
+              253, 254, 255
+            ],
+            "dims": [16, 16],
+            "type": "float32"
+          },
+          {
+            "dims": [16, 1, 8],
+            "type": "uint8",
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+              82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
+              107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+              128
+            ]
+          },
+          {
+            "dims": [16],
+            "type": "float32",
+            "data": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
+          }
+        ],
+        "outputs": [
+          {
+            "dims": [16, 16],
+            "type": "float32",
+            "data": [
+              0, -385, -1120, -963, -1984, -1285, -2592, -1351, -2944, -1161, -3040, -715, -2880, -13, -2464, 945, 0,
+              -1073, -3808, -2643, -6848, -3445, -9120, -3479, -10624, -2745, -11360, -1243, -11328, 1027, -10528, 4065,
+              0, -1761, -6496, -4323, -11712, -5605, -15648, -5607, -18304, -4329, -19680, -1771, -19776, 2067, -18592,
+              7185, 0, -2449, -9184, -6003, -16576, -7765, -22176, -7735, -25984, -5913, -28000, -2299, -28224, 3107,
+              -26656, 10305, 0, -3137, -11872, -7683, -21440, -9925, -28704, -9863, -33664, -7497, -36320, -2827,
+              -36672, 4147, -34720, 13425, 0, -3825, -14560, -9363, -26304, -12085, -35232, -11991, -41344, -9081,
+              -44640, -3355, -45120, 5187, -42784, 16545, 0, -4513, -17248, -11043, -31168, -14245, -41760, -14119,
+              -49024, -10665, -52960, -3883, -53568, 6227, -50848, 19665, 0, -5201, -19936, -12723, -36032, -16405,
+              -48288, -16247, -56704, -12249, -61280, -4411, -62016, 7267, -58912, 22785, 0, -5889, -22624, -14403,
+              -40896, -18565, -54816, -18375, -64384, -13833, -69600, -4939, -70464, 8307, -66976, 25905, 0, -6577,
+              -25312, -16083, -45760, -20725, -61344, -20503, -72064, -15417, -77920, -5467, -78912, 9347, -75040,
+              29025, 0, -7265, -28000, -17763, -50624, -22885, -67872, -22631, -79744, -17001, -86240, -5995, -87360,
+              10387, -83104, 32145, 0, -7953, -30688, -19443, -55488, -25045, -74400, -24759, -87424, -18585, -94560,
+              -6523, -95808, 11427, -91168, 35265, 0, -8641, -33376, -21123, -60352, -27205, -80928, -26887, -95104,
+              -20169, -102880, -7051, -104256, 12467, -99232, 38385, 0, -9329, -36064, -22803, -65216, -29365, -87456,
+              -29015, -102784, -21753, -111200, -7579, -112704, 13507, -107296, 41505, 0, -10017, -38752, -24483,
+              -70080, -31525, -93984, -31143, -110464, -23337, -119520, -8107, -121152, 14547, -115360, 44625, 0,
+              -10705, -41440, -26163, -74944, -33685, -100512, -33271, -118144, -24921, -127840, -8635, -129600, 15587,
+              -123424, 47745
+            ]
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "MatMulNBits; K=16, N=16, block_size=16, bits=4",
+    "operator": "MatMulNBits",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "attributes": [
+      { "name": "K", "data": 16, "type": "int" },
+      { "name": "N", "data": 16, "type": "int" },
+      { "name": "block_size", "data": 16, "type": "int" },
+      { "name": "bits", "data": 4, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "MatMulNBits; K=16, N=16, block_size=16, bits=4; asymmetric",
+        "inputs": [
+          {
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+              55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
+              81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
+              106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
+              127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147,
+              148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
+              169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189,
+              190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210,
+              211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231,
+              232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252,
+              253, 254, 255
+            ],
+            "dims": [16, 16],
+            "type": "float32"
+          },
+          {
+            "dims": [16, 1, 8],
+            "type": "uint8",
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+              55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
+              81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
+              106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
+              127
+            ]
+          },
+          {
+            "dims": [16],
+            "type": "float32",
+            "data": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
+          },
+          {
+            "dims": [16],
+            "type": "uint8",
+            "data": [128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128]
+          }
+        ],
+        "outputs": [
+          {
+            "dims": [16, 16],
+            "type": "float32",
+            "data": [
+              0, 728, 688, 2376, 1632, 4280, 2832, 6440, 4288, 8856, 6000, 11528, 7968, 14456, 10192, 17640, 0, 2200,
+              1840, 7176, 4448, 12920, 7824, 19432, 11968, 26712, 16880, 34760, 22560, 43576, 29008, 53160, 0, 3672,
+              2992, 11976, 7264, 21560, 12816, 32424, 19648, 44568, 27760, 57992, 37152, 72696, 47824, 88680, 0, 5144,
+              4144, 16776, 10080, 30200, 17808, 45416, 27328, 62424, 38640, 81224, 51744, 101816, 66640, 124200, 0,
+              6616, 5296, 21576, 12896, 38840, 22800, 58408, 35008, 80280, 49520, 104456, 66336, 130936, 85456, 159720,
+              0, 8088, 6448, 26376, 15712, 47480, 27792, 71400, 42688, 98136, 60400, 127688, 80928, 160056, 104272,
+              195240, 0, 9560, 7600, 31176, 18528, 56120, 32784, 84392, 50368, 115992, 71280, 150920, 95520, 189176,
+              123088, 230760, 0, 11032, 8752, 35976, 21344, 64760, 37776, 97384, 58048, 133848, 82160, 174152, 110112,
+              218296, 141904, 266280, 0, 12504, 9904, 40776, 24160, 73400, 42768, 110376, 65728, 151704, 93040, 197384,
+              124704, 247416, 160720, 301800, 0, 13976, 11056, 45576, 26976, 82040, 47760, 123368, 73408, 169560,
+              103920, 220616, 139296, 276536, 179536, 337320, 0, 15448, 12208, 50376, 29792, 90680, 52752, 136360,
+              81088, 187416, 114800, 243848, 153888, 305656, 198352, 372840, 0, 16920, 13360, 55176, 32608, 99320,
+              57744, 149352, 88768, 205272, 125680, 267080, 168480, 334776, 217168, 408360, 0, 18392, 14512, 59976,
+              35424, 107960, 62736, 162344, 96448, 223128, 136560, 290312, 183072, 363896, 235984, 443880, 0, 19864,
+              15664, 64776, 38240, 116600, 67728, 175336, 104128, 240984, 147440, 313544, 197664, 393016, 254800,
+              479400, 0, 21336, 16816, 69576, 41056, 125240, 72720, 188328, 111808, 258840, 158320, 336776, 212256,
+              422136, 273616, 514920, 0, 22808, 17968, 74376, 43872, 133880, 77712, 201320, 119488, 276696, 169200,
+              360008, 226848, 451256, 292432, 550440
+            ]
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "MatMulNBits; K=16, N=32, block_size=16, bits=4",
+    "operator": "MatMulNBits",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "attributes": [
+      { "name": "K", "data": 16, "type": "int" },
+      { "name": "N", "data": 32, "type": "int" },
+      { "name": "block_size", "data": 16, "type": "int" },
+      { "name": "bits", "data": 4, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "MatMulNBits; K=16, N=32, block_size=16, bits=4; symmetric",
+        "inputs": [
+          {
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+              82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
+              107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+              128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
+              149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
+              170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
+              191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211,
+              212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232,
+              233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
+              254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274,
+              275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295,
+              296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316,
+              317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337,
+              338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358,
+              359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379,
+              380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400,
+              401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421,
+              422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442,
+              443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463,
+              464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484,
+              485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505,
+              506, 507, 508, 509, 510, 511, 512
+            ],
+            "dims": [32, 16],
+            "type": "float32"
+          },
+          {
+            "dims": [32, 1, 8],
+            "type": "uint8",
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+              82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
+              107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+              128, 29, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
+              149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
+              170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
+              191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211,
+              212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232,
+              233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
+              254, 255, 256
+            ]
+          },
+          {
+            "dims": [32],
+            "type": "float32",
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31
+            ]
+          }
+        ],
+        "outputs": [
+          {
+            "dims": [32, 32],
+            "type": "float32",
+            "data": [
+              0, -428, -1288, -1068, -2288, -1420, -3000, -1484, -3424, -1260, -3560, -748, -3408, 52, -2968, 1140,
+              -2272, 2516, -1224, 4180, 80, 6132, 1672, 8372, 3552, 10900, 5720, 13716, 8176, 16820, 10920, 12276, 0,
+              -1116, -3976, -2748, -7152, -3580, -9528, -3612, -11104, -2844, -11880, -1276, -11856, 1092, -11032, 4260,
+              -8160, 8228, -6984, 12996, -3760, 18564, 264, 24932, 5088, 32100, 10712, 40068, 17136, 48836, 24360,
+              42532, 0, -1804, -6664, -4428, -12016, -5740, -16056, -5740, -18784, -4428, -20200, -1804, -20304, 2132,
+              -19096, 7380, -14048, 13940, -12744, 21812, -7600, 30996, -1144, 41492, 6624, 53300, 15704, 66420, 26096,
+              80852, 37800, 72788, 0, -2492, -9352, -6108, -16880, -7900, -22584, -7868, -26464, -6012, -28520, -2332,
+              -28752, 3172, -27160, 10500, -19936, 19652, -18504, 30628, -11440, 43428, -2552, 58052, 8160, 74500,
+              20696, 92772, 35056, 112868, 51240, 103044, 0, -3180, -12040, -7788, -21744, -10060, -29112, -9996,
+              -34144, -7596, -36840, -2860, -37200, 4212, -35224, 13620, -25824, 25364, -24264, 39444, -15280, 55860,
+              -3960, 74612, 9696, 95700, 25688, 119124, 44016, 144884, 64680, 133300, 0, -3868, -14728, -9468, -26608,
+              -12220, -35640, -12124, -41824, -9180, -45160, -3388, -45648, 5252, -43288, 16740, -31712, 31076, -30024,
+              48260, -19120, 68292, -5368, 91172, 11232, 116900, 30680, 145476, 52976, 176900, 78120, 163556, 0, -4556,
+              -17416, -11148, -31472, -14380, -42168, -14252, -49504, -10764, -53480, -3916, -54096, 6292, -51352,
+              19860, -37600, 36788, -35784, 57076, -22960, 80724, -6776, 107732, 12768, 138100, 35672, 171828, 61936,
+              208916, 91560, 193812, 0, -5244, -20104, -12828, -36336, -16540, -48696, -16380, -57184, -12348, -61800,
+              -4444, -62544, 7332, -59416, 22980, -43488, 42500, -41544, 65892, -26800, 93156, -8184, 124292, 14304,
+              159300, 40664, 198180, 70896, 240932, 105000, 224068, 0, -5932, -22792, -14508, -41200, -18700, -55224,
+              -18508, -64864, -13932, -70120, -4972, -70992, 8372, -67480, 26100, -49376, 48212, -47304, 74708, -30640,
+              105588, -9592, 140852, 15840, 180500, 45656, 224532, 79856, 272948, 118440, 254324, 0, -6620, -25480,
+              -16188, -46064, -20860, -61752, -20636, -72544, -15516, -78440, -5500, -79440, 9412, -75544, 29220,
+              -55264, 53924, -53064, 83524, -34480, 118020, -11000, 157412, 17376, 201700, 50648, 250884, 88816, 304964,
+              131880, 284580, 0, -7308, -28168, -17868, -50928, -23020, -68280, -22764, -80224, -17100, -86760, -6028,
+              -87888, 10452, -83608, 32340, -61152, 59636, -58824, 92340, -38320, 130452, -12408, 173972, 18912, 222900,
+              55640, 277236, 97776, 336980, 145320, 314836, 0, -7996, -30856, -19548, -55792, -25180, -74808, -24892,
+              -87904, -18684, -95080, -6556, -96336, 11492, -91672, 35460, -67040, 65348, -64584, 101156, -42160,
+              142884, -13816, 190532, 20448, 244100, 60632, 303588, 106736, 368996, 158760, 345092, 0, -8684, -33544,
+              -21228, -60656, -27340, -81336, -27020, -95584, -20268, -103400, -7084, -104784, 12532, -99736, 38580,
+              -72928, 71060, -70344, 109972, -46000, 155316, -15224, 207092, 21984, 265300, 65624, 329940, 115696,
+              401012, 172200, 375348, 0, -9372, -36232, -22908, -65520, -29500, -87864, -29148, -103264, -21852,
+              -111720, -7612, -113232, 13572, -107800, 41700, -78816, 76772, -76104, 118788, -49840, 167748, -16632,
+              223652, 23520, 286500, 70616, 356292, 124656, 433028, 185640, 405604, 0, -10060, -38920, -24588, -70384,
+              -31660, -94392, -31276, -110944, -23436, -120040, -8140, -121680, 14612, -115864, 44820, -84704, 82484,
+              -81864, 127604, -53680, 180180, -18040, 240212, 25056, 307700, 75608, 382644, 133616, 465044, 199080,
+              435860, 0, -10748, -41608, -26268, -75248, -33820, -100920, -33404, -118624, -25020, -128360, -8668,
+              -130128, 15652, -123928, 47940, -90592, 88196, -87624, 136420, -57520, 192612, -19448, 256772, 26592,
+              328900, 80600, 408996, 142576, 497060, 212520, 466116, 0, -11436, -44296, -27948, -80112, -35980, -107448,
+              -35532, -126304, -26604, -136680, -9196, -138576, 16692, -131992, 51060, -96480, 93908, -93384, 145236,
+              -61360, 205044, -20856, 273332, 28128, 350100, 85592, 435348, 151536, 529076, 225960, 496372, 0, -12124,
+              -46984, -29628, -84976, -38140, -113976, -37660, -133984, -28188, -145000, -9724, -147024, 17732, -140056,
+              54180, -102368, 99620, -99144, 154052, -65200, 217476, -22264, 289892, 29664, 371300, 90584, 461700,
+              160496, 561092, 239400, 526628, 0, -12812, -49672, -31308, -89840, -40300, -120504, -39788, -141664,
+              -29772, -153320, -10252, -155472, 18772, -148120, 57300, -108256, 105332, -104904, 162868, -69040, 229908,
+              -23672, 306452, 31200, 392500, 95576, 488052, 169456, 593108, 252840, 556884, 0, -13500, -52360, -32988,
+              -94704, -42460, -127032, -41916, -149344, -31356, -161640, -10780, -163920, 19812, -156184, 60420,
+              -114144, 111044, -110664, 171684, -72880, 242340, -25080, 323012, 32736, 413700, 100568, 514404, 178416,
+              625124, 266280, 587140, 0, -14188, -55048, -34668, -99568, -44620, -133560, -44044, -157024, -32940,
+              -169960, -11308, -172368, 20852, -164248, 63540, -120032, 116756, -116424, 180500, -76720, 254772, -26488,
+              339572, 34272, 434900, 105560, 540756, 187376, 657140, 279720, 617396, 0, -14876, -57736, -36348, -104432,
+              -46780, -140088, -46172, -164704, -34524, -178280, -11836, -180816, 21892, -172312, 66660, -125920,
+              122468, -122184, 189316, -80560, 267204, -27896, 356132, 35808, 456100, 110552, 567108, 196336, 689156,
+              293160, 647652, 0, -15564, -60424, -38028, -109296, -48940, -146616, -48300, -172384, -36108, -186600,
+              -12364, -189264, 22932, -180376, 69780, -131808, 128180, -127944, 198132, -84400, 279636, -29304, 372692,
+              37344, 477300, 115544, 593460, 205296, 721172, 306600, 677908, 0, -16252, -63112, -39708, -114160, -51100,
+              -153144, -50428, -180064, -37692, -194920, -12892, -197712, 23972, -188440, 72900, -137696, 133892,
+              -133704, 206948, -88240, 292068, -30712, 389252, 38880, 498500, 120536, 619812, 214256, 753188, 320040,
+              708164, 0, -16940, -65800, -41388, -119024, -53260, -159672, -52556, -187744, -39276, -203240, -13420,
+              -206160, 25012, -196504, 76020, -143584, 139604, -139464, 215764, -92080, 304500, -32120, 405812, 40416,
+              519700, 125528, 646164, 223216, 785204, 333480, 738420, 0, -17628, -68488, -43068, -123888, -55420,
+              -166200, -54684, -195424, -40860, -211560, -13948, -214608, 26052, -204568, 79140, -149472, 145316,
+              -145224, 224580, -95920, 316932, -33528, 422372, 41952, 540900, 130520, 672516, 232176, 817220, 346920,
+              768676, 0, -18316, -71176, -44748, -128752, -57580, -172728, -56812, -203104, -42444, -219880, -14476,
+              -223056, 27092, -212632, 82260, -155360, 151028, -150984, 233396, -99760, 329364, -34936, 438932, 43488,
+              562100, 135512, 698868, 241136, 849236, 360360, 798932, 0, -19004, -73864, -46428, -133616, -59740,
+              -179256, -58940, -210784, -44028, -228200, -15004, -231504, 28132, -220696, 85380, -161248, 156740,
+              -156744, 242212, -103600, 341796, -36344, 455492, 45024, 583300, 140504, 725220, 250096, 881252, 373800,
+              829188, 0, -19692, -76552, -48108, -138480, -61900, -185784, -61068, -218464, -45612, -236520, -15532,
+              -239952, 29172, -228760, 88500, -167136, 162452, -162504, 251028, -107440, 354228, -37752, 472052, 46560,
+              604500, 145496, 751572, 259056, 913268, 387240, 859444, 0, -20380, -79240, -49788, -143344, -64060,
+              -192312, -63196, -226144, -47196, -244840, -16060, -248400, 30212, -236824, 91620, -173024, 168164,
+              -168264, 259844, -111280, 366660, -39160, 488612, 48096, 625700, 150488, 777924, 268016, 945284, 400680,
+              889700, 0, -21068, -81928, -51468, -148208, -66220, -198840, -65324, -233824, -48780, -253160, -16588,
+              -256848, 31252, -244888, 94740, -178912, 173876, -174024, 268660, -115120, 379092, -40568, 505172, 49632,
+              646900, 155480, 804276, 276976, 977300, 414120, 919956, 0, -21756, -84616, -53148, -153072, -68380,
+              -205368, -67452, -241504, -50364, -261480, -17116, -265296, 32292, -252952, 97860, -184800, 179588,
+              -179784, 277476, -118960, 391524, -41976, 521732, 51168, 668100, 160472, 830628, 285936, 1009316, 427560,
+              950212
+            ]
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "MatMulNBits; K=16, N=32, block_size=16, bits=4",
+    "operator": "MatMulNBits",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "attributes": [
+      { "name": "K", "data": 16, "type": "int" },
+      { "name": "N", "data": 32, "type": "int" },
+      { "name": "block_size", "data": 16, "type": "int" },
+      { "name": "bits", "data": 4, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "MatMulNBits; K=16, N=32, block_size=16, bits=4; asymmetric",
+        "inputs": [
+          {
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+              82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
+              107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+              128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
+              149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
+              170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
+              191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211,
+              212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232,
+              233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
+              254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274,
+              275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295,
+              296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316,
+              317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337,
+              338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358,
+              359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379,
+              380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400,
+              401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421,
+              422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442,
+              443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463,
+              464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484,
+              485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505,
+              506, 507, 508, 509, 510, 511, 512
+            ],
+            "dims": [32, 16],
+            "type": "float32"
+          },
+          {
+            "dims": [32, 1, 8],
+            "type": "uint8",
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+              82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
+              107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+              128, 29, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
+              149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
+              170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
+              191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211,
+              212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232,
+              233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
+              254, 255, 256
+            ]
+          },
+          {
+            "dims": [32],
+            "type": "float32",
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31
+            ]
+          },
+          {
+            "dims": [32],
+            "type": "uint8",
+            "data": [
+              128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+              128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128
+            ]
+          }
+        ],
+        "outputs": [
+          {
+            "dims": [32, 32],
+            "type": "float32",
+            "data": [
+              0, 660, 888, 2196, 2064, 4020, 3528, 6132, 5280, 8532, 7320, 11220, 9648, 14196, 12264, 17460, 15136,
+              21012, 18360, 24852, 21840, 28980, 25608, 33396, 29664, 38100, 34008, 43092, 38640, 48372, 43560, 46004,
+              0, 2020, 2296, 6660, 5392, 12100, 9288, 18340, 13984, 25380, 19480, 33220, 25776, 41860, 32872, 51300,
+              42016, 61540, 49464, 72580, 58960, 84420, 69256, 97060, 80352, 110500, 92248, 124740, 104944, 139780,
+              118440, 139748, 0, 3380, 3704, 11124, 8720, 20180, 15048, 30548, 22688, 42228, 31640, 55220, 41904, 69524,
+              53480, 85140, 68896, 102068, 80568, 120308, 96080, 139860, 112904, 160724, 131040, 182900, 150488, 206388,
+              171248, 231188, 193320, 233492, 0, 4740, 5112, 15588, 12048, 28260, 20808, 42756, 31392, 59076, 43800,
+              77220, 58032, 97188, 74088, 118980, 95776, 142596, 111672, 168036, 133200, 195300, 156552, 224388, 181728,
+              255300, 208728, 288036, 237552, 322596, 268200, 327236, 0, 6100, 6520, 20052, 15376, 36340, 26568, 54964,
+              40096, 75924, 55960, 99220, 74160, 124852, 94696, 152820, 122656, 183124, 142776, 215764, 170320, 250740,
+              200200, 288052, 232416, 327700, 266968, 369684, 303856, 414004, 343080, 420980, 0, 7460, 7928, 24516,
+              18704, 44420, 32328, 67172, 48800, 92772, 68120, 121220, 90288, 152516, 115304, 186660, 149536, 223652,
+              173880, 263492, 207440, 306180, 243848, 351716, 283104, 400100, 325208, 451332, 370160, 505412, 417960,
+              514724, 0, 8820, 9336, 28980, 22032, 52500, 38088, 79380, 57504, 109620, 80280, 143220, 106416, 180180,
+              135912, 220500, 176416, 264180, 204984, 311220, 244560, 361620, 287496, 415380, 333792, 472500, 383448,
+              532980, 436464, 596820, 492840, 608468, 0, 10180, 10744, 33444, 25360, 60580, 43848, 91588, 66208, 126468,
+              92440, 165220, 122544, 207844, 156520, 254340, 203296, 304708, 236088, 358948, 281680, 417060, 331144,
+              479044, 384480, 544900, 441688, 614628, 502768, 688228, 567720, 702212, 0, 11540, 12152, 37908, 28688,
+              68660, 49608, 103796, 74912, 143316, 104600, 187220, 138672, 235508, 177128, 288180, 230176, 345236,
+              267192, 406676, 318800, 472500, 374792, 542708, 435168, 617300, 499928, 696276, 569072, 779636, 642600,
+              795956, 0, 12900, 13560, 42372, 32016, 76740, 55368, 116004, 83616, 160164, 116760, 209220, 154800,
+              263172, 197736, 322020, 257056, 385764, 298296, 454404, 355920, 527940, 418440, 606372, 485856, 689700,
+              558168, 777924, 635376, 871044, 717480, 889700, 0, 14260, 14968, 46836, 35344, 84820, 61128, 128212,
+              92320, 177012, 128920, 231220, 170928, 290836, 218344, 355860, 283936, 426292, 329400, 502132, 393040,
+              583380, 462088, 670036, 536544, 762100, 616408, 859572, 701680, 962452, 792360, 983444, 0, 15620, 16376,
+              51300, 38672, 92900, 66888, 140420, 101024, 193860, 141080, 253220, 187056, 318500, 238952, 389700,
+              310816, 466820, 360504, 549860, 430160, 638820, 505736, 733700, 587232, 834500, 674648, 941220, 767984,
+              1053860, 867240, 1077188, 0, 16980, 17784, 55764, 42000, 100980, 72648, 152628, 109728, 210708, 153240,
+              275220, 203184, 346164, 259560, 423540, 337696, 507348, 391608, 597588, 467280, 694260, 549384, 797364,
+              637920, 906900, 732888, 1022868, 834288, 1145268, 942120, 1170932, 0, 18340, 19192, 60228, 45328, 109060,
+              78408, 164836, 118432, 227556, 165400, 297220, 219312, 373828, 280168, 457380, 364576, 547876, 422712,
+              645316, 504400, 749700, 593032, 861028, 688608, 979300, 791128, 1104516, 900592, 1236676, 1017000,
+              1264676, 0, 19700, 20600, 64692, 48656, 117140, 84168, 177044, 127136, 244404, 177560, 319220, 235440,
+              401492, 300776, 491220, 391456, 588404, 453816, 693044, 541520, 805140, 636680, 924692, 739296, 1051700,
+              849368, 1186164, 966896, 1328084, 1091880, 1358420, 0, 21060, 22008, 69156, 51984, 125220, 89928, 189252,
+              135840, 261252, 189720, 341220, 251568, 429156, 321384, 525060, 418336, 628932, 484920, 740772, 578640,
+              860580, 680328, 988356, 789984, 1124100, 907608, 1267812, 1033200, 1419492, 1166760, 1452164, 0, 22420,
+              23416, 73620, 55312, 133300, 95688, 201460, 144544, 278100, 201880, 363220, 267696, 456820, 341992,
+              558900, 445216, 669460, 516024, 788500, 615760, 916020, 723976, 1052020, 840672, 1196500, 965848, 1349460,
+              1099504, 1510900, 1241640, 1545908, 0, 23780, 24824, 78084, 58640, 141380, 101448, 213668, 153248, 294948,
+              214040, 385220, 283824, 484484, 362600, 592740, 472096, 709988, 547128, 836228, 652880, 971460, 767624,
+              1115684, 891360, 1268900, 1024088, 1431108, 1165808, 1602308, 1316520, 1639652, 0, 25140, 26232, 82548,
+              61968, 149460, 107208, 225876, 161952, 311796, 226200, 407220, 299952, 512148, 383208, 626580, 498976,
+              750516, 578232, 883956, 690000, 1026900, 811272, 1179348, 942048, 1341300, 1082328, 1512756, 1232112,
+              1693716, 1391400, 1733396, 0, 26500, 27640, 87012, 65296, 157540, 112968, 238084, 170656, 328644, 238360,
+              429220, 316080, 539812, 403816, 660420, 525856, 791044, 609336, 931684, 727120, 1082340, 854920, 1243012,
+              992736, 1413700, 1140568, 1594404, 1298416, 1785124, 1466280, 1827140, 0, 27860, 29048, 91476, 68624,
+              165620, 118728, 250292, 179360, 345492, 250520, 451220, 332208, 567476, 424424, 694260, 552736, 831572,
+              640440, 979412, 764240, 1137780, 898568, 1306676, 1043424, 1486100, 1198808, 1676052, 1364720, 1876532,
+              1541160, 1920884, 0, 29220, 30456, 95940, 71952, 173700, 124488, 262500, 188064, 362340, 262680, 473220,
+              348336, 595140, 445032, 728100, 579616, 872100, 671544, 1027140, 801360, 1193220, 942216, 1370340,
+              1094112, 1558500, 1257048, 1757700, 1431024, 1967940, 1616040, 2014628, 0, 30580, 31864, 100404, 75280,
+              181780, 130248, 274708, 196768, 379188, 274840, 495220, 364464, 622804, 465640, 761940, 606496, 912628,
+              702648, 1074868, 838480, 1248660, 985864, 1434004, 1144800, 1630900, 1315288, 1839348, 1497328, 2059348,
+              1690920, 2108372, 0, 31940, 33272, 104868, 78608, 189860, 136008, 286916, 205472, 396036, 287000, 517220,
+              380592, 650468, 486248, 795780, 633376, 953156, 733752, 1122596, 875600, 1304100, 1029512, 1497668,
+              1195488, 1703300, 1373528, 1920996, 1563632, 2150756, 1765800, 2202116, 0, 33300, 34680, 109332, 81936,
+              197940, 141768, 299124, 214176, 412884, 299160, 539220, 396720, 678132, 506856, 829620, 660256, 993684,
+              764856, 1170324, 912720, 1359540, 1073160, 1561332, 1246176, 1775700, 1431768, 2002644, 1629936, 2242164,
+              1840680, 2295860, 0, 34660, 36088, 113796, 85264, 206020, 147528, 311332, 222880, 429732, 311320, 561220,
+              412848, 705796, 527464, 863460, 687136, 1034212, 795960, 1218052, 949840, 1414980, 1116808, 1624996,
+              1296864, 1848100, 1490008, 2084292, 1696240, 2333572, 1915560, 2389604, 0, 36020, 37496, 118260, 88592,
+              214100, 153288, 323540, 231584, 446580, 323480, 583220, 428976, 733460, 548072, 897300, 714016, 1074740,
+              827064, 1265780, 986960, 1470420, 1160456, 1688660, 1347552, 1920500, 1548248, 2165940, 1762544, 2424980,
+              1990440, 2483348, 0, 37380, 38904, 122724, 91920, 222180, 159048, 335748, 240288, 463428, 335640, 605220,
+              445104, 761124, 568680, 931140, 740896, 1115268, 858168, 1313508, 1024080, 1525860, 1204104, 1752324,
+              1398240, 1992900, 1606488, 2247588, 1828848, 2516388, 2065320, 2577092, 0, 38740, 40312, 127188, 95248,
+              230260, 164808, 347956, 248992, 480276, 347800, 627220, 461232, 788788, 589288, 964980, 767776, 1155796,
+              889272, 1361236, 1061200, 1581300, 1247752, 1815988, 1448928, 2065300, 1664728, 2329236, 1895152, 2607796,
+              2140200, 2670836, 0, 40100, 41720, 131652, 98576, 238340, 170568, 360164, 257696, 497124, 359960, 649220,
+              477360, 816452, 609896, 998820, 794656, 1196324, 920376, 1408964, 1098320, 1636740, 1291400, 1879652,
+              1499616, 2137700, 1722968, 2410884, 1961456, 2699204, 2215080, 2764580, 0, 41460, 43128, 136116, 101904,
+              246420, 176328, 372372, 266400, 513972, 372120, 671220, 493488, 844116, 630504, 1032660, 821536, 1236852,
+              951480, 1456692, 1135440, 1692180, 1335048, 1943316, 1550304, 2210100, 1781208, 2492532, 2027760, 2790612,
+              2289960, 2858324, 0, 42820, 44536, 140580, 105232, 254500, 182088, 384580, 275104, 530820, 384280, 693220,
+              509616, 871780, 651112, 1066500, 848416, 1277380, 982584, 1504420, 1172560, 1747620, 1378696, 2006980,
+              1600992, 2282500, 1839448, 2574180, 2094064, 2882020, 2364840, 2952068
+            ]
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "MatMulNBits; K=32, N=16, block_size=16, bits=4",
+    "operator": "MatMulNBits",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "attributes": [
+      { "name": "K", "data": 32, "type": "int" },
+      { "name": "N", "data": 16, "type": "int" },
+      { "name": "block_size", "data": 16, "type": "int" },
+      { "name": "bits", "data": 4, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "MatMulNBits; K=32, N=16, block_size=16, bits=4; symmetric",
+        "inputs": [
+          {
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+              82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
+              107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+              128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
+              149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
+              170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
+              191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211,
+              212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232,
+              233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
+              254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274,
+              275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295,
+              296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316,
+              317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337,
+              338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358,
+              359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379,
+              380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400,
+              401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421,
+              422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442,
+              443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463,
+              464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484,
+              485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505,
+              506, 507, 508, 509, 510, 511, 512
+            ],
+            "dims": [16, 32],
+            "type": "float32"
+          },
+          {
+            "dims": [16, 2, 8],
+            "type": "uint8",
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+              82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
+              107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+              128, 29, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
+              149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
+              170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
+              191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211,
+              212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232,
+              233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
+              254, 255, 256
+            ]
+          },
+          {
+            "dims": [32],
+            "type": "float32",
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31
+            ]
+          }
+        ],
+        "outputs": [
+          {
+            "dims": [16, 16],
+            "type": "float32",
+            "data": [
+              -1116, -4036, -5868, -6612, -6268, -4836, -2316, 1292, 5956, 11772, 18644, 26604, 35652, 45788, 57012,
+              53452, -2492, -12772, -19916, -23924, -24796, -22532, -17132, -8596, 5604, 17884, 35828, 56908, 81124,
+              108476, 138964, 140844, -3868, -21508, -33964, -41236, -43324, -40228, -31948, -18484, 5252, 23996, 53012,
+              87212, 126596, 171164, 220916, 228236, -5244, -30244, -48012, -58548, -61852, -57924, -46764, -28372,
+              4900, 30108, 70196, 117516, 172068, 233852, 302868, 315628, -6620, -38980, -62060, -75860, -80380, -75620,
+              -61580, -38260, 4548, 36220, 87380, 147820, 217540, 296540, 384820, 403020, -7996, -47716, -76108, -93172,
+              -98908, -93316, -76396, -48148, 4196, 42332, 104564, 178124, 263012, 359228, 466772, 490412, -9372,
+              -56452, -90156, -110484, -117436, -111012, -91212, -58036, 3844, 48444, 121748, 208428, 308484, 421916,
+              548724, 577804, -10748, -65188, -104204, -127796, -135964, -128708, -106028, -67924, 3492, 54556, 138932,
+              238732, 353956, 484604, 630676, 665196, -12124, -73924, -118252, -145108, -154492, -146404, -120844,
+              -77812, 3140, 60668, 156116, 269036, 399428, 547292, 712628, 752588, -13500, -82660, -132300, -162420,
+              -173020, -164100, -135660, -87700, 2788, 66780, 173300, 299340, 444900, 609980, 794580, 839980, -14876,
+              -91396, -146348, -179732, -191548, -181796, -150476, -97588, 2436, 72892, 190484, 329644, 490372, 672668,
+              876532, 927372, -16252, -100132, -160396, -197044, -210076, -199492, -165292, -107476, 2084, 79004,
+              207668, 359948, 535844, 735356, 958484, 1014764, -17628, -108868, -174444, -214356, -228604, -217188,
+              -180108, -117364, 1732, 85116, 224852, 390252, 581316, 798044, 1040436, 1102156, -19004, -117604, -188492,
+              -231668, -247132, -234884, -194924, -127252, 1380, 91228, 242036, 420556, 626788, 860732, 1122388,
+              1189548, -20380, -126340, -202540, -248980, -265660, -252580, -209740, -137140, 1028, 97340, 259220,
+              450860, 672260, 923420, 1204340, 1276940, -21756, -135076, -216588, -266292, -284188, -270276, -224556,
+              -147028, 676, 103452, 276404, 481164, 717732, 986108, 1286292, 1364332
+            ]
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "MatMulNBits; K=32, N=16, block_size=16, bits=4",
+    "operator": "MatMulNBits",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "attributes": [
+      { "name": "K", "data": 32, "type": "int" },
+      { "name": "N", "data": 16, "type": "int" },
+      { "name": "block_size", "data": 16, "type": "int" },
+      { "name": "bits", "data": 4, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "MatMulNBits; K=32, N=16, block_size=16, bits=4; asymmetric",
+        "inputs": [
+          {
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+              82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
+              107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+              128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
+              149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
+              170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
+              191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211,
+              212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232,
+              233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
+              254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274,
+              275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295,
+              296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316,
+              317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337,
+              338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358,
+              359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379,
+              380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400,
+              401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421,
+              422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442,
+              443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463,
+              464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484,
+              485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505,
+              506, 507, 508, 509, 510, 511, 512
+            ],
+            "dims": [16, 32],
+            "type": "float32"
+          },
+          {
+            "dims": [16, 2, 8],
+            "type": "uint8",
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+              82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
+              107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+              128, 29, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
+              149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
+              170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
+              191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211,
+              212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232,
+              233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
+              254, 255, 256
+            ]
+          },
+          {
+            "dims": [32],
+            "type": "float32",
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31
+            ]
+          },
+          {
+            "dims": [16],
+            "type": "uint8",
+            "data": [128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128]
+          }
+        ],
+        "outputs": [
+          {
+            "dims": [16, 16],
+            "type": "float32",
+            "data": [
+              -1116, -1860, -1516, -84, 2436, 6044, 10740, 16524, 23364, 31356, 40404, 50540, 61764, 74076, 87476,
+              86092, -2492, -2404, 820, 7180, 16676, 29308, 45076, 63980, 88548, 111196, 139508, 170956, 205540, 243260,
+              284116, 296364, -3868, -2948, 3156, 14444, 30916, 52572, 79412, 111436, 153732, 191036, 238612, 291372,
+              349316, 412444, 480756, 506636, -5244, -3492, 5492, 21708, 45156, 75836, 113748, 158892, 218916, 270876,
+              337716, 411788, 493092, 581628, 677396, 716908, -6620, -4036, 7828, 28972, 59396, 99100, 148084, 206348,
+              284100, 350716, 436820, 532204, 636868, 750812, 874036, 927180, -7996, -4580, 10164, 36236, 73636, 122364,
+              182420, 253804, 349284, 430556, 535924, 652620, 780644, 919996, 1070676, 1137452, -9372, -5124, 12500,
+              43500, 87876, 145628, 216756, 301260, 414468, 510396, 635028, 773036, 924420, 1089180, 1267316, 1347724,
+              -10748, -5668, 14836, 50764, 102116, 168892, 251092, 348716, 479652, 590236, 734132, 893452, 1068196,
+              1258364, 1463956, 1557996, -12124, -6212, 17172, 58028, 116356, 192156, 285428, 396172, 544836, 670076,
+              833236, 1013868, 1211972, 1427548, 1660596, 1768268, -13500, -6756, 19508, 65292, 130596, 215420, 319764,
+              443628, 610020, 749916, 932340, 1134284, 1355748, 1596732, 1857236, 1978540, -14876, -7300, 21844, 72556,
+              144836, 238684, 354100, 491084, 675204, 829756, 1031444, 1254700, 1499524, 1765916, 2053876, 2188812,
+              -16252, -7844, 24180, 79820, 159076, 261948, 388436, 538540, 740388, 909596, 1130548, 1375116, 1643300,
+              1935100, 2250516, 2399084, -17628, -8388, 26516, 87084, 173316, 285212, 422772, 585996, 805572, 989436,
+              1229652, 1495532, 1787076, 2104284, 2447156, 2609356, -19004, -8932, 28852, 94348, 187556, 308476, 457108,
+              633452, 870756, 1069276, 1328756, 1615948, 1930852, 2273468, 2643796, 2819628, -20380, -9476, 31188,
+              101612, 201796, 331740, 491444, 680908, 935940, 1149116, 1427860, 1736364, 2074628, 2442652, 2840436,
+              3029900, -21756, -10020, 33524, 108876, 216036, 355004, 525780, 728364, 1001124, 1228956, 1526964,
+              1856780, 2218404, 2611836, 3037076, 3240172
+            ]
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "MatMulNBits; K=32, N=32, block_size=16, bits=4",
+    "operator": "MatMulNBits",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "attributes": [
+      { "name": "K", "data": 32, "type": "int" },
+      { "name": "N", "data": 32, "type": "int" },
+      { "name": "block_size", "data": 16, "type": "int" },
+      { "name": "bits", "data": 4, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "MatMulNBits; K=32, N=32, block_size=16, bits=4; symmetric",
+        "inputs": [
+          {
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+              82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
+              107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+              128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
+              149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
+              170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
+              191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211,
+              212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232,
+              233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
+              254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274,
+              275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295,
+              296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316,
+              317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337,
+              338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358,
+              359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379,
+              380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400,
+              401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421,
+              422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442,
+              443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463,
+              464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484,
+              485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505,
+              506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526,
+              527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547,
+              548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568,
+              569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589,
+              590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610,
+              611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631,
+              632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652,
+              653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673,
+              674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694,
+              695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715,
+              716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736,
+              737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757,
+              758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778,
+              779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799,
+              800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820,
+              821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 840, 841,
+              842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, 860, 861, 862,
+              863, 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883,
+              884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904,
+              905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925,
+              926, 927, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946,
+              947, 948, 949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961, 962, 963, 964, 965, 966, 967,
+              968, 969, 970, 971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, 988,
+              989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
+              1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024
+            ],
+            "dims": [32, 32],
+            "type": "float32"
+          },
+          {
+            "dims": [32, 2, 8],
+            "type": "uint8",
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+              82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
+              107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+              128, 29, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
+              149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
+              170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
+              191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211,
+              212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232,
+              233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
+              254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274,
+              275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295,
+              296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316,
+              317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337,
+              338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358,
+              359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379,
+              380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400,
+              401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421,
+              422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442,
+              443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463,
+              464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484,
+              485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505,
+              506, 507, 508, 509, 510, 511, 512
+            ]
+          },
+          {
+            "dims": [64],
+            "type": "float32",
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+              55, 56, 57, 58, 59, 60, 61, 62, 63
+            ]
+          }
+        ],
+        "outputs": [
+          {
+            "dims": [32, 32],
+            "type": "float32",
+            "data": [
+              -1116, -4036, -5868, -6612, -6268, -4836, -2316, 1292, 5956, 11772, 18644, 26604, 35652, 45788, 57012,
+              53452, -59740, -53956, -47084, -39124, -30076, -19940, -8716, 3596, 16996, 31484, 47060, 63724, 81476,
+              100316, 120244, 109004, -2492, -12772, -19916, -23924, -24796, -22532, -17132, -8596, 5604, 17884, 35828,
+              56908, 81124, 108476, 138964, 140844, -199356, -184548, -166604, -145524, -121308, -93956, -63468, -29844,
+              6916, 46812, 89844, 136012, 185316, 237756, 293332, 287532, -3868, -21508, -33964, -41236, -43324, -40228,
+              -31948, -18484, 5252, 23996, 53012, 87212, 126596, 171164, 220916, 228236, -338972, -315140, -286124,
+              -251924, -212540, -167972, -118220, -63284, -3164, 62140, 132628, 208300, 289156, 375196, 466420, 466060,
+              -5244, -30244, -48012, -58548, -61852, -57924, -46764, -28372, 4900, 30108, 70196, 117516, 172068, 233852,
+              302868, 315628, -478588, -445732, -405644, -358324, -303772, -241988, -172972, -96724, -13244, 77468,
+              175412, 280588, 392996, 512636, 639508, 644588, -6620, -38980, -62060, -75860, -80380, -75620, -61580,
+              -38260, 4548, 36220, 87380, 147820, 217540, 296540, 384820, 403020, -618204, -576324, -525164, -464724,
+              -395004, -316004, -227724, -130164, -23324, 92796, 218196, 352876, 496836, 650076, 812596, 823116, -7996,
+              -47716, -76108, -93172, -98908, -93316, -76396, -48148, 4196, 42332, 104564, 178124, 263012, 359228,
+              466772, 490412, -757820, -706916, -644684, -571124, -486236, -390020, -282476, -163604, -33404, 108124,
+              260980, 425164, 600676, 787516, 985684, 1001644, -9372, -56452, -90156, -110484, -117436, -111012, -91212,
+              -58036, 3844, 48444, 121748, 208428, 308484, 421916, 548724, 577804, -897436, -837508, -764204, -677524,
+              -577468, -464036, -337228, -197044, -43484, 123452, 303764, 497452, 704516, 924956, 1158772, 1180172,
+              -10748, -65188, -104204, -127796, -135964, -128708, -106028, -67924, 3492, 54556, 138932, 238732, 353956,
+              484604, 630676, 665196, -1037052, -968100, -883724, -783924, -668700, -538052, -391980, -230484, -53564,
+              138780, 346548, 569740, 808356, 1062396, 1331860, 1358700, -12124, -73924, -118252, -145108, -154492,
+              -146404, -120844, -77812, 3140, 60668, 156116, 269036, 399428, 547292, 712628, 752588, -1176668, -1098692,
+              -1003244, -890324, -759932, -612068, -446732, -263924, -63644, 154108, 389332, 642028, 912196, 1199836,
+              1504948, 1537228, -13500, -82660, -132300, -162420, -173020, -164100, -135660, -87700, 2788, 66780,
+              173300, 299340, 444900, 609980, 794580, 839980, -1316284, -1229284, -1122764, -996724, -851164, -686084,
+              -501484, -297364, -73724, 169436, 432116, 714316, 1016036, 1337276, 1678036, 1715756, -14876, -91396,
+              -146348, -179732, -191548, -181796, -150476, -97588, 2436, 72892, 190484, 329644, 490372, 672668, 876532,
+              927372, -1455900, -1359876, -1242284, -1103124, -942396, -760100, -556236, -330804, -83804, 184764,
+              474900, 786604, 1119876, 1474716, 1851124, 1894284, -16252, -100132, -160396, -197044, -210076, -199492,
+              -165292, -107476, 2084, 79004, 207668, 359948, 535844, 735356, 958484, 1014764, -1595516, -1490468,
+              -1361804, -1209524, -1033628, -834116, -610988, -364244, -93884, 200092, 517684, 858892, 1223716, 1612156,
+              2024212, 2072812, -17628, -108868, -174444, -214356, -228604, -217188, -180108, -117364, 1732, 85116,
+              224852, 390252, 581316, 798044, 1040436, 1102156, -1735132, -1621060, -1481324, -1315924, -1124860,
+              -908132, -665740, -397684, -103964, 215420, 560468, 931180, 1327556, 1749596, 2197300, 2251340, -19004,
+              -117604, -188492, -231668, -247132, -234884, -194924, -127252, 1380, 91228, 242036, 420556, 626788,
+              860732, 1122388, 1189548, -1874748, -1751652, -1600844, -1422324, -1216092, -982148, -720492, -431124,
+              -114044, 230748, 603252, 1003468, 1431396, 1887036, 2370388, 2429868, -20380, -126340, -202540, -248980,
+              -265660, -252580, -209740, -137140, 1028, 97340, 259220, 450860, 672260, 923420, 1204340, 1276940,
+              -2014364, -1882244, -1720364, -1528724, -1307324, -1056164, -775244, -464564, -124124, 246076, 646036,
+              1075756, 1535236, 2024476, 2543476, 2608396, -21756, -135076, -216588, -266292, -284188, -270276, -224556,
+              -147028, 676, 103452, 276404, 481164, 717732, 986108, 1286292, 1364332, -2153980, -2012836, -1839884,
+              -1635124, -1398556, -1130180, -829996, -498004, -134204, 261404, 688820, 1148044, 1639076, 2161916,
+              2716564, 2786924, -23132, -143812, -230636, -283604, -302716, -287972, -239372, -156916, 324, 109564,
+              293588, 511468, 763204, 1048796, 1368244, 1451724, -2293596, -2143428, -1959404, -1741524, -1489788,
+              -1204196, -884748, -531444, -144284, 276732, 731604, 1220332, 1742916, 2299356, 2889652, 2965452, -24508,
+              -152548, -244684, -300916, -321244, -305668, -254188, -166804, -28, 115676, 310772, 541772, 808676,
+              1111484, 1450196, 1539116, -2433212, -2274020, -2078924, -1847924, -1581020, -1278212, -939500, -564884,
+              -154364, 292060, 774388, 1292620, 1846756, 2436796, 3062740, 3143980, -25884, -161284, -258732, -318228,
+              -339772, -323364, -269004, -176692, -380, 121788, 327956, 572076, 854148, 1174172, 1532148, 1626508,
+              -2572828, -2404612, -2198444, -1954324, -1672252, -1352228, -994252, -598324, -164444, 307388, 817172,
+              1364908, 1950596, 2574236, 3235828, 3322508, -27260, -170020, -272780, -335540, -358300, -341060, -283820,
+              -186580, -732, 127900, 345140, 602380, 899620, 1236860, 1614100, 1713900, -2712444, -2535204, -2317964,
+              -2060724, -1763484, -1426244, -1049004, -631764, -174524, 322716, 859956, 1437196, 2054436, 2711676,
+              3408916, 3501036, -28636, -178756, -286828, -352852, -376828, -358756, -298636, -196468, -1084, 134012,
+              362324, 632684, 945092, 1299548, 1696052, 1801292, -2852060, -2665796, -2437484, -2167124, -1854716,
+              -1500260, -1103756, -665204, -184604, 338044, 902740, 1509484, 2158276, 2849116, 3582004, 3679564, -30012,
+              -187492, -300876, -370164, -395356, -376452, -313452, -206356, -1436, 140124, 379508, 662988, 990564,
+              1362236, 1778004, 1888684, -2991676, -2796388, -2557004, -2273524, -1945948, -1574276, -1158508, -698644,
+              -194684, 353372, 945524, 1581772, 2262116, 2986556, 3755092, 3858092, -31388, -196228, -314924, -387476,
+              -413884, -394148, -328268, -216244, -1788, 146236, 396692, 693292, 1036036, 1424924, 1859956, 1976076,
+              -3131292, -2926980, -2676524, -2379924, -2037180, -1648292, -1213260, -732084, -204764, 368700, 988308,
+              1654060, 2365956, 3123996, 3928180, 4036620, -32764, -204964, -328972, -404788, -432412, -411844, -343084,
+              -226132, -2140, 152348, 413876, 723596, 1081508, 1487612, 1941908, 2063468, -3270908, -3057572, -2796044,
+              -2486324, -2128412, -1722308, -1268012, -765524, -214844, 384028, 1031092, 1726348, 2469796, 3261436,
+              4101268, 4215148, -34140, -213700, -343020, -422100, -450940, -429540, -357900, -236020, -2492, 158460,
+              431060, 753900, 1126980, 1550300, 2023860, 2150860, -3410524, -3188164, -2915564, -2592724, -2219644,
+              -1796324, -1322764, -798964, -224924, 399356, 1073876, 1798636, 2573636, 3398876, 4274356, 4393676,
+              -35516, -222436, -357068, -439412, -469468, -447236, -372716, -245908, -2844, 164572, 448244, 784204,
+              1172452, 1612988, 2105812, 2238252, -3550140, -3318756, -3035084, -2699124, -2310876, -1870340, -1377516,
+              -832404, -235004, 414684, 1116660, 1870924, 2677476, 3536316, 4447444, 4572204, -36892, -231172, -371116,
+              -456724, -487996, -464932, -387532, -255796, -3196, 170684, 465428, 814508, 1217924, 1675676, 2187764,
+              2325644, -3689756, -3449348, -3154604, -2805524, -2402108, -1944356, -1432268, -865844, -245084, 430012,
+              1159444, 1943212, 2781316, 3673756, 4620532, 4750732, -38268, -239908, -385164, -474036, -506524, -482628,
+              -402348, -265684, -3548, 176796, 482612, 844812, 1263396, 1738364, 2269716, 2413036, -3829372, -3579940,
+              -3274124, -2911924, -2493340, -2018372, -1487020, -899284, -255164, 445340, 1202228, 2015500, 2885156,
+              3811196, 4793620, 4929260, -39644, -248644, -399212, -491348, -525052, -500324, -417164, -275572, -3900,
+              182908, 499796, 875116, 1308868, 1801052, 2351668, 2500428, -3968988, -3710532, -3393644, -3018324,
+              -2584572, -2092388, -1541772, -932724, -265244, 460668, 1245012, 2087788, 2988996, 3948636, 4966708,
+              5107788, -41020, -257380, -413260, -508660, -543580, -518020, -431980, -285460, -4252, 189020, 516980,
+              905420, 1354340, 1863740, 2433620, 2587820, -4108604, -3841124, -3513164, -3124724, -2675804, -2166404,
+              -1596524, -966164, -275324, 475996, 1287796, 2160076, 3092836, 4086076, 5139796, 5286316, -42396, -266116,
+              -427308, -525972, -562108, -535716, -446796, -295348, -4604, 195132, 534164, 935724, 1399812, 1926428,
+              2515572, 2675212, -4248220, -3971716, -3632684, -3231124, -2767036, -2240420, -1651276, -999604, -285404,
+              491324, 1330580, 2232364, 3196676, 4223516, 5312884, 5464844, -43772, -274852, -441356, -543284, -580636,
+              -553412, -461612, -305236, -4956, 201244, 551348, 966028, 1445284, 1989116, 2597524, 2762604, -4387836,
+              -4102308, -3752204, -3337524, -2858268, -2314436, -1706028, -1033044, -295484, 506652, 1373364, 2304652,
+              3300516, 4360956, 5485972, 5643372
+            ]
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "MatMulNBits; K=32, N=32, block_size=16, bits=4",
+    "operator": "MatMulNBits",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "attributes": [
+      { "name": "K", "data": 32, "type": "int" },
+      { "name": "N", "data": 32, "type": "int" },
+      { "name": "block_size", "data": 16, "type": "int" },
+      { "name": "bits", "data": 4, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "MatMulNBits; K=32, N=32, block_size=16, bits=4; asymmetric",
+        "inputs": [
+          {
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+              82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
+              107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+              128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
+              149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
+              170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
+              191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211,
+              212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232,
+              233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
+              254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274,
+              275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295,
+              296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316,
+              317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337,
+              338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358,
+              359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379,
+              380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400,
+              401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421,
+              422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442,
+              443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463,
+              464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484,
+              485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505,
+              506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526,
+              527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547,
+              548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568,
+              569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589,
+              590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610,
+              611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631,
+              632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652,
+              653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673,
+              674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694,
+              695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715,
+              716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736,
+              737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757,
+              758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778,
+              779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799,
+              800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820,
+              821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 840, 841,
+              842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, 860, 861, 862,
+              863, 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883,
+              884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904,
+              905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925,
+              926, 927, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946,
+              947, 948, 949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961, 962, 963, 964, 965, 966, 967,
+              968, 969, 970, 971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, 988,
+              989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
+              1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024
+            ],
+            "dims": [32, 32],
+            "type": "float32"
+          },
+          {
+            "dims": [32, 2, 8],
+            "type": "uint8",
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+              82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
+              107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+              128, 29, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
+              149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
+              170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
+              191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211,
+              212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232,
+              233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
+              254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274,
+              275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295,
+              296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316,
+              317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337,
+              338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358,
+              359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379,
+              380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400,
+              401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421,
+              422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442,
+              443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463,
+              464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484,
+              485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505,
+              506, 507, 508, 509, 510, 511, 512
+            ]
+          },
+          {
+            "dims": [64],
+            "type": "float32",
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+              55, 56, 57, 58, 59, 60, 61, 62, 63
+            ]
+          },
+          {
+            "dims": [32],
+            "type": "uint8",
+            "data": [
+              128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+              128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128
+            ]
+          }
+        ],
+        "outputs": [
+          {
+            "dims": [32, 32],
+            "type": "float32",
+            "data": [
+              -1116, -1860, -1516, -84, 2436, 6044, 10740, 16524, 23364, 31356, 40404, 50540, 61764, 74076, 87476,
+              86092, -24924, -16964, -7916, 2220, 13444, 25756, 39156, 53644, 69220, 85884, 103636, 122476, 142404,
+              163420, 185524, 176460, -2492, -2404, 820, 7180, 16676, 29308, 45076, 63980, 88548, 111196, 139508,
+              170956, 205540, 243260, 284116, 296364, -33468, -8292, 20020, 51468, 86052, 123772, 164628, 208620,
+              255748, 306012, 359412, 415948, 475620, 538428, 604372, 608940, -3868, -2948, 3156, 14444, 30916, 52572,
+              79412, 111436, 153732, 191036, 238612, 291372, 349316, 412444, 480756, 506636, -42012, 380, 47956, 100716,
+              158660, 221788, 290100, 363596, 442276, 526140, 615188, 709420, 808836, 913436, 1023220, 1041420, -5244,
+              -3492, 5492, 21708, 45156, 75836, 113748, 158892, 218916, 270876, 337716, 411788, 493092, 581628, 677396,
+              716908, -50556, 9052, 75892, 149964, 231268, 319804, 415572, 518572, 628804, 746268, 870964, 1002892,
+              1142052, 1288444, 1442068, 1473900, -6620, -4036, 7828, 28972, 59396, 99100, 148084, 206348, 284100,
+              350716, 436820, 532204, 636868, 750812, 874036, 927180, -59100, 17724, 103828, 199212, 303876, 417820,
+              541044, 673548, 815332, 966396, 1126740, 1296364, 1475268, 1663452, 1860916, 1906380, -7996, -4580, 10164,
+              36236, 73636, 122364, 182420, 253804, 349284, 430556, 535924, 652620, 780644, 919996, 1070676, 1137452,
+              -67644, 26396, 131764, 248460, 376484, 515836, 666516, 828524, 1001860, 1186524, 1382516, 1589836,
+              1808484, 2038460, 2279764, 2338860, -9372, -5124, 12500, 43500, 87876, 145628, 216756, 301260, 414468,
+              510396, 635028, 773036, 924420, 1089180, 1267316, 1347724, -76188, 35068, 159700, 297708, 449092, 613852,
+              791988, 983500, 1188388, 1406652, 1638292, 1883308, 2141700, 2413468, 2698612, 2771340, -10748, -5668,
+              14836, 50764, 102116, 168892, 251092, 348716, 479652, 590236, 734132, 893452, 1068196, 1258364, 1463956,
+              1557996, -84732, 43740, 187636, 346956, 521700, 711868, 917460, 1138476, 1374916, 1626780, 1894068,
+              2176780, 2474916, 2788476, 3117460, 3203820, -12124, -6212, 17172, 58028, 116356, 192156, 285428, 396172,
+              544836, 670076, 833236, 1013868, 1211972, 1427548, 1660596, 1768268, -93276, 52412, 215572, 396204,
+              594308, 809884, 1042932, 1293452, 1561444, 1846908, 2149844, 2470252, 2808132, 3163484, 3536308, 3636300,
+              -13500, -6756, 19508, 65292, 130596, 215420, 319764, 443628, 610020, 749916, 932340, 1134284, 1355748,
+              1596732, 1857236, 1978540, -101820, 61084, 243508, 445452, 666916, 907900, 1168404, 1448428, 1747972,
+              2067036, 2405620, 2763724, 3141348, 3538492, 3955156, 4068780, -14876, -7300, 21844, 72556, 144836,
+              238684, 354100, 491084, 675204, 829756, 1031444, 1254700, 1499524, 1765916, 2053876, 2188812, -110364,
+              69756, 271444, 494700, 739524, 1005916, 1293876, 1603404, 1934500, 2287164, 2661396, 3057196, 3474564,
+              3913500, 4374004, 4501260, -16252, -7844, 24180, 79820, 159076, 261948, 388436, 538540, 740388, 909596,
+              1130548, 1375116, 1643300, 1935100, 2250516, 2399084, -118908, 78428, 299380, 543948, 812132, 1103932,
+              1419348, 1758380, 2121028, 2507292, 2917172, 3350668, 3807780, 4288508, 4792852, 4933740, -17628, -8388,
+              26516, 87084, 173316, 285212, 422772, 585996, 805572, 989436, 1229652, 1495532, 1787076, 2104284, 2447156,
+              2609356, -127452, 87100, 327316, 593196, 884740, 1201948, 1544820, 1913356, 2307556, 2727420, 3172948,
+              3644140, 4140996, 4663516, 5211700, 5366220, -19004, -8932, 28852, 94348, 187556, 308476, 457108, 633452,
+              870756, 1069276, 1328756, 1615948, 1930852, 2273468, 2643796, 2819628, -135996, 95772, 355252, 642444,
+              957348, 1299964, 1670292, 2068332, 2494084, 2947548, 3428724, 3937612, 4474212, 5038524, 5630548, 5798700,
+              -20380, -9476, 31188, 101612, 201796, 331740, 491444, 680908, 935940, 1149116, 1427860, 1736364, 2074628,
+              2442652, 2840436, 3029900, -144540, 104444, 383188, 691692, 1029956, 1397980, 1795764, 2223308, 2680612,
+              3167676, 3684500, 4231084, 4807428, 5413532, 6049396, 6231180, -21756, -10020, 33524, 108876, 216036,
+              355004, 525780, 728364, 1001124, 1228956, 1526964, 1856780, 2218404, 2611836, 3037076, 3240172, -153084,
+              113116, 411124, 740940, 1102564, 1495996, 1921236, 2378284, 2867140, 3387804, 3940276, 4524556, 5140644,
+              5788540, 6468244, 6663660, -23132, -10564, 35860, 116140, 230276, 378268, 560116, 775820, 1066308,
+              1308796, 1626068, 1977196, 2362180, 2781020, 3233716, 3450444, -161628, 121788, 439060, 790188, 1175172,
+              1594012, 2046708, 2533260, 3053668, 3607932, 4196052, 4818028, 5473860, 6163548, 6887092, 7096140, -24508,
+              -11108, 38196, 123404, 244516, 401532, 594452, 823276, 1131492, 1388636, 1725172, 2097612, 2505956,
+              2950204, 3430356, 3660716, -170172, 130460, 466996, 839436, 1247780, 1692028, 2172180, 2688236, 3240196,
+              3828060, 4451828, 5111500, 5807076, 6538556, 7305940, 7528620, -25884, -11652, 40532, 130668, 258756,
+              424796, 628788, 870732, 1196676, 1468476, 1824276, 2218028, 2649732, 3119388, 3626996, 3870988, -178716,
+              139132, 494932, 888684, 1320388, 1790044, 2297652, 2843212, 3426724, 4048188, 4707604, 5404972, 6140292,
+              6913564, 7724788, 7961100, -27260, -12196, 42868, 137932, 272996, 448060, 663124, 918188, 1261860,
+              1548316, 1923380, 2338444, 2793508, 3288572, 3823636, 4081260, -187260, 147804, 522868, 937932, 1392996,
+              1888060, 2423124, 2998188, 3613252, 4268316, 4963380, 5698444, 6473508, 7288572, 8143636, 8393580, -28636,
+              -12740, 45204, 145196, 287236, 471324, 697460, 965644, 1327044, 1628156, 2022484, 2458860, 2937284,
+              3457756, 4020276, 4291532, -195804, 156476, 550804, 987180, 1465604, 1986076, 2548596, 3153164, 3799780,
+              4488444, 5219156, 5991916, 6806724, 7663580, 8562484, 8826060, -30012, -13284, 47540, 152460, 301476,
+              494588, 731796, 1013100, 1392228, 1707996, 2121588, 2579276, 3081060, 3626940, 4216916, 4501804, -204348,
+              165148, 578740, 1036428, 1538212, 2084092, 2674068, 3308140, 3986308, 4708572, 5474932, 6285388, 7139940,
+              8038588, 8981332, 9258540, -31388, -13828, 49876, 159724, 315716, 517852, 766132, 1060556, 1457412,
+              1787836, 2220692, 2699692, 3224836, 3796124, 4413556, 4712076, -212892, 173820, 606676, 1085676, 1610820,
+              2182108, 2799540, 3463116, 4172836, 4928700, 5730708, 6578860, 7473156, 8413596, 9400180, 9691020, -32764,
+              -14372, 52212, 166988, 329956, 541116, 800468, 1108012, 1522596, 1867676, 2319796, 2820108, 3368612,
+              3965308, 4610196, 4922348, -221436, 182492, 634612, 1134924, 1683428, 2280124, 2925012, 3618092, 4359364,
+              5148828, 5986484, 6872332, 7806372, 8788604, 9819028, 10123500, -34140, -14916, 54548, 174252, 344196,
+              564380, 834804, 1155468, 1587780, 1947516, 2418900, 2940524, 3512388, 4134492, 4806836, 5132620, -229980,
+              191164, 662548, 1184172, 1756036, 2378140, 3050484, 3773068, 4545892, 5368956, 6242260, 7165804, 8139588,
+              9163612, 10237876, 10555980, -35516, -15460, 56884, 181516, 358436, 587644, 869140, 1202924, 1652964,
+              2027356, 2518004, 3060940, 3656164, 4303676, 5003476, 5342892, -238524, 199836, 690484, 1233420, 1828644,
+              2476156, 3175956, 3928044, 4732420, 5589084, 6498036, 7459276, 8472804, 9538620, 10656724, 10988460,
+              -36892, -16004, 59220, 188780, 372676, 610908, 903476, 1250380, 1718148, 2107196, 2617108, 3181356,
+              3799940, 4472860, 5200116, 5553164, -247068, 208508, 718420, 1282668, 1901252, 2574172, 3301428, 4083020,
+              4918948, 5809212, 6753812, 7752748, 8806020, 9913628, 11075572, 11420940, -38268, -16548, 61556, 196044,
+              386916, 634172, 937812, 1297836, 1783332, 2187036, 2716212, 3301772, 3943716, 4642044, 5396756, 5763436,
+              -255612, 217180, 746356, 1331916, 1973860, 2672188, 3426900, 4237996, 5105476, 6029340, 7009588, 8046220,
+              9139236, 10288636, 11494420, 11853420, -39644, -17092, 63892, 203308, 401156, 657436, 972148, 1345292,
+              1848516, 2266876, 2815316, 3422188, 4087492, 4811228, 5593396, 5973708, -264156, 225852, 774292, 1381164,
+              2046468, 2770204, 3552372, 4392972, 5292004, 6249468, 7265364, 8339692, 9472452, 10663644, 11913268,
+              12285900, -41020, -17636, 66228, 210572, 415396, 680700, 1006484, 1392748, 1913700, 2346716, 2914420,
+              3542604, 4231268, 4980412, 5790036, 6183980, -272700, 234524, 802228, 1430412, 2119076, 2868220, 3677844,
+              4547948, 5478532, 6469596, 7521140, 8633164, 9805668, 11038652, 12332116, 12718380, -42396, -18180, 68564,
+              217836, 429636, 703964, 1040820, 1440204, 1978884, 2426556, 3013524, 3663020, 4375044, 5149596, 5986676,
+              6394252, -281244, 243196, 830164, 1479660, 2191684, 2966236, 3803316, 4702924, 5665060, 6689724, 7776916,
+              8926636, 10138884, 11413660, 12750964, 13150860, -43772, -18724, 70900, 225100, 443876, 727228, 1075156,
+              1487660, 2044068, 2506396, 3112628, 3783436, 4518820, 5318780, 6183316, 6604524, -289788, 251868, 858100,
+              1528908, 2264292, 3064252, 3928788, 4857900, 5851588, 6909852, 8032692, 9220108, 10472100, 11788668,
+              13169812, 13583340
+            ]
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "MatMulNBits; K=32, N=32, block_size=32, bits=4",
+    "operator": "MatMulNBits",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "attributes": [
+      { "name": "K", "data": 32, "type": "int" },
+      { "name": "N", "data": 32, "type": "int" },
+      { "name": "block_size", "data": 32, "type": "int" },
+      { "name": "bits", "data": 4, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "MatMulNBits; K=32, N=32, block_size=32, bits=4; symmetric",
+        "inputs": [
+          {
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+              82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
+              107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+              128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
+              149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
+              170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
+              191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211,
+              212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232,
+              233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
+              254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274,
+              275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295,
+              296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316,
+              317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337,
+              338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358,
+              359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379,
+              380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400,
+              401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421,
+              422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442,
+              443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463,
+              464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484,
+              485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505,
+              506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526,
+              527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547,
+              548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568,
+              569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589,
+              590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610,
+              611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631,
+              632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652,
+              653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673,
+              674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694,
+              695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715,
+              716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736,
+              737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757,
+              758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778,
+              779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799,
+              800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820,
+              821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 840, 841,
+              842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, 860, 861, 862,
+              863, 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883,
+              884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904,
+              905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925,
+              926, 927, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946,
+              947, 948, 949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961, 962, 963, 964, 965, 966, 967,
+              968, 969, 970, 971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, 988,
+              989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
+              1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024
+            ],
+            "dims": [32, 32],
+            "type": "float32"
+          },
+          {
+            "dims": [32, 1, 16],
+            "type": "uint8",
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+              82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
+              107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+              128, 29, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
+              149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
+              170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
+              191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211,
+              212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232,
+              233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
+              254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274,
+              275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295,
+              296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316,
+              317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337,
+              338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358,
+              359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379,
+              380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400,
+              401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421,
+              422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442,
+              443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463,
+              464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484,
+              485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505,
+              506, 507, 508, 509, 510, 511, 512
+            ]
+          },
+          {
+            "dims": [32],
+            "type": "float32",
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31
+            ]
+          }
+        ],
+        "outputs": [
+          {
+            "dims": [32, 32],
+            "type": "float32",
+            "data": [
+              0, -1560, -2576, -3048, -2976, -2360, -1200, 504, 2736, 5544, 8880, 12760, 17184, 22152, 27664, 26040,
+              -29312, -26520, -23184, -19304, -14880, -9912, -4400, 1656, 8256, 15400, 23088, 31320, 40096, 49416,
+              59280, 53816, 0, -5368, -9168, -11400, -12064, -11160, -8688, -4648, 2224, 8136, 16880, 27192, 39072,
+              52520, 67536, 68760, -98432, -91256, -82512, -72200, -60320, -46872, -31856, -15272, 2880, 22600, 43888,
+              66744, 91168, 117160, 144720, 142104, 0, -9176, -15760, -19752, -21152, -19960, -16176, -9800, 1712,
+              10728, 24880, 41624, 60960, 82888, 107408, 111480, -167552, -155992, -141840, -125096, -105760, -83832,
+              -59312, -32200, -2496, 29800, 64688, 102168, 142240, 184904, 230160, 230392, 0, -12984, -22352, -28104,
+              -30240, -28760, -23664, -14952, 1200, 13320, 32880, 56056, 82848, 113256, 147280, 154200, -236672,
+              -220728, -201168, -177992, -151200, -120792, -86768, -49128, -7872, 37000, 85488, 137592, 193312, 252648,
+              315600, 318680, 0, -16792, -28944, -36456, -39328, -37560, -31152, -20104, 688, 15912, 40880, 70488,
+              104736, 143624, 187152, 196920, -305792, -285464, -260496, -230888, -196640, -157752, -114224, -66056,
+              -13248, 44200, 106288, 173016, 244384, 320392, 401040, 406968, 0, -20600, -35536, -44808, -48416, -46360,
+              -38640, -25256, 176, 18504, 48880, 84920, 126624, 173992, 227024, 239640, -374912, -350200, -319824,
+              -283784, -242080, -194712, -141680, -82984, -18624, 51400, 127088, 208440, 295456, 388136, 486480, 495256,
+              0, -24408, -42128, -53160, -57504, -55160, -46128, -30408, -336, 21096, 56880, 99352, 148512, 204360,
+              266896, 282360, -444032, -414936, -379152, -336680, -287520, -231672, -169136, -99912, -24000, 58600,
+              147888, 243864, 346528, 455880, 571920, 583544, 0, -28216, -48720, -61512, -66592, -63960, -53616, -35560,
+              -848, 23688, 64880, 113784, 170400, 234728, 306768, 325080, -513152, -479672, -438480, -389576, -332960,
+              -268632, -196592, -116840, -29376, 65800, 168688, 279288, 397600, 523624, 657360, 671832, 0, -32024,
+              -55312, -69864, -75680, -72760, -61104, -40712, -1360, 26280, 72880, 128216, 192288, 265096, 346640,
+              367800, -582272, -544408, -497808, -442472, -378400, -305592, -224048, -133768, -34752, 73000, 189488,
+              314712, 448672, 591368, 742800, 760120, 0, -35832, -61904, -78216, -84768, -81560, -68592, -45864, -1872,
+              28872, 80880, 142648, 214176, 295464, 386512, 410520, -651392, -609144, -557136, -495368, -423840,
+              -342552, -251504, -150696, -40128, 80200, 210288, 350136, 499744, 659112, 828240, 848408, 0, -39640,
+              -68496, -86568, -93856, -90360, -76080, -51016, -2384, 31464, 88880, 157080, 236064, 325832, 426384,
+              453240, -720512, -673880, -616464, -548264, -469280, -379512, -278960, -167624, -45504, 87400, 231088,
+              385560, 550816, 726856, 913680, 936696, 0, -43448, -75088, -94920, -102944, -99160, -83568, -56168, -2896,
+              34056, 96880, 171512, 257952, 356200, 466256, 495960, -789632, -738616, -675792, -601160, -514720,
+              -416472, -306416, -184552, -50880, 94600, 251888, 420984, 601888, 794600, 999120, 1024984, 0, -47256,
+              -81680, -103272, -112032, -107960, -91056, -61320, -3408, 36648, 104880, 185944, 279840, 386568, 506128,
+              538680, -858752, -803352, -735120, -654056, -560160, -453432, -333872, -201480, -56256, 101800, 272688,
+              456408, 652960, 862344, 1084560, 1113272, 0, -51064, -88272, -111624, -121120, -116760, -98544, -66472,
+              -3920, 39240, 112880, 200376, 301728, 416936, 546000, 581400, -927872, -868088, -794448, -706952, -605600,
+              -490392, -361328, -218408, -61632, 109000, 293488, 491832, 704032, 930088, 1170000, 1201560, 0, -54872,
+              -94864, -119976, -130208, -125560, -106032, -71624, -4432, 41832, 120880, 214808, 323616, 447304, 585872,
+              624120, -996992, -932824, -853776, -759848, -651040, -527352, -388784, -235336, -67008, 116200, 314288,
+              527256, 755104, 997832, 1255440, 1289848, 0, -58680, -101456, -128328, -139296, -134360, -113520, -76776,
+              -4944, 44424, 128880, 229240, 345504, 477672, 625744, 666840, -1066112, -997560, -913104, -812744,
+              -696480, -564312, -416240, -252264, -72384, 123400, 335088, 562680, 806176, 1065576, 1340880, 1378136, 0,
+              -62488, -108048, -136680, -148384, -143160, -121008, -81928, -5456, 47016, 136880, 243672, 367392, 508040,
+              665616, 709560, -1135232, -1062296, -972432, -865640, -741920, -601272, -443696, -269192, -77760, 130600,
+              355888, 598104, 857248, 1133320, 1426320, 1466424, 0, -66296, -114640, -145032, -157472, -151960, -128496,
+              -87080, -5968, 49608, 144880, 258104, 389280, 538408, 705488, 752280, -1204352, -1127032, -1031760,
+              -918536, -787360, -638232, -471152, -286120, -83136, 137800, 376688, 633528, 908320, 1201064, 1511760,
+              1554712, 0, -70104, -121232, -153384, -166560, -160760, -135984, -92232, -6480, 52200, 152880, 272536,
+              411168, 568776, 745360, 795000, -1273472, -1191768, -1091088, -971432, -832800, -675192, -498608, -303048,
+              -88512, 145000, 397488, 668952, 959392, 1268808, 1597200, 1643000, 0, -73912, -127824, -161736, -175648,
+              -169560, -143472, -97384, -6992, 54792, 160880, 286968, 433056, 599144, 785232, 837720, -1342592,
+              -1256504, -1150416, -1024328, -878240, -712152, -526064, -319976, -93888, 152200, 418288, 704376, 1010464,
+              1336552, 1682640, 1731288, 0, -77720, -134416, -170088, -184736, -178360, -150960, -102536, -7504, 57384,
+              168880, 301400, 454944, 629512, 825104, 880440, -1411712, -1321240, -1209744, -1077224, -923680, -749112,
+              -553520, -336904, -99264, 159400, 439088, 739800, 1061536, 1404296, 1768080, 1819576, 0, -81528, -141008,
+              -178440, -193824, -187160, -158448, -107688, -8016, 59976, 176880, 315832, 476832, 659880, 864976, 923160,
+              -1480832, -1385976, -1269072, -1130120, -969120, -786072, -580976, -353832, -104640, 166600, 459888,
+              775224, 1112608, 1472040, 1853520, 1907864, 0, -85336, -147600, -186792, -202912, -195960, -165936,
+              -112840, -8528, 62568, 184880, 330264, 498720, 690248, 904848, 965880, -1549952, -1450712, -1328400,
+              -1183016, -1014560, -823032, -608432, -370760, -110016, 173800, 480688, 810648, 1163680, 1539784, 1938960,
+              1996152, 0, -89144, -154192, -195144, -212000, -204760, -173424, -117992, -9040, 65160, 192880, 344696,
+              520608, 720616, 944720, 1008600, -1619072, -1515448, -1387728, -1235912, -1060000, -859992, -635888,
+              -387688, -115392, 181000, 501488, 846072, 1214752, 1607528, 2024400, 2084440, 0, -92952, -160784, -203496,
+              -221088, -213560, -180912, -123144, -9552, 67752, 200880, 359128, 542496, 750984, 984592, 1051320,
+              -1688192, -1580184, -1447056, -1288808, -1105440, -896952, -663344, -404616, -120768, 188200, 522288,
+              881496, 1265824, 1675272, 2109840, 2172728, 0, -96760, -167376, -211848, -230176, -222360, -188400,
+              -128296, -10064, 70344, 208880, 373560, 564384, 781352, 1024464, 1094040, -1757312, -1644920, -1506384,
+              -1341704, -1150880, -933912, -690800, -421544, -126144, 195400, 543088, 916920, 1316896, 1743016, 2195280,
+              2261016, 0, -100568, -173968, -220200, -239264, -231160, -195888, -133448, -10576, 72936, 216880, 387992,
+              586272, 811720, 1064336, 1136760, -1826432, -1709656, -1565712, -1394600, -1196320, -970872, -718256,
+              -438472, -131520, 202600, 563888, 952344, 1367968, 1810760, 2280720, 2349304, 0, -104376, -180560,
+              -228552, -248352, -239960, -203376, -138600, -11088, 75528, 224880, 402424, 608160, 842088, 1104208,
+              1179480, -1895552, -1774392, -1625040, -1447496, -1241760, -1007832, -745712, -455400, -136896, 209800,
+              584688, 987768, 1419040, 1878504, 2366160, 2437592, 0, -108184, -187152, -236904, -257440, -248760,
+              -210864, -143752, -11600, 78120, 232880, 416856, 630048, 872456, 1144080, 1222200, -1964672, -1839128,
+              -1684368, -1500392, -1287200, -1044792, -773168, -472328, -142272, 217000, 605488, 1023192, 1470112,
+              1946248, 2451600, 2525880, 0, -111992, -193744, -245256, -266528, -257560, -218352, -148904, -12112,
+              80712, 240880, 431288, 651936, 902824, 1183952, 1264920, -2033792, -1903864, -1743696, -1553288, -1332640,
+              -1081752, -800624, -489256, -147648, 224200, 626288, 1058616, 1521184, 2013992, 2537040, 2614168, 0,
+              -115800, -200336, -253608, -275616, -266360, -225840, -154056, -12624, 83304, 248880, 445720, 673824,
+              933192, 1223824, 1307640, -2102912, -1968600, -1803024, -1606184, -1378080, -1118712, -828080, -506184,
+              -153024, 231400, 647088, 1094040, 1572256, 2081736, 2622480, 2702456, 0, -119608, -206928, -261960,
+              -284704, -275160, -233328, -159208, -13136, 85896, 256880, 460152, 695712, 963560, 1263696, 1350360,
+              -2172032, -2033336, -1862352, -1659080, -1423520, -1155672, -855536, -523112, -158400, 238600, 667888,
+              1129464, 1623328, 2149480, 2707920, 2790744
+            ]
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "MatMulNBits; K=32, N=32, block_size=32, bits=4",
+    "operator": "MatMulNBits",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "attributes": [
+      { "name": "K", "data": 32, "type": "int" },
+      { "name": "N", "data": 32, "type": "int" },
+      { "name": "block_size", "data": 32, "type": "int" },
+      { "name": "bits", "data": 4, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "MatMulNBits; K=32, N=32, block_size=32, bits=4; asymmetric",
+        "inputs": [
+          {
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+              82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
+              107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+              128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
+              149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
+              170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
+              191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211,
+              212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232,
+              233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
+              254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274,
+              275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295,
+              296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316,
+              317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337,
+              338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358,
+              359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379,
+              380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400,
+              401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421,
+              422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442,
+              443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463,
+              464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484,
+              485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505,
+              506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526,
+              527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547,
+              548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568,
+              569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589,
+              590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610,
+              611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631,
+              632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652,
+              653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673,
+              674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694,
+              695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715,
+              716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736,
+              737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757,
+              758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778,
+              779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799,
+              800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820,
+              821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 840, 841,
+              842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, 860, 861, 862,
+              863, 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883,
+              884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904,
+              905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925,
+              926, 927, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946,
+              947, 948, 949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961, 962, 963, 964, 965, 966, 967,
+              968, 969, 970, 971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, 988,
+              989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
+              1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024
+            ],
+            "dims": [32, 32],
+            "type": "float32"
+          },
+          {
+            "dims": [32, 1, 16],
+            "type": "uint8",
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+              82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
+              107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+              128, 29, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
+              149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
+              170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
+              191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211,
+              212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232,
+              233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
+              254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274,
+              275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295,
+              296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316,
+              317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337,
+              338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358,
+              359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379,
+              380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400,
+              401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421,
+              422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442,
+              443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463,
+              464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484,
+              485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505,
+              506, 507, 508, 509, 510, 511, 512
+            ]
+          },
+          {
+            "dims": [32],
+            "type": "float32",
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31
+            ]
+          },
+          {
+            "dims": [32],
+            "type": "uint8",
+            "data": [
+              128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+              128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128
+            ]
+          }
+        ],
+        "outputs": [
+          {
+            "dims": [32, 32],
+            "type": "float32",
+            "data": [
+              0, 2664, 5872, 9624, 13920, 18760, 24144, 30072, 36528, 43560, 51120, 59224, 67872, 77064, 86800, 89400,
+              38272, 45288, 52848, 60952, 69600, 78792, 88528, 98808, 109632, 121000, 132912, 145368, 158368, 171912,
+              186000, 184760, 0, 7048, 15664, 25848, 37600, 50920, 65808, 82264, 101552, 119880, 141040, 163768, 188064,
+              213928, 241360, 255000, 100224, 119816, 140976, 163704, 188000, 213864, 241296, 270296, 300864, 333000,
+              366704, 401976, 438816, 477224, 517200, 527000, 0, 11432, 25456, 42072, 61280, 83080, 107472, 134456,
+              166576, 196200, 230960, 268312, 308256, 350792, 395920, 420600, 162176, 194344, 229104, 266456, 306400,
+              348936, 394064, 441784, 492096, 545000, 600496, 658584, 719264, 782536, 848400, 869240, 0, 15816, 35248,
+              58296, 84960, 115240, 149136, 186648, 231600, 272520, 320880, 372856, 428448, 487656, 550480, 586200,
+              224128, 268872, 317232, 369208, 424800, 484008, 546832, 613272, 683328, 757000, 834288, 915192, 999712,
+              1087848, 1179600, 1211480, 0, 20200, 45040, 74520, 108640, 147400, 190800, 238840, 296624, 348840, 410800,
+              477400, 548640, 624520, 705040, 751800, 286080, 343400, 405360, 471960, 543200, 619080, 699600, 784760,
+              874560, 969000, 1068080, 1171800, 1280160, 1393160, 1510800, 1553720, 0, 24584, 54832, 90744, 132320,
+              179560, 232464, 291032, 361648, 425160, 500720, 581944, 668832, 761384, 859600, 917400, 348032, 417928,
+              493488, 574712, 661600, 754152, 852368, 956248, 1065792, 1181000, 1301872, 1428408, 1560608, 1698472,
+              1842000, 1895960, 0, 28968, 64624, 106968, 156000, 211720, 274128, 343224, 426672, 501480, 590640, 686488,
+              789024, 898248, 1014160, 1083000, 409984, 492456, 581616, 677464, 780000, 889224, 1005136, 1127736,
+              1257024, 1393000, 1535664, 1685016, 1841056, 2003784, 2173200, 2238200, 0, 33352, 74416, 123192, 179680,
+              243880, 315792, 395416, 491696, 577800, 680560, 791032, 909216, 1035112, 1168720, 1248600, 471936, 566984,
+              669744, 780216, 898400, 1024296, 1157904, 1299224, 1448256, 1605000, 1769456, 1941624, 2121504, 2309096,
+              2504400, 2580440, 0, 37736, 84208, 139416, 203360, 276040, 357456, 447608, 556720, 654120, 770480, 895576,
+              1029408, 1171976, 1323280, 1414200, 533888, 641512, 757872, 882968, 1016800, 1159368, 1310672, 1470712,
+              1639488, 1817000, 2003248, 2198232, 2401952, 2614408, 2835600, 2922680, 0, 42120, 94000, 155640, 227040,
+              308200, 399120, 499800, 621744, 730440, 860400, 1000120, 1149600, 1308840, 1477840, 1579800, 595840,
+              716040, 846000, 985720, 1135200, 1294440, 1463440, 1642200, 1830720, 2029000, 2237040, 2454840, 2682400,
+              2919720, 3166800, 3264920, 0, 46504, 103792, 171864, 250720, 340360, 440784, 551992, 686768, 806760,
+              950320, 1104664, 1269792, 1445704, 1632400, 1745400, 657792, 790568, 934128, 1088472, 1253600, 1429512,
+              1616208, 1813688, 2021952, 2241000, 2470832, 2711448, 2962848, 3225032, 3498000, 3607160, 0, 50888,
+              113584, 188088, 274400, 372520, 482448, 604184, 751792, 883080, 1040240, 1209208, 1389984, 1582568,
+              1786960, 1911000, 719744, 865096, 1022256, 1191224, 1372000, 1564584, 1768976, 1985176, 2213184, 2453000,
+              2704624, 2968056, 3243296, 3530344, 3829200, 3949400, 0, 55272, 123376, 204312, 298080, 404680, 524112,
+              656376, 816816, 959400, 1130160, 1313752, 1510176, 1719432, 1941520, 2076600, 781696, 939624, 1110384,
+              1293976, 1490400, 1699656, 1921744, 2156664, 2404416, 2665000, 2938416, 3224664, 3523744, 3835656,
+              4160400, 4291640, 0, 59656, 133168, 220536, 321760, 436840, 565776, 708568, 881840, 1035720, 1220080,
+              1418296, 1630368, 1856296, 2096080, 2242200, 843648, 1014152, 1198512, 1396728, 1608800, 1834728, 2074512,
+              2328152, 2595648, 2877000, 3172208, 3481272, 3804192, 4140968, 4491600, 4633880, 0, 64040, 142960, 236760,
+              345440, 469000, 607440, 760760, 946864, 1112040, 1310000, 1522840, 1750560, 1993160, 2250640, 2407800,
+              905600, 1088680, 1286640, 1499480, 1727200, 1969800, 2227280, 2499640, 2786880, 3089000, 3406000, 3737880,
+              4084640, 4446280, 4822800, 4976120, 0, 68424, 152752, 252984, 369120, 501160, 649104, 812952, 1011888,
+              1188360, 1399920, 1627384, 1870752, 2130024, 2405200, 2573400, 967552, 1163208, 1374768, 1602232, 1845600,
+              2104872, 2380048, 2671128, 2978112, 3301000, 3639792, 3994488, 4365088, 4751592, 5154000, 5318360, 0,
+              72808, 162544, 269208, 392800, 533320, 690768, 865144, 1076912, 1264680, 1489840, 1731928, 1990944,
+              2266888, 2559760, 2739000, 1029504, 1237736, 1462896, 1704984, 1964000, 2239944, 2532816, 2842616,
+              3169344, 3513000, 3873584, 4251096, 4645536, 5056904, 5485200, 5660600, 0, 77192, 172336, 285432, 416480,
+              565480, 732432, 917336, 1141936, 1341000, 1579760, 1836472, 2111136, 2403752, 2714320, 2904600, 1091456,
+              1312264, 1551024, 1807736, 2082400, 2375016, 2685584, 3014104, 3360576, 3725000, 4107376, 4507704,
+              4925984, 5362216, 5816400, 6002840, 0, 81576, 182128, 301656, 440160, 597640, 774096, 969528, 1206960,
+              1417320, 1669680, 1941016, 2231328, 2540616, 2868880, 3070200, 1153408, 1386792, 1639152, 1910488,
+              2200800, 2510088, 2838352, 3185592, 3551808, 3937000, 4341168, 4764312, 5206432, 5667528, 6147600,
+              6345080, 0, 85960, 191920, 317880, 463840, 629800, 815760, 1021720, 1271984, 1493640, 1759600, 2045560,
+              2351520, 2677480, 3023440, 3235800, 1215360, 1461320, 1727280, 2013240, 2319200, 2645160, 2991120,
+              3357080, 3743040, 4149000, 4574960, 5020920, 5486880, 5972840, 6478800, 6687320, 0, 90344, 201712, 334104,
+              487520, 661960, 857424, 1073912, 1337008, 1569960, 1849520, 2150104, 2471712, 2814344, 3178000, 3401400,
+              1277312, 1535848, 1815408, 2115992, 2437600, 2780232, 3143888, 3528568, 3934272, 4361000, 4808752,
+              5277528, 5767328, 6278152, 6810000, 7029560, 0, 94728, 211504, 350328, 511200, 694120, 899088, 1126104,
+              1402032, 1646280, 1939440, 2254648, 2591904, 2951208, 3332560, 3567000, 1339264, 1610376, 1903536,
+              2218744, 2556000, 2915304, 3296656, 3700056, 4125504, 4573000, 5042544, 5534136, 6047776, 6583464,
+              7141200, 7371800, 0, 99112, 221296, 366552, 534880, 726280, 940752, 1178296, 1467056, 1722600, 2029360,
+              2359192, 2712096, 3088072, 3487120, 3732600, 1401216, 1684904, 1991664, 2321496, 2674400, 3050376,
+              3449424, 3871544, 4316736, 4785000, 5276336, 5790744, 6328224, 6888776, 7472400, 7714040, 0, 103496,
+              231088, 382776, 558560, 758440, 982416, 1230488, 1532080, 1798920, 2119280, 2463736, 2832288, 3224936,
+              3641680, 3898200, 1463168, 1759432, 2079792, 2424248, 2792800, 3185448, 3602192, 4043032, 4507968,
+              4997000, 5510128, 6047352, 6608672, 7194088, 7803600, 8056280, 0, 107880, 240880, 399000, 582240, 790600,
+              1024080, 1282680, 1597104, 1875240, 2209200, 2568280, 2952480, 3361800, 3796240, 4063800, 1525120,
+              1833960, 2167920, 2527000, 2911200, 3320520, 3754960, 4214520, 4699200, 5209000, 5743920, 6303960,
+              6889120, 7499400, 8134800, 8398520, 0, 112264, 250672, 415224, 605920, 822760, 1065744, 1334872, 1662128,
+              1951560, 2299120, 2672824, 3072672, 3498664, 3950800, 4229400, 1587072, 1908488, 2256048, 2629752,
+              3029600, 3455592, 3907728, 4386008, 4890432, 5421000, 5977712, 6560568, 7169568, 7804712, 8466000,
+              8740760, 0, 116648, 260464, 431448, 629600, 854920, 1107408, 1387064, 1727152, 2027880, 2389040, 2777368,
+              3192864, 3635528, 4105360, 4395000, 1649024, 1983016, 2344176, 2732504, 3148000, 3590664, 4060496,
+              4557496, 5081664, 5633000, 6211504, 6817176, 7450016, 8110024, 8797200, 9083000, 0, 121032, 270256,
+              447672, 653280, 887080, 1149072, 1439256, 1792176, 2104200, 2478960, 2881912, 3313056, 3772392, 4259920,
+              4560600, 1710976, 2057544, 2432304, 2835256, 3266400, 3725736, 4213264, 4728984, 5272896, 5845000,
+              6445296, 7073784, 7730464, 8415336, 9128400, 9425240, 0, 125416, 280048, 463896, 676960, 919240, 1190736,
+              1491448, 1857200, 2180520, 2568880, 2986456, 3433248, 3909256, 4414480, 4726200, 1772928, 2132072,
+              2520432, 2938008, 3384800, 3860808, 4366032, 4900472, 5464128, 6057000, 6679088, 7330392, 8010912,
+              8720648, 9459600, 9767480, 0, 129800, 289840, 480120, 700640, 951400, 1232400, 1543640, 1922224, 2256840,
+              2658800, 3091000, 3553440, 4046120, 4569040, 4891800, 1834880, 2206600, 2608560, 3040760, 3503200,
+              3995880, 4518800, 5071960, 5655360, 6269000, 6912880, 7587000, 8291360, 9025960, 9790800, 10109720, 0,
+              134184, 299632, 496344, 724320, 983560, 1274064, 1595832, 1987248, 2333160, 2748720, 3195544, 3673632,
+              4182984, 4723600, 5057400, 1896832, 2281128, 2696688, 3143512, 3621600, 4130952, 4671568, 5243448,
+              5846592, 6481000, 7146672, 7843608, 8571808, 9331272, 10122000, 10451960, 0, 138568, 309424, 512568,
+              748000, 1015720, 1315728, 1648024, 2052272, 2409480, 2838640, 3300088, 3793824, 4319848, 4878160, 5223000,
+              1958784, 2355656, 2784816, 3246264, 3740000, 4266024, 4824336, 5414936, 6037824, 6693000, 7380464,
+              8100216, 8852256, 9636584, 10453200, 10794200
+            ]
+          }
+        ]
+      }
+    ]
+  }
+]
diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc
index 55b21283025c2..1c61518ddcdd2 100644
--- a/js/web/test/suite-test-list.jsonc
+++ b/js/web/test/suite-test-list.jsonc
@@ -1362,6 +1362,7 @@
       "less.jsonc",
       "log.jsonc",
       "matmul.jsonc",
+      "matmulnbits.jsonc",
       "matmul-broadcast.jsonc",
       "mul.jsonc",
       "mul_int32.jsonc",
diff --git a/onnxruntime/contrib_ops/js/js_contrib_kernels.cc b/onnxruntime/contrib_ops/js/js_contrib_kernels.cc
index bd58dded026a6..25e7567a2e9fc 100644
--- a/onnxruntime/contrib_ops/js/js_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/js/js_contrib_kernels.cc
@@ -8,13 +8,14 @@ namespace contrib {
 namespace js {
 
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, Attention);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, BiasAdd);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, BiasSplitGelu);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, FastGelu);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, FusedConv);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, Gelu);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, MatMulNBits);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, MultiHeadAttention);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, BiasSplitGelu);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, BiasAdd);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, SkipLayerNormalization);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, FusedConv);
 
 template <>
 KernelCreateInfo BuildKernelCreateInfo<void>() {
@@ -25,14 +26,15 @@ KernelCreateInfo BuildKernelCreateInfo<void>() {
 Status RegisterJsContribKernels(KernelRegistry& kernel_registry) {
   static const BuildKernelCreateInfoFn function_table[] = {
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, Attention)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, BiasAdd)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, BiasSplitGelu)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, FastGelu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, FusedConv)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, Gelu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, MatMulNBits)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, MultiHeadAttention)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, BiasAdd)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, BiasSplitGelu)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1,
-                                                            SkipLayerNormalization)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, FusedConv)>};
+                                                            SkipLayerNormalization)>};
 
   for (auto& function_table_entry : function_table) {
     KernelCreateInfo info = function_table_entry();
diff --git a/onnxruntime/contrib_ops/js/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/js/quantization/matmul_nbits.cc
new file mode 100644
index 0000000000000..888db0fd161f2
--- /dev/null
+++ b/onnxruntime/contrib_ops/js/quantization/matmul_nbits.cc
@@ -0,0 +1,25 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "contrib_ops/js/quantization/matmul_nbits.h"
+#include "core/providers/js/js_data_types.h"
+
+namespace onnxruntime {
+namespace contrib {
+namespace js {
+
+using onnxruntime::js::JsepSupportedFloatTypes;
+
+ONNX_OPERATOR_KERNEL_EX(
+    MatMulNBits,
+    kMSDomain,
+    1,
+    kJsExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T1", JsepSupportedFloatTypes())
+        .TypeConstraint("T2", DataTypeImpl::GetTensorType<uint8_t>()),
+    MatMulNBits);
+
+}  // namespace js
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/js/quantization/matmul_nbits.h b/onnxruntime/contrib_ops/js/quantization/matmul_nbits.h
new file mode 100644
index 0000000000000..cca2c4757765b
--- /dev/null
+++ b/onnxruntime/contrib_ops/js/quantization/matmul_nbits.h
@@ -0,0 +1,48 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/js/js_kernel.h"
+
+namespace onnxruntime {
+namespace contrib {
+namespace js {
+
+using onnxruntime::js::JsKernel;
+
+class MatMulNBits final : public JsKernel {
+ public:
+  MatMulNBits(const OpKernelInfo& info) : JsKernel(info),
+                                          K_{narrow<size_t>(info.GetAttr<int64_t>("K"))},
+                                          N_{narrow<size_t>(info.GetAttr<int64_t>("N"))},
+                                          accuracy_level_{info.GetAttrOrDefault<int64_t>("accuracy_level", 0)},
+                                          nbits_{narrow<size_t>(info.GetAttr<int64_t>("bits"))},
+                                          block_size_{narrow<size_t>(info.GetAttr<int64_t>("block_size"))} {
+    ORT_ENFORCE(nbits_ == 4,
+                "Only 4b quantization is supported for MatMulNBits op, additional bits support is planned.");
+    ORT_ENFORCE(block_size_ >= 16 && !(block_size_ & (block_size_ - 1)),
+                "Block size must be a power of 2 and greater than or equal to 16.");
+    JSEP_INIT_KERNEL_ATTRIBUTE(MatMulNBits, ({
+                                 "k" : $1,
+                                 "n" : $2,
+                                 "accuracyLevel" : $3,
+                                 "bits" : $4,
+                                 "blockSize" : $5
+                               }),
+                               static_cast<int32_t>(K_),
+                               static_cast<int32_t>(N_),
+                               static_cast<int32_t>(accuracy_level_),
+                               static_cast<int32_t>(nbits_),
+                               static_cast<int32_t>(block_size_));
+  }
+
+ private:
+  const size_t K_;
+  const size_t N_;
+  const int64_t accuracy_level_;
+  const size_t nbits_;
+  const size_t block_size_;
+};
+
+}  // namespace js
+}  // namespace contrib
+}  // namespace onnxruntime

From b55260d076da309f3a4634eb5248a0eb541e8ca0 Mon Sep 17 00:00:00 2001
From: pengwa <pengwa@microsoft.com>
Date: Mon, 19 Feb 2024 10:21:19 +0800
Subject: [PATCH 011/279] Minor fix for cmake (#19552)

### Minor fix for cmake

When build on Linux, get a warning saying "
CMake Warning at CMakeLists.txt:1603 (message):
  MPI and NCCL disabled on Win build.
"

This message is not correct. So have such a fix to avoid any
misunderstanding from users.


![image](https://github.com/microsoft/onnxruntime/assets/10530022/848c2d77-a538-4e31-8e0d-4b539233e515)


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 cmake/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index ff1c7a84f077f..c9be4aa65d0cc 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -1600,7 +1600,7 @@ if (UNIX AND onnxruntime_USE_NCCL)
 else()
   set(onnxruntime_USE_NCCL OFF)
   set(onnxruntime_USE_MPI OFF)
-message( WARNING "MPI and NCCL disabled on Win build." )
+  message( WARNING "MPI and NCCL are disabled because build is on Windows or USE_NCCL is set to OFF." )
 endif()
 
 if (onnxruntime_USE_MPI)

From f3e3b531fe4c0d33d70928b101fb5d445e4174a8 Mon Sep 17 00:00:00 2001
From: PeixuanZuo <94887879+PeixuanZuo@users.noreply.github.com>
Date: Tue, 20 Feb 2024 10:31:39 +0800
Subject: [PATCH 012/279] Update build directory clean up stage for python
 package pipeline (#19553)

Fix to make clean up stage take effect.

If the `SourceFolder ` is empty, the task deletes files from the root
folder of the repository as though
[$(Build.SourcesDirectory)](https://learn.microsoft.com/en-us/azure/devops/pipelines/build/variables)
was specified.
---
 .../component-governance-component-detection-steps.yml      | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/templates/component-governance-component-detection-steps.yml b/tools/ci_build/github/azure-pipelines/templates/component-governance-component-detection-steps.yml
index c2ef565a6e9ee..f1418e75bffa2 100644
--- a/tools/ci_build/github/azure-pipelines/templates/component-governance-component-detection-steps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/component-governance-component-detection-steps.yml
@@ -5,10 +5,12 @@ parameters:
   default: 'succeeded' # could be 'ci_only', 'always', 'succeeded'
 
 steps:
-- ${{ if eq(variables['System.TeamProject'], 'Lotus') }}: 
+- ${{ if eq(variables['System.TeamProject'], 'Lotus') }}:
   - task: DeleteFiles@1
     inputs:
-      contents: $(Build.BinariesDirectory)/*
+      SourceFolder: '$(Build.BinariesDirectory)'
+      contents: |
+        **/*
     displayName: 'Clean up build directory'
 
   - task: ms.vss-governance-buildtask.governance-build-task-component-detection.ComponentGovernanceComponentDetection@0

From e832562d70685ffeaab7e3bfa20cd5e9aec916a3 Mon Sep 17 00:00:00 2001
From: Markus Tavenrath <mtavenrath@users.noreply.github.com>
Date: Tue, 20 Feb 2024 09:06:03 +0100
Subject: [PATCH 013/279] Fix invalid usage of designated initializers.
 (#19497)

### Description
I've replaces all ocurances of C++ designated initializers in the CUDA
NHWC Tests by member initialization.


### Motivation and Context
C++ designated initializers have been introduced in C++ 20. Yet GCC
accepts designated initializers in C++17 which is the standard used to
compile onnxruntime. Yet MSVC is standard conform and accepts this
feature starting C++20 which leads to compile failures on Windows
without this change.
---
 .../test/providers/cuda/nhwc/conv_test.cc     | 23 +++++++---
 .../cuda/nhwc/conv_transpose_test.cc          | 40 +++++++++-------
 .../providers/cuda/nhwc/nhwc_cuda_helper.h    |  6 ++-
 .../test/providers/cuda/nhwc/norm_test.cc     |  7 ++-
 .../test/providers/cuda/nhwc/pool_test.cc     | 46 ++++++++++---------
 5 files changed, 72 insertions(+), 50 deletions(-)

diff --git a/onnxruntime/test/providers/cuda/nhwc/conv_test.cc b/onnxruntime/test/providers/cuda/nhwc/conv_test.cc
index 13d4546d669e3..b6a760f7041ad 100644
--- a/onnxruntime/test/providers/cuda/nhwc/conv_test.cc
+++ b/onnxruntime/test/providers/cuda/nhwc/conv_test.cc
@@ -9,8 +9,8 @@ namespace test {
 
 template <typename T>
 struct ConvOp {
-  const std::vector<int64_t> input_dims;
-  const std::vector<int64_t> kernel_shape;
+  std::vector<int64_t> input_dims;
+  std::vector<int64_t> kernel_shape;
   int64_t channels;
   int64_t group = 1;
   bool bias = false;
@@ -52,20 +52,31 @@ struct ConvOp {
 };
 
 TYPED_TEST(CudaNhwcTypedTest, ConvNhwcBias) {
-  auto op = ConvOp<TypeParam>{.input_dims = {1, 16, 64, 64}, .kernel_shape = {3, 3}, .channels = 16, .bias = true};
+  auto op = ConvOp<TypeParam>{};
+  op.input_dims = {1, 16, 64, 64};
+  op.kernel_shape = {3, 3};
+  op.channels = 16;
+  op.bias = true;
 
   MAKE_PROVIDERS_EPS_TYPE(TypeParam)
 }
 
 TYPED_TEST(CudaNhwcTypedTest, ConvNhwcGroupNoBias) {
-  auto op = ConvOp<TypeParam>{.input_dims = {1, 16, 64, 64}, .kernel_shape = {3, 3}, .channels = 16, .group = 4};
+  auto op = ConvOp<TypeParam>{};
+  op.input_dims = {1, 16, 64, 64};
+  op.kernel_shape = {3, 3};
+  op.channels = 16;
+  op.group = 4;
 
   MAKE_PROVIDERS_EPS_TYPE(TypeParam)
 }
 
 TYPED_TEST(CudaNhwcTypedTest, ConvNhwcPadding) {
-  auto op =
-      ConvOp<TypeParam>{.input_dims = {2, 4, 64, 64}, .kernel_shape = {3, 3}, .channels = 4, .padding = {4, 4, 4, 4}};
+  auto op = ConvOp<TypeParam>{};
+  op.input_dims = {2, 4, 64, 64};
+  op.kernel_shape = {3, 3};
+  op.channels = 4;
+  op.padding = {4, 4, 4, 4};
 
   MAKE_PROVIDERS_EPS_TYPE(TypeParam)
 }
diff --git a/onnxruntime/test/providers/cuda/nhwc/conv_transpose_test.cc b/onnxruntime/test/providers/cuda/nhwc/conv_transpose_test.cc
index 6514feadf0ff7..786b2cb4cedc4 100644
--- a/onnxruntime/test/providers/cuda/nhwc/conv_transpose_test.cc
+++ b/onnxruntime/test/providers/cuda/nhwc/conv_transpose_test.cc
@@ -9,8 +9,8 @@ namespace test {
 
 template <typename T>
 struct ConvTransposeOp {
-  const std::vector<int64_t> input_dims;
-  const std::vector<int64_t> kernel_shape;
+  std::vector<int64_t> input_dims;
+  std::vector<int64_t> kernel_shape;
   int64_t channels;
   int64_t group = 1;
   bool bias = false;
@@ -60,15 +60,21 @@ struct ConvTransposeOp {
 };
 
 TYPED_TEST(CudaNhwcTypedTest, ConvTransposeNhwcGroupNoBias) {
-  auto op =
-      ConvTransposeOp<TypeParam>{.input_dims = {8, 8, 32, 32}, .kernel_shape = {3, 3}, .channels = 16, .group = 4};
+  auto op = ConvTransposeOp<TypeParam>{};
+  op.input_dims = {8, 8, 32, 32};
+  op.kernel_shape = {3, 3};
+  op.channels = 16;
+  op.group = 4;
 
   MAKE_PROVIDERS_EPS_TYPE(TypeParam)
 }
 
 TYPED_TEST(CudaNhwcTypedTest, ConvTransposeNhwcBias) {
-  auto op =
-      ConvTransposeOp<TypeParam>{.input_dims = {1, 8, 80, 80}, .kernel_shape = {5, 5}, .channels = 16, .bias = true};
+  auto op = ConvTransposeOp<TypeParam>{};
+  op.input_dims = {1, 8, 80, 80};
+  op.kernel_shape = {5, 5};
+  op.channels = 16;
+  op.bias = true;
 
   if (HasCudaEnvironment(800)) {
     MAKE_PROVIDERS_EPS(1e-2)
@@ -78,21 +84,23 @@ TYPED_TEST(CudaNhwcTypedTest, ConvTransposeNhwcBias) {
 }
 
 TYPED_TEST(CudaNhwcTypedTest, ConvTransposeNhwcPad) {
-  auto op = ConvTransposeOp<TypeParam>{.input_dims = {1, 16, 8, 8},
-                                       .kernel_shape = {3, 3},
-                                       .channels = 32,
-                                       .padding = {2, 2, 2, 2},
-                                       .output_padding = {}};
+  auto op = ConvTransposeOp<TypeParam>{};
+  op.input_dims = {1, 16, 8, 8};
+  op.kernel_shape = {3, 3};
+  op.channels = 32;
+  op.padding = {2, 2, 2, 2};
+  op.output_padding = {};
 
   MAKE_PROVIDERS_EPS_TYPE(TypeParam)
 }
 
 TYPED_TEST(CudaNhwcTypedTest, ConvTransposeNhwcOutPad) {
-  auto op = ConvTransposeOp<TypeParam>{.input_dims = {1, 32, 8, 8},
-                                       .kernel_shape = {3, 3},
-                                       .channels = 32,
-                                       .strides = {2, 2},
-                                       .output_padding = {1, 1, 1, 1}};
+  auto op = ConvTransposeOp<TypeParam>{};
+  op.input_dims = {1, 32, 8, 8};
+  op.kernel_shape = {3, 3};
+  op.channels = 32;
+  op.strides = {2, 2};
+  op.output_padding = {1, 1, 1, 1};
 
   MAKE_PROVIDERS_EPS_TYPE(TypeParam)
 }
diff --git a/onnxruntime/test/providers/cuda/nhwc/nhwc_cuda_helper.h b/onnxruntime/test/providers/cuda/nhwc/nhwc_cuda_helper.h
index 2c942bb790096..82b6a286409cd 100644
--- a/onnxruntime/test/providers/cuda/nhwc/nhwc_cuda_helper.h
+++ b/onnxruntime/test/providers/cuda/nhwc/nhwc_cuda_helper.h
@@ -16,11 +16,13 @@
 
 #define MAKE_PROVIDERS_EPS(eps)                                           \
   std::vector<std::shared_ptr<IExecutionProvider>> execution_providers;   \
-  OrtCUDAProviderOptionsV2 nhwc = {.prefer_nhwc = true};                  \
+  OrtCUDAProviderOptionsV2 nhwc{};                                        \
+  nhwc.prefer_nhwc = true;                                                \
   execution_providers.push_back(CudaExecutionProviderWithOptions(&nhwc)); \
                                                                           \
   double error_tolerance = eps;                                           \
-  OrtCUDAProviderOptionsV2 nchw = {.prefer_nhwc = false};                 \
+  OrtCUDAProviderOptionsV2 nchw{};                                        \
+  nchw.prefer_nhwc = false;                                               \
   auto source_ep = CudaExecutionProviderWithOptions(&nchw);               \
   auto test = op.get_test();                                              \
   test->CompareEPs(std::move(source_ep), execution_providers, error_tolerance);
diff --git a/onnxruntime/test/providers/cuda/nhwc/norm_test.cc b/onnxruntime/test/providers/cuda/nhwc/norm_test.cc
index 52da8ba557c2d..40f69e3bd5b4f 100644
--- a/onnxruntime/test/providers/cuda/nhwc/norm_test.cc
+++ b/onnxruntime/test/providers/cuda/nhwc/norm_test.cc
@@ -9,7 +9,7 @@ namespace test {
 
 template <typename T>
 struct BatchNormOp {
-  const std::vector<int64_t> input_dims;
+  std::vector<int64_t> input_dims;
 
   std::unique_ptr<CompareOpTester> get_test() {
     // create rand inputs
@@ -40,9 +40,8 @@ struct BatchNormOp {
 };
 
 TYPED_TEST(CudaNhwcTypedTest, BatchNormNhwc) {
-  auto op = BatchNormOp<TypeParam>{
-      .input_dims = {4, 16, 64, 64},
-  };
+  auto op = BatchNormOp<TypeParam>{};
+  op.input_dims = {4, 16, 64, 64};
 
   MAKE_PROVIDERS()
 }
diff --git a/onnxruntime/test/providers/cuda/nhwc/pool_test.cc b/onnxruntime/test/providers/cuda/nhwc/pool_test.cc
index e0d59901da80c..426170b9588f1 100644
--- a/onnxruntime/test/providers/cuda/nhwc/pool_test.cc
+++ b/onnxruntime/test/providers/cuda/nhwc/pool_test.cc
@@ -9,9 +9,9 @@ namespace test {
 
 template <typename T>
 struct PoolOp {
-  const std::string pooling_type;
-  const std::vector<int64_t> input_dims;
-  const std::vector<int64_t> kernel_shape;
+  std::string pooling_type;
+  std::vector<int64_t> input_dims;
+  std::vector<int64_t> kernel_shape;
   int64_t channels;
   int64_t group = 1;
   std::vector<int64_t> strides = {1, 1};
@@ -41,22 +41,21 @@ struct PoolOp {
 };
 
 TYPED_TEST(CudaNhwcTypedTest, AveragePoolNhwc) {
-  auto op = PoolOp<TypeParam>{
-      .pooling_type = "AveragePool",
-      .input_dims = {1, 16, 64, 64},
-      .kernel_shape = {3, 3},
-      .channels = 16,
-  };
+  auto op = PoolOp<TypeParam>{};
+  op.pooling_type = "AveragePool";
+  op.input_dims = {1, 16, 64, 64};
+  op.kernel_shape = {3, 3};
+  op.channels = 16;
+
   MAKE_PROVIDERS()
 }
 
 TYPED_TEST(CudaNhwcTypedTest, MaxPoolNhwc) {
-  auto op = PoolOp<TypeParam>{
-      .pooling_type = "MaxPool",
-      .input_dims = {1, 16, 64, 64},
-      .kernel_shape = {3, 3},
-      .channels = 16,
-  };
+  auto op = PoolOp<TypeParam>{};
+  op.pooling_type = "MaxPool";
+  op.input_dims = {1, 16, 64, 64};
+  op.kernel_shape = {3, 3};
+  op.channels = 16;
   MAKE_PROVIDERS()
 }
 
@@ -72,21 +71,24 @@ TYPED_TEST(CudaNhwcTypedTest, GlobalMaxPoolNhwc) {
   test->AddOutput<TypeParam>("Y", output_dims, output_data);
 
   std::vector<std::shared_ptr<IExecutionProvider>> execution_providers;
-  OrtCUDAProviderOptionsV2 nhwc = {.prefer_nhwc = true};
+  OrtCUDAProviderOptionsV2 nhwc{};
+  nhwc.prefer_nhwc = true;
   execution_providers.push_back(CudaExecutionProviderWithOptions(&nhwc));
 
   double error_tolerance = 1e-3;
-  OrtCUDAProviderOptionsV2 nchw = {.prefer_nhwc = false};
+  OrtCUDAProviderOptionsV2 nchw{};
+  nchw.prefer_nhwc = false;
   auto source_ep = CudaExecutionProviderWithOptions(&nchw);
   test->CompareEPs(std::move(source_ep), execution_providers, error_tolerance);
 }
 
 TYPED_TEST(CudaNhwcTypedTest, AveragePoolNhwcPad) {
-  auto op = PoolOp<TypeParam>{.pooling_type = "AveragePool",
-                              .input_dims = {1, 16, 64, 64},
-                              .kernel_shape = {3, 3},
-                              .channels = 16,
-                              .padding = {2, 2, 2, 2}};
+  auto op = PoolOp<TypeParam>{};
+  op.pooling_type = "AveragePool";
+  op.input_dims = {1, 16, 64, 64};
+  op.kernel_shape = {3, 3};
+  op.channels = 16;
+  op.padding = {2, 2, 2, 2};
 
   MAKE_PROVIDERS()
 }

From 7efb0dbe12cf8736d97dcc3b8f41eb96c5c34719 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@users.noreply.github.com>
Date: Tue, 20 Feb 2024 17:22:44 +0100
Subject: [PATCH 014/279] add option DefaultTensorType to specify the default
 tensor type to quantize (#19455)

### Description
The current quantization tool relies on shape inference to provide the
type of every intermediate tensor, then the tool knows which type it
must dequantize into (float32, float16). However, this information is
not available if shape inference fails. That happens every time the
model include an operator from a custom domain such as com.microsoft.

This PR introduces an extra option `DefaultTensorType` as a fall back
when the quantizer cannot find the type it needs.

### Motivation and Context
This fixes issue #19409.
---
 .../tools/quantization/onnx_quantizer.py      | 25 ++++-
 .../tools/transformers/quantize_helper.py     |  3 +-
 .../test_quantizer_shape_inference.py         | 92 +++++++++++++++++++
 3 files changed, 115 insertions(+), 5 deletions(-)
 create mode 100644 onnxruntime/test/python/quantization/test_quantizer_shape_inference.py

diff --git a/onnxruntime/python/tools/quantization/onnx_quantizer.py b/onnxruntime/python/tools/quantization/onnx_quantizer.py
index ecfbaa569ca0a..9450426f12444 100644
--- a/onnxruntime/python/tools/quantization/onnx_quantizer.py
+++ b/onnxruntime/python/tools/quantization/onnx_quantizer.py
@@ -385,7 +385,7 @@ def add_new_nodes(self, nodes):
     def quantize_model(self):
         if self.has_QDQ_nodes():
             logging.warning(
-                "Please check if the model is already quantized."
+                "Please check if the model is already quantized. "
                 "Note you don't need to quantize a QAT model. OnnxRuntime support to run QAT model directly."
             )
 
@@ -442,6 +442,23 @@ def is_valid_quantize_weight(self, weight_name):
             return False
         return self.parent.is_valid_quantize_weight(weight_name)
 
+    def _get_default_tensor_type(self, tensor_name):
+        if "DefaultTensorType" in self.extra_options:
+            logging.info(
+                "get_tensor_type returns DefaultTensorType for tensor name %r, use %d",
+                tensor_name,
+                self.extra_options["DefaultTensorType"],
+            )
+            return self.extra_options["DefaultTensorType"]
+        raise RuntimeError(
+            f"Unable to find data type for weight_name={tensor_name!r}. "
+            f"shape_inference failed to return a type probably this node is "
+            f"from a different domain or using an input produced by such an operator. "
+            f"This may happen if you quantize a model already quantized. "
+            f"You may use extra_options `DefaultTensorType` to indicate "
+            f"the default weight type, usually `onnx.TensorProto.FLOAT`."
+        )
+
     def get_tensor_type(self, tensor_name, mandatory=False):
         weight = find_by_name(tensor_name, self.model.initializer())
         if weight is not None:
@@ -450,11 +467,11 @@ def get_tensor_type(self, tensor_name, mandatory=False):
             vi = self.value_infos[tensor_name]
             if vi.type.HasField("tensor_type"):
                 if mandatory and vi.type.tensor_type.elem_type == 0:
-                    raise RuntimeError(f"Unable to find data type for weight_name={tensor_name!r}")
+                    return self._get_default_tensor_type(tensor_name)
                 return vi.type.tensor_type.elem_type
         if (not self.enable_subgraph_quantization) or (self.parent is None):
             if mandatory:
-                raise RuntimeError(f"Unable to find data type for weight_name={tensor_name!r}")
+                return self._get_default_tensor_type(tensor_name)
             return None
         otype = self.parent.is_valid_quantize_weight(tensor_name)
         if otype is not None:
@@ -464,7 +481,7 @@ def get_tensor_type(self, tensor_name, mandatory=False):
             if res is not None:
                 return res
         if mandatory:
-            raise RuntimeError(f"Unable to find data type for weight_name={tensor_name!r}")
+            return self._get_default_tensor_type(tensor_name)
         return None
 
     def is_float_tensor(self, tensor_name):
diff --git a/onnxruntime/python/tools/transformers/quantize_helper.py b/onnxruntime/python/tools/transformers/quantize_helper.py
index a449e881ad361..6a25196dbc24c 100644
--- a/onnxruntime/python/tools/transformers/quantize_helper.py
+++ b/onnxruntime/python/tools/transformers/quantize_helper.py
@@ -7,7 +7,7 @@
 import logging
 import os
 
-import onnx  # noqa: F401
+import onnx
 import torch
 from transformers.modeling_utils import Conv1D
 
@@ -69,6 +69,7 @@ def quantize_onnx_model(onnx_model_path, quantized_model_path, use_external_data
             onnx_model_path,
             quantized_model_path,
             use_external_data_format=use_external_data_format,
+            extra_options={"DefaultTensorType": onnx.TensorProto.FLOAT},
         )
         logger.info(f"quantized model saved to:{quantized_model_path}")
         # TODO: inlcude external data in total model size.
diff --git a/onnxruntime/test/python/quantization/test_quantizer_shape_inference.py b/onnxruntime/test/python/quantization/test_quantizer_shape_inference.py
new file mode 100644
index 0000000000000..2b5d1f36070e5
--- /dev/null
+++ b/onnxruntime/test/python/quantization/test_quantizer_shape_inference.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+import unittest
+
+import numpy as np
+import onnx
+import onnx.helper as oh
+import onnx.numpy_helper as onh
+
+from onnxruntime.quantization.onnx_quantizer import ONNXQuantizer
+from onnxruntime.quantization.quant_utils import QuantizationMode, QuantType
+
+
+class TestQuantizerShapeInference(unittest.TestCase):
+    def test_com_microsoft(self):
+        model = oh.make_model(
+            oh.make_graph(
+                [
+                    oh.make_node("MatMul", ["X", "W1"], ["T1"]),
+                    oh.make_node("FusedMatMul", ["T1", "W2"], ["T2"], domain="com.microsoft"),
+                    oh.make_node("MatMul", ["T2", "W3"], ["T3"]),
+                    oh.make_node("MatMul", ["T3", "W4"], ["Y"]),
+                ],
+                "name",
+                [oh.make_tensor_value_info("X", onnx.TensorProto.FLOAT, [1, 4])],
+                [oh.make_tensor_value_info("Y", onnx.TensorProto.FLOAT, [1, 4])],
+                [
+                    onh.from_array(np.random.randn(4, 4).astype(np.float32), "W1"),
+                    onh.from_array(np.random.randn(4, 4).astype(np.float32), "W2"),
+                    onh.from_array(np.random.randn(4, 4).astype(np.float32), "W3"),
+                    onh.from_array(np.random.randn(4, 4).astype(np.float32), "W4"),
+                ],
+            ),
+            opset_imports=[oh.make_opsetid("", 18), oh.make_opsetid("com.microsoft", 1)],
+        )
+        model_shaped = onnx.shape_inference.infer_shapes(model)
+        shaped_results = set(t.name for t in model_shaped.graph.value_info)
+        # every result after T1 depends on T2 coming from a node com.microsoft,
+        # shape_inference cannot go beyond this point
+        self.assertEqual(shaped_results, {"T1"})
+
+        # first try: checks it raises an exception
+        quantizer = ONNXQuantizer(
+            model,
+            False,  # per_channel
+            False,  # reduce_range
+            QuantizationMode.IntegerOps,  # mode
+            False,  # static
+            QuantType.QInt8,  #  weight_type,
+            QuantType.QUInt8,  # dynamic activation only supports uint8
+            None,
+            [],  # nodes_to_quantize,
+            [],  # nodes_to_exclude
+            ["MatMul"],  # op_types_to_quantize,
+            {"MatMulConstBOnly": True},  # extra_options,
+            # {'DefaultTensorType': 1, }
+        )
+
+        with self.assertRaises(RuntimeError) as e:
+            quantizer.quantize_model()
+            self.assertIn("Unable to find data type for weight_name=", str(e))
+
+        # second try: checks it works
+        quantizer = ONNXQuantizer(
+            model,
+            False,  # per_channel
+            False,  # reduce_range
+            QuantizationMode.IntegerOps,  # mode
+            False,  # static
+            QuantType.QInt8,  #  weight_type,
+            QuantType.QUInt8,  # dynamic activation only supports uint8
+            None,
+            [],  # nodes_to_quantize,
+            [],  # nodes_to_exclude
+            ["MatMul"],  # op_types_to_quantize,
+            {
+                "MatMulConstBOnly": True,
+                "DefaultTensorType": 1,
+            },
+        )
+
+        model = quantizer.quantize_model()
+        ops = {n.op_type for n in model.graph.node}
+        self.assertEqual(ops, {"Cast", "FusedMatMul", "MatMulInteger", "DynamicQuantizeLinear", "Mul"})
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)

From 1b48054e1b7991ccef664fbedd659ec95d0e7ca7 Mon Sep 17 00:00:00 2001
From: Jiajie Hu <jiajie.hu@intel.com>
Date: Wed, 21 Feb 2024 01:24:34 +0800
Subject: [PATCH 015/279] [js/webgpu] Create Split indices helpers by rank, not
 by shape (#19554)

### Description
This is required to make shape uniforms really work.

### Motivation and Context
The bug was unveiled in a model with multiple Split nodes. The later
nodes would try to reuse a previous pipeline cache, while the old shapes
were hardcoded as constants in cache.
---
 js/web/lib/wasm/jsep/webgpu/ops/split.ts | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/split.ts b/js/web/lib/wasm/jsep/webgpu/ops/split.ts
index 14d6f37927590..a09ac78b17006 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/split.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/split.ts
@@ -68,7 +68,7 @@ const createSplitProgramInfo = (inputs: readonly TensorView[], attributes: Split
   const dataType = inputs[0].dataType;
   const axis = ShapeUtil.normalizeAxis(attributes.axis, inputShape.length);
   const outputs = new Array<IndicesHelper>(attributes.numOutputs);
-  const input = inputVariable('input', dataType, inputShape);
+  const input = inputVariable('input', dataType, inputShape.length);
   const sizeInSplitAxis = new Array<number>(attributes.numOutputs);
   const outputsTensorInfo: TensorInfo[] = [];
   const outputShapes: number[][] = [];
@@ -80,7 +80,7 @@ const createSplitProgramInfo = (inputs: readonly TensorView[], attributes: Split
     const outputShape = inputShape.slice();
     outputShape[attributes.axis] = attributes.splitSizes[i];
     outputShapes.push(outputShape);
-    outputs[i] = outputVariable(`output${i}`, dataType, outputShape);
+    outputs[i] = outputVariable(`output${i}`, dataType, outputShape.length);
     outputsTensorInfo.push({dims: outputShapes[i], dataType: inputs[0].dataType});
   }
   programUniforms.push(

From 3c49aacd5667b320a4e02626a176098f7423d7c0 Mon Sep 17 00:00:00 2001
From: Sheil Kumar <smk2007@gmail.com>
Date: Tue, 20 Feb 2024 13:13:40 -0800
Subject: [PATCH 016/279] Disable __cpuid check on arm64 builds as intrinsic is
 not available (#19574)

Disable __cpuid check on arm64 builds as intrinsic is not available

Motivation
Breaking the arm64 build.

Co-authored-by: Sheil Kumar <sheilk@microsoft.com>
---
 winml/lib/Api/HardwareCoreEnumerator.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/winml/lib/Api/HardwareCoreEnumerator.cpp b/winml/lib/Api/HardwareCoreEnumerator.cpp
index fa069c7fb66a7..b6b44690f4f6c 100644
--- a/winml/lib/Api/HardwareCoreEnumerator.cpp
+++ b/winml/lib/Api/HardwareCoreEnumerator.cpp
@@ -84,6 +84,7 @@ uint32_t HardwareCoreEnumerator::DefaultIntraOpNumThreads() {
   // # of logical cores = # of P cores x 2 (if hyper threading is enabled) + # of E cores + # of Soc Cores.
   auto cores = GetNumberOPhysicalAndEngineeringCores();
 
+#if !defined(_M_ARM64) && !defined(__aarch64__)
   const int kVendorID_Intel[3] = {0x756e6547, 0x6c65746e, 0x49656e69};  // "GenuntelineI"
   int regs_leaf0[4];
   int regs_leaf7[4];
@@ -100,6 +101,7 @@ uint32_t HardwareCoreEnumerator::DefaultIntraOpNumThreads() {
     // On Intel Hybrid processors, numSocCores == cores.Num2CacheCores
     return cores.PhysicalCores - cores.Num2CacheCores;
   }
+#endif
 
   return cores.PhysicalCores;
 }

From ec9c8cbdc9686ccda6553674d6aab61cfd245cf0 Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Wed, 21 Feb 2024 07:40:35 +1000
Subject: [PATCH 017/279] Use xcode parallel build flags to speed up iOS CI
 that is timing out (#19570)

### Description
<!-- Describe your changes. -->
Provide specific xcodebuild flags instead of depending on cmake to do
the right thing.

This built in just over an hour with a ccache miss. Previous CIs with a
ccache miss were timing out after 150 minutes.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 tools/ci_build/build.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 244bebd81474d..5b715bb29e5a1 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -1631,9 +1631,11 @@ def generate_build_tree(
             [
                 *temp_cmake_args,
                 f"-DCMAKE_BUILD_TYPE={config}",
-                f"-DCMAKE_PREFIX_PATH={build_dir}/{config}/installed"
-                if preinstalled_dir.exists() and not (args.arm64 or args.arm64ec or args.arm)
-                else "",
+                (
+                    f"-DCMAKE_PREFIX_PATH={build_dir}/{config}/installed"
+                    if preinstalled_dir.exists() and not (args.arm64 or args.arm64ec or args.arm)
+                    else ""
+                ),
             ],
             cwd=config_build_dir,
             cuda_home=cuda_home,
@@ -1667,8 +1669,11 @@ def build_targets(args, cmake_path, build_dir, configs, num_parallel_jobs, targe
                     f"/p:CL_MPCount={num_parallel_jobs}",
                 ]
             elif args.cmake_generator == "Xcode":
-                # CMake will generate correct build tool args for Xcode
-                cmd_args += ["--parallel", str(num_parallel_jobs)]
+                build_tool_args += [
+                    "-parallelizeTargets",
+                    "-jobs",
+                    str(num_parallel_jobs),
+                ]
             else:
                 build_tool_args += [f"-j{num_parallel_jobs}"]
 

From 7a5860e4909387448cb51351d3af50933238ba10 Mon Sep 17 00:00:00 2001
From: Jake Mathern <jamather@microsoft.com>
Date: Tue, 20 Feb 2024 13:41:40 -0800
Subject: [PATCH 018/279] Fix cmake function duplicate lib (#19547)

### Description
Fixes cmake function definition in winml.cmake to copy link flags.


### Motivation and Context
XFGCheck errors in WindowsAI because this function does not transfer
linker flags
---
 cmake/winml.cmake | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cmake/winml.cmake b/cmake/winml.cmake
index 268ee3960e75a..57cecd3e66adb 100644
--- a/cmake/winml.cmake
+++ b/cmake/winml.cmake
@@ -827,6 +827,7 @@ if (winml_is_inbox)
     get_target_property(compile_options ${target} COMPILE_OPTIONS)
     get_target_property(include_directories ${target} INCLUDE_DIRECTORIES)
     get_target_property(link_libraries ${target} LINK_LIBRARIES)
+    get_target_property(link_flags ${target} LINK_FLAGS)
     get_target_property(link_options ${target} LINK_OPTIONS)
 
     add_library(${new_target} SHARED ${sources})
@@ -835,6 +836,7 @@ if (winml_is_inbox)
     target_compile_options(${new_target} PRIVATE ${compile_options})
     target_include_directories(${new_target} PRIVATE ${include_directories})
     target_link_libraries(${new_target} PRIVATE ${link_libraries})
+    set_property(TARGET ${new_target} PROPERTY LINK_FLAGS "${link_flags}")
     target_link_options(${new_target} PRIVATE ${link_options})
   endfunction()
 

From 97ff17c2cbb6ee6f27c052e9c4302c70a41af485 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Tue, 20 Feb 2024 17:02:11 -0800
Subject: [PATCH 019/279] update script of run CI for external PRs to add "Big
 Models" (#19576)

### Description
update script of run CI for external PRs to add "Big Models"
---
 tools/python/run_CIs_for_external_pr.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/python/run_CIs_for_external_pr.py b/tools/python/run_CIs_for_external_pr.py
index 7a77839c4a4e7..df4e70b1e51fe 100644
--- a/tools/python/run_CIs_for_external_pr.py
+++ b/tools/python/run_CIs_for_external_pr.py
@@ -93,6 +93,8 @@ def main():
         # checks
         "onnxruntime-python-checks-ci-pipeline",
         "onnxruntime-binary-size-checks-ci-pipeline",
+        # big models
+        "Big Models",
         # not currently required, but running ensures we're hitting all mobile platforms
         "Android CI Pipeline",
         "iOS CI Pipeline",

From 3fe2c137ee5923ee369062453d528fe0e33bf4bc Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Tue, 20 Feb 2024 17:23:01 -0800
Subject: [PATCH 020/279] [js] small fix to workaround formatter (#19400)

### Description
Rename shader variable names to snake_case naming and also to avoid
formatter behaving inconsistently in win/linux.
---
 js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts b/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts
index 3f73d9cb7c5bc..d5f97213e49ce 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts
@@ -85,28 +85,28 @@ const createLayerNormProgramInfo =
   ${shaderHelper.mainStart()}
     ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.norm_count')}
     let offset = global_idx * uniforms.norm_size_vectorized;
-    var meanVector = ${fillVector('f32', components)};
-    var meanSquareVector = ${fillVector('f32', components)};
+    var mean_vector = ${fillVector('f32', components)};
+    var mean_square_vector = ${fillVector('f32', components)};
 
     for (var h: u32 = 0u; h < uniforms.norm_size_vectorized; h++) {
       let value = ${castToF32(dataType, components, 'x[h + offset]')};
-      meanVector += value;
-      meanSquareVector += value * value;
+      mean_vector += value;
+      mean_square_vector += value * value;
     }
-    let mean = ${sumVector('meanVector', components)} / uniforms.norm_size;
-    let invStdDev =
-        inverseSqrt(${sumVector('meanSquareVector', components)} / uniforms.norm_size - mean * mean + uniforms.epsilon);
+    let mean = ${sumVector('mean_vector', components)} / uniforms.norm_size;
+    let inv_std_dev = inverseSqrt(${
+            sumVector('mean_square_vector', components)} / uniforms.norm_size - mean * mean + uniforms.epsilon);
 
     for (var j: u32 = 0; j < uniforms.norm_size_vectorized; j++) {
       let f32input = ${castToF32(dataType, components, 'x[j + offset]')};
       let f32scale = ${castToF32(dataType, components, 'scale[j]')};
-      output[j + offset] = ${variables[0].type.value}((f32input - mean) * invStdDev * f32scale
+      output[j + offset] = ${variables[0].type.value}((f32input - mean) * inv_std_dev * f32scale
         ${bias ? `+ ${castToF32(dataType, components, 'bias[j]')}` : ''}
       );
     }
 
     ${hasMeanDataOutput ? 'mean_data_output[global_idx] = mean' : ''};
-    ${hasInvStdOutput ? 'inv_std_output[global_idx] = invStdDev' : ''};
+    ${hasInvStdOutput ? 'inv_std_output[global_idx] = inv_std_dev' : ''};
   }`;
       };
       const outputs = [{dims: outputShape, dataType: inputs[0].dataType}];

From 70567a4b3a8bc74fb0f1a9ed9ea5a5be6b99b378 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Tue, 20 Feb 2024 17:33:21 -0800
Subject: [PATCH 021/279] [js/web] use ApiTensor insteadof onnxjs Tensor in
 TensorResultValidator (#19358)

### Description
use ApiTensor insteadof onnxjs Tensor in TensorResultValidator. Make
test runner less depend on onnxjs classes.
---
 js/web/test/test-runner.ts                    | 26 +++++++------------
 .../unittests/backends/webgl/test-conv-new.ts |  4 ++-
 2 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/js/web/test/test-runner.ts b/js/web/test/test-runner.ts
index b01d474788f25..ecc7d4b4a09a5 100644
--- a/js/web/test/test-runner.ts
+++ b/js/web/test/test-runner.ts
@@ -39,10 +39,6 @@ const ONNXRUNTIME_THRESHOLD_RELATIVE_ERROR = 1.00001;
  */
 const now = (typeof performance !== 'undefined' && performance.now) ? () => performance.now() : Date.now;
 
-function toInternalTensor(tensor: ort.Tensor): Tensor {
-  return new Tensor(
-      tensor.dims, tensor.type as Tensor.DataType, undefined, undefined, tensor.data as Tensor.NumberType);
-}
 function fromInternalTensor(tensor: Tensor): ort.Tensor {
   return new ort.Tensor(tensor.type, tensor.data as ort.Tensor.DataType, tensor.dims);
 }
@@ -330,6 +326,10 @@ export class TensorResultValidator {
   }
 
   checkTensorResult(actual: Tensor[], expected: Tensor[]): void {
+    this.checkApiTensorResult(actual.map(fromInternalTensor), expected.map(fromInternalTensor));
+  }
+
+  checkApiTensorResult(actual: ort.Tensor[], expected: ort.Tensor[]): void {
     // check output size
     expect(actual.length, 'size of output tensors').to.equal(expected.length);
 
@@ -347,10 +347,6 @@ export class TensorResultValidator {
     }
   }
 
-  checkApiTensorResult(actual: ort.Tensor[], expected: ort.Tensor[]): void {
-    this.checkTensorResult(actual.map(toInternalTensor), expected.map(toInternalTensor));
-  }
-
   checkNamedTensorResult(actual: Record<string, ort.Tensor>, expected: Test.NamedTensor[]): void {
     // check output size
     expect(Object.getOwnPropertyNames(actual).length, 'size of output tensors').to.equal(expected.length);
@@ -364,7 +360,7 @@ export class TensorResultValidator {
   }
 
   // This function check whether 2 tensors should be considered as 'match' or not
-  areEqual(actual: Tensor, expected: Tensor): boolean {
+  areEqual(actual: ort.Tensor, expected: ort.Tensor): boolean {
     if (!actual || !expected) {
       return false;
     }
@@ -392,13 +388,13 @@ export class TensorResultValidator {
 
     switch (actualType) {
       case 'string':
-        return this.strictEqual(actual.stringData, expected.stringData);
+        return this.strictEqual(actual.data, expected.data);
 
       case 'float32':
       case 'float64':
         return this.floatEqual(
-            actual.numberData as number[] | Float32Array | Float64Array,
-            expected.numberData as number[] | Float32Array | Float64Array);
+            actual.data as number[] | Float32Array | Float64Array,
+            expected.data as number[] | Float32Array | Float64Array);
 
       case 'uint8':
       case 'int8':
@@ -409,10 +405,8 @@ export class TensorResultValidator {
       case 'int64':
       case 'bool':
         return TensorResultValidator.integerEqual(
-            actual.numberData as number[] | Uint8Array | Int8Array | Uint16Array | Int16Array | Uint32Array |
-                Int32Array,
-            expected.numberData as number[] | Uint8Array | Int8Array | Uint16Array | Int16Array | Uint32Array |
-                Int32Array);
+            actual.data as number[] | Uint8Array | Int8Array | Uint16Array | Int16Array | Uint32Array | Int32Array,
+            expected.data as number[] | Uint8Array | Int8Array | Uint16Array | Int16Array | Uint32Array | Int32Array);
 
       default:
         throw new Error('type not implemented or not supported');
diff --git a/js/web/test/unittests/backends/webgl/test-conv-new.ts b/js/web/test/unittests/backends/webgl/test-conv-new.ts
index 8c186b9b36451..014fc57f21558 100644
--- a/js/web/test/unittests/backends/webgl/test-conv-new.ts
+++ b/js/web/test/unittests/backends/webgl/test-conv-new.ts
@@ -893,7 +893,9 @@ describe('New Conv tests', () => {
             const expected = cpuConv(
                 inputTensor, kernelTensor, biasTensor, testData.autoPad, testData.dilations, testData.pads,
                 testData.strides);
-            if (!validator.areEqual(actual, expected)) {
+            try {
+              validator.checkTensorResult([actual], [expected]);
+            } catch {
               console.log(actual.dims, `[${actual.numberData.slice(0, 20).join(',')},...]`);
               console.log(expected.dims, `[${expected.numberData.slice(0, 20).join(',')},...]`);
               throw new Error('Expected and Actual did not match');

From 6e04e36e3faf2d8115c0962c85b86a6a8b48ac5b Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Tue, 20 Feb 2024 17:33:37 -0800
Subject: [PATCH 022/279] [js/common] upgrade tsc in common from 4.9.5 to 5.2.2
 (#19317)

### Description
upgrade tsc in common from 4.9.5 to 5.2.2
---
 js/common/package-lock.json  | 106 +++++++++++++++++------------------
 js/common/package.json       |   4 +-
 js/common/test/tsconfig.json |   2 +-
 3 files changed, 56 insertions(+), 56 deletions(-)

diff --git a/js/common/package-lock.json b/js/common/package-lock.json
index a5ada877b916a..3988ac80707e0 100644
--- a/js/common/package-lock.json
+++ b/js/common/package-lock.json
@@ -9,13 +9,13 @@
       "version": "1.18.0",
       "license": "MIT",
       "devDependencies": {
-        "typedoc": "^0.23.22"
+        "typedoc": "^0.25.7"
       }
     },
     "node_modules/ansi-sequence-parser": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/ansi-sequence-parser/-/ansi-sequence-parser-1.1.0.tgz",
-      "integrity": "sha512-lEm8mt52to2fT8GhciPCGeCXACSz2UwIN4X2e2LJSnZ5uAbn2/dsYdOmUXq0AtWS5cpAupysIneExOgH0Vd2TQ==",
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/ansi-sequence-parser/-/ansi-sequence-parser-1.1.1.tgz",
+      "integrity": "sha512-vJXt3yiaUL4UU546s3rPXlsry/RnM730G1+HkpKE012AN0sx1eOrxSu95oKDIonskeLTijMgqWZ3uDEe3NFvyg==",
       "dev": true
     },
     "node_modules/balanced-match": {
@@ -34,9 +34,9 @@
       }
     },
     "node_modules/jsonc-parser": {
-      "version": "3.2.0",
-      "resolved": "https://registry.npmjs.org/jsonc-parser/-/jsonc-parser-3.2.0.tgz",
-      "integrity": "sha512-gfFQZrcTc8CnKXp6Y4/CBT3fTc0OVuDofpre4aEeEpSBPV5X5v4+Vmx+8snU7RLPrNHPKSgLxGo9YuQzz20o+w==",
+      "version": "3.2.1",
+      "resolved": "https://registry.npmjs.org/jsonc-parser/-/jsonc-parser-3.2.1.tgz",
+      "integrity": "sha512-AilxAyFOAcK5wA1+LeaySVBrHsGQvUFCDWXKpZjzaL0PqW+xfBOttn8GNtWKFWqneyMZj41MWF9Kl6iPWLwgOA==",
       "dev": true
     },
     "node_modules/lunr": {
@@ -46,9 +46,9 @@
       "dev": true
     },
     "node_modules/marked": {
-      "version": "4.2.12",
-      "resolved": "https://registry.npmjs.org/marked/-/marked-4.2.12.tgz",
-      "integrity": "sha512-yr8hSKa3Fv4D3jdZmtMMPghgVt6TWbk86WQaWhDloQjRSQhMMYCAro7jP7VDJrjjdV8pxVxMssXS8B8Y5DZ5aw==",
+      "version": "4.3.0",
+      "resolved": "https://registry.npmjs.org/marked/-/marked-4.3.0.tgz",
+      "integrity": "sha512-PRsaiG84bK+AMvxziE/lCFss8juXjNaWzVbN5tXAm4XjeaS9NAHhop+PjQxz2A9h8Q4M/xGmzP8vqNwy6JeK0A==",
       "dev": true,
       "bin": {
         "marked": "bin/marked.js"
@@ -58,24 +58,24 @@
       }
     },
     "node_modules/minimatch": {
-      "version": "7.4.2",
-      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-7.4.2.tgz",
-      "integrity": "sha512-xy4q7wou3vUoC9k1xGTXc+awNdGaGVHtFUaey8tiX4H1QRc04DZ/rmDFwNm2EBsuYEhAZ6SgMmYf3InGY6OauA==",
+      "version": "9.0.3",
+      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.3.tgz",
+      "integrity": "sha512-RHiac9mvaRw0x3AYRgDC1CxAP7HTcNrrECeA8YYJeWnpo+2Q5CegtZjaotWTWxDG3UeGA1coE05iH1mPjT/2mg==",
       "dev": true,
       "dependencies": {
         "brace-expansion": "^2.0.1"
       },
       "engines": {
-        "node": ">=10"
+        "node": ">=16 || 14 >=14.17"
       },
       "funding": {
         "url": "https://github.com/sponsors/isaacs"
       }
     },
     "node_modules/shiki": {
-      "version": "0.14.1",
-      "resolved": "https://registry.npmjs.org/shiki/-/shiki-0.14.1.tgz",
-      "integrity": "sha512-+Jz4nBkCBe0mEDqo1eKRcCdjRtrCjozmcbTUjbPTX7OOJfEbTZzlUWlZtGe3Gb5oV1/jnojhG//YZc3rs9zSEw==",
+      "version": "0.14.7",
+      "resolved": "https://registry.npmjs.org/shiki/-/shiki-0.14.7.tgz",
+      "integrity": "sha512-dNPAPrxSc87ua2sKJ3H5dQ/6ZaY8RNnaAqK+t0eG7p0Soi2ydiqbGOTaZCqaYvA/uZYfS1LJnemt3Q+mSfcPCg==",
       "dev": true,
       "dependencies": {
         "ansi-sequence-parser": "^1.1.0",
@@ -85,30 +85,30 @@
       }
     },
     "node_modules/typedoc": {
-      "version": "0.23.26",
-      "resolved": "https://registry.npmjs.org/typedoc/-/typedoc-0.23.26.tgz",
-      "integrity": "sha512-5m4KwR5tOLnk0OtMaRn9IdbeRM32uPemN9kur7YK9wFqx8U0CYrvO9aVq6ysdZSV1c824BTm+BuQl2Ze/k1HtA==",
+      "version": "0.25.7",
+      "resolved": "https://registry.npmjs.org/typedoc/-/typedoc-0.25.7.tgz",
+      "integrity": "sha512-m6A6JjQRg39p2ZVRIN3NKXgrN8vzlHhOS+r9ymUYtcUP/TIQPvWSq7YgE5ZjASfv5Vd5BW5xrir6Gm2XNNcOow==",
       "dev": true,
       "dependencies": {
         "lunr": "^2.3.9",
-        "marked": "^4.2.12",
-        "minimatch": "^7.1.3",
-        "shiki": "^0.14.1"
+        "marked": "^4.3.0",
+        "minimatch": "^9.0.3",
+        "shiki": "^0.14.7"
       },
       "bin": {
         "typedoc": "bin/typedoc"
       },
       "engines": {
-        "node": ">= 14.14"
+        "node": ">= 16"
       },
       "peerDependencies": {
-        "typescript": "4.6.x || 4.7.x || 4.8.x || 4.9.x"
+        "typescript": "4.6.x || 4.7.x || 4.8.x || 4.9.x || 5.0.x || 5.1.x || 5.2.x || 5.3.x"
       }
     },
     "node_modules/typescript": {
-      "version": "4.9.5",
-      "resolved": "https://registry.npmjs.org/typescript/-/typescript-4.9.5.tgz",
-      "integrity": "sha512-1FXk9E2Hm+QzZQ7z+McJiHL4NW1F2EzMu9Nq9i3zAaGqibafqYwCVU6WyWAuyQRRzOlxou8xZSyXLEN8oKj24g==",
+      "version": "5.2.2",
+      "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.2.2.tgz",
+      "integrity": "sha512-mI4WrpHsbCIcwT9cF4FZvr80QUeKvsUsUvKDoR+X/7XHQH98xYD8YHZg7ANtz2GtZt/CBq2QJ0thkGJMHfqc1w==",
       "dev": true,
       "peer": true,
       "bin": {
@@ -116,7 +116,7 @@
         "tsserver": "bin/tsserver"
       },
       "engines": {
-        "node": ">=4.2.0"
+        "node": ">=14.17"
       }
     },
     "node_modules/vscode-oniguruma": {
@@ -134,9 +134,9 @@
   },
   "dependencies": {
     "ansi-sequence-parser": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/ansi-sequence-parser/-/ansi-sequence-parser-1.1.0.tgz",
-      "integrity": "sha512-lEm8mt52to2fT8GhciPCGeCXACSz2UwIN4X2e2LJSnZ5uAbn2/dsYdOmUXq0AtWS5cpAupysIneExOgH0Vd2TQ==",
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/ansi-sequence-parser/-/ansi-sequence-parser-1.1.1.tgz",
+      "integrity": "sha512-vJXt3yiaUL4UU546s3rPXlsry/RnM730G1+HkpKE012AN0sx1eOrxSu95oKDIonskeLTijMgqWZ3uDEe3NFvyg==",
       "dev": true
     },
     "balanced-match": {
@@ -155,9 +155,9 @@
       }
     },
     "jsonc-parser": {
-      "version": "3.2.0",
-      "resolved": "https://registry.npmjs.org/jsonc-parser/-/jsonc-parser-3.2.0.tgz",
-      "integrity": "sha512-gfFQZrcTc8CnKXp6Y4/CBT3fTc0OVuDofpre4aEeEpSBPV5X5v4+Vmx+8snU7RLPrNHPKSgLxGo9YuQzz20o+w==",
+      "version": "3.2.1",
+      "resolved": "https://registry.npmjs.org/jsonc-parser/-/jsonc-parser-3.2.1.tgz",
+      "integrity": "sha512-AilxAyFOAcK5wA1+LeaySVBrHsGQvUFCDWXKpZjzaL0PqW+xfBOttn8GNtWKFWqneyMZj41MWF9Kl6iPWLwgOA==",
       "dev": true
     },
     "lunr": {
@@ -167,24 +167,24 @@
       "dev": true
     },
     "marked": {
-      "version": "4.2.12",
-      "resolved": "https://registry.npmjs.org/marked/-/marked-4.2.12.tgz",
-      "integrity": "sha512-yr8hSKa3Fv4D3jdZmtMMPghgVt6TWbk86WQaWhDloQjRSQhMMYCAro7jP7VDJrjjdV8pxVxMssXS8B8Y5DZ5aw==",
+      "version": "4.3.0",
+      "resolved": "https://registry.npmjs.org/marked/-/marked-4.3.0.tgz",
+      "integrity": "sha512-PRsaiG84bK+AMvxziE/lCFss8juXjNaWzVbN5tXAm4XjeaS9NAHhop+PjQxz2A9h8Q4M/xGmzP8vqNwy6JeK0A==",
       "dev": true
     },
     "minimatch": {
-      "version": "7.4.2",
-      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-7.4.2.tgz",
-      "integrity": "sha512-xy4q7wou3vUoC9k1xGTXc+awNdGaGVHtFUaey8tiX4H1QRc04DZ/rmDFwNm2EBsuYEhAZ6SgMmYf3InGY6OauA==",
+      "version": "9.0.3",
+      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.3.tgz",
+      "integrity": "sha512-RHiac9mvaRw0x3AYRgDC1CxAP7HTcNrrECeA8YYJeWnpo+2Q5CegtZjaotWTWxDG3UeGA1coE05iH1mPjT/2mg==",
       "dev": true,
       "requires": {
         "brace-expansion": "^2.0.1"
       }
     },
     "shiki": {
-      "version": "0.14.1",
-      "resolved": "https://registry.npmjs.org/shiki/-/shiki-0.14.1.tgz",
-      "integrity": "sha512-+Jz4nBkCBe0mEDqo1eKRcCdjRtrCjozmcbTUjbPTX7OOJfEbTZzlUWlZtGe3Gb5oV1/jnojhG//YZc3rs9zSEw==",
+      "version": "0.14.7",
+      "resolved": "https://registry.npmjs.org/shiki/-/shiki-0.14.7.tgz",
+      "integrity": "sha512-dNPAPrxSc87ua2sKJ3H5dQ/6ZaY8RNnaAqK+t0eG7p0Soi2ydiqbGOTaZCqaYvA/uZYfS1LJnemt3Q+mSfcPCg==",
       "dev": true,
       "requires": {
         "ansi-sequence-parser": "^1.1.0",
@@ -194,21 +194,21 @@
       }
     },
     "typedoc": {
-      "version": "0.23.26",
-      "resolved": "https://registry.npmjs.org/typedoc/-/typedoc-0.23.26.tgz",
-      "integrity": "sha512-5m4KwR5tOLnk0OtMaRn9IdbeRM32uPemN9kur7YK9wFqx8U0CYrvO9aVq6ysdZSV1c824BTm+BuQl2Ze/k1HtA==",
+      "version": "0.25.7",
+      "resolved": "https://registry.npmjs.org/typedoc/-/typedoc-0.25.7.tgz",
+      "integrity": "sha512-m6A6JjQRg39p2ZVRIN3NKXgrN8vzlHhOS+r9ymUYtcUP/TIQPvWSq7YgE5ZjASfv5Vd5BW5xrir6Gm2XNNcOow==",
       "dev": true,
       "requires": {
         "lunr": "^2.3.9",
-        "marked": "^4.2.12",
-        "minimatch": "^7.1.3",
-        "shiki": "^0.14.1"
+        "marked": "^4.3.0",
+        "minimatch": "^9.0.3",
+        "shiki": "^0.14.7"
       }
     },
     "typescript": {
-      "version": "4.9.5",
-      "resolved": "https://registry.npmjs.org/typescript/-/typescript-4.9.5.tgz",
-      "integrity": "sha512-1FXk9E2Hm+QzZQ7z+McJiHL4NW1F2EzMu9Nq9i3zAaGqibafqYwCVU6WyWAuyQRRzOlxou8xZSyXLEN8oKj24g==",
+      "version": "5.2.2",
+      "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.2.2.tgz",
+      "integrity": "sha512-mI4WrpHsbCIcwT9cF4FZvr80QUeKvsUsUvKDoR+X/7XHQH98xYD8YHZg7ANtz2GtZt/CBq2QJ0thkGJMHfqc1w==",
       "dev": true,
       "peer": true
     },
diff --git a/js/common/package.json b/js/common/package.json
index 64ab2736adbe3..cd2612aab4984 100644
--- a/js/common/package.json
+++ b/js/common/package.json
@@ -9,7 +9,7 @@
   },
   "author": "fs-eire",
   "scripts": {
-    "build:cjs": "tsc --module commonjs --outDir ./dist/cjs",
+    "build:cjs": "tsc --module commonjs --moduleResolution node10 --outDir ./dist/cjs",
     "build:esm": "tsc",
     "build:bundles": "webpack",
     "build": "node ./build.js",
@@ -18,7 +18,7 @@
     "test": "mocha ./test/**/*.js --timeout 30000"
   },
   "devDependencies": {
-    "typedoc": "^0.23.22"
+    "typedoc": "^0.25.7"
   },
   "main": "dist/cjs/index.js",
   "exports": {
diff --git a/js/common/test/tsconfig.json b/js/common/test/tsconfig.json
index 2e4927ac3b325..e9068ad837a81 100644
--- a/js/common/test/tsconfig.json
+++ b/js/common/test/tsconfig.json
@@ -2,7 +2,7 @@
   "extends": "../../tsconfig.tools.json",
   "exclude": ["type-tests/**/*.ts"],
   "compilerOptions": {
-    "module": "ES2022",
+    "module": "Node16",
     "sourceMap": true
   }
 }

From 45e20bf7810689ecf385957c34434c6d2456e32b Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Wed, 21 Feb 2024 12:38:37 +1000
Subject: [PATCH 023/279] Use build.py to build in py-win-gpu.yml so
 parallelization parameters are set (#19578)

### Description
<!-- Describe your changes. -->
build.py sets a few parallelization parameters when building. Using
msbuild directly lacks those.


https://github.com/microsoft/onnxruntime/blob/7a5860e4909387448cb51351d3af50933238ba10/tools/ci_build/build.py#L1665-L1669

Changed to use build.py. If there's a concern with that we _could_ set
the parameters in the yaml, but that will be uglier due to duplicating
logic in multiple places.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../azure-pipelines/templates/py-win-gpu.yml  | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml
index 18368e59cad52..4315eae503ebd 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml
@@ -120,17 +120,17 @@ jobs:
             $(TelemetryOption) ${{ parameters.BUILD_PY_PARAMETERS }} ${{ parameters.EP_BUILD_FLAGS }}
           workingDirectory: '$(Build.BinariesDirectory)'
 
-      - task: VSBuild@1
+      # building with build.py so the parallelization parameters are added to the msbuild command
+      - task: PythonScript@0
         displayName: 'Build'
         inputs:
-          solution: '$(Build.BinariesDirectory)\RelWithDebInfo\onnxruntime.sln'
-          platform: x64
-          configuration: RelWithDebInfo
-          msbuildArchitecture: $(buildArch)
-          maximumCpuCount: true
-          logProjectEvents: true
-          workingFolder: '$(Build.BinariesDirectory)\RelWithDebInfo'
-          createLogFile: true
+          scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
+          arguments: >
+            --config RelWithDebInfo
+            --build_dir $(Build.BinariesDirectory)
+            --parallel --build
+            $(TelemetryOption) ${{ parameters.BUILD_PY_PARAMETERS }} ${{ parameters.EP_BUILD_FLAGS }}
+          workingDirectory: '$(Build.BinariesDirectory)'
 
       # Esrp signing
       - template: win-esrp-dll.yml
@@ -188,7 +188,7 @@ jobs:
         condition: and (succeeded(), eq(variables['Build.SourceBranch'], 'refs/heads/main'))
         inputs:
           GdnPublishTsaOnboard: false
-          GdnPublishTsaConfigFile: '$(Build.sourcesDirectory)\.gdn\.gdntsa' 
+          GdnPublishTsaConfigFile: '$(Build.sourcesDirectory)\.gdn\.gdntsa'
 
       - template: component-governance-component-detection-steps.yml
         parameters:

From 0c4421cb7867434e1e08b4274f16f6c2f14cb4ce Mon Sep 17 00:00:00 2001
From: Markus Tavenrath <mtavenrath@users.noreply.github.com>
Date: Wed, 21 Feb 2024 03:39:43 +0100
Subject: [PATCH 024/279] Fix compile warnings (as errors) for functions which
 miss returning required return value (#19079)

Added dummy return values to functions which specify a return value, but
do not return an value value.

### Motivation and Context
Fix compiler errors with 'warnings as errors' enabled.

From 8fadc6c913bc30edff2e89756da515b9bd75d256 Mon Sep 17 00:00:00 2001
From: zhijiang <43435212+zhijxu-MS@users.noreply.github.com>
Date: Wed, 21 Feb 2024 10:41:42 +0800
Subject: [PATCH 025/279] Zhijxu/cleanup cached tensors when oom (#19306)

in pytorch, when oom happens at bp, user could decrease the batch size
and rerun it without restarting the process.

while in ORT, the intermediate tensors are kept even OOM, so decrease
batch size still fail.


this is torch run, we can see after oom failure, torch will release
tensor before next step

![image](https://github.com/microsoft/onnxruntime/assets/43435212/92b8a2e3-454b-448a-a223-17cb91d463c2)

this is from ort, we can see ort not release its tensors after OOM
failure.

![image](https://github.com/microsoft/onnxruntime/assets/43435212/bb6a3882-8e14-4f37-8079-e7f70fc2546b)

ort with the PR, we can see memory is released, **the 4GB memory is not
own by ort, and will be released by torch at the end**.

![image](https://github.com/microsoft/onnxruntime/assets/43435212/7f39d711-4e36-47d5-aecf-3805433a6d01)
---
 onnxruntime/core/framework/execution_frame.cc | 21 +++++++++++++++
 onnxruntime/core/framework/execution_frame.h  |  2 ++
 .../training/ortmodule/_training_manager.py   | 26 ++++++++++---------
 3 files changed, 37 insertions(+), 12 deletions(-)

diff --git a/onnxruntime/core/framework/execution_frame.cc b/onnxruntime/core/framework/execution_frame.cc
index 8c08152986cf6..32a5f749af084 100644
--- a/onnxruntime/core/framework/execution_frame.cc
+++ b/onnxruntime/core/framework/execution_frame.cc
@@ -204,6 +204,14 @@ AllocatorPtr IExecutionFrame::GetAllocator(const OrtDevice& info) const {
 
 Status IExecutionFrame::ReleaseMLValue(int ort_value_idx) { return ReleaseMLValueImpl(ort_value_idx); }
 
+#ifdef ENABLE_TRAINING
+void IExecutionFrame::ReleaseAllMLValues() {
+  for (size_t ort_value_idx = 0; ort_value_idx < all_values_.size(); ort_value_idx++) {
+    all_values_[ort_value_idx] = OrtValue();
+  }
+}
+#endif
+
 Status IExecutionFrame::ReleaseMLValueImpl(int ort_value_idx) {
   if (ort_value_idx == NodeIndexInfo::kInvalidEntry || static_cast<size_t>(ort_value_idx) >= all_values_size_) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "invalid index ", ort_value_idx);
@@ -831,7 +839,20 @@ AllocatorPtr ExecutionFrame::GetAllocatorImpl(const OrtDevice& info) const {
 // This method is not thread safe!
 // Return S_OK and nullptr if index map to a value that is an unused optional input/output
 Status ExecutionFrame::CreateNodeOutputMLValueImpl(OrtValue& ort_value, int ort_value_idx, const TensorShape* shape) {
+#ifdef ENABLE_TRAINING
+  try {
+    auto status = AllocateAsPerAllocationPlan(ort_value, ort_value_idx, shape);
+    return status;
+  } catch (const std::exception& e) {
+    LOGS(session_state_.Logger(), WARNING)
+        << "Exception caught when allocating memory for ort_value with index: " << ort_value_idx
+        << "so clean up all OrtValues";
+    ReleaseAllMLValues();
+    return Status(ONNXRUNTIME, FAIL, e.what());
+  }
+#else
   return AllocateAsPerAllocationPlan(ort_value, ort_value_idx, shape);
+#endif
 }
 
 void ExecutionFrame::VerifyOutputSizes(int output_index, const Node& node, const TensorShape& output_shape) {
diff --git a/onnxruntime/core/framework/execution_frame.h b/onnxruntime/core/framework/execution_frame.h
index 1576c16684faa..18d210ffd48f7 100644
--- a/onnxruntime/core/framework/execution_frame.h
+++ b/onnxruntime/core/framework/execution_frame.h
@@ -67,6 +67,8 @@ class IExecutionFrame {
 
                      const std::unordered_map<int, OrtValue>& initializers);
   Status GetOutputs(gsl::span<const int> fetch_mlvalue_idxs, std::vector<OrtValue>& fetches);
+  // if OOM happens, then release all values, so session can run next batch.
+  void ReleaseAllMLValues();
 #endif
 
   // TO DO: make it thread safe
diff --git a/orttraining/orttraining/python/training/ortmodule/_training_manager.py b/orttraining/orttraining/python/training/ortmodule/_training_manager.py
index cc533e549db92..73c32a2f51e41 100644
--- a/orttraining/orttraining/python/training/ortmodule/_training_manager.py
+++ b/orttraining/orttraining/python/training/ortmodule/_training_manager.py
@@ -196,18 +196,20 @@ def backward(ctx, *grad_outputs):
 
                 # Run and get results
                 backward_outputs = C.OrtValueVector()
-                self._execution_agent.run_backward(backward_inputs, backward_outputs, ctx.run_info.state)
-                # Destroy the state immediately (as opposed to be at the mercy of garbage collector) so it does not
-                # affect peak memory usage in a subsequent graph run.
-                del ctx.run_info.state
-
-                # Fast version: all backward_outputs are converted first.
-                # This version only works if backward_outputs is an OrtValueVector.
-                transferred_backward_outputs = _utils._ortvalues_to_torch_tensor(backward_outputs, self._device)
-
-                self._runtime_inspector.memory_ob.inspect_memory(Phase.POST_BACKWARD)
-
-                return tuple(transferred_backward_outputs[idx] if idx != -1 else None for idx in self._gradient_map)
+                try:
+                    self._execution_agent.run_backward(backward_inputs, backward_outputs, ctx.run_info.state)
+                    # Destroy the state immediately (as opposed to be at the mercy of garbage collector) so it does not
+                    # affect peak memory usage in a subsequent graph run.
+
+                    # Fast version: all backward_outputs are converted first.
+                    # This version only works if backward_outputs is an OrtValueVector.
+                    transferred_backward_outputs = _utils._ortvalues_to_torch_tensor(backward_outputs, self._device)
+
+                    self._runtime_inspector.memory_ob.inspect_memory(Phase.POST_BACKWARD)
+                    res = tuple(transferred_backward_outputs[idx] if idx != -1 else None for idx in self._gradient_map)
+                    return res
+                finally:
+                    del ctx.run_info.state
 
         return _ORTModuleFunction
 

From 6226c5f62f3d16b9702d5c40993ee9bf1cbd119c Mon Sep 17 00:00:00 2001
From: PeixuanZuo <94887879+PeixuanZuo@users.noreply.github.com>
Date: Wed, 21 Feb 2024 11:08:48 +0800
Subject: [PATCH 026/279] [ROCm] Add SkipGroupNorm for ROCm EP (#19303)

Add SkipGroupNorm for ROCm EP.

---------

Co-authored-by: Peixuan Zuo <peixuanzuo@microsoft.com@orttrainingdev7.d32nl1ml4oruzj4qz3bqlggovf.px.internal.cloudapp.net>
---
 cmake/onnxruntime_rocm_hipify.cmake           |   5 -
 .../contrib_ops/rocm/diffusion/group_norm.cc  | 152 -------------
 .../rocm/diffusion/group_norm_ck.cuh          |  35 +--
 .../diffusion/group_norm_ck_impl/impl.cuh     |  10 +-
 .../diffusion/group_norm_ck_impl/impl_fp16.cu |   8 +-
 .../diffusion/group_norm_ck_impl/impl_fp32.cu |   8 +-
 .../rocm/diffusion/group_norm_common.h        | 125 +++-------
 .../rocm/diffusion/group_norm_impl.cu         |  47 ++--
 .../rocm/diffusion/group_norm_impl.h          |  47 ----
 .../rocm/diffusion/group_norm_impl_kernel.cuh | 213 ------------------
 .../rocm/diffusion/group_norm_triton.cuh      |  29 +--
 .../rocm/diffusion/group_norm_triton.py       |  16 +-
 .../rocm/diffusion/group_norm_tunable_op.h    | 153 +++++++------
 .../contrib_ops/rocm/rocm_contrib_kernels.cc  |   2 +
 .../kernel_explorer/kernels/groupnorm_test.py | 136 ++++++++---
 .../kernels/rocm/group_norm.cu                | 112 +++++----
 .../contrib_ops/skip_group_norm_op_test.cc    |  14 +-
 tools/ci_build/amd_hipify.py                  |   2 +
 18 files changed, 382 insertions(+), 732 deletions(-)
 delete mode 100644 onnxruntime/contrib_ops/rocm/diffusion/group_norm.cc
 delete mode 100644 onnxruntime/contrib_ops/rocm/diffusion/group_norm_impl.h
 delete mode 100644 onnxruntime/contrib_ops/rocm/diffusion/group_norm_impl_kernel.cuh

diff --git a/cmake/onnxruntime_rocm_hipify.cmake b/cmake/onnxruntime_rocm_hipify.cmake
index d485abe6bb1a6..85a9bf50460d3 100644
--- a/cmake/onnxruntime_rocm_hipify.cmake
+++ b/cmake/onnxruntime_rocm_hipify.cmake
@@ -44,12 +44,7 @@ set(contrib_ops_excluded_files
   "bert/packed_multihead_attention.cc"
   "bert/packed_multihead_attention_impl.h"
   "bert/packed_multihead_attention_impl.cu"
-  "diffusion/group_norm.cc"
   "diffusion/group_norm_impl.cu"
-  "diffusion/group_norm_impl.h"
-  "diffusion/group_norm_impl_kernel.cuh"
-  "diffusion/group_norm_common_base.h"
-  "diffusion/group_norm_common_base.cc"
   "diffusion/nhwc_conv.cc"
   "math/gemm_float8.cc"
   "math/gemm_float8.cu"
diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm.cc b/onnxruntime/contrib_ops/rocm/diffusion/group_norm.cc
deleted file mode 100644
index e82e15a304f4c..0000000000000
--- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm.cc
+++ /dev/null
@@ -1,152 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/providers/rocm/rocm_common.h"
-#include "contrib_ops/rocm/diffusion/group_norm.h"
-#include "contrib_ops/rocm/diffusion/group_norm_impl.h"
-
-namespace onnxruntime {
-namespace contrib {
-namespace rocm {
-
-#define GROUP_NORM_TYPES float, MLFloat16
-
-ONNX_OPERATOR_KERNEL_EX(
-    GroupNorm, kMSDomain, 1, kRocmExecutionProvider,
-    (*KernelDefBuilder::Create()).TypeConstraint("T", BuildKernelDefConstraints<GROUP_NORM_TYPES>()), GroupNorm);
-
-using namespace ONNX_NAMESPACE;
-
-namespace {
-template <typename T>
-struct DispatchGroupNorm {
-  Status operator()(RocmTuningContext* tuning_ctx,
-                    Stream* stream,
-                    Tensor* output,
-                    const Tensor* input,
-                    const Tensor* gamma,
-                    const Tensor* beta,
-                    void* workspace,
-                    float epsilon,
-                    int batch_size,
-                    int num_channels,
-                    int height,
-                    int width,
-                    int num_groups,
-                    bool use_swish_activation) {
-    typedef typename ToHipType<T>::MappedType HipT;
-    return LaunchGroupNormKernel<HipT>(
-        tuning_ctx,
-        stream,
-        reinterpret_cast<HipT*>(output->MutableData<T>()),
-        reinterpret_cast<const HipT*>(input->Data<T>()),
-        gamma->Data<float>(),
-        beta->Data<float>(),
-        workspace,
-        epsilon,
-        batch_size,
-        num_channels,
-        height,
-        width,
-        num_groups,
-        use_swish_activation);
-  }
-};
-
-}  // namespace
-
-GroupNorm::GroupNorm(const OpKernelInfo& op_info) : RocmKernel(op_info) {
-  epsilon_ = op_info.GetAttrOrDefault<float>("epsilon", 1e-5f);
-  ORT_ENFORCE(epsilon_ >= 0);
-
-  int64_t num_groups;
-  ORT_ENFORCE(op_info.GetAttr("groups", &num_groups).IsOK());
-  ORT_ENFORCE(num_groups >= 0);
-  num_groups_ = static_cast<int>(num_groups);
-
-  int64_t activation;
-  ORT_ENFORCE(op_info.GetAttr("activation", &activation).IsOK());
-  ORT_ENFORCE(activation == 0 || activation == 1);  // 0 is None, 1 is Swish
-  use_swish_activation_ = (activation == 1);
-
-  channels_last_ = (op_info.GetAttrOrDefault<int64_t>("channels_last", static_cast<int64_t>(1)) != 0);
-}
-
-Status GroupNorm::PrePack(const Tensor& /*tensor*/, int /*input_idx*/, AllocatorPtr /*alloc*/,
-                          bool& is_packed, PrePackedWeights* /*prepacked_weights*/) {
-  is_packed = false;
-  return Status::OK();
-}
-
-Status GroupNorm::ComputeInternal(OpKernelContext* context) const {
-  const Tensor* input = context->Input<Tensor>(0);
-  const Tensor* gamma = context->Input<Tensor>(1);
-  const Tensor* beta = context->Input<Tensor>(2);
-  Tensor* output = context->Output(0, input->Shape());
-
-  if (!channels_last_) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "only the channels_last layout is supported");
-  }
-
-  const auto& input_dims = input->Shape().GetDims();
-  if (input_dims.size() != 4) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "input is expected to have 4 dimensions, got ", input_dims.size());
-  }
-
-  const auto& gamma_dims = gamma->Shape().GetDims();
-  if (gamma_dims.size() != 1) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "gamma is expected to have 1 dimension, got ", gamma_dims.size());
-  }
-  if (gamma_dims[0] != input_dims[3]) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "Number of channels in gamma and input does not match");
-  }
-
-  const auto& beta_dims = beta->Shape().GetDims();
-  if (beta_dims.size() != 1) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "beta is expected to have 1 dimension, got ", beta_dims.size());
-  }
-  if (beta_dims[0] != input_dims[3]) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "Number of channels in beta and input does not match");
-  }
-
-  // Input and output format is NHWC
-  int batch_size = static_cast<int>(input_dims[0]);
-  int num_channels = static_cast<int>(input_dims[3]);
-  int height = static_cast<int>(input_dims[1]);
-  int width = static_cast<int>(input_dims[2]);
-
-  if (num_channels % num_groups_ != 0) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "number of channels should be divisible by num_groups");
-  }
-
-  if (context->GetUseDeterministicCompute()) {
-    static std::once_flag log_warning;
-    std::call_once(log_warning, []() {
-      LOGS_DEFAULT(WARNING) << "GroupNorm has no deterministic GPU kernel, its outputs may still be nondeterministic.";
-    });
-  }
-
-  auto workspace = GetScratchBuffer<void>(GetGroupNormWorkspaceSizeInBytes(), context->GetComputeStream());
-
-  utils::MLTypeCallDispatcher<GROUP_NORM_TYPES> dispatcher(input->GetElementType());
-  return dispatcher.InvokeRet<Status, DispatchGroupNorm>(GetTuningContext(), context->GetComputeStream(),
-                                                         output, input, gamma, beta, workspace.get(),
-                                                         epsilon_,
-                                                         batch_size,
-                                                         num_channels,
-                                                         height,
-                                                         width,
-                                                         num_groups_,
-                                                         use_swish_activation_);
-}
-
-}  // namespace rocm
-}  // namespace contrib
-}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck.cuh b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck.cuh
index fb7091592c16e..d0a0d09fcbae3 100644
--- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck.cuh
+++ b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck.cuh
@@ -26,13 +26,18 @@ namespace rocm {
 
 using onnxruntime::rocm::CKDataTypeAdaptor;
 
-using Swish = ck::tensor_operation::element_wise::Swish;
+// The SiLU function is a special case of Swish function,
+// The Swish function is parametrized by b, which is set to 1.0 for SiLU. They are defined as:
+// SiLU(x) = x * sigmoid(x)
+// Swish(x) = x * sigmoid(bx)
+// The default value of b is 1.0 in ck::tensor_operation::element_wise::Swish function. We treat them as the same function here.
+using Silu = ck::tensor_operation::element_wise::Swish;
 using Pass = ck::tensor_operation::element_wise::PassThrough;
 
 constexpr int Rank = 5;
 constexpr int NumReduceDim = 3;
 
-template <typename T, typename AccT, bool WithSwish>
+template <typename T, typename AccT, bool WithSilu>
 auto GetCKGroupNormNHWCTypeStringAndOps() {
   using XDataType = typename CKDataTypeAdaptor<T>::type;
   using YDataType = typename CKDataTypeAdaptor<T>::type;
@@ -40,26 +45,30 @@ auto GetCKGroupNormNHWCTypeStringAndOps() {
   using GammaDataType = float;
   using BetaDataType = float;
 
-  using Activation = std::conditional_t<WithSwish, Swish, Pass>;
+  using Activation = std::conditional_t<WithSilu, Silu, Pass>;
 
-  std::vector<std::pair<std::string, onnxruntime::rocm::tunable::Op<GroupNormNHWCParams<T>>>> ret;
+  std::vector<std::pair<std::string, onnxruntime::rocm::tunable::Op<GroupNormNHWCTunableParams<T>>>> ret;
   for (auto&& impl : internal::GetDeviceGroupNormInstances<XDataType, GammaDataType, BetaDataType, YDataType,
                                                            SaveMeanInvStdDataType, Activation, Rank, NumReduceDim>()) {
-    std::string swish_suffix = WithSwish ? "_Swish" : "_Pass";
-    auto type_string = onnxruntime::MakeString(impl->GetTypeString()) + swish_suffix;
+    std::string silu_suffix = WithSilu ? "_Silu" : "_Pass";
+    auto type_string = onnxruntime::MakeString(impl->GetTypeString()) + silu_suffix;
     auto invoker = impl->MakeInvokerPointer();
 
-    auto ck_group_norm_op = [impl = std::move(impl), invoker = std::move(invoker)](const GroupNormNHWCParams<T>* params) -> Status {
-      if constexpr (WithSwish) {
+    auto ck_group_norm_op = [impl = std::move(impl), invoker = std::move(invoker)](
+                                const GroupNormNHWCTunableParams<T>* params) -> Status {
+      TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF((params->skip != nullptr || params->bias != nullptr),
+                                                "Input skip or bias is not supported by composable kernel.");
+      if constexpr (WithSilu) {
         TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(
-            !params->withSwish, "Swish version only support groupnorm with swish");
+            !params->use_silu, "Silu version only support groupnorm with silu");
       } else {
         TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(
-            params->withSwish, "Pass version only support groupnorm without swish");
+            params->use_silu, "Pass version only support groupnorm without silu");
       }
-      std::vector<ck::index_t> in_lengths{params->n, params->h, params->w, params->groups, params->cPerGroup};
-      std::vector<ck::index_t> in_out_strides{params->h * params->w * params->c, params->w * params->c, params->c, params->cPerGroup, 1};
-      std::vector<ck::index_t> gamma_beta_strides{0, 0, 0, params->cPerGroup, 1};
+      std::vector<ck::index_t> in_lengths{params->n, params->h, params->w, params->groups, params->channels_per_group};
+      std::vector<ck::index_t> in_out_strides{params->h * params->w * params->c, params->w * params->c,
+                                              params->c, params->channels_per_group, 1};
+      std::vector<ck::index_t> gamma_beta_strides{0, 0, 0, params->channels_per_group, 1};
       std::vector<ck::index_t> reduce_dims{1, 2, 4};
 
       auto activation = Activation{};
diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck_impl/impl.cuh b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck_impl/impl.cuh
index 19b081881dcec..4cb371fdcf960 100644
--- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck_impl/impl.cuh
+++ b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck_impl/impl.cuh
@@ -18,7 +18,7 @@ namespace internal {
 using F16 = ck::half_t;
 using F32 = float;
 
-using Swish = ck::tensor_operation::element_wise::Swish;
+using Silu = ck::tensor_operation::element_wise::Swish;
 using Pass = ck::tensor_operation::element_wise::PassThrough;
 
 using ck::tensor_operation::device::DeviceNormalizationFwd;      // the interface
@@ -101,9 +101,9 @@ GetDeviceGroupNormInstances() {
 
 template <>
 std::vector<std::unique_ptr<DeviceNormalizationFwd<
-    F16, F32, F32, F16, F32, Swish, 5, 3>>>
+    F16, F32, F32, F16, F32, Silu, 5, 3>>>
 GetDeviceGroupNormInstances<
-    F16, F32, F32, F16, F32, Swish, 5, 3>();
+    F16, F32, F32, F16, F32, Silu, 5, 3>();
 
 template <>
 std::vector<std::unique_ptr<DeviceNormalizationFwd<
@@ -113,9 +113,9 @@ GetDeviceGroupNormInstances<
 
 template <>
 std::vector<std::unique_ptr<DeviceNormalizationFwd<
-    F32, F32, F32, F32, F32, Swish, 5, 3>>>
+    F32, F32, F32, F32, F32, Silu, 5, 3>>>
 GetDeviceGroupNormInstances<
-    F32, F32, F32, F32, F32, Swish, 5, 3>();
+    F32, F32, F32, F32, F32, Silu, 5, 3>();
 
 template <>
 std::vector<std::unique_ptr<DeviceNormalizationFwd<
diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck_impl/impl_fp16.cu b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck_impl/impl_fp16.cu
index 6718f29268031..ad191314e5e4c 100644
--- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck_impl/impl_fp16.cu
+++ b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck_impl/impl_fp16.cu
@@ -11,12 +11,12 @@ namespace rocm {
 namespace internal {
 
 template <>
-std::vector<std::unique_ptr<DeviceNormalizationFwd<F16, F32, F32, F16, F32, Swish, 5, 3>>>
-GetDeviceGroupNormInstances<F16, F32, F32, F16, F32, Swish, 5, 3>() {
-  std::vector<std::unique_ptr<DeviceNormalizationFwd<F16, F32, F32, F16, F32, Swish, 5, 3>>> instances;
+std::vector<std::unique_ptr<DeviceNormalizationFwd<F16, F32, F32, F16, F32, Silu, 5, 3>>>
+GetDeviceGroupNormInstances<F16, F32, F32, F16, F32, Silu, 5, 3>() {
+  std::vector<std::unique_ptr<DeviceNormalizationFwd<F16, F32, F32, F16, F32, Silu, 5, 3>>> instances;
   ck::tensor_operation::device::instance::add_device_operation_instances(
       instances,
-      device_normalization_f16_instances<Swish, 5, 3>{});
+      device_normalization_f16_instances<Silu, 5, 3>{});
 
   return instances;
 }
diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck_impl/impl_fp32.cu b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck_impl/impl_fp32.cu
index 9b0ccab17b4c1..ceb53ed442abc 100644
--- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck_impl/impl_fp32.cu
+++ b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck_impl/impl_fp32.cu
@@ -11,12 +11,12 @@ namespace rocm {
 namespace internal {
 
 template <>
-std::vector<std::unique_ptr<DeviceNormalizationFwd<F32, F32, F32, F32, F32, Swish, 5, 3>>>
-GetDeviceGroupNormInstances<F32, F32, F32, F32, F32, Swish, 5, 3>() {
-  std::vector<std::unique_ptr<DeviceNormalizationFwd<F32, F32, F32, F32, F32, Swish, 5, 3>>> instances;
+std::vector<std::unique_ptr<DeviceNormalizationFwd<F32, F32, F32, F32, F32, Silu, 5, 3>>>
+GetDeviceGroupNormInstances<F32, F32, F32, F32, F32, Silu, 5, 3>() {
+  std::vector<std::unique_ptr<DeviceNormalizationFwd<F32, F32, F32, F32, F32, Silu, 5, 3>>> instances;
   ck::tensor_operation::device::instance::add_device_operation_instances(
       instances,
-      device_normalization_f32_instances<Swish, 5, 3>{});
+      device_normalization_f32_instances<Silu, 5, 3>{});
 
   return instances;
 }
diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_common.h b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_common.h
index 008ae20b0561f..7cff640db2f34 100644
--- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_common.h
+++ b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_common.h
@@ -8,110 +8,47 @@
 #include "core/providers/rocm/cu_inc/common.cuh"
 #include "core/providers/rocm/rocm_common.h"
 #include "core/providers/rocm/tunable/rocm_tunable.h"
+#include "contrib_ops/rocm/diffusion/group_norm_common_base.h"
 
 namespace onnxruntime {
 namespace contrib {
 namespace rocm {
 
-using onnxruntime::rocm::CeilDiv;
-
-int32_t findMaxDivisor(int32_t n, int32_t maxAllowedDivisor) {
-  int32_t maxDivisor = -1;
-  for (int32_t i = 1; i <= std::sqrt(n); i++) {
-    if (n % i == 0) {
-      int32_t divisor1 = n / i;
-      int32_t divisor2 = i;
-
-      if (divisor1 > maxDivisor && divisor1 < maxAllowedDivisor) {
-        maxDivisor = divisor1;
-      }
-      if (divisor2 > maxDivisor && divisor2 < maxAllowedDivisor) {
-        maxDivisor = divisor2;
-      }
-    }
-  }
-  return maxDivisor;
-}
-
 template <typename T>
-struct GroupNormNHWCParams : OpParams {
-  GroupNormNHWCParams(RocmTuningContext* tuning_ctx, onnxruntime::Stream* stream, T* dst, float* redBuffer, const T* src, const float* gamma,
-                      const float* beta, int32_t n, int32_t h, int32_t w, int32_t c, int32_t groups, float epsilon, bool withSwish)
-      : OpParams(tuning_ctx, stream), dst(dst), src(src), gamma(gamma), beta(beta), redBuffer(redBuffer), epsilon(epsilon), n(n), h(h), w(w), c(c), groups(groups), withSwish(withSwish) {
-    int32_t maxBlocksPerHW = 1024;
-    switch (c) {
-      case 960:
-      case 1920:
-        cPerBlock = 480;
-        break;
-      case 512:
-      case 256:
-        cPerBlock = 256;
-        break;
-      case 128:
-        cPerBlock = 128;
-        break;
-      default:
-        cPerBlock = 320;
-    }
-
-    hw = h * w;
-    const int32_t blocksPerHW = findMaxDivisor(hw, maxBlocksPerHW);
-    hwPerBlock = CeilDiv(hw, blocksPerHW);
-    cPerGroup = c / groups;
-    hwc = hw * c;
-    invHWC = 1.F / (float)(hw * cPerGroup);
-    groupsPerBlock = cPerBlock / cPerGroup;
-  }
+struct GroupNormNHWCTunableParams : OpParams, GroupNormNHWCParams<T> {
+  GroupNormNHWCTunableParams(RocmTuningContext* tuning_ctx,
+                             onnxruntime::Stream* ort_stream,
+                             T* output,
+                             T* add_out,
+                             const T* input,
+                             const T* skip,
+                             const T* bias,
+                             const float* gamma,
+                             const float* beta,
+                             float* workspace,
+                             float epsilon,
+                             int batch_size,
+                             int num_channels,
+                             int height,
+                             int width,
+                             int num_groups,
+                             bool use_silu,
+                             bool broadcast_skip,
+                             int channels_per_block)
+      : OpParams(tuning_ctx, ort_stream),
+        GroupNormNHWCParams<T>(output, add_out, input, skip, bias, gamma, beta, workspace, epsilon, batch_size,
+                               num_channels, height, width, num_groups, use_silu, broadcast_skip, channels_per_block) {}
 
   std::string Signature() const override {
-    std::string swish_suffix = withSwish ? "_Swish" : "_Pass";
-    std::string sig = std::to_string(n) + "_" + std::to_string(h * w) + "_" + std::to_string(c) + "_" + std::to_string(groups) + swish_suffix;
+    std::string silu_suffix = this->use_silu ? "_silu" : "_pass";
+    std::string skip_suffix = this->skip != nullptr ? "_skip" : "_noskip";
+    std::string broadcast_suffix = this->broadcast_skip ? "_broadcast" : "_nobroadcast";
+    std::string bias_suffix = this->bias != nullptr ? "_bias" : "_nobias";
+    std::string sig = std::to_string(this->n) + "_" + std::to_string(this->h * this->w) + "_" +
+                      std::to_string(this->c) + "_" + std::to_string(this->groups) + silu_suffix +
+                      skip_suffix + broadcast_suffix + bias_suffix;
     return sig;
   }
-
-  // The output buffer. Layout NHWC.
-  T* dst;
-  // The input buffer. Layout NHWC.
-  T const* src;
-  // The gamma scaling factor.
-  float const* gamma;
-  // The beta term to add in GN.
-  float const* beta;
-  // The temporary buffer to do the global parallel reduction. Size:
-  // BLOCKS_PER_BATCH x C x 2.
-  float* redBuffer;
-  float epsilon;
-
-  // The number of instances in the batch.
-  int32_t n;
-  // The height and width of each activation map.
-  int32_t h;
-  int32_t w;
-  // The number of channels.
-  int32_t c;
-  // The number of groups.
-  int32_t groups;
-  // Do we apply the Swish activation function?
-  bool withSwish;
-
-  // Precomputed values and parameters to control the execution of the kernels.
-
-  // The number of activations per instance (h * w) and the number of
-  // activations per block.
-  int32_t hw;
-  int32_t hwPerBlock;
-  // The number of channels per group and blocks per activation in the C
-  // dimension.
-  int32_t cPerBlock;
-  int32_t cPerGroup;
-
-  // The precomputed stride between instances.
-  int32_t hwc;
-  // The inverse of hwc in floats (to compute mean/var).
-  float invHWC;
-  // The precomputed number of groups per block.
-  int32_t groupsPerBlock;
 };
 
 }  // namespace rocm
diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_impl.cu b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_impl.cu
index dbd5009e63676..142aaf14e8d2d 100644
--- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_impl.cu
+++ b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_impl.cu
@@ -15,9 +15,12 @@ namespace rocm {
 template <typename T>
 Status LaunchGroupNormKernel(
     RocmTuningContext* tuning_ctx,
-    Stream* stream,
+    Stream* ort_stream,
     T* output,
+    T* add_out,
     const T* input,
+    const T* skip,
+    const T* bias,
     const float* gamma,
     const float* beta,
     void* workspace,
@@ -27,19 +30,26 @@ Status LaunchGroupNormKernel(
     int height,
     int width,
     int num_groups,
-    bool use_swish_activation) {
-  if (batch_size > static_cast<int>(kMaxGroupNormBatchSize)) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, StatusCode::NOT_IMPLEMENTED,
-                           "only support batch_size <= 32. Got", batch_size);
-  }
+    bool use_silu,
+    bool broadcast_skip,
+    int channels_per_block) {
+  GroupNormNHWCTunableParams<T> params(tuning_ctx, ort_stream, output, add_out, input, skip, bias, gamma, beta,
+                                       reinterpret_cast<float*>(workspace), epsilon, batch_size, num_channels,
+                                       height, width, num_groups, use_silu, broadcast_skip, channels_per_block);
 
-  if (num_groups != static_cast<int>(kGroupNormNumberOfGroups)) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, StatusCode::NOT_IMPLEMENTED,
-                           "only num_groups=32 is supported. Got", num_groups);
+  if (params.channels_per_block % params.channels_per_group != 0 ||
+      params.channels_per_block > kMaxSize ||
+      (params.channels_per_group % CHANNELS_PER_THREAD != 0)) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED,
+                           "GroupNorm in ROCM does not support the input: n=", batch_size,
+                           " h=", height,
+                           " w=", width,
+                           " c=", num_channels,
+                           " groups=", num_groups);
   }
 
-  GroupNormNHWCParams<T> params(tuning_ctx, stream, output, reinterpret_cast<float*>(workspace), input, gamma, beta,
-                                batch_size, height, width, num_channels, num_groups, epsilon, use_swish_activation);
+  HIP_RETURN_IF_ERROR(hipMemsetAsync(
+      params.group_sum_buffer, 0, GetGroupNormWorkspaceSizeInBytes(batch_size, num_groups), params.StreamHandle()));
 
   if (tuning_ctx->IsTunableOpEnabled()) {
     static GroupNormNHWCTunableOp<T> op;
@@ -50,14 +60,17 @@ Status LaunchGroupNormKernel(
 }
 
 template Status LaunchGroupNormKernel<half>(RocmTuningContext* tuning_ctx, Stream* stream, half* output,
-                                            const half* input, const float* gamma, const float* beta, void* workspace,
-                                            float epsilon, int batch_size, int num_channels,
-                                            int height, int width, int num_groups, bool swish);
+                                            half* add_out, const half* input, const half* skip, const half* bias,
+                                            const float* gamma, const float* beta, void* workspace, float epsilon,
+                                            int batch_size, int num_channels, int height, int width, int num_groups,
+                                            bool use_silu, bool broadcast_skip, int channels_per_block);
 
 template Status LaunchGroupNormKernel<float>(RocmTuningContext* tuning_ctx, Stream* stream, float* output,
-                                             const float* input, const float* gamma, const float* beta, void* workspace,
-                                             float epsilon, int batch_size, int num_channels,
-                                             int height, int width, int num_groups, bool swish);
+                                             float* add_out, const float* input, const float* skip, const float* bias,
+                                             const float* gamma, const float* beta, void* workspace, float epsilon,
+                                             int batch_size, int num_channels, int height, int width, int num_groups,
+                                             bool use_silu, bool broadcast_skip, int channels_per_block);
+
 }  // namespace rocm
 }  // namespace contrib
 }  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_impl.h b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_impl.h
deleted file mode 100644
index a0f7e0aca5def..0000000000000
--- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_impl.h
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-
-#include <cstdint>
-#include <hip/hip_runtime.h>
-
-#include "core/common/common.h"
-#include "core/common/status.h"
-#include "core/providers/rocm/tunable/rocm_tunable.h"
-
-using onnxruntime::rocm::tunable::RocmTuningContext;
-
-namespace onnxruntime {
-namespace contrib {
-namespace rocm {
-
-constexpr size_t kMaxGroupNormBatchSize = 32;
-constexpr size_t kGroupNormNumberOfGroups = 32;
-
-constexpr size_t GetGroupNormWorkspaceSizeInBytes() {
-  // Two buffers for sum and squared sum
-  return (sizeof(float) * 2) * kMaxGroupNormBatchSize * kGroupNormNumberOfGroups;
-}
-
-template <typename T>
-Status LaunchGroupNormKernel(
-    RocmTuningContext* tuning_ctx,
-    Stream* stream,
-    T* output,                 // normalized output tensor
-    const T* input,            // input tensor
-    const float* gamma,        // gamma (also known as weight or scale)
-    const float* beta,         // beta (also known as bias)
-    void* workspace,           // Work space
-    float epsilon,             // epsilon used normalization
-    int batch_size,            // N
-    int num_channels,          // C
-    int height,                // H
-    int width,                 // W
-    int num_groups,            // number of groups
-    bool use_swish_activation  // Whether there is Swish activation after group normalization
-);
-
-}  // namespace rocm
-}  // namespace contrib
-}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_impl_kernel.cuh b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_impl_kernel.cuh
deleted file mode 100644
index d6322a12a9363..0000000000000
--- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_impl_kernel.cuh
+++ /dev/null
@@ -1,213 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-// The ROCm kernel is modified from TensorRT 8.5.
-/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <hip/hip_fp16.h>
-#include <hip/hip_runtime_api.h>
-#include <hipcub/hipcub.hpp>
-#include "core/providers/rocm/cu_inc/common.cuh"
-#include "core/providers/rocm/rocm_common.h"
-
-namespace onnxruntime {
-namespace contrib {
-namespace rocm {
-
-static inline __device__ __host__ float sigmoid(float x) {
-  return 1.F / (1.F + expf(-x));
-}
-
-struct GroupSums {
-  // Is it the 1st element of the group?
-  int32_t flag;
-  // The sum.
-  float sum;
-  // The sum of squares.
-  float sumSq;
-};
-
-struct GroupSumsOp {
-  inline __device__ GroupSums operator()(GroupSums const& a, GroupSums const& b) {
-    GroupSums dst;
-    dst.sum = b.flag ? b.sum : (a.sum + b.sum);
-    dst.sumSq = b.flag ? b.sumSq : (a.sumSq + b.sumSq);
-    dst.flag = a.flag + b.flag;
-    return dst;
-  }
-};
-
-template <typename T, typename U, int ILP>
-inline __device__ void UpdateSum(const T* src, int64_t offset, U& sum, U& sumSq) {
-  using VecT = onnxruntime::rocm::aligned_vector<T, ILP>;
-  const VecT input_v = *reinterpret_cast<const VecT*>(src + offset);
-
-#pragma unroll
-  for (int i = 0; i < ILP; i++) {
-    const U val = static_cast<U>(input_v.val[i]);
-    sum += val;
-    sumSq += val * val;
-  }
-}
-
-template <typename T, int ThreadsPerBlock, int ILP>
-__global__ void groupNormNHWCSumKernel(const T* src, float* redBuffer, int32_t cPerBlock, int32_t hwPerBlock, int32_t hw,
-                                       int32_t hwc, int32_t c, int32_t cPerGroup, int32_t groups, int32_t groupsPerBlock) {
-  // The object in charge of doing the sums for the different blocks.
-  typedef hipcub::BlockScan<GroupSums, ThreadsPerBlock> BlockScan;
-
-  // Allocate shared memory for BlockScan.
-  __shared__ typename BlockScan::TempStorage tempStorage;
-  // Allocate shared memory for the groups. We could reduce the amount of shared
-  // memory reserved.
-  __shared__ float2 smem[ThreadsPerBlock];
-
-  // The instance in the batch.
-  int32_t ni = blockIdx.z;
-  // The channel loaded by that thread (ILP channels per thread).
-  int32_t ci = blockIdx.x * cPerBlock + threadIdx.x * ILP;
-
-  // The first activation loaded by that block.
-  int32_t hwBegin = blockIdx.y * hwPerBlock;
-  // The last activation loaded by that block.
-  int32_t hwEnd = min(hwBegin + hwPerBlock, hw);
-
-  // The sums.
-  float sum = 0.F;
-  float sumSq = 0.F;
-
-  // Iterate over the activations to compute the sums.
-  if (ci < c) {
-    for (int32_t hwi = hwBegin; hwi < hwEnd; ++hwi) {
-      // The offset.
-      int64_t offset = static_cast<int64_t>(ni) * hwc + static_cast<int64_t>(hwi) * c + ci;
-      UpdateSum<T, float, ILP>(src, offset, sum, sumSq);
-    }
-  }
-
-  // The group that thread works on and the channel in the group (modulus).
-  int32_t gi = threadIdx.x * ILP / cPerGroup;
-  int32_t cj = threadIdx.x * ILP - cPerGroup * gi;
-
-  // The data for the summations.
-  GroupSums inp{cj == 0 ? 1 : 0, sum, sumSq};
-
-  // Do the segmented scan.
-  GroupSums out;
-  BlockScan(tempStorage).InclusiveScan(inp, out, GroupSumsOp());
-
-  // Store the results for the groups in shared memory (to produce coalesced
-  // stores later).
-  if (cj == cPerGroup - ILP) {  // ILP channels per thread
-    smem[gi] = make_float2(out.sum, out.sumSq);
-  }
-
-  // Make sure the data is in shared memory.
-  __syncthreads();
-
-  // The global group index.
-  int32_t gj = blockIdx.x * groupsPerBlock + threadIdx.x;
-
-  // Threads that have nothing left to do, exit.
-  if (threadIdx.x >= groupsPerBlock || gj >= groups) {
-    return;
-  }
-
-  // The first threads (those storing to global memory, load the values).
-  float2 sums = smem[threadIdx.x];
-
-  // Store to global memory.
-  atomicAdd(&redBuffer[(2 * ni + 0) * groups + gj], sums.x);
-  atomicAdd(&redBuffer[(2 * ni + 1) * groups + gj], sums.y);
-}
-
-template <typename T, typename U, int32_t ILP>
-__device__ void computeGroupNorm(const T* src, T* dst, int64_t offset, U mean, U invStdDev,
-                                 const U* gamma_v, const U* beta_v, bool swish) {
-  using VecT = onnxruntime::rocm::aligned_vector<T, ILP>;
-  const VecT input_v = *reinterpret_cast<const VecT*>(src + offset);
-  VecT output_v;
-
-#pragma unroll
-  for (int i = 0; i < ILP; i++) {
-    U val = static_cast<U>(input_v.val[i]);
-    val = (val - mean) * invStdDev;
-    val = gamma_v[i] * val + beta_v[i];
-
-    if (swish) {
-      val = val * sigmoid(val);
-    }
-    output_v.val[i] = static_cast<T>(val);
-  }
-  *(reinterpret_cast<VecT*>(dst + offset)) = output_v;
-}
-
-template <typename T, int ThreadsPerBlock, int ILP>
-__global__ void groupNormNHWCScaleKernel(T* dst, const T* src, const float* gamma, const float* beta, const float* redBuffer, float epsilon, int32_t c, int32_t cPerBlock,
-                                         int32_t cPerGroup, int32_t groups, int32_t hwc, float invHWC, int32_t hw, int32_t hwPerBlock, bool withSwish) {
-  // The channel loaded by that thread (ILP channels per thread for F16x2).
-  int32_t ci = blockIdx.x * cPerBlock + threadIdx.x * ILP;
-  if (ci >= c) {
-    return;
-  }
-
-  // The instance in the batch.
-  int32_t ni = blockIdx.z;
-
-  // The group that thread works on and the channel in the group (modulus).
-  int32_t gi = ci / cPerGroup;
-
-  // Load the sum and sum of squares for the group.
-  float sum = 0.F, sumSq = 0.F;
-  if (gi < groups) {
-    sum = redBuffer[(2 * ni + 0) * groups + gi];
-    sumSq = redBuffer[(2 * ni + 1) * groups + gi];
-  }
-
-  using VecF = onnxruntime::rocm::aligned_vector<float, ILP>;
-
-  const VecF gamma_v = *reinterpret_cast<const VecF*>(gamma + ci);
-  const VecF beta_v = *reinterpret_cast<const VecF*>(beta + ci);
-
-  // Compute the mean.
-  float mean = sum * invHWC;
-  // Compute the variance.
-  float var = sumSq * invHWC - (mean * mean);
-  // Compute the inverse of the stddev.
-  float invStdDev = var <= 0.F ? 1.F : rsqrtf(var + epsilon);
-
-  // The first activation loaded by that block.
-  int32_t hwBegin = blockIdx.y * hwPerBlock;
-  // The last activation loaded by that block.
-  int32_t hwEnd = min(hwBegin + hwPerBlock, hw);
-
-  // Iterate over the activations to compute the sums.
-  for (int32_t hwi = hwBegin; hwi < hwEnd; ++hwi) {
-    // The src/dst offset.
-    int64_t offset = (int64_t)ni * hwc + hwi * c + ci;
-
-    // Fetch ILP channels per thread.
-    computeGroupNorm<T, float, ILP>(src, dst, offset, mean, invStdDev, gamma_v.val, beta_v.val, withSwish);
-  }
-}
-
-}  // namespace rocm
-}  // namespace contrib
-}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.cuh b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.cuh
index b7b9441ac997d..b3d3e92209b39 100644
--- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.cuh
+++ b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.cuh
@@ -20,21 +20,21 @@ namespace rocm {
 
 namespace {
 
-template <typename T, bool WithSwish>
+template <typename T, bool WithSilu>
 std::string GetGroupNormTritonGroupName() {
   std::string ret = "GroupNormTriton_";
-  std::string swish_suffix = WithSwish ? "Swish_" : "Pass_";
-  ret += swish_suffix;
+  std::string silu_suffix = WithSilu ? "Silu_" : "Pass_";
+  ret += silu_suffix;
   ret += GetDataTypeName<T>();
   return ret;
 }
 
 }  // namespace
 
-template <typename T, bool WithSwish>
+template <typename T, bool WithSilu>
 auto GetTritonGroupNormNHWCTypeStringAndOps() {
-  std::vector<std::pair<std::string, tunable::Op<GroupNormNHWCParams<T>>>> ret;
-  auto group_name = GetGroupNormTritonGroupName<T, WithSwish>();
+  std::vector<std::pair<std::string, tunable::Op<GroupNormNHWCTunableParams<T>>>> ret;
+  auto group_name = GetGroupNormTritonGroupName<T, WithSilu>();
   auto* kernel_list = GetOrtTritonKernelByGroup(group_name);
   if (kernel_list == nullptr) {
     return ret;
@@ -45,16 +45,19 @@ auto GetTritonGroupNormNHWCTypeStringAndOps() {
     auto* metadata = GetOrtTritonKernelMetadata(i);
     auto block_size = metadata->constants.at("BLOCK_SIZE");
     auto hw_size = metadata->constants.at("HW_SIZE");
-    auto impl = [i, block_size, hw_size](const GroupNormNHWCParams<T>* params) -> Status {
+    auto impl = [i, block_size, hw_size](const GroupNormNHWCTunableParams<T>* params) -> Status {
+      TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF((params->skip != nullptr || params->bias != nullptr),
+                                                "Input skip or bias is not supported by triton kernel.");
       TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(
-          params->cPerGroup > block_size || params->cPerGroup * 2 <= block_size,
-          "Arg block_size (", block_size, ") is not the next power of 2 of cPerGroup (", params->cPerGroup, ").");
+          params->channels_per_group > block_size || params->channels_per_group * 2 <= block_size,
+          "Arg block_size (", block_size, ") is not the next power of 2 of channels_per_group (",
+          params->channels_per_group, ").");
       TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(
           params->hw % hw_size != 0, "Arg hw_size (", hw_size, ") is not a divisor of hw (", params->hw, ").");
-      if constexpr (WithSwish) {
-        TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(!params->withSwish, "Swish version does not support GN w/o swish.");
+      if constexpr (WithSilu) {
+        TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(!params->use_silu, "Silu version does not support GN w/o silu.");
       } else {
-        TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(params->withSwish, "Pass version does not support GN w/ swish.");
+        TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(params->use_silu, "Pass version does not support GN w/ silu.");
       }
       // Construct args for launch kernel
       struct {
@@ -73,7 +76,7 @@ auto GetTritonGroupNormNHWCTypeStringAndOps() {
           (const void*)params->beta,
           params->hw,
           params->c,
-          params->cPerGroup,
+          params->channels_per_group,
           params->epsilon};
 
       // Grid dim is (batch_count, groups, 1)
diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.py b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.py
index 56b3a030b289e..5368cb1cf635b 100644
--- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.py
+++ b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.py
@@ -21,7 +21,7 @@ def group_norm_kernel(
     eps,
     BLOCK_SIZE: tl.constexpr,
     HW_SIZE: tl.constexpr,
-    ACTIVATION_SWISH: tl.constexpr,
+    ACTIVATION_SILU: tl.constexpr,
 ):
     row_x = tl.program_id(0)
     row_y = tl.program_id(1)
@@ -62,7 +62,7 @@ def group_norm_kernel(
         x = tl.load(x_ptr + offsets, mask=mask, other=0.0).to(tl.float32)
         x_hat = (x - group_mean) * rstd
         y = x_hat * gamma + beta
-        if ACTIVATION_SWISH:
+        if ACTIVATION_SILU:
             y *= tl.sigmoid(y)
         tl.store(y_ptr + offsets, y, mask=mask)
 
@@ -71,7 +71,7 @@ def group_norm_kernel(
 # blocks = [16, 32, 64, 128, 256, 512]
 # hw_sizes = [8, 16, 32, 64, 128, 256, 512]
 # but this will result in too many functions and slow down the compilation.
-with_swish = [True, False]
+with_silu = [True, False]
 dtypes = ["fp32", "fp16"]
 blocks = [16, 32, 64, 128]
 hw_sizes = [8, 16, 32, 64, 128, 256]
@@ -84,14 +84,14 @@ def group_norm_kernel(
 def get_function_table():
     func_table = []
 
-    for swish, dtype, hw_size, warp, b in product(with_swish, dtypes, hw_sizes, warps, blocks):
-        swish_suffix = "Swish" if swish else "Pass"
-        name = name_pattern.format(swish_suffix, dtype, b, hw_size, warp)
-        group = group_pattern.format(swish_suffix, dtype)
+    for silu, dtype, hw_size, warp, b in product(with_silu, dtypes, hw_sizes, warps, blocks):
+        silu_suffix = "Silu" if silu else "Pass"
+        name = name_pattern.format(silu_suffix, dtype, b, hw_size, warp)
+        group = group_pattern.format(silu_suffix, dtype)
         sig = sig_pattern.format(dtype, dtype)
         kwargs = {
             "num_warps": warp,
-            "constants": {"BLOCK_SIZE": b, "HW_SIZE": hw_size, "ACTIVATION_SWISH": int(swish)},
+            "constants": {"BLOCK_SIZE": b, "HW_SIZE": hw_size, "ACTIVATION_SILU": int(silu)},
         }
         func_desc = {"name": name, "group": group, "func": group_norm_kernel, "sig": sig, "kwargs": kwargs}
         func_table.append(func_desc)
diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_tunable_op.h b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_tunable_op.h
index 25d820f7ed326..e6831f764b418 100644
--- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_tunable_op.h
+++ b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_tunable_op.h
@@ -20,115 +20,117 @@ namespace rocm {
 using onnxruntime::rocm::GPU_WARP_SIZE;
 
 template <typename T>
-void groupNormNHWCSum(const GroupNormNHWCParams<T>* params) {
-  // Make sure the values are as we expect.
-  ORT_ENFORCE(params->c % params->cPerBlock == 0 && params->hw % params->hwPerBlock == 0);
-  // Make sure a group does not span multiple blocks.
-  ORT_ENFORCE(params->cPerBlock % params->cPerGroup == 0);
-
+void GroupNormNHWCSum(const GroupNormNHWCTunableParams<T>* params) {
   dim3 grid;
 
   // The number of blocks to compute all the channels.
-  grid.x = params->c / params->cPerBlock;
+  grid.x = DivUp(params->c, params->channels_per_block);
   // The number of blocks to compute all the activations in a given instance.
-  grid.y = CeilDiv(params->hw, params->hwPerBlock);
+  grid.y = DivUp(params->hw, params->hw_per_block);
   // The number of instances.
   grid.z = params->n;
 
-#define LAUNCH_GROUPNORM_SUM(ThreadsPerBlock, VecSize)                \
-  groupNormNHWCSumKernel<T, ThreadsPerBlock, VecSize>                 \
-      <<<grid, ThreadsPerBlock, 0, params->StreamHandle()>>>(         \
-          params->src, params->redBuffer, params->cPerBlock,          \
-          params->hwPerBlock, params->hw, params->hwc, params->c,     \
-          params->cPerGroup, params->groups, params->groupsPerBlock); \
+#define LAUNCH_GROUPNORM_SUM(ThreadsPerBlock, VecSize)                                                   \
+  GroupNormNHWCSumKernel<T, ThreadsPerBlock, VecSize>                                                    \
+      <<<grid, ThreadsPerBlock, 0, params->StreamHandle()>>>(                                            \
+          params->skip_workspace, params->group_sum_buffer, params->src, params->skip, params->bias,     \
+          params->channels_per_block, params->hw_per_block, params->hw, params->hwc, params->c,          \
+          params->channels_per_group, params->groups, params->groups_per_block, params->broadcast_skip); \
   break;
 
-  switch (params->cPerBlock) {
-    case 320:
-      LAUNCH_GROUPNORM_SUM(256, 2)
-    case 480:
-      LAUNCH_GROUPNORM_SUM(256, 2)
+  // Threads_per_block is half of values in kSizes since CHANNELS_PER_THREAD = 2.
+  switch (params->threads_per_block) {
     case 256:
-      LAUNCH_GROUPNORM_SUM(128, 2)
+      LAUNCH_GROUPNORM_SUM(256, CHANNELS_PER_THREAD)
+    case 192:
+      LAUNCH_GROUPNORM_SUM(192, CHANNELS_PER_THREAD)
+    case 160:
+      LAUNCH_GROUPNORM_SUM(160, CHANNELS_PER_THREAD)
     case 128:
-      LAUNCH_GROUPNORM_SUM(64, 2)
+      LAUNCH_GROUPNORM_SUM(128, CHANNELS_PER_THREAD)
+    case 64:
+      LAUNCH_GROUPNORM_SUM(64, CHANNELS_PER_THREAD)
     default:
       ORT_NOT_IMPLEMENTED("Not implemented");
   }
 }
 
 template <typename T, int ThreadsPerBlock, int VecSize>
-Status GroupNormNHWCSumOp(const GroupNormNHWCParams<T>* params) {
+Status GroupNormNHWCSumOp(const GroupNormNHWCTunableParams<T>* params) {
   dim3 grid;
-  grid.x = params->c / params->cPerBlock;
-  grid.y = CeilDiv(params->hw, params->hwPerBlock);
+  grid.x = DivUp(params->c, params->channels_per_block);
+  grid.y = DivUp(params->hw, params->hw_per_block);
   grid.z = params->n;
 
-  groupNormNHWCSumKernel<T, ThreadsPerBlock, VecSize>
+  GroupNormNHWCSumKernel<T, ThreadsPerBlock, VecSize>
       <<<grid, ThreadsPerBlock, 0, params->StreamHandle()>>>(
-          params->src, params->redBuffer, params->cPerBlock, params->hwPerBlock,
-          params->hw, params->hwc, params->c, params->cPerGroup, params->groups, params->groupsPerBlock);
+          params->skip_workspace, params->group_sum_buffer, params->src, params->skip, params->bias,
+          params->channels_per_block, params->hw_per_block, params->hw, params->hwc, params->c,
+          params->channels_per_group, params->groups, params->groups_per_block, params->broadcast_skip);
   return HIP_CALL(hipGetLastError());
 }
 
 template <typename T>
-void groupNormNHWCScale(const GroupNormNHWCParams<T>* params) {
-  // Make sure the dimensions are aligned with what we expect.
-  ORT_ENFORCE(params->c % params->cPerBlock == 0);
-  // Make sure a group does not span multiple blocks.
-  ORT_ENFORCE(params->cPerBlock % params->cPerGroup == 0);
-
+void GroupNormNHWCScale(const GroupNormNHWCTunableParams<T>* params) {
   dim3 grid;
 
   // The number of blocks to compute all the channels.
-  grid.x = params->c / params->cPerBlock;
+  grid.x = DivUp(params->c, params->channels_per_block);
   // The number of blocks to compute all the activations in a given instance.
-  grid.y = CeilDiv(params->hw, params->hwPerBlock);
+  grid.y = DivUp(params->hw, params->hw_per_block);
   // The number of instances.
   grid.z = params->n;
 
-#define LAUNCH_GROUPNORM_SCALE(ThreadsPerBlock, VecSize)                    \
-  groupNormNHWCScaleKernel<T, ThreadsPerBlock, VecSize>                     \
-      <<<grid, ThreadsPerBlock, 0, params->StreamHandle()>>>(               \
-          params->dst, params->src, params->gamma, params->beta,            \
-          params->redBuffer, params->epsilon, params->c, params->cPerBlock, \
-          params->cPerGroup, params->groups, params->hwc, params->invHWC,   \
-          params->hw, params->hwPerBlock, params->withSwish);               \
+#define LAUNCH_GROUPNORM_SCALE(ThreadsPerBlock, VecSize)                                               \
+  GroupNormNHWCScaleKernel<T, VecSize>                                                                 \
+      <<<grid, ThreadsPerBlock, 0, params->StreamHandle()>>>(                                          \
+          params->dst, params->src, params->skip, params->gamma, params->beta, params->skip_workspace, \
+          params->group_sum_buffer, params->epsilon, params->c, params->channels_per_block,            \
+          params->channels_per_group, params->groups, params->hwc, params->inv_hw_channels_per_group,  \
+          params->hw, params->hw_per_block, params->use_silu);                                         \
   break;
 
-  switch (params->cPerBlock) {
-    case 320:
-      LAUNCH_GROUPNORM_SCALE(256, 2)
-    case 480:
-      LAUNCH_GROUPNORM_SCALE(256, 2)
+  // Threads_per_block is half of values in kSizes since CHANNELS_PER_THREAD = 2.
+  switch (params->threads_per_block) {
     case 256:
-      LAUNCH_GROUPNORM_SCALE(128, 2)
+      LAUNCH_GROUPNORM_SCALE(256, CHANNELS_PER_THREAD)
+    case 192:
+      LAUNCH_GROUPNORM_SCALE(192, CHANNELS_PER_THREAD)
+    case 160:
+      LAUNCH_GROUPNORM_SCALE(160, CHANNELS_PER_THREAD)
     case 128:
-      LAUNCH_GROUPNORM_SCALE(64, 2)
+      LAUNCH_GROUPNORM_SCALE(128, CHANNELS_PER_THREAD)
+    case 64:
+      LAUNCH_GROUPNORM_SCALE(64, CHANNELS_PER_THREAD)
     default:
       ORT_NOT_IMPLEMENTED("Not implemented");
   }
 }
 
 template <typename T, int ThreadsPerBlock, int VecSize>
-Status GroupNormNHWCScaleOp(const GroupNormNHWCParams<T>* params) {
+Status GroupNormNHWCScaleOp(const GroupNormNHWCTunableParams<T>* params) {
   dim3 grid;
-  grid.x = params->c / params->cPerBlock;
-  grid.y = CeilDiv(params->hw, params->hwPerBlock);
+  grid.x = DivUp(params->c, params->channels_per_block);
+  grid.y = DivUp(params->hw, params->hw_per_block);
   grid.z = params->n;
 
-  groupNormNHWCScaleKernel<T, ThreadsPerBlock, VecSize>
+  GroupNormNHWCScaleKernel<T, VecSize>
       <<<grid, ThreadsPerBlock, 0, params->StreamHandle()>>>(
-          params->dst, params->src, params->gamma, params->beta, params->redBuffer, params->epsilon, params->c, params->cPerBlock,
-          params->cPerGroup, params->groups, params->hwc, params->invHWC, params->hw, params->hwPerBlock, params->withSwish);
+          params->dst, params->src, params->skip, params->gamma, params->beta, params->skip_workspace,
+          params->group_sum_buffer, params->epsilon, params->c, params->channels_per_block, params->channels_per_group,
+          params->groups, params->hwc, params->inv_hw_channels_per_group, params->hw, params->hw_per_block,
+          params->use_silu);
   return HIP_CALL(hipGetLastError());
 }
 
 template <typename T, int ThreadsPerBlock, int VecSize>
 class GroupNormNHWCOp {
  public:
-  Status operator()(const GroupNormNHWCParams<T>* params) {
-    HIP_RETURN_IF_ERROR(hipMemsetAsync(params->redBuffer, 0, GetGroupNormWorkspaceSizeInBytes(), params->StreamHandle()));
+  Status operator()(const GroupNormNHWCTunableParams<T>* params) {
+    HIP_RETURN_IF_ERROR(hipMemsetAsync(params->group_sum_buffer,
+                                       0,
+                                       GetGroupNormWorkspaceSizeInBytes(params->n, params->groups),
+                                       params->StreamHandle()));
     auto status = GroupNormNHWCSumOp<T, ThreadsPerBlock, VecSize>(params);
     ORT_RETURN_IF_ERROR(status);
     HIP_RETURN_IF_ERROR(hipGetLastError());
@@ -138,29 +140,30 @@ class GroupNormNHWCOp {
     return Status::OK();
   }
 
-  Status IsSupported(const GroupNormNHWCParams<T>* params) {
+  Status IsSupported(const GroupNormNHWCTunableParams<T>* params) {
     TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(
-        !(params->c % VecSize == 0 && params->cPerGroup % VecSize == 0),
-        "The number of channels (", params->c, ") or the number of channels per group (", params->cPerGroup,
+        !(params->c % VecSize == 0 && params->channels_per_group % VecSize == 0),
+        "The number of channels (", params->c, ") or the number of channels per group (", params->channels_per_group,
         ") isn't divisible by the number of vector size: ", VecSize);
-    TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(!(params->cPerBlock % params->cPerGroup == 0 &&
-                                                params->c % params->cPerBlock == 0 && params->hw % params->hwPerBlock == 0),
-                                              "The value of attributes don't meet the requirements.");
-    TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(!(params->cPerBlock <= ThreadsPerBlock * VecSize &&
-                                                params->cPerBlock > (ThreadsPerBlock - GPU_WARP_SIZE) * VecSize),
+    TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(!(params->channels_per_block <= ThreadsPerBlock * VecSize &&
+                                                params->channels_per_block > (ThreadsPerBlock - GPU_WARP_SIZE) * VecSize),
                                               "Configuration: Threads (", ThreadsPerBlock, "), vector size (",
-                                              VecSize, ") is redundant for the number of channels per group: ", params->cPerBlock);
+                                              VecSize, ") is redundant for the number of channels per group: ",
+                                              params->channels_per_block);
 
     return Status::OK();
   }
 };
 
 template <typename T>
-Status GroupNormNHWCStaticSelection(const GroupNormNHWCParams<T>* params) {
-  HIP_RETURN_IF_ERROR(hipMemsetAsync(params->redBuffer, 0, GetGroupNormWorkspaceSizeInBytes(), params->StreamHandle()));
-  groupNormNHWCSum<T>(params);
+Status GroupNormNHWCStaticSelection(const GroupNormNHWCTunableParams<T>* params) {
+  HIP_RETURN_IF_ERROR(hipMemsetAsync(params->group_sum_buffer,
+                                     0,
+                                     GetGroupNormWorkspaceSizeInBytes(params->n, params->groups),
+                                     params->StreamHandle()));
+  GroupNormNHWCSum<T>(params);
   HIP_RETURN_IF_ERROR(hipGetLastError());
-  groupNormNHWCScale<T>(params);
+  GroupNormNHWCScale<T>(params);
   HIP_RETURN_IF_ERROR(hipGetLastError());
   return Status::OK();
 }
@@ -178,30 +181,30 @@ Status GroupNormNHWCStaticSelection(const GroupNormNHWCParams<T>* params) {
   ADD_OP_FOR_ALL_VEC_SIZE(name, 320)
 
 template <typename T>
-class GroupNormNHWCTunableOp : public TunableOp<GroupNormNHWCParams<T>> {
+class GroupNormNHWCTunableOp : public TunableOp<GroupNormNHWCTunableParams<T>> {
  public:
   GroupNormNHWCTunableOp() {
     this->RegisterOp(GroupNormNHWCStaticSelection<T>);
     ADD_OP_FOR_ALL_THREADS_PER_BLOCK_ALL_VEC_SIZE(GroupNormNHWCOp)
 
 #ifdef USE_COMPOSABLE_KERNEL
-    for (auto&& [_, op] : GetCKGroupNormNHWCTypeStringAndOps<T, /*AccT=*/float, /*WithSwish=*/false>()) {
+    for (auto&& [_, op] : GetCKGroupNormNHWCTypeStringAndOps<T, /*AccT=*/float, /*WithSilu=*/false>()) {
       ORT_UNUSED_PARAMETER(_);
       this->RegisterOp(std::move(op));
     }
 
-    for (auto&& [_, op] : GetCKGroupNormNHWCTypeStringAndOps<T, /*AccT=*/float, /*WithSwish=*/true>()) {
+    for (auto&& [_, op] : GetCKGroupNormNHWCTypeStringAndOps<T, /*AccT=*/float, /*WithSilu=*/true>()) {
       ORT_UNUSED_PARAMETER(_);
       this->RegisterOp(std::move(op));
     }
 #endif  // USE_COMPOSABLE_KERNEL
 
 #ifdef USE_TRITON_KERNEL
-    for (auto&& [_, op] : GetTritonGroupNormNHWCTypeStringAndOps<T, /*WithSwish=*/false>()) {
+    for (auto&& [_, op] : GetTritonGroupNormNHWCTypeStringAndOps<T, /*WithSilu=*/false>()) {
       ORT_UNUSED_PARAMETER(_);
       this->RegisterOp(std::move(op));
     }
-    for (auto&& [_, op] : GetTritonGroupNormNHWCTypeStringAndOps<T, /*WithSwish=*/true>()) {
+    for (auto&& [_, op] : GetTritonGroupNormNHWCTypeStringAndOps<T, /*WithSilu=*/true>()) {
       ORT_UNUSED_PARAMETER(_);
       this->RegisterOp(std::move(op));
     }
diff --git a/onnxruntime/contrib_ops/rocm/rocm_contrib_kernels.cc b/onnxruntime/contrib_ops/rocm/rocm_contrib_kernels.cc
index 55cd6a1d112f5..382a3951f3a83 100644
--- a/onnxruntime/contrib_ops/rocm/rocm_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/rocm/rocm_contrib_kernels.cc
@@ -93,6 +93,7 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, Samp
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, float, ScaledTanh);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, double, ScaledTanh);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, MLFloat16, ScaledTanh);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, SkipGroupNorm);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, SkipLayerNormalization);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, SkipLayerNormalization);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, SkipSimplifiedLayerNormalization);
@@ -246,6 +247,7 @@ Status RegisterRocmContribKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, float, ScaledTanh)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, double, ScaledTanh)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, MLFloat16, ScaledTanh)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, SkipGroupNorm)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, SkipLayerNormalization)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, SkipLayerNormalization)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, SkipSimplifiedLayerNormalization)>,
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/groupnorm_test.py b/onnxruntime/python/tools/kernel_explorer/kernels/groupnorm_test.py
index e32cb032798fc..8334d20e47c86 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/groupnorm_test.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/groupnorm_test.py
@@ -35,7 +35,11 @@ def sigmoid_function(x):
     return 1.0 / (1.0 + np.exp(-x))
 
 
-def group_norm(input_x, gamma, beta, num_groups, epsilon, with_swish):
+def group_norm(input_x, skip_x, bias_x, gamma, beta, num_groups, epsilon, with_silu, has_skip):
+    add_output = None
+    if has_skip:
+        input_x = input_x + skip_x + bias_x
+        add_output = input_x
     n, h, w, c = input_x.shape
     input_x = input_x.transpose([0, 3, 1, 2])
     assert c % num_groups == 0
@@ -45,46 +49,70 @@ def group_norm(input_x, gamma, beta, num_groups, epsilon, with_swish):
     x = x.transpose([0, 2, 3, 1])
     x = x * gamma + beta
 
-    if with_swish:
+    if with_silu:
         x = x * sigmoid_function(x)
-    return x
+    return x, add_output
 
 
-def run_group_norm(batch_size: int, height: int, num_channels: int, num_groups: int, dtype: str, swish: bool, func):
+def run_group_norm(
+    batch_size: int, height: int, num_channels: int, num_groups: int, dtype: str, silu: bool, has_skip: bool, func
+):
     np.random.seed(0)
     width = height
     input_x = np.random.rand(batch_size, height, width, num_channels).astype(np.float32)
     gamma = np.random.rand(num_channels).astype(np.float32)
     beta = np.random.rand(num_channels).astype(np.float32)
     # the size of workspace is defined in onnxruntime/contrib_ops/cuda/diffusion/group_norm_impl.h L18
-    workspace = np.random.rand((np.dtype(np.float32).itemsize * 2) * 32 * 32).astype(np.float32)
+    workspace = np.random.rand((np.dtype(np.float32).itemsize * 2) * batch_size * num_groups).astype(np.float32)
     epsilon = 1e-05
     output_y = np.random.rand(batch_size, height, width, num_channels).astype(dtype)
-    use_swish = swish
 
-    host_x = input_x.astype(dtype)
-    input_d = ke.DeviceArray(host_x)
+    skip_x = (
+        np.random.rand(batch_size, height, width, num_channels).astype(np.float32)
+        if has_skip
+        else np.empty((0), dtype=dtype)
+    )
+    bias_x = np.random.rand(num_channels).astype(np.float32) if has_skip else np.empty((0), dtype=dtype)
+    add_output = (
+        np.random.rand(batch_size, height, width, num_channels).astype(dtype)
+        if has_skip
+        else np.empty((0), dtype=dtype)
+    )
+    use_silu = silu
+    broadcast_skip = False
+    channels_per_block = 0  # Compute in params initialization
+
+    input_d = ke.DeviceArray(input_x.astype(dtype))
+    skip_d = ke.DeviceArray(skip_x.astype(dtype))
+    bias_d = ke.DeviceArray(bias_x.astype(dtype))
     gamma_d = ke.DeviceArray(gamma)
     beta_d = ke.DeviceArray(beta)
     workspace_d = ke.DeviceArray(workspace)
     y_d = ke.DeviceArray(output_y)
+    y_add_d = ke.DeviceArray(add_output)
     f = getattr(ke, func)
 
     my_op = f(
         y_d,
-        workspace_d,
+        y_add_d,
         input_d,
+        skip_d,
+        bias_d,
         gamma_d,
         beta_d,
+        workspace_d,
+        epsilon,
         batch_size,
+        num_channels,
         height,
         width,
-        num_channels,
         num_groups,
-        epsilon,
-        use_swish,
+        use_silu,
+        broadcast_skip,
+        channels_per_block,
     )
-    y_ref = group_norm(input_x, gamma, beta, num_groups, epsilon, use_swish).astype(dtype)
+    y_ref, y_add_d_ref = group_norm(input_x, skip_x, bias_x, gamma, beta, num_groups, epsilon, use_silu, has_skip)
+    y_ref = y_ref.astype(dtype)
 
     for impl in my_op.ListOps():
         if not my_op.SelectOp(impl):
@@ -95,6 +123,10 @@ def run_group_norm(batch_size: int, height: int, num_channels: int, num_groups:
         y_d.UpdateHostNumpyArray()
 
         np.testing.assert_allclose(y_ref, output_y, atol=1e-02)
+        if has_skip:
+            y_add_d_ref = y_add_d_ref.astype(dtype)
+            y_add_d.UpdateHostNumpyArray()
+            np.testing.assert_allclose(y_add_d_ref, add_output, atol=1e-02)
 
 
 dtypes = ["float32", "float16"]
@@ -102,19 +134,21 @@ def run_group_norm(batch_size: int, height: int, num_channels: int, num_groups:
 
 @pytest.mark.parametrize("sd_sizes", get_sd_sizes())
 @pytest.mark.parametrize("dtype", dtypes)
-@pytest.mark.parametrize("swish", [True])
-def test_group_norm(sd_sizes, dtype, swish):
+@pytest.mark.parametrize("silu", [True])
+@pytest.mark.parametrize("has_skip", [True, False])
+def test_group_norm(sd_sizes, dtype, silu, has_skip):
     for func in dtype_to_funcs(dtype):
-        run_group_norm(*sd_sizes, dtype, swish, func)
+        run_group_norm(*sd_sizes, dtype, silu, has_skip, func)
 
 
 @pytest.mark.parametrize("sd_sizes", get_sd_sizes())
 @pytest.mark.parametrize("dtype", dtypes)
-@pytest.mark.parametrize("swish", [True])
-def test_group_norm_ck(sd_sizes, dtype, swish):
-    swish_suffix = "Swish" if swish else "Pass"
-    ck_f_name = "CKGroupNormNHWC" + swish_suffix + "_" + dtype_to_suffix(dtype)
-    run_group_norm(*sd_sizes, dtype, swish, ck_f_name)
+@pytest.mark.parametrize("silu", [True])
+@pytest.mark.parametrize("has_skip", [False])
+def test_group_norm_ck(sd_sizes, dtype, silu, has_skip):
+    silu_suffix = "Silu" if silu else "Pass"
+    ck_f_name = "CKGroupNormNHWC" + silu_suffix + "_" + dtype_to_suffix(dtype)
+    run_group_norm(*sd_sizes, dtype, silu, has_skip, ck_f_name)
 
 
 @dataclass
@@ -136,37 +170,67 @@ def report(self):
 
 
 def profile_group_norm_func(
-    batch_size: int, height: int, width: int, num_channels: int, num_groups: int, dtype: str, swish: bool, func
+    batch_size: int,
+    height: int,
+    width: int,
+    num_channels: int,
+    num_groups: int,
+    dtype: str,
+    silu: bool,
+    has_skip: bool,
+    func,
 ):
     np.random.seed(0)
     input_x = np.random.rand(batch_size, height, width, num_channels).astype(dtype)
     gamma = np.random.rand(num_channels).astype(np.float32)
     beta = np.random.rand(num_channels).astype(np.float32)
-    workspace = np.random.rand(np.dtype(np.float32).itemsize * 2 * 32 * 32).astype(np.float32)
+    workspace = np.random.rand(np.dtype(np.float32).itemsize * 2 * batch_size * num_groups).astype(np.float32)
     epsilon = 0.05
     output_y = np.random.rand(batch_size, height, width, num_channels).astype(dtype)
-    use_swish = swish
+
+    skip_x = (
+        np.random.rand(batch_size, height, width, num_channels).astype(dtype)
+        if has_skip
+        else np.empty((0), dtype=dtype)
+    )
+    bias_x = np.random.rand(num_channels).astype(dtype) if has_skip else np.empty((0), dtype=dtype)
+    add_output = (
+        np.random.rand(batch_size, height, width, num_channels).astype(dtype)
+        if has_skip
+        else np.empty((0), dtype=dtype)
+    )
+    use_silu = silu
+    broadcast_skip = False
+    channels_per_block = 0  # Compute in params initialization
 
     input_d = ke.DeviceArray(input_x)
+    skip_d = ke.DeviceArray(skip_x)
+    bias_d = ke.DeviceArray(bias_x)
     gamma_d = ke.DeviceArray(gamma)
     beta_d = ke.DeviceArray(beta)
     workspace_d = ke.DeviceArray(workspace)
     y_d = ke.DeviceArray(output_y)
+    y_add_d = ke.DeviceArray(add_output)
     f = getattr(ke, func)
 
     my_op = f(
         y_d,
-        workspace_d,
+        y_add_d,
         input_d,
+        skip_d,
+        bias_d,
         gamma_d,
         beta_d,
+        workspace_d,
+        epsilon,
         batch_size,
+        num_channels,
         height,
         width,
-        num_channels,
         num_groups,
-        epsilon,
-        use_swish,
+        use_silu,
+        broadcast_skip,
+        channels_per_block,
     )
     for impl in my_op.ListOps():
         duration_ms = -1
@@ -181,14 +245,14 @@ def profile_group_norm_func(
         )
 
 
-def profile_with_args(batch_size, height, width, num_channels, num_groups, dtype, swish=True, sort=True):
+def profile_with_args(batch_size, height, width, num_channels, num_groups, dtype, silu=True, has_skip=True, sort=True):
     with ke.benchmark(sort):
         for func in dtype_to_funcs(dtype):
-            profile_group_norm_func(batch_size, height, width, num_channels, num_groups, dtype, swish, func)
+            profile_group_norm_func(batch_size, height, width, num_channels, num_groups, dtype, silu, has_skip, func)
         # ck function
-        swish_suffix = "Swish" if swish else "Pass"
-        ck_f_name = "CKGroupNormNHWC" + swish_suffix + "_" + dtype_to_suffix(dtype)
-        profile_group_norm_func(batch_size, height, width, num_channels, num_groups, dtype, swish, ck_f_name)
+        silu_suffix = "Silu" if silu else "Pass"
+        ck_f_name = "CKGroupNormNHWC" + silu_suffix + "_" + dtype_to_suffix(dtype)
+        profile_group_norm_func(batch_size, height, width, num_channels, num_groups, dtype, silu, has_skip, ck_f_name)
 
 
 sd_profile_sizes = [
@@ -227,7 +291,8 @@ def profile():
     group.add_argument("num_channels", type=int)
     group.add_argument("num_groups", type=int)
     group.add_argument("dtype", choices=dtypes)
-    group.add_argument("--swish", action="store_true")
+    group.add_argument("--silu", action="store_true")
+    group.add_argument("--has_skip", action="store_true")
     group.add_argument("--sort", action="store_true")
 
     if len(sys.argv) == 1:
@@ -241,6 +306,7 @@ def profile():
             args.num_channels,
             args.num_groups,
             args.dtype,
-            args.swish,
+            args.silu,
+            args.has_skip,
             args.sort,
         )
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/rocm/group_norm.cu b/onnxruntime/python/tools/kernel_explorer/kernels/rocm/group_norm.cu
index 0bd47b2c0387e..6af163ab94b10 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/rocm/group_norm.cu
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/rocm/group_norm.cu
@@ -12,17 +12,21 @@
 #include "python/tools/kernel_explorer/kernel_explorer_interface.h"
 
 namespace py = pybind11;
-
+using onnxruntime::contrib::rocm::GetGroupNormWorkspaceSizeInBytes;
 namespace onnxruntime {
 
 template <typename T, int ThreadsPerBlock, int VecSize>
 class GroupNormNHWC : public IKernelExplorer {
  public:
-  GroupNormNHWC(DeviceArray& output, DeviceArray& workspace, DeviceArray& input, DeviceArray& gamma, DeviceArray& beta,
-                int batch_size, int height, int width, int num_channels, int num_groups, float epsilon, bool use_swish)
-      : params_(TuningContext(), Stream(), static_cast<T*>(output.ptr()), static_cast<float*>(workspace.ptr()),
-                static_cast<T*>(input.ptr()), static_cast<float*>(gamma.ptr()), static_cast<float*>(beta.ptr()),
-                batch_size, height, width, num_channels, num_groups, epsilon, use_swish) {
+  GroupNormNHWC(DeviceArray& output, DeviceArray& add_output, DeviceArray& input, DeviceArray& skip, DeviceArray& bias,
+                DeviceArray& gamma, DeviceArray& beta, DeviceArray& workspace, float epsilon,
+                int batch_size, int num_channels, int height, int width, int num_groups, bool use_silu,
+                bool broadcast_skip, int channels_per_block)
+      : params_(TuningContext(), Stream(), static_cast<T*>(output.ptr()), static_cast<T*>(add_output.ptr()),
+                static_cast<T*>(input.ptr()), static_cast<T*>(skip.ptr()), static_cast<T*>(bias.ptr()),
+                static_cast<float*>(gamma.ptr()), static_cast<float*>(beta.ptr()), static_cast<float*>(workspace.ptr()),
+                epsilon, batch_size, num_channels, height, width, num_groups, use_silu, broadcast_skip,
+                channels_per_block) {
     type_string_ = "GroupNormNHWC_" + std::to_string(ThreadsPerBlock) + "_" + std::to_string(VecSize);
   }
 
@@ -40,7 +44,7 @@ class GroupNormNHWC : public IKernelExplorer {
   }
 
  private:
-  using ParamsT = contrib::rocm::GroupNormNHWCParams<T>;
+  using ParamsT = contrib::rocm::GroupNormNHWCTunableParams<T>;
   ParamsT params_{};
   contrib::rocm::GroupNormNHWCOp<T, ThreadsPerBlock, VecSize> op_{};
   std::string type_string_{};
@@ -49,11 +53,15 @@ class GroupNormNHWC : public IKernelExplorer {
 template <typename T>
 class GroupNormNHWCStaticSelection : public IKernelExplorer {
  public:
-  GroupNormNHWCStaticSelection(DeviceArray& output, DeviceArray& workspace, DeviceArray& input, DeviceArray& gamma, DeviceArray& beta,
-                               int batch_size, int height, int width, int num_channels, int num_groups, float epsilon, bool use_swish)
-      : params_(TuningContext(), Stream(), static_cast<T*>(output.ptr()), static_cast<float*>(workspace.ptr()),
-                static_cast<T*>(input.ptr()), static_cast<float*>(gamma.ptr()), static_cast<float*>(beta.ptr()),
-                batch_size, height, width, num_channels, num_groups, epsilon, use_swish) {
+  GroupNormNHWCStaticSelection(DeviceArray& output, DeviceArray& add_output, DeviceArray& input, DeviceArray& skip,
+                               DeviceArray& bias, DeviceArray& gamma, DeviceArray& beta, DeviceArray& workspace,
+                               float epsilon, int batch_size, int num_channels, int height, int width, int num_groups,
+                               bool use_silu, bool broadcast_skip, int channels_per_block)
+      : params_(TuningContext(), Stream(), static_cast<T*>(output.ptr()), static_cast<T*>(add_output.ptr()),
+                static_cast<T*>(input.ptr()), static_cast<T*>(skip.ptr()), static_cast<T*>(bias.ptr()),
+                static_cast<float*>(gamma.ptr()), static_cast<float*>(beta.ptr()), static_cast<float*>(workspace.ptr()),
+                epsilon, batch_size, num_channels, height, width, num_groups, use_silu, broadcast_skip,
+                channels_per_block) {
     type_string_ = "GroupNormNHWCStaticSelection";
   }
 
@@ -71,7 +79,7 @@ class GroupNormNHWCStaticSelection : public IKernelExplorer {
   }
 
  private:
-  using ParamsT = contrib::rocm::GroupNormNHWCParams<T>;
+  using ParamsT = contrib::rocm::GroupNormNHWCTunableParams<T>;
   ParamsT params_{};
   std::string type_string_{};
 };
@@ -79,11 +87,15 @@ class GroupNormNHWCStaticSelection : public IKernelExplorer {
 template <typename T>
 class GroupNormNHWCTunable : public IKernelExplorer {
  public:
-  GroupNormNHWCTunable(DeviceArray& output, DeviceArray& workspace, DeviceArray& input, DeviceArray& gamma, DeviceArray& beta,
-                       int batch_size, int height, int width, int num_channels, int num_groups, float epsilon, bool use_swish)
-      : params_(TuningContext(), Stream(), static_cast<T*>(output.ptr()), static_cast<float*>(workspace.ptr()),
-                static_cast<T*>(input.ptr()), static_cast<float*>(gamma.ptr()), static_cast<float*>(beta.ptr()),
-                batch_size, height, width, num_channels, num_groups, epsilon, use_swish) {
+  GroupNormNHWCTunable(DeviceArray& output, DeviceArray& add_output, DeviceArray& input, DeviceArray& skip,
+                       DeviceArray& bias, DeviceArray& gamma, DeviceArray& beta, DeviceArray& workspace,
+                       float epsilon, int batch_size, int num_channels, int height, int width, int num_groups,
+                       bool use_silu, bool broadcast_skip, int channels_per_block)
+      : params_(TuningContext(), Stream(), static_cast<T*>(output.ptr()), static_cast<T*>(add_output.ptr()),
+                static_cast<T*>(input.ptr()), static_cast<T*>(skip.ptr()), static_cast<T*>(bias.ptr()),
+                static_cast<float*>(gamma.ptr()), static_cast<float*>(beta.ptr()), static_cast<float*>(workspace.ptr()),
+                epsilon, batch_size, num_channels, height, width, num_groups, use_silu, broadcast_skip,
+                channels_per_block) {
     params_.TuningContext()->EnableTunableOpAndTuning();
   }
 
@@ -100,21 +112,25 @@ class GroupNormNHWCTunable : public IKernelExplorer {
   }
 
  private:
-  using ParamsT = contrib::rocm::GroupNormNHWCParams<T>;
+  using ParamsT = contrib::rocm::GroupNormNHWCTunableParams<T>;
   ParamsT params_{};
   contrib::rocm::GroupNormNHWCTunableOp<T> op_{};
 };
 
 #ifdef USE_COMPOSABLE_KERNEL
-template <typename T, bool WithSwish>
+template <typename T, bool WithSilu>
 class CKGroupNormNHWC : public IKernelExplorer {
  public:
-  CKGroupNormNHWC(DeviceArray& output, DeviceArray& workspace, DeviceArray& input, DeviceArray& gamma, DeviceArray& beta,
-                  int batch_size, int height, int width, int num_channels, int num_groups, float epsilon, bool use_swish)
-      : params_(TuningContext(), Stream(), static_cast<T*>(output.ptr()), static_cast<float*>(workspace.ptr()),
-                static_cast<T*>(input.ptr()), static_cast<float*>(gamma.ptr()), static_cast<float*>(beta.ptr()),
-                batch_size, height, width, num_channels, num_groups, epsilon, use_swish) {
-    for (auto&& [type_string, op] : contrib::rocm::GetCKGroupNormNHWCTypeStringAndOps<T, float, WithSwish>()) {
+  CKGroupNormNHWC(DeviceArray& output, DeviceArray& add_output, DeviceArray& input, DeviceArray& skip,
+                  DeviceArray& bias, DeviceArray& gamma, DeviceArray& beta, DeviceArray& workspace,
+                  float epsilon, int batch_size, int num_channels, int height, int width, int num_groups,
+                  bool use_silu, bool broadcast_skip, int channels_per_block)
+      : params_(TuningContext(), Stream(), static_cast<T*>(output.ptr()), static_cast<T*>(add_output.ptr()),
+                static_cast<T*>(input.ptr()), static_cast<T*>(skip.ptr()), static_cast<T*>(bias.ptr()),
+                static_cast<float*>(gamma.ptr()), static_cast<float*>(beta.ptr()), static_cast<float*>(workspace.ptr()),
+                epsilon, batch_size, num_channels, height, width, num_groups, use_silu, broadcast_skip,
+                channels_per_block) {
+    for (auto&& [type_string, op] : contrib::rocm::GetCKGroupNormNHWCTypeStringAndOps<T, float, WithSilu>()) {
       type_strings_.emplace_back(std::move(type_string));
       ops_.emplace_back(std::move(op));
     }
@@ -141,7 +157,7 @@ class CKGroupNormNHWC : public IKernelExplorer {
   }
 
  private:
-  using ParamsT = contrib::rocm::GroupNormNHWCParams<T>;
+  using ParamsT = contrib::rocm::GroupNormNHWCTunableParams<T>;
   using OpT = rocm::tunable::Op<ParamsT>;
   ParamsT params_{};
   std::vector<OpT> ops_;
@@ -151,15 +167,19 @@ class CKGroupNormNHWC : public IKernelExplorer {
 #endif  // USE_COMPOSABLE_KERNEL
 
 #ifdef USE_TRITON_KERNEL
-template <typename T, bool WithSwish>
+template <typename T, bool WithSilu>
 class GroupNormNHWCTriton : public IKernelExplorer {
  public:
-  GroupNormNHWCTriton(DeviceArray& output, DeviceArray& workspace, DeviceArray& input, DeviceArray& gamma, DeviceArray& beta,
-                      int batch_size, int height, int width, int num_channels, int num_groups, float epsilon, bool use_swish)
-      : params_(TuningContext(), Stream(), static_cast<T*>(output.ptr()), static_cast<float*>(workspace.ptr()),
-                static_cast<T*>(input.ptr()), static_cast<float*>(gamma.ptr()), static_cast<float*>(beta.ptr()),
-                batch_size, height, width, num_channels, num_groups, epsilon, use_swish) {
-    for (auto&& [name, op] : contrib::rocm::GetTritonGroupNormNHWCTypeStringAndOps<T, WithSwish>()) {
+  GroupNormNHWCTriton(DeviceArray& output, DeviceArray& add_output, DeviceArray& input, DeviceArray& skip,
+                      DeviceArray& bias, DeviceArray& gamma, DeviceArray& beta, DeviceArray& workspace,
+                      float epsilon, int batch_size, int num_channels, int height, int width, int num_groups,
+                      bool use_silu, bool broadcast_skip, int channels_per_block)
+      : params_(TuningContext(), Stream(), static_cast<T*>(output.ptr()), static_cast<T*>(add_output.ptr()),
+                static_cast<T*>(input.ptr()), static_cast<T*>(skip.ptr()), static_cast<T*>(bias.ptr()),
+                static_cast<float*>(gamma.ptr()), static_cast<float*>(beta.ptr()), static_cast<float*>(workspace.ptr()),
+                epsilon, batch_size, num_channels, height, width, num_groups, use_silu, broadcast_skip,
+                channels_per_block) {
+    for (auto&& [name, op] : contrib::rocm::GetTritonGroupNormNHWCTypeStringAndOps<T, WithSilu>()) {
       name_strings_.emplace_back(name);
       ops_.emplace_back(std::move(op));
     }
@@ -186,7 +206,7 @@ class GroupNormNHWCTriton : public IKernelExplorer {
   }
 
  private:
-  using ParamsT = contrib::rocm::GroupNormNHWCParams<T>;
+  using ParamsT = contrib::rocm::GroupNormNHWCTunableParams<T>;
   using OpT = rocm::tunable::Op<ParamsT>;
   ParamsT params_{};
   std::vector<OpT> ops_;
@@ -198,7 +218,8 @@ class GroupNormNHWCTriton : public IKernelExplorer {
 #define REGISTER_OP(name, type, threads_per_block, vec_size)                                                   \
   py::class_<name<type, threads_per_block, vec_size>>(m, #name "_" #type "_" #threads_per_block "_" #vec_size) \
       .def(py::init<DeviceArray&, DeviceArray&, DeviceArray&, DeviceArray&, DeviceArray&,                      \
-                    int, int, int, int, int, float, bool>())                                                   \
+                    DeviceArray&, DeviceArray&, DeviceArray&, float,                                           \
+                    int, int, int, int, int, bool, bool, int>())                                               \
       .def("SetRepeats", &name<type, threads_per_block, vec_size>::SetRepeats)                                 \
       .def("Profile", &name<type, threads_per_block, vec_size>::Profile)                                       \
       .def("Run", &name<type, threads_per_block, vec_size>::Run)                                               \
@@ -220,7 +241,8 @@ class GroupNormNHWCTriton : public IKernelExplorer {
 #define REGISTER_COMMON(name, type, ...)                                                  \
   py::class_<type<__VA_ARGS__>>(m, name)                                                  \
       .def(py::init<DeviceArray&, DeviceArray&, DeviceArray&, DeviceArray&, DeviceArray&, \
-                    int, int, int, int, int, float, bool>())                              \
+                    DeviceArray&, DeviceArray&, DeviceArray&, float,                      \
+                    int, int, int, int, int, bool, bool, int>())                          \
       .def("SetRepeats", &type<__VA_ARGS__>::SetRepeats)                                  \
       .def("Profile", &type<__VA_ARGS__>::Profile)                                        \
       .def("Run", &type<__VA_ARGS__>::Run)                                                \
@@ -230,11 +252,11 @@ class GroupNormNHWCTriton : public IKernelExplorer {
 #define REGISTER_OP_TYPED(name, type) \
   REGISTER_COMMON(#name "_" #type, name, type)
 
-#define REGISTER_CK(type, with_swish, swish_suffix) \
-  REGISTER_COMMON("CKGroupNormNHWC" swish_suffix "_" #type, CKGroupNormNHWC, type, with_swish)
+#define REGISTER_CK(type, with_silu, silu_suffix) \
+  REGISTER_COMMON("CKGroupNormNHWC" silu_suffix "_" #type, CKGroupNormNHWC, type, with_silu)
 
-#define REGISTER_TRITON(type, with_swish, swish_suffix) \
-  REGISTER_COMMON("GroupNormNHWCTriton" swish_suffix "_" #type, GroupNormNHWCTriton, type, with_swish)
+#define REGISTER_TRITON(type, with_silu, silu_suffix) \
+  REGISTER_COMMON("GroupNormNHWCTriton" silu_suffix "_" #type, GroupNormNHWCTriton, type, with_silu)
 
 KE_REGISTER(m) {
   REGISTER_OP_FOR_ALL_THREADS_PER_BLOCK_ALL_VEC_SIZE(GroupNormNHWC, half);
@@ -248,16 +270,16 @@ KE_REGISTER(m) {
 
 #ifdef USE_COMPOSABLE_KERNEL
   REGISTER_CK(half, false, "Pass");
-  REGISTER_CK(half, true, "Swish");
+  REGISTER_CK(half, true, "Silu");
   REGISTER_CK(float, false, "Pass");
-  REGISTER_CK(float, true, "Swish");
+  REGISTER_CK(float, true, "Silu");
 #endif  // USE_COMPOSABLE_KERNEL
 
 #ifdef USE_TRITON_KERNEL
   REGISTER_TRITON(half, false, "Pass");
-  REGISTER_TRITON(half, true, "Swish");
+  REGISTER_TRITON(half, true, "Silu");
   REGISTER_TRITON(float, false, "Pass");
-  REGISTER_TRITON(float, true, "Swish");
+  REGISTER_TRITON(float, true, "Silu");
 #endif
 }
 
diff --git a/onnxruntime/test/contrib_ops/skip_group_norm_op_test.cc b/onnxruntime/test/contrib_ops/skip_group_norm_op_test.cc
index fefd5722054de..ea8537f243f5d 100644
--- a/onnxruntime/test/contrib_ops/skip_group_norm_op_test.cc
+++ b/onnxruntime/test/contrib_ops/skip_group_norm_op_test.cc
@@ -114,16 +114,21 @@ TEST(SkipGroupNormTest, SkipGroupNorm_with_bias) {
 
   int min_cuda_architecture = 530;
   bool enable_cuda = HasCudaEnvironment(min_cuda_architecture);
+  bool enable_rocm = (nullptr != DefaultRocmExecutionProvider().get());
 
   std::array<int, 2> channels_last_values = {-1, 1};
 
   for (const int channels_last : channels_last_values) {
-    if (enable_cuda) {
+    if (enable_cuda || enable_rocm) {
       std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
       if (enable_cuda && channels_last != 0) {
         execution_providers.push_back(DefaultCudaExecutionProvider());
       }
 
+      if (enable_rocm && channels_last != 0) {
+        execution_providers.push_back(DefaultRocmExecutionProvider());
+      }
+
       // Don't run the test if no providers are supported
       if (execution_providers.empty()) {
         continue;
@@ -230,6 +235,7 @@ TEST(SkipGroupNormTest, SkipGroupNorm_no_bias_broadcast_skip) {
 
   int min_cuda_architecture = 530;
   bool enable_cuda = HasCudaEnvironment(min_cuda_architecture);
+  bool enable_rocm = (nullptr != DefaultRocmExecutionProvider().get());
 
   std::array<bool, 2> has_add_out_values = {true, false};
   std::array<int, 2> skip_dims = {2, 4};
@@ -237,12 +243,16 @@ TEST(SkipGroupNormTest, SkipGroupNorm_no_bias_broadcast_skip) {
   constexpr int channels_last = 1;
   for (const int skip_dim : skip_dims) {
     for (const bool has_add_out : has_add_out_values) {
-      if (enable_cuda) {
+      if (enable_cuda || enable_rocm) {
         std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
         if (enable_cuda && channels_last != 0) {
           execution_providers.push_back(DefaultCudaExecutionProvider());
         }
 
+        if (enable_rocm && channels_last != 0) {
+          execution_providers.push_back(DefaultRocmExecutionProvider());
+        }
+
         // Don't run the test if no providers are supported
         if (execution_providers.empty()) {
           continue;
diff --git a/tools/ci_build/amd_hipify.py b/tools/ci_build/amd_hipify.py
index e286236ba6447..f1d3702e3245e 100644
--- a/tools/ci_build/amd_hipify.py
+++ b/tools/ci_build/amd_hipify.py
@@ -181,6 +181,8 @@ def hipify(hipify_perl_path, src_file_path, dst_file_path):
     s = s.replace("rocm_device_prop_", "cuda_device_prop_")
     s = s.replace("rocm_device_arch_", "cuda_device_arch_")
 
+    s = s.replace("HipTuningContext", "RocmTuningContext")
+
     # We want hipfft, which needs hipDataType etc, but only do this for files that have "fft" in their names
     # And we do this last, undoing or fixing hipify mistakes.
     if "fft" in src_file_path:

From 124bde985ae883566c44f5cd84d351612006100c Mon Sep 17 00:00:00 2001
From: Baiju Meswani <bmeswani@microsoft.com>
Date: Tue, 20 Feb 2024 19:20:42 -0800
Subject: [PATCH 027/279] Bring QAT POC back to a functional state (#19290)

---
 .../test/python/qat_poc_example/README.md     |  2 +-
 .../test/python/qat_poc_example/model.py      | 56 +++++++------------
 .../test/python/qat_poc_example/qat.py        |  2 +-
 .../test/python/qat_poc_example/train.py      | 18 ++----
 4 files changed, 27 insertions(+), 51 deletions(-)

diff --git a/orttraining/orttraining/test/python/qat_poc_example/README.md b/orttraining/orttraining/test/python/qat_poc_example/README.md
index 6840e98bd9c86..05072b410b730 100644
--- a/orttraining/orttraining/test/python/qat_poc_example/README.md
+++ b/orttraining/orttraining/test/python/qat_poc_example/README.md
@@ -48,7 +48,7 @@ We use `onnxruntime.training.onnxblock` to perform the above operations to get t
 
 > **_NOTE:_**  As of this writing, ORT does not have its own `"Observers"`. Instead, we rely on the `onnxruntime.quantization` tool to quantize the model and give us an initial estimate of the quantization parameters using its calibration process. Here the calibration process is used as a substitute for the observers to present the POC.
 
-> **_NOTE:_** Typically, the weights in the statically quantized onnx model is associated with a DQ node only (not the QDQ pair) since weights are quantized. However, QAT requires weights and biases to be non quantized. We ensure that the weights have dedicated QDQ pair by passing in the flag AddQDQPairToWeight=True`
+> **_NOTE:_** Typically, the weights in the statically quantized onnx model is associated with a DQ node only (not the QDQ pair) since weights are quantized. However, QAT requires weights and biases to be non quantized. We ensure that the weights have dedicated QDQ pair by passing in the flag `AddQDQPairToWeight=True`
 
 > **_NOTE:_**  Typically, the bias term in the statically quantized onnx model is associated with a DQ node only (not the QDQ pair) since it is quantized as int32 as opposed to int8. So, we disable quantizing the bias term using the flag QuantizeBias=False`
 
diff --git a/orttraining/orttraining/test/python/qat_poc_example/model.py b/orttraining/orttraining/test/python/qat_poc_example/model.py
index 91d7ccd7294f5..601362a59e379 100644
--- a/orttraining/orttraining/test/python/qat_poc_example/model.py
+++ b/orttraining/orttraining/test/python/qat_poc_example/model.py
@@ -5,7 +5,7 @@
 import onnx
 import torch
 
-import onnxruntime.training.onnxblock as onnxblock
+from onnxruntime.training import artifacts
 
 
 class MNIST(torch.nn.Module):
@@ -96,42 +96,26 @@ def create_training_artifacts(model_path, artifacts_dir, model_prefix):
     4. The checkpoint file
     """
 
-    class MNISTWithLoss(onnxblock.TrainingModel):
-        def __init__(self):
-            super().__init__()
-            self.loss = onnxblock.loss.CrossEntropyLoss()
-
-        def build(self, output_name):
-            return self.loss(output_name)
-
-    mnist_with_loss = MNISTWithLoss()
-    onnx_model, eval_model, optimizer_model = onnx.load(model_path), None, None
-
-    # Build the training and eval graphs
-    logging.info("Using onnxblock to create the training artifacts.")
-    with onnxblock.onnx_model(onnx_model) as model_accessor:
-        _ = mnist_with_loss(onnx_model.graph.output[0].name)
-        eval_model = model_accessor.eval_model
-
-    # Build the optimizer graph
-    optimizer = onnxblock.optim.AdamW()
-    with onnxblock.onnx_model() as accessor:
-        _ = optimizer(mnist_with_loss.parameters())
-        optimizer_model = accessor.model
+    onnx_model = onnx.load(model_path)
+
+    requires_grad = [
+        param.name
+        for param in onnx_model.graph.initializer
+        if (not param.name.endswith("_scale") and not param.name.endswith("_zero_point"))
+    ]
+    artifacts.generate_artifacts(
+        onnx_model,
+        requires_grad=requires_grad,
+        loss=artifacts.LossType.CrossEntropyLoss,
+        optimizer=artifacts.OptimType.AdamW,
+        artifact_directory=artifacts_dir,
+        prefix=model_prefix,
+    )
 
     # Create the training artifacts
-    train_model_path = os.path.join(artifacts_dir, f"{model_prefix}_train.onnx")
-    logging.info(f"Saving the training model to {train_model_path}.")
-    onnx.save(onnx_model, train_model_path)
-    eval_model_path = os.path.join(artifacts_dir, f"{model_prefix}_eval.onnx")
-    logging.info(f"Saving the eval model to {eval_model_path}.")
-    onnx.save(eval_model, eval_model_path)
-    optimizer_model_path = os.path.join(artifacts_dir, f"{model_prefix}_optimizer.onnx")
-    logging.info(f"Saving the optimizer model to {optimizer_model_path}.")
-    onnx.save(optimizer_model, optimizer_model_path)
-    trainable_params, non_trainable_params = mnist_with_loss.parameters()
-    checkpoint_path = os.path.join(artifacts_dir, f"{model_prefix}_checkpoint.ckpt")
-    logging.info(f"Saving the checkpoint to {checkpoint_path}.")
-    onnxblock.save_checkpoint((trainable_params, non_trainable_params), checkpoint_path)
+    train_model_path = os.path.join(artifacts_dir, f"{model_prefix}training_model.onnx")
+    eval_model_path = os.path.join(artifacts_dir, f"{model_prefix}eval_model.onnx")
+    optimizer_model_path = os.path.join(artifacts_dir, f"{model_prefix}optimizer_model.onnx")
+    checkpoint_path = os.path.join(artifacts_dir, f"{model_prefix}checkpoint")
 
     return train_model_path, eval_model_path, optimizer_model_path, checkpoint_path
diff --git a/orttraining/orttraining/test/python/qat_poc_example/qat.py b/orttraining/orttraining/test/python/qat_poc_example/qat.py
index 51a15475ee911..dcc9e116fda7d 100644
--- a/orttraining/orttraining/test/python/qat_poc_example/qat.py
+++ b/orttraining/orttraining/test/python/qat_poc_example/qat.py
@@ -46,7 +46,7 @@
     )
 
     logging.info("Preparing the training artifacts for QAT.")
-    training_model_name = "mnist_qat"
+    training_model_name = "mnist_qat_"
     artifacts_dir = os.path.join(model_dir, "training_artifacts")
     utils.makedir(artifacts_dir)
     training_artifacts = create_training_artifacts(
diff --git a/orttraining/orttraining/test/python/qat_poc_example/train.py b/orttraining/orttraining/test/python/qat_poc_example/train.py
index 9a429d2adc6f1..a25c071c58a48 100644
--- a/orttraining/orttraining/test/python/qat_poc_example/train.py
+++ b/orttraining/orttraining/test/python/qat_poc_example/train.py
@@ -26,14 +26,10 @@ def _train_epoch(model, optimizer, train_loader):
     model.train()
     cumulative_loss = 0
     for data, target in train_loader:
-        forward_inputs = [
-            data.reshape(len(data), 784).numpy(),
-            target.numpy().astype(np.int32),
-        ]
-        train_loss = model(forward_inputs)
+        train_loss = model(data.reshape(len(data), 784).numpy(), target.numpy().astype(np.int64))
         optimizer.step()
         model.lazy_reset_grad()
-        cumulative_loss += train_loss[0]
+        cumulative_loss += train_loss
 
     return cumulative_loss / len(train_loader)
 
@@ -43,12 +39,8 @@ def _eval(model, test_loader):
     model.eval()
     cumulative_loss = 0
     for data, target in test_loader:
-        forward_inputs = [
-            data.reshape(len(data), 784).numpy(),
-            target.numpy().astype(np.int32),
-        ]
-        test_loss = model(forward_inputs)
-        cumulative_loss += test_loss[0]
+        test_loss = model(data.reshape(len(data), 784).numpy(), target.numpy().astype(np.int64))
+        cumulative_loss += test_loss
 
     return cumulative_loss / len(test_loader)
 
@@ -65,7 +57,7 @@ def train_model(qat_train_model, qat_eval_model, qat_optimizer_model, qat_checkp
     train_loader, test_loader = _get_dataloaders("data", batch_size)
 
     # Load the checkpoint state.
-    state = orttraining.CheckpointState(qat_checkpoint)
+    state = orttraining.CheckpointState.load_checkpoint(qat_checkpoint)
 
     # Create the training module.
     model = orttraining.Module(qat_train_model, state, qat_eval_model)

From 8092a89688f92dee83d1d0111acaa1e1d2dfdb85 Mon Sep 17 00:00:00 2001
From: satyajandhyala <satya.k.jandhyala@gmail.com>
Date: Tue, 20 Feb 2024 21:18:54 -0800
Subject: [PATCH 028/279] Changed command line argpasrse to process
 '--symmetric [True|False]'. (#19577)

### Description
<!-- Describe your changes. -->
Accept the command line option --symmetric and its optional value
correctly. If the optional value matches uncased to 'True' then set
symmetric to True else set symmetric to False. Asymmetric quantization
will generate zero_point input.
```
usage: matmul_4bits_quantizer.py [-h] --input_model INPUT_MODEL --output_model OUTPUT_MODEL [--block_size BLOCK_SIZE] [--symmetric [{True,False}]] [--accuracy_level ACCURACY_LEVEL] [-v]
                                 [--nodes_to_exclude NODES_TO_EXCLUDE [NODES_TO_EXCLUDE ...]]
```
### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../python/tools/quantization/matmul_4bits_quantizer.py  | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
index 3e9f9a6544a71..eb7bbec997d59 100644
--- a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
+++ b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
@@ -349,6 +349,10 @@ def process(self):
             self.int4_quant_algo()
 
 
+def ort_convert_str_to_bool(value):
+    return value.lower() in ("true", "1")
+
+
 def parse_args():
     parser = argparse.ArgumentParser(
         description="""Blockwise int4 quantization for MatMul 2D weight matrices.
@@ -366,7 +370,10 @@ def parse_args():
         "--symmetric",
         required=False,
         default=True,
-        type=bool,
+        const=True,
+        nargs="?",
+        type=ort_convert_str_to_bool,
+        choices=[True, False],
         help="Indicate whether to quantize the model symmetrically",
     )
     parser.add_argument(

From 58f4921686bf0a5b0442fb6df92d1b1972a118cc Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Wed, 21 Feb 2024 00:31:06 -0800
Subject: [PATCH 029/279] [js] changes to allow Float16Array if any polyfill is
 available (#19305)

### Description

This change adds only necessary code to enable ort-web works with any
Float16Array polyfill. Unlike #19302, in this PR, ort-web does not
include any specific polyfill; instead, it's user's choice for how to
use a polyfill.

ORT-web uses Float16Array if it's available; otherwise, fallback to use
Uint16Array.

```js
// case 1: user does not use polyfill:
import * as ort from 'onnxruntime-web';

const myF16Data = new Uint16Array(...);  // need to use Uint16Array
const myF16tensor = new ort.Tensor('float16', myF16Data, dims);
```

```js
// case 2: user use polyfill:
import * as ort from 'onnxruntime-web';
import {
  Float16Array, isFloat16Array, isTypedArray,
  getFloat16, setFloat16,
  f16round,
} from "@petamoriken/float16";
globalThis.Float16Array = Float16Array;  // ort-web will pick the global Float16Array

const myF16Data = new Float16Array(...);  // Use the polyfilled Float16Array type
const myF16tensor = new ort.Tensor('float16', myF16Data, dims);
```
---
 js/common/lib/tensor-impl-type-mapping.ts | 34 +++++++++++++++--------
 js/common/lib/tensor-impl.ts              | 10 ++++---
 js/web/lib/wasm/wasm-common.ts            |  9 +++++-
 3 files changed, 37 insertions(+), 16 deletions(-)

diff --git a/js/common/lib/tensor-impl-type-mapping.ts b/js/common/lib/tensor-impl-type-mapping.ts
index c4a43ea27fea1..b29cb8cbd6d35 100644
--- a/js/common/lib/tensor-impl-type-mapping.ts
+++ b/js/common/lib/tensor-impl-type-mapping.ts
@@ -14,7 +14,6 @@ export const NUMERIC_TENSOR_TYPE_TO_TYPEDARRAY_MAP = new Map<string, SupportedTy
   ['uint8', Uint8Array],
   ['int8', Int8Array],
   ['uint16', Uint16Array],
-  ['float16', Uint16Array],
   ['int16', Int16Array],
   ['int32', Int32Array],
   ['bool', Uint8Array],
@@ -34,16 +33,22 @@ export const NUMERIC_TENSOR_TYPEDARRAY_TO_TYPE_MAP = new Map<SupportedTypedArray
   [Uint32Array, 'uint32'],
 ]);
 
-// the following code allows delaying execution of BigInt checking. This allows lazy initialization for
-// NUMERIC_TENSOR_TYPE_TO_TYPEDARRAY_MAP and NUMERIC_TENSOR_TYPEDARRAY_TO_TYPE_MAP, which allows BigInt polyfill
-// if available.
-let isBigIntChecked = false;
-export const checkBigInt = () => {
-  if (!isBigIntChecked) {
-    isBigIntChecked = true;
-    const isBigInt64ArrayAvailable = typeof BigInt64Array !== 'undefined' && typeof BigInt64Array.from === 'function';
-    const isBigUint64ArrayAvailable =
-        typeof BigUint64Array !== 'undefined' && typeof BigUint64Array.from === 'function';
+// a dummy type declaration for Float16Array in case any polyfill is available.
+declare global {
+  // eslint-disable-next-line @typescript-eslint/naming-convention, @typescript-eslint/no-explicit-any
+  const Float16Array: any;
+}
+
+// the following code allows delaying execution of BigInt/Float16Array checking. This allows lazy initialization for
+// NUMERIC_TENSOR_TYPE_TO_TYPEDARRAY_MAP and NUMERIC_TENSOR_TYPEDARRAY_TO_TYPE_MAP, which allows BigInt/Float16Array
+// polyfill if available.
+let isTypedArrayChecked = false;
+export const checkTypedArray = () => {
+  if (!isTypedArrayChecked) {
+    isTypedArrayChecked = true;
+    const isBigInt64ArrayAvailable = typeof BigInt64Array !== 'undefined' && BigInt64Array.from;
+    const isBigUint64ArrayAvailable = typeof BigUint64Array !== 'undefined' && BigUint64Array.from;
+    const isFloat16ArrayAvailable = typeof Float16Array !== 'undefined' && Float16Array.from;
 
     if (isBigInt64ArrayAvailable) {
       NUMERIC_TENSOR_TYPE_TO_TYPEDARRAY_MAP.set('int64', BigInt64Array);
@@ -53,5 +58,12 @@ export const checkBigInt = () => {
       NUMERIC_TENSOR_TYPE_TO_TYPEDARRAY_MAP.set('uint64', BigUint64Array);
       NUMERIC_TENSOR_TYPEDARRAY_TO_TYPE_MAP.set(BigUint64Array, 'uint64');
     }
+    if (isFloat16ArrayAvailable) {
+      NUMERIC_TENSOR_TYPE_TO_TYPEDARRAY_MAP.set('float16', Float16Array);
+      NUMERIC_TENSOR_TYPEDARRAY_TO_TYPE_MAP.set(Float16Array, 'float16');
+    } else {
+      // if Float16Array is not available, use 'Uint16Array' to store the data.
+      NUMERIC_TENSOR_TYPE_TO_TYPEDARRAY_MAP.set('float16', Uint16Array);
+    }
   }
 };
diff --git a/js/common/lib/tensor-impl.ts b/js/common/lib/tensor-impl.ts
index de18126a9d0ae..56682ef98e117 100644
--- a/js/common/lib/tensor-impl.ts
+++ b/js/common/lib/tensor-impl.ts
@@ -5,7 +5,7 @@ import {tensorToDataURL, tensorToImageData} from './tensor-conversion-impl.js';
 import {TensorToDataUrlOptions, TensorToImageDataOptions} from './tensor-conversion.js';
 import {tensorFromGpuBuffer, tensorFromImage, tensorFromPinnedBuffer, tensorFromTexture} from './tensor-factory-impl.js';
 import {CpuPinnedConstructorParameters, GpuBufferConstructorParameters, TensorFromGpuBufferOptions, TensorFromImageBitmapOptions, TensorFromImageDataOptions, TensorFromImageElementOptions, TensorFromTextureOptions, TensorFromUrlOptions, TextureConstructorParameters} from './tensor-factory.js';
-import {checkBigInt, NUMERIC_TENSOR_TYPE_TO_TYPEDARRAY_MAP, NUMERIC_TENSOR_TYPEDARRAY_TO_TYPE_MAP, SupportedTypedArray, SupportedTypedArrayConstructors} from './tensor-impl-type-mapping.js';
+import {checkTypedArray, NUMERIC_TENSOR_TYPE_TO_TYPEDARRAY_MAP, NUMERIC_TENSOR_TYPEDARRAY_TO_TYPE_MAP, SupportedTypedArray, SupportedTypedArrayConstructors} from './tensor-impl-type-mapping.js';
 import {calculateSize, tensorReshape} from './tensor-utils-impl.js';
 import {Tensor as TensorInterface} from './tensor.js';
 
@@ -67,8 +67,8 @@ export class Tensor implements TensorInterface {
       arg0: TensorType|TensorDataType|readonly string[]|readonly boolean[]|CpuPinnedConstructorParameters|
       TextureConstructorParameters|GpuBufferConstructorParameters,
       arg1?: TensorDataType|readonly number[]|readonly string[]|readonly boolean[], arg2?: readonly number[]) {
-    // perform one-time check for BigInt support
-    checkBigInt();
+    // perform one-time check for BigInt/Float16Array support
+    checkTypedArray();
 
     let type: TensorType;
     let dims: readonly number[];
@@ -142,7 +142,9 @@ export class Tensor implements TensorInterface {
             throw new TypeError(`Unsupported tensor type: ${arg0}.`);
           }
           if (Array.isArray(arg1)) {
-            if (arg0 === 'float16') {
+            if (arg0 === 'float16' && typedArrayConstructor === Uint16Array) {
+              // When no Float16Array polyfill is used, we cannot create 'float16' tensor from number array.
+              //
               // Throw error here because when user try to use number array as data,
               // e.g. new Tensor('float16', [1, 2, 3, 4], dims)), it will actually call
               // Uint16Array.from(arg1) which generates wrong data.
diff --git a/js/web/lib/wasm/wasm-common.ts b/js/web/lib/wasm/wasm-common.ts
index 93910af1f1bf0..54eaf5e0c43cc 100644
--- a/js/web/lib/wasm/wasm-common.ts
+++ b/js/web/lib/wasm/wasm-common.ts
@@ -3,6 +3,12 @@
 
 import {Tensor} from 'onnxruntime-common';
 
+// a dummy type declaration for Float16Array in case any polyfill is available.
+declare global {
+  // eslint-disable-next-line @typescript-eslint/naming-convention, @typescript-eslint/no-explicit-any
+  const Float16Array: any;
+}
+
 // This file includes common definitions. They do NOT have dependency on the WebAssembly instance.
 
 /**
@@ -117,7 +123,8 @@ export const tensorTypeToTypedArrayConstructor = (type: Tensor.Type): Float32Arr
     Uint8ArrayConstructor|Float64ArrayConstructor|Uint32ArrayConstructor|BigUint64ArrayConstructor => {
       switch (type) {
         case 'float16':
-          return Uint16Array;
+          // allow Float16Array polyfill.
+          return typeof Float16Array !== 'undefined' && Float16Array.from ? Float16Array : Uint16Array;
         case 'float32':
           return Float32Array;
         case 'uint8':

From 57d6819212464f49b30db047528be0f409dadc67 Mon Sep 17 00:00:00 2001
From: Xu Xing <xing.xu@intel.com>
Date: Thu, 22 Feb 2024 00:08:47 +0800
Subject: [PATCH 030/279] [js/web] Fix fused-conv is not included in npm test
 (#19581)

BUG: https://github.com/microsoft/onnxruntime/issues/18855

### Description
<!-- Describe your changes. -->


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 js/web/test/suite-test-list.jsonc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc
index 1c61518ddcdd2..b43b1ac37e37d 100644
--- a/js/web/test/suite-test-list.jsonc
+++ b/js/web/test/suite-test-list.jsonc
@@ -1354,6 +1354,7 @@
       "expand.jsonc",
       "fast-gelu.jsonc",
       "floor.jsonc",
+      "fused-conv.jsonc",
       "gather-elements.jsonc",
       "gemm.jsonc",
       "global-average-pool.jsonc",

From e5ce81ae847d0b347a3dfe95abfc9e407e2f0469 Mon Sep 17 00:00:00 2001
From: Adam Pocock <adam.pocock@oracle.com>
Date: Wed, 21 Feb 2024 15:24:41 -0500
Subject: [PATCH 031/279] [java] Adding ML program flag for CoreML (#19551)

### Description
Adds the new CoreML enum flags to enable ML Program support in Java.

### Motivation and Context
Adds support for #19347 to the Java API.
---
 .../ai/onnxruntime/providers/CoreMLFlags.java     | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/java/src/main/java/ai/onnxruntime/providers/CoreMLFlags.java b/java/src/main/java/ai/onnxruntime/providers/CoreMLFlags.java
index eb124decf75f3..cec3fadf446ca 100644
--- a/java/src/main/java/ai/onnxruntime/providers/CoreMLFlags.java
+++ b/java/src/main/java/ai/onnxruntime/providers/CoreMLFlags.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, 2023, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2021, 2024, Oracle and/or its affiliates. All rights reserved.
  * Licensed under the MIT License.
  */
 package ai.onnxruntime.providers;
@@ -14,7 +14,18 @@ public enum CoreMLFlags implements OrtFlags {
   /** Enables CoreML on subgraphs. */
   ENABLE_ON_SUBGRAPH(2), // COREML_FLAG_ENABLE_ON_SUBGRAPH(0x002)
   /** Only enable usage of CoreML if the device has an Apple Neural Engine. */
-  ONLY_ENABLE_DEVICE_WITH_ANE(4); // COREML_FLAG_ONLY_ENABLE_DEVICE_WITH_ANE(0x004),
+  ONLY_ENABLE_DEVICE_WITH_ANE(4), // COREML_FLAG_ONLY_ENABLE_DEVICE_WITH_ANE(0x004)
+  /**
+   * Only allow CoreML EP to take nodes with inputs with static shapes. By default it will also
+   * allow inputs with dynamic shapes. However, the performance may be negatively impacted if inputs
+   * have dynamic shapes.
+   */
+  ONLY_ALLOW_STATIC_INPUT_SHAPES(8), // COREML_FLAG_ONLY_ALLOW_STATIC_INPUT_SHAPES(0x008)
+  /**
+   * Create an MLProgram. By default it will create a NeuralNetwork model. Requires Core ML 5 or
+   * later.
+   */
+  CREATE_MLPROGRAM(16); // COREML_FLAG_CREATE_MLPROGRAM(0x010)
 
   /** The native value of the enum. */
   public final int value;

From 3afb38cfb7d4263f262dea33bcfa16d35c67fede Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Wed, 21 Feb 2024 12:46:16 -0800
Subject: [PATCH 032/279] [CUDA] Add use_tf32 cuda provider option (for FP32
 Conv) (#19426)

Follow up of https://github.com/microsoft/onnxruntime/pull/19357 to apply the use_tf32 option on fp32 cuDNN convolution.

When use_tf32 = 0, we will disable TF32 in cuDNN convolution for FP32 inputs.

https://docs.nvidia.com/deeplearning/cudnn/api/cudnn-graph-library.html#cudnnmathtype-t
**CUDNN_FMA_MATH**
- Restricted to only kernels that use FMA instructions.
- On pre-NVIDIA A100 GPU devices, CUDNN_DEFAULT_MATH and CUDNN_FMA_MATH
have the same behavior: Tensor Core kernels will not be selected.
- With NVIDIA Ampere architecture and CUDA toolkit 11,
CUDNN_DEFAULT_MATH permits TF32 Tensor Core operation and CUDNN_FMA_MATH
does not.
- The TF32 behavior for CUDNN_DEFAULT_MATH and the other Tensor Core
math types can be explicitly disabled by the environment variable
NVIDIA_TF32_OVERRIDE=0.
---
 onnxruntime/core/providers/cuda/nn/conv.cc      | 17 ++++++++++++++---
 onnxruntime/core/providers/cuda/nn/conv.h       |  3 ++-
 .../core/providers/cuda/nn/conv_transpose.cc    | 10 ++++++++--
 .../training_ops/cuda/nn/conv_grad.cc           |  3 ++-
 .../training_ops/cuda/nn/conv_shared.cc         |  6 ++++--
 .../training_ops/cuda/nn/conv_shared.h          |  2 +-
 .../training_ops/cuda/nn/conv_transpose_grad.cc |  6 ++++--
 7 files changed, 35 insertions(+), 12 deletions(-)

diff --git a/onnxruntime/core/providers/cuda/nn/conv.cc b/onnxruntime/core/providers/cuda/nn/conv.cc
index 82f3503919237..a417be5a86c32 100644
--- a/onnxruntime/core/providers/cuda/nn/conv.cc
+++ b/onnxruntime/core/providers/cuda/nn/conv.cc
@@ -326,7 +326,8 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
 
     ORT_RETURN_IF_ERROR(s_.conv_desc.Set(kernel_shape.size(), pads, strides, dilations,
                                          gsl::narrow_cast<int>(conv_attrs_.group),
-                                         CUDNN_CROSS_CORRELATION, CudnnTensor::GetDataType<CudaT>()));
+                                         CUDNN_CROSS_CORRELATION, CudnnTensor::GetDataType<CudaT>(),
+                                         UseTF32()));
 
     if (context->InputCount() >= 3) {
       const Tensor* B = context->Input<Tensor>(2);
@@ -351,8 +352,13 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
 
     if (!s_.cached_benchmark_results.contains(x_dims_cudnn)) {
       // set math type to tensor core before algorithm search
-      if constexpr (std::is_same<T, MLFloat16>::value)
+      if constexpr (std::is_same<T, MLFloat16>::value) {
         CUDNN_RETURN_IF_ERROR(cudnnSetConvolutionMathType(s_.conv_desc, CUDNN_TENSOR_OP_MATH));
+      } else if constexpr (std::is_same<T, float>::value) {
+        if (!UseTF32()) {
+          CUDNN_RETURN_IF_ERROR(cudnnSetConvolutionMathType(s_.conv_desc, CUDNN_FMA_MATH));
+        }
+      }
 
       cudnnConvolutionFwdAlgoPerf_t perf;
       int algo_count = 1;
@@ -399,6 +405,8 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
           CUDNN_RETURN_IF_ERROR(GetWorkspaceSize(GetCudnnHandle(context), s_, perf.algo, &perf.memory));
           if (std::is_same<T, MLFloat16>::value) {
             perf.mathType = CUDNN_TENSOR_OP_MATH;
+          } else if (std::is_same<T, float>::value && !UseTF32()) {
+            perf.mathType = CUDNN_FMA_MATH;
           } else {
             perf.mathType = CUDNN_DEFAULT_MATH;
           }
@@ -480,7 +488,8 @@ Status CudnnConvolutionDescriptor::Set(
     const gsl::span<const int64_t>& dilations,
     int groups,
     cudnnConvolutionMode_t mode,
-    cudnnDataType_t data_type) {
+    cudnnDataType_t data_type,
+    bool use_tf32) {
   if (!desc_)
     CUDNN_RETURN_IF_ERROR(cudnnCreateConvolutionDescriptor(&desc_));
 
@@ -513,6 +522,8 @@ Status CudnnConvolutionDescriptor::Set(
   CUDNN_RETURN_IF_ERROR(cudnnSetConvolutionMathType(desc_, CUDNN_DEFAULT_MATH));
   if (data_type == CUDNN_DATA_HALF) {
     CUDNN_RETURN_IF_ERROR(cudnnSetConvolutionMathType(desc_, CUDNN_TENSOR_OP_MATH));
+  } else if (data_type == CUDNN_DATA_FLOAT && !use_tf32) {
+    CUDNN_RETURN_IF_ERROR(cudnnSetConvolutionMathType(desc_, CUDNN_FMA_MATH));
   }
 
   return Status::OK();
diff --git a/onnxruntime/core/providers/cuda/nn/conv.h b/onnxruntime/core/providers/cuda/nn/conv.h
index bcaa4d855b81e..181fbc99fd8e9 100644
--- a/onnxruntime/core/providers/cuda/nn/conv.h
+++ b/onnxruntime/core/providers/cuda/nn/conv.h
@@ -29,7 +29,8 @@ class CudnnConvolutionDescriptor final {
              const gsl::span<const int64_t>& dilations,
              int groups,
              cudnnConvolutionMode_t mode,
-             cudnnDataType_t data_type);
+             cudnnDataType_t data_type,
+             bool use_tf32);
 
   operator cudnnConvolutionDescriptor_t() const { return desc_; }
 
diff --git a/onnxruntime/core/providers/cuda/nn/conv_transpose.cc b/onnxruntime/core/providers/cuda/nn/conv_transpose.cc
index 55dceaa2698e8..939b9959af818 100644
--- a/onnxruntime/core/providers/cuda/nn/conv_transpose.cc
+++ b/onnxruntime/core/providers/cuda/nn/conv_transpose.cc
@@ -167,7 +167,8 @@ Status ConvTranspose<T, NHWC>::DoConvTranspose(OpKernelContext* context, bool dy
       cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
       ORT_RETURN_IF_ERROR(s_.conv_desc.Set(p.kernel_shape.size(), p.pads, p.strides, p.dilations,
                                            gsl::narrow_cast<int>(conv_transpose_attrs_.group), mode,
-                                           CudnnTensor::GetDataType<CudaT>()));
+                                           CudnnTensor::GetDataType<CudaT>(),
+                                           UseTF32()));
 
       if (has_bias) {
         const auto& b_shape = p.B->Shape();
@@ -187,8 +188,13 @@ Status ConvTranspose<T, NHWC>::DoConvTranspose(OpKernelContext* context, bool dy
             GetScratchBuffer<void>(AlgoSearchWorkspaceSize, context->GetComputeStream());
 
         // set math type to tensor core before algorithm search
-        if constexpr (std::is_same<T, MLFloat16>::value)
+        if constexpr (std::is_same<T, MLFloat16>::value) {
           CUDNN_RETURN_IF_ERROR(cudnnSetConvolutionMathType(s_.conv_desc, CUDNN_TENSOR_OP_MATH));
+        } else if constexpr (std::is_same<T, float>::value) {
+          if (!UseTF32()) {
+            CUDNN_RETURN_IF_ERROR(cudnnSetConvolutionMathType(s_.conv_desc, CUDNN_FMA_MATH));
+          }
+        }
 
         cudnnConvolutionBwdDataAlgoPerf_t perf;
         int algo_count = 1;
diff --git a/orttraining/orttraining/training_ops/cuda/nn/conv_grad.cc b/orttraining/orttraining/training_ops/cuda/nn/conv_grad.cc
index f6c58445c0a5d..fc5d9b65d0f89 100644
--- a/orttraining/orttraining/training_ops/cuda/nn/conv_grad.cc
+++ b/orttraining/orttraining/training_ops/cuda/nn/conv_grad.cc
@@ -114,7 +114,8 @@ Status ConvGrad<T>::PrepareArgs(const Tensor& x, const Tensor& dY, const Tensor&
     ORT_RETURN_IF_ERROR(args_.y_tensor.Set(dy_dims, args_.params.data_type));
     ORT_RETURN_IF_ERROR(args_.conv_desc.Set(kernel_shape.size(), pads, strides, dilations,
                                             gsl::narrow_cast<int>(conv_attrs_.group), CUDNN_CROSS_CORRELATION,
-                                            args_.params.data_type));
+                                            args_.params.data_type,
+                                            UseTF32()));
 
     if (dB) {
       const TensorShape& db_shape = dB->Shape();
diff --git a/orttraining/orttraining/training_ops/cuda/nn/conv_shared.cc b/orttraining/orttraining/training_ops/cuda/nn/conv_shared.cc
index 5dc16c68f6210..d23905496c9bb 100644
--- a/orttraining/orttraining/training_ops/cuda/nn/conv_shared.cc
+++ b/orttraining/orttraining/training_ops/cuda/nn/conv_shared.cc
@@ -233,11 +233,13 @@ bool ConvParamsEqual::operator()(const ConvParams& a, const ConvParams& b) const
 }
 
 template <typename T_Perf>
-Status AlgoIterator<T_Perf>::OnlyDefaultAlgorithm(const ConvArgs& args, std::vector<T_Perf>& perf_results) {
+Status AlgoIterator<T_Perf>::OnlyDefaultAlgorithm(const ConvArgs& args, std::vector<T_Perf>& perf_results, bool use_tf32) {
   perf_results.resize(1);
   perf_results[0].algo = AlgoSearch<T_Perf>::DEFAULT_ALGO;
   if (args.params.data_type == CUDNN_DATA_HALF) {
     perf_results[0].mathType = CUDNN_TENSOR_OP_MATH;
+  } else if (args.params.data_type == CUDNN_DATA_FLOAT && !use_tf32) {
+    perf_results[0].mathType = CUDNN_FMA_MATH;
   } else {
     perf_results[0].mathType = CUDNN_DEFAULT_MATH;
   }
@@ -256,7 +258,7 @@ Status AlgoIterator<T_Perf>::TryAll(const CUDAExecutionProvider* provider, const
 
   std::vector<T_Perf> perf_results;
   ORT_RETURN_IF_ERROR(args_.params.algo_mode == OrtCudnnConvAlgoSearchDefault
-                          ? OnlyDefaultAlgorithm(args_, perf_results)
+                          ? OnlyDefaultAlgorithm(args_, perf_results, provider->UseTF32())
                           : AlgoSearch<T_Perf>::FindAlgorithms(args_, provider, allocator, perf_results));
   for (auto& algo_perf : perf_results) {
     if (f(algo_perf) == Status::OK()) {
diff --git a/orttraining/orttraining/training_ops/cuda/nn/conv_shared.h b/orttraining/orttraining/training_ops/cuda/nn/conv_shared.h
index a2d4bf3bdc006..3fdb4306bfbbb 100644
--- a/orttraining/orttraining/training_ops/cuda/nn/conv_shared.h
+++ b/orttraining/orttraining/training_ops/cuda/nn/conv_shared.h
@@ -75,7 +75,7 @@ class AlgoIterator {
   Status TryAll(const CUDAExecutionProvider* provider, const AllocatorPtr& allocator,
                 std::function<Status(const T_Perf& perf)> f);
 
-  static Status OnlyDefaultAlgorithm(const ConvArgs& args, std::vector<T_Perf>& perf_results);
+  static Status OnlyDefaultAlgorithm(const ConvArgs& args, std::vector<T_Perf>& perf_results, bool use_tf32);
 
  private:
   const ConvArgs& args_;
diff --git a/orttraining/orttraining/training_ops/cuda/nn/conv_transpose_grad.cc b/orttraining/orttraining/training_ops/cuda/nn/conv_transpose_grad.cc
index 5f7206fc121ec..d3f5a89434a48 100644
--- a/orttraining/orttraining/training_ops/cuda/nn/conv_transpose_grad.cc
+++ b/orttraining/orttraining/training_ops/cuda/nn/conv_transpose_grad.cc
@@ -182,7 +182,8 @@ Status ConvTransposeGrad<T>::PrepareConvForwardArgs(const Tensor& X, const Tenso
     ORT_RETURN_IF_ERROR(args.y_tensor.Set(y_dims, args.params.data_type));
     ORT_RETURN_IF_ERROR(args.conv_desc.Set(kernel_shape.size(), pads, strides, dilations,
                                            gsl::narrow_cast<int>(conv_attrs_.group), CUDNN_CROSS_CORRELATION,
-                                           args.params.data_type));
+                                           args.params.data_type,
+                                           UseTF32()));
   }
 
   return Status::OK();
@@ -287,7 +288,8 @@ Status ConvTransposeGrad<T>::PrepareConvBackwardFilterArgs(const Tensor& X, cons
     ORT_RETURN_IF_ERROR(args.y_tensor.Set(y_dims, args.params.data_type));
     ORT_RETURN_IF_ERROR(args.conv_desc.Set(kernel_shape.size(), pads, strides, dilations,
                                            gsl::narrow_cast<int>(conv_attrs_.group), CUDNN_CROSS_CORRELATION,
-                                           args.params.data_type));
+                                           args.params.data_type,
+                                           UseTF32()));
 
     if (dB) {
       const auto& b_shape = dB->Shape();

From ebd220b0730f9898aaa0275ef0d8195ce70057d0 Mon Sep 17 00:00:00 2001
From: Matttttt <18152455+martholomew@users.noreply.github.com>
Date: Wed, 21 Feb 2024 21:38:18 +0000
Subject: [PATCH 033/279] Misspelling in README.md (#19433)

Fixed a misspelling.
---
 js/web/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/js/web/README.md b/js/web/README.md
index c75a40ad6da28..906c78a1b7ec4 100644
--- a/js/web/README.md
+++ b/js/web/README.md
@@ -12,7 +12,7 @@ The [Open Neural Network Exchange](http://onnx.ai/) (ONNX) is an open standard f
 
 With ONNX Runtime Web, web developers can score models directly on browsers with various benefits including reducing server-client communication and protecting user privacy, as well as offering install-free and cross-platform in-browser ML experience.
 
-ONNX Runtime Web can run on both CPU and GPU. On CPU side, [WebAssembly](https://developer.mozilla.org/en-US/docs/WebAssembly) is adopted to execute the model at near-native speed. ONNX Runtime Web complies the native ONNX Runtime CPU engine into WebAssembly backend by using Emscripten, so it supports most functionalities native ONNX Runtime offers, including full ONNX operator coverage, multi-threading, [ONNX Runtime Quantization](https://www.onnxruntime.ai/docs/how-to/quantization.html) as well as [ONNX Runtime Mobile](https://onnxruntime.ai/docs/tutorials/mobile/). For performance acceleration with GPUs, ONNX Runtime Web leverages WebGL, a popular standard for accessing GPU capabilities. We are keeping improving op coverage and optimizing performance in WebGL backend.
+ONNX Runtime Web can run on both CPU and GPU. On CPU side, [WebAssembly](https://developer.mozilla.org/en-US/docs/WebAssembly) is adopted to execute the model at near-native speed. ONNX Runtime Web compiles the native ONNX Runtime CPU engine into WebAssembly backend by using Emscripten, so it supports most functionalities native ONNX Runtime offers, including full ONNX operator coverage, multi-threading, [ONNX Runtime Quantization](https://www.onnxruntime.ai/docs/how-to/quantization.html) as well as [ONNX Runtime Mobile](https://onnxruntime.ai/docs/tutorials/mobile/). For performance acceleration with GPUs, ONNX Runtime Web leverages WebGL, a popular standard for accessing GPU capabilities. We are keeping improving op coverage and optimizing performance in WebGL backend.
 
 See [Compatibility](#Compatibility) and [Operators Supported](#Operators) for a list of platforms and operators ONNX Runtime Web currently supports.
 
@@ -22,7 +22,7 @@ Refer to [ONNX Runtime JavaScript examples](https://github.com/microsoft/onnxrun
 
 ## Documents
 
-### Developement
+### Development
 
 Refer to the following links for development information:
 

From 38c34323939bac03b9648b2e59dbbe8de0bd7092 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 21 Feb 2024 13:58:53 -0800
Subject: [PATCH 034/279] Bump ip from 1.1.8 to 1.1.9 in /js/react_native
 (#19582)

Bumps [ip](https://github.com/indutny/node-ip) from 1.1.8 to 1.1.9.
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/indutny/node-ip/commit/1ecbf2fd8c0cc85e44c3b587d2de641f50dc0217"><code>1ecbf2f</code></a>
1.1.9</li>
<li><a
href="https://github.com/indutny/node-ip/commit/6a3ada9b471b09d5f0f5be264911ab564bf67894"><code>6a3ada9</code></a>
lib: fixed CVE-2023-42282 and added unit test</li>
<li>See full diff in <a
href="https://github.com/indutny/node-ip/compare/v1.1.8...v1.1.9">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=ip&package-manager=npm_and_yarn&previous-version=1.1.8&new-version=1.1.9)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
Dependabot will merge this PR once CI passes on it, as requested by
@fs-eire.

[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)
You can disable automated security fix PRs for this repo from the
[Security Alerts
page](https://github.com/microsoft/onnxruntime/network/alerts).

</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 js/react_native/yarn.lock | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/js/react_native/yarn.lock b/js/react_native/yarn.lock
index 4dca90d7415cf..bbb0c4f3d1e22 100644
--- a/js/react_native/yarn.lock
+++ b/js/react_native/yarn.lock
@@ -3701,9 +3701,9 @@ invariant@^2.2.4:
     loose-envify "^1.0.0"
 
 ip@^1.1.5:
-  version "1.1.8"
-  resolved "https://registry.yarnpkg.com/ip/-/ip-1.1.8.tgz#ae05948f6b075435ed3307acce04629da8cdbf48"
-  integrity sha512-PuExPYUiu6qMBQb4l06ecm6T6ujzhmh+MeJcW9wa89PoAz5pvd4zPgN5WJV104mb6S2T1AwNIAaB70JNrLQWhg==
+  version "1.1.9"
+  resolved "https://registry.yarnpkg.com/ip/-/ip-1.1.9.tgz#8dfbcc99a754d07f425310b86a99546b1151e396"
+  integrity sha512-cyRxvOEpNHNtchU3Ln9KC/auJgup87llfQpQ+t5ghoC/UhL16SWzbueiCsdTnWmqAWl7LadfuwhlqmtOaqMHdQ==
 
 is-absolute@^1.0.0:
   version "1.0.0"

From 5197db19802a39e47d19ac829cd08a94bacbdfbb Mon Sep 17 00:00:00 2001
From: Sheil Kumar <smk2007@gmail.com>
Date: Wed, 21 Feb 2024 15:45:44 -0800
Subject: [PATCH 035/279] Diable __cpuid call for ARM64EC (#19592)

Diable __cpuid call for ARM64EC

Co-authored-by: Sheil Kumar <sheilk@microsoft.com>
---
 winml/lib/Api/HardwareCoreEnumerator.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/winml/lib/Api/HardwareCoreEnumerator.cpp b/winml/lib/Api/HardwareCoreEnumerator.cpp
index b6b44690f4f6c..d04e276347170 100644
--- a/winml/lib/Api/HardwareCoreEnumerator.cpp
+++ b/winml/lib/Api/HardwareCoreEnumerator.cpp
@@ -84,7 +84,7 @@ uint32_t HardwareCoreEnumerator::DefaultIntraOpNumThreads() {
   // # of logical cores = # of P cores x 2 (if hyper threading is enabled) + # of E cores + # of Soc Cores.
   auto cores = GetNumberOPhysicalAndEngineeringCores();
 
-#if !defined(_M_ARM64) && !defined(__aarch64__)
+#if !defined(_M_ARM64EC) && !defined(_M_ARM64) && !defined(__aarch64__)
   const int kVendorID_Intel[3] = {0x756e6547, 0x6c65746e, 0x49656e69};  // "GenuntelineI"
   int regs_leaf0[4];
   int regs_leaf7[4];

From 3d88487c96bf467c4b83dff179c9e282602e2d64 Mon Sep 17 00:00:00 2001
From: Vincent Wang <wangwchpku@outlook.com>
Date: Thu, 22 Feb 2024 10:35:26 +0800
Subject: [PATCH 036/279] Minor Triton Fix (#19589)

Including removing a unnecessary assert, and add support of passing
string attribute from ONNX node attribute to python functoin kwargs
(mainly for passing debug info from graph to python for now).
---
 .../orttraining/core/framework/triton/triton_op_executor.cc  | 2 ++
 orttraining/orttraining/python/training/ort_triton/_utils.py | 3 ++-
 orttraining/orttraining/training_ops/cpu/triton/triton_op.h  | 5 ++++-
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/orttraining/orttraining/core/framework/triton/triton_op_executor.cc b/orttraining/orttraining/core/framework/triton/triton_op_executor.cc
index 092ab89d5d760..f30d6ddee253a 100644
--- a/orttraining/orttraining/core/framework/triton/triton_op_executor.cc
+++ b/orttraining/orttraining/core/framework/triton/triton_op_executor.cc
@@ -106,6 +106,8 @@ void TritonOpExecutor::ExecuteByFuncName(const std::string& func_name, const Inl
       PyDict_SetItemString(python_kwargs.get(), kv.first.c_str(), PyLong_FromLongLong(std::stoll(kv.second.first)));
     } else if (kv.second.second == ONNX_NAMESPACE::TensorProto_DataType_FLOAT) {
       PyDict_SetItemString(python_kwargs.get(), kv.first.c_str(), PyFloat_FromDouble(std::stod(kv.second.first)));
+    } else if (kv.second.second == ONNX_NAMESPACE::TensorProto_DataType_STRING) {
+      PyDict_SetItemString(python_kwargs.get(), kv.first.c_str(), PyUnicode_FromString(kv.second.first.c_str()));
     } else {
       ORT_THROW("Unsupported kwargs data type: ", kv.second.second);
     }
diff --git a/orttraining/orttraining/python/training/ort_triton/_utils.py b/orttraining/orttraining/python/training/ort_triton/_utils.py
index 95e6703be8783..877eacc0b775f 100644
--- a/orttraining/orttraining/python/training/ort_triton/_utils.py
+++ b/orttraining/orttraining/python/training/ort_triton/_utils.py
@@ -141,13 +141,14 @@ def get_reduce_info(node: NodeProto, graph: GraphProto, input_rank: int) -> Tupl
 
 
 def next_power_of_2(n: int) -> int:
-    assert n <= 2**32, "32-bit only"
+    """Return the smallest power of 2 greater than or equal to n"""
     n -= 1
     n |= n >> 1
     n |= n >> 2
     n |= n >> 4
     n |= n >> 8
     n |= n >> 16
+    n |= n >> 32
     n += 1
     return n
 
diff --git a/orttraining/orttraining/training_ops/cpu/triton/triton_op.h b/orttraining/orttraining/training_ops/cpu/triton/triton_op.h
index f226db76f7ed7..db8e8558ab884 100644
--- a/orttraining/orttraining/training_ops/cpu/triton/triton_op.h
+++ b/orttraining/orttraining/training_ops/cpu/triton/triton_op.h
@@ -25,12 +25,15 @@ class TritonOp final : public OpKernel {
           attr.first == "onnx_string") {
         continue;
       }
-      // Support int64 and float only for now, skip other types.
+      // Support int64, float and string only for now, skip other types.
       if (attr.second.type() == ONNX_NAMESPACE::AttributeProto::AttributeType::AttributeProto_AttributeType_INT) {
         kwargs_.insert({attr.first, {std::to_string(attr.second.i()), ONNX_NAMESPACE::TensorProto_DataType_INT64}});
       } else if (attr.second.type() ==
                  ONNX_NAMESPACE::AttributeProto::AttributeType::AttributeProto_AttributeType_FLOAT) {
         kwargs_.insert({attr.first, {std::to_string(attr.second.f()), ONNX_NAMESPACE::TensorProto_DataType_FLOAT}});
+      } else if (attr.second.type() ==
+                 ONNX_NAMESPACE::AttributeProto::AttributeType::AttributeProto_AttributeType_STRING) {
+        kwargs_.insert({attr.first, {attr.second.s(), ONNX_NAMESPACE::TensorProto_DataType_STRING}});
       }
     }
   }

From 8354329086ebb190db9ea0cb6a3fa72f53f8f881 Mon Sep 17 00:00:00 2001
From: PeixuanZuo <94887879+PeixuanZuo@users.noreply.github.com>
Date: Thu, 22 Feb 2024 13:34:45 +0800
Subject: [PATCH 037/279] [ROCm] SkipGroupNorm triton (#19408)

Change GroupNorm triton to support SkipGroupNorm
---
 .../rocm/diffusion/group_norm_triton.cuh      | 23 ++++++++---
 .../rocm/diffusion/group_norm_triton.py       | 39 +++++++++++++++++--
 .../kernel_explorer/kernels/groupnorm_test.py | 12 ++++++
 3 files changed, 64 insertions(+), 10 deletions(-)

diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.cuh b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.cuh
index b3d3e92209b39..c6ca16bfdfc80 100644
--- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.cuh
+++ b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.cuh
@@ -46,8 +46,6 @@ auto GetTritonGroupNormNHWCTypeStringAndOps() {
     auto block_size = metadata->constants.at("BLOCK_SIZE");
     auto hw_size = metadata->constants.at("HW_SIZE");
     auto impl = [i, block_size, hw_size](const GroupNormNHWCTunableParams<T>* params) -> Status {
-      TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF((params->skip != nullptr || params->bias != nullptr),
-                                                "Input skip or bias is not supported by triton kernel.");
       TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(
           params->channels_per_group > block_size || params->channels_per_group * 2 <= block_size,
           "Arg block_size (", block_size, ") is not the next power of 2 of channels_per_group (",
@@ -61,23 +59,36 @@ auto GetTritonGroupNormNHWCTypeStringAndOps() {
       }
       // Construct args for launch kernel
       struct {
-        void* X;
-        void* Y;
+        const void* src;
+        const void* skip;
+        const void* bias;
+        void* out;
+        void* add_out;
         const void* gamma;
         const void* beta;
         int hw;
         int c;
         int c_per_group;
         float eps;
+        bool has_skip;
+        bool has_bias;
+        bool broadcast_skip;
       } args = {
-          (void*)params->src,
+          (const void*)params->src,
+          (const void*)params->skip,
+          (const void*)params->bias,
           (void*)params->dst,
+          (void*)params->skip_workspace,
           (const void*)params->gamma,
           (const void*)params->beta,
           params->hw,
           params->c,
           params->channels_per_group,
-          params->epsilon};
+          params->epsilon,
+          params->skip != nullptr,
+          params->bias != nullptr,
+          params->broadcast_skip,
+      };
 
       // Grid dim is (batch_count, groups, 1)
       return LaunchTritonKernel(params->StreamHandle(), i, params->n, params->groups, 1, &args, sizeof(args));
diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.py b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.py
index 5368cb1cf635b..5ba96ebc117f0 100644
--- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.py
+++ b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.py
@@ -12,13 +12,19 @@
 @triton.jit
 def group_norm_kernel(
     input_ptr,
+    skip_ptr,
+    bias_ptr,
     output_ptr,
+    add_out_ptr,
     gamma_ptr,
     beta_ptr,
     img_size,
     c,
     c_per_group,
     eps,
+    has_skip,
+    has_bias,
+    broadcast_skip,
     BLOCK_SIZE: tl.constexpr,
     HW_SIZE: tl.constexpr,
     ACTIVATION_SILU: tl.constexpr,
@@ -36,14 +42,35 @@ def group_norm_kernel(
     offsets = hw[:, None] * c + cols[None, :]
     mask = (cols < c_per_group)[None, :]
 
+    bias = tl.zeros([BLOCK_SIZE], dtype=tl.float32)
+    if has_skip:
+        add_out_ptr += row_x * stride + row_y * c_per_group
+        if broadcast_skip:
+            broadcast_skip_ptr = skip_ptr + row_x * c + row_y * c_per_group
+            bias += tl.load(broadcast_skip_ptr + cols, mask=cols < c_per_group, other=0.0).to(tl.float32)
+        else:
+            skip_ptr += row_x * stride + row_y * c_per_group
+    if has_bias:
+        bias_ptr += row_y * c_per_group
+        bias += tl.load(bias_ptr + cols, mask=cols < c_per_group, other=0.0).to(tl.float32)
+
     # Calculate mean and variance
     _sum = tl.zeros([HW_SIZE, BLOCK_SIZE], dtype=tl.float32)
     _square_sum = tl.zeros([HW_SIZE, BLOCK_SIZE], dtype=tl.float32)
     for i in range(tl.cdiv(img_size, HW_SIZE)):
         x_ptr = input_ptr + i * HW_SIZE * c
         a = tl.load(x_ptr + offsets, mask=mask, other=0.0).to(tl.float32)
+        if has_skip and not broadcast_skip:
+            s_ptr = skip_ptr + i * HW_SIZE * c
+            s = tl.load(s_ptr + offsets, mask=mask, other=0.0).to(tl.float32)
+            a += s
+        if has_bias or broadcast_skip:
+            a += bias
         _sum += a
         _square_sum += a * a
+        if has_skip:
+            add_y_ptr = add_out_ptr + i * HW_SIZE * c
+            tl.store(add_y_ptr + offsets, a, mask=mask)
 
     # Set axis=None (or leave it unspecified) to reduce all axes.
     # TODO: In older Triton we have to reduce an axis at a time, but in our case
@@ -57,9 +84,13 @@ def group_norm_kernel(
     gamma = tl.load(gamma_ptr + cols, mask=cols < c_per_group).to(tl.float32)
     beta = tl.load(beta_ptr + cols, mask=cols < c_per_group).to(tl.float32)
     for i in range(tl.cdiv(img_size, HW_SIZE)):
-        x_ptr = input_ptr + i * HW_SIZE * c
         y_ptr = output_ptr + i * HW_SIZE * c
-        x = tl.load(x_ptr + offsets, mask=mask, other=0.0).to(tl.float32)
+        if has_skip:
+            add_y_ptr = add_out_ptr + i * HW_SIZE * c
+            x = tl.load(add_y_ptr + offsets, mask=mask, other=0.0).to(tl.float32)
+        else:
+            x_ptr = input_ptr + i * HW_SIZE * c
+            x = tl.load(x_ptr + offsets, mask=mask, other=0.0).to(tl.float32)
         x_hat = (x - group_mean) * rstd
         y = x_hat * gamma + beta
         if ACTIVATION_SILU:
@@ -77,7 +108,7 @@ def group_norm_kernel(
 hw_sizes = [8, 16, 32, 64, 128, 256]
 warps = [1, 2, 4, 8, 16]
 name_pattern = "GroupNormTriton_{}_{}_b{}_hw{}_w{}"
-sig_pattern = "*{},*{},*fp32,*fp32,i32,i32,i32,fp32"
+sig_pattern = "*{},*{},*{},*{},*{},*fp32,*fp32,i32,i32,i32,fp32,i1,i1,i1"
 group_pattern = "GroupNormTriton_{}_{}"
 
 
@@ -88,7 +119,7 @@ def get_function_table():
         silu_suffix = "Silu" if silu else "Pass"
         name = name_pattern.format(silu_suffix, dtype, b, hw_size, warp)
         group = group_pattern.format(silu_suffix, dtype)
-        sig = sig_pattern.format(dtype, dtype)
+        sig = sig_pattern.format(dtype, dtype, dtype, dtype, dtype)
         kwargs = {
             "num_warps": warp,
             "constants": {"BLOCK_SIZE": b, "HW_SIZE": hw_size, "ACTIVATION_SILU": int(silu)},
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/groupnorm_test.py b/onnxruntime/python/tools/kernel_explorer/kernels/groupnorm_test.py
index 8334d20e47c86..400a9d8a7a187 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/groupnorm_test.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/groupnorm_test.py
@@ -80,6 +80,18 @@ def run_group_norm(
     )
     use_silu = silu
     broadcast_skip = False
+    if has_skip:
+        skip_x_shape = skip_x.shape
+        b2 = len(skip_x_shape) == 2 and skip_x_shape[0] == batch_size and skip_x_shape[1] == num_channels
+        b4 = (
+            len(skip_x_shape) == 4
+            and skip_x_shape[0] == batch_size
+            and skip_x_shape[1] == 1
+            and skip_x_shape[2] == 1
+            and skip_x_shape[3] == num_channels
+        )
+        if b2 or b4:
+            broadcast_skip = True
     channels_per_block = 0  # Compute in params initialization
 
     input_d = ke.DeviceArray(input_x.astype(dtype))

From 05ed89f46980b7e5a5328bc20af8b32ca9f1f715 Mon Sep 17 00:00:00 2001
From: PeixuanZuo <94887879+PeixuanZuo@users.noreply.github.com>
Date: Thu, 22 Feb 2024 13:34:55 +0800
Subject: [PATCH 038/279] [ROCm] Add excluded libs for ROCm python package 
 (#19586)

The rocm lib version has changed in rocm 6.0

Using libs packaged in whl might cause errors.
For example, `libamdhip64.so.6` packaged in whl will cause compute error
when training gpt2 model.

The root cause still in investigating.
---
 setup.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/setup.py b/setup.py
index 03e1cb75ba581..9a5fc29dd5e02 100644
--- a/setup.py
+++ b/setup.py
@@ -205,18 +205,23 @@ def run(self):
                 rocm_dependencies = [
                     "libamd_comgr.so.2",
                     "libamdhip64.so.5",
+                    "libamdhip64.so.6",
                     "libdrm.so.2",
                     "libdrm_amdgpu.so.1",
                     "libelf.so.1",
                     "libhipfft.so.0",
                     "libhiprtc.so.5",
+                    "libhiprtc.so.6",
                     "libhsa-runtime64.so.1",
                     "libMIOpen.so.1",
                     "libnuma.so.1",
                     "librccl.so.1",
                     "librocblas.so.3",
+                    "librocblas.so.4",
                     "librocfft.so.0",
+                    "libroctx64.so.4",
                     "librocm_smi64.so.5",
+                    "librocm_smi64.so.6",
                     "libroctracer64.so.4",
                     "libtinfo.so.6",
                     "libmigraphx_c.so.3",

From 6b73ab3e3e72a9f2008e8d0e221b0be77d2993b1 Mon Sep 17 00:00:00 2001
From: cao lei <jslhcl@gmail.com>
Date: Thu, 22 Feb 2024 10:19:08 -0800
Subject: [PATCH 039/279] Introduce reused_buffer_index_per_stream in
 allocation planner which will be reset after computing the reuse buffer for
 each stream (#19515)

### Description
<!-- Describe your changes. -->
Introduce reused_buffer_index_per_stream in allocation planner which
will be reset after computing the reuse buffer for each stream. So if a
NodeArg is an input of several Ops across different streams and reuses
other NodeArg, the reused NodeArg won't be involved when computing the
second stream's reuse plan.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
This is to fix https://github.com/microsoft/onnxruntime/issues/19480,
which is a crash for the scenario mentioned above.

---------

Co-authored-by: Lei Cao <leca@microsoft.com>
---
 .../core/framework/allocation_planner.cc      |  44 ++++++------
 .../test/framework/allocation_planner_test.cc |  68 ++++++++++++++++++
 .../multi_stream_models/issue_19480.onnx      | Bin 0 -> 760 bytes
 3 files changed, 91 insertions(+), 21 deletions(-)
 create mode 100644 onnxruntime/test/testdata/multi_stream_models/issue_19480.onnx

diff --git a/onnxruntime/core/framework/allocation_planner.cc b/onnxruntime/core/framework/allocation_planner.cc
index ea7a6432a7507..158ab8ed610f4 100644
--- a/onnxruntime/core/framework/allocation_planner.cc
+++ b/onnxruntime/core/framework/allocation_planner.cc
@@ -182,7 +182,6 @@ class PlannerImpl {
   // upstream_node_0 and upstream_node_1 are the immmediate upstream nodes of downstream_node
   // upstream_node_2 is the immediate nodes ahead of downstream_node in the same logic stream
   InlinedHashMap<onnxruntime::NodeIndex, InlinedHashSet<onnxruntime::NodeIndex>> dependence_graph_;
-  InlinedHashMap<onnxruntime::OrtValueIndex, InlinedHashSet<onnxruntime::NodeIndex>> value_consumer_map_;
   InlinedHashMap<onnxruntime::OrtValueIndex, onnxruntime::NodeIndex> value_node_map_;
 
   // OrtValueInfo: Auxiliary information about an OrtValue used only during plan-generation:
@@ -295,7 +294,7 @@ class PlannerImpl {
   }
 #endif
 
-  // Find if there exists some input tensor that we can use in-place for output_arg_num-th input in the node.
+  // Find if there exists some input tensor that we can use in-place for output_arg_num-th output in the node.
   bool FindReusableInput(const onnxruntime::Node& node, int output_arg_num, OrtValueIndex* reusable_input,
                          bool* is_strided_tensor) {
     *is_strided_tensor = false;
@@ -530,6 +529,7 @@ class PlannerImpl {
 
     // Initialize allocation plan:
     plan_.allocation_plan.resize(num_ml_values);
+    for (int i = 0; static_cast<size_t>(i) < num_ml_values; i++) AllocPlan(i).reused_buffer = i;
   }
 
   bool HasExternalOutputs(const Node& node) const {
@@ -1065,7 +1065,8 @@ class PlannerImpl {
 
     // build the consumer list for each value
     int num_ml_values = ort_value_name_idx_map_.MaxIdx() + 1;
-    value_consumer_map_.reserve(num_ml_values);
+    InlinedHashMap<onnxruntime::OrtValueIndex, InlinedHashSet<onnxruntime::NodeIndex>> value_consumer_map;
+    value_consumer_map.reserve(num_ml_values);
 
     // iterate each stream from back, so the first element is the last consumer in single stream case
     for (auto& stream : stream_nodes_) {
@@ -1078,10 +1079,10 @@ class PlannerImpl {
             const auto& name = input.Name();
             int value_idx;
             ORT_RETURN_IF_ERROR(ort_value_name_idx_map_.GetIdx(name, value_idx));
-            auto origin = Buffer(value_idx);
-            if (origin != -1 && plan_.allocation_plan[origin].alloc_kind == AllocKind::kAllocate) {
+            auto origin = AllocPlan(value_idx).reused_buffer;
+            if (AllocPlan(origin).alloc_kind == AllocKind::kAllocate) {
               // add current node as consumer for origin buffer
-              value_consumer_map_[origin].insert(node_index);
+              value_consumer_map[origin].insert(node_index);
             }
           }
           return Status::OK();
@@ -1138,8 +1139,8 @@ class PlannerImpl {
                   std::cout << p_input_arg->Name() << " reused by " << p_output_arg->Name() << " as input" << std::endl;
                   allocation_plan[output_idx_global].alloc_kind = AllocKind::kReuse;
                   allocation_plan[output_idx_global].reused_buffer = reusable_input;
-                  value_consumer_map_[reusable_input].insert(value_consumer_map_[output_idx_global].begin(),
-                                                             value_consumer_map_[output_idx_global].end());
+                  value_consumer_map[reusable_input].insert(value_consumer_map[output_idx_global].begin(),
+                                                            value_consumer_map[output_idx_global].end());
                   reused.insert(reusable_input);
                   found_reusable = true;
                   break;
@@ -1168,8 +1169,8 @@ class PlannerImpl {
                   allocation_plan[reusable_input].alloc_kind == AllocKind::kAllocate) {
                 allocation_plan[output_idx_global].alloc_kind = AllocKind::kReuse;
                 allocation_plan[output_idx_global].reused_buffer = reusable_input;
-                value_consumer_map_[reusable_input].insert(value_consumer_map_[output_idx_global].begin(),
-                                                           value_consumer_map_[output_idx_global].end());
+                value_consumer_map[reusable_input].insert(value_consumer_map[output_idx_global].begin(),
+                                                          value_consumer_map[output_idx_global].end());
                 reused.insert(reusable_input);
                 continue;
               }  // if
@@ -1187,11 +1188,11 @@ class PlannerImpl {
                 OrtValueIndex input_arg_index{};
                 if (value_map.GetIdx(p_input_arg->Name(), input_arg_index).IsOK() &&
                     allocation_plan[input_arg_index].alloc_kind == AllocKind::kAllocate) {
-                  if (value_consumer_map_[input_arg_index].size() == 1 && SameSize(*p_input_arg, *p_output_arg)) {
+                  if (value_consumer_map[input_arg_index].size() == 1 && SameSize(*p_input_arg, *p_output_arg)) {
                     allocation_plan[output_idx_global].alloc_kind = AllocKind::kReuse;
                     allocation_plan[output_idx_global].reused_buffer = input_arg_index;
-                    value_consumer_map_[input_arg_index].insert(value_consumer_map_[output_idx_global].begin(),
-                                                                value_consumer_map_[output_idx_global].end());
+                    value_consumer_map[input_arg_index].insert(value_consumer_map[output_idx_global].begin(),
+                                                               value_consumer_map[output_idx_global].end());
                     reused.insert(input_arg_index);
                   }
                 }
@@ -1266,7 +1267,7 @@ class PlannerImpl {
             }
 
             bool all_covered = true;
-            for (auto consumer : value_consumer_map_[output_idx_global]) {
+            for (auto consumer : value_consumer_map[output_idx_global]) {
               if (deps->find(consumer) == deps->end()) {
                 all_covered = false;
                 break;
@@ -1277,9 +1278,9 @@ class PlannerImpl {
               allocation_plan[downstream_value].reused_buffer = output_idx_global;
               get_reused = true;
               // add new consumer for the value to be reused
-              value_consumer_map_[output_idx_global].insert(value_node_map_[downstream_value]);
-              value_consumer_map_[output_idx_global].insert(value_consumer_map_[downstream_value].begin(),
-                                                            value_consumer_map_[downstream_value].end());
+              value_consumer_map[output_idx_global].insert(value_node_map_[downstream_value]);
+              value_consumer_map[output_idx_global].insert(value_consumer_map[downstream_value].begin(),
+                                                           value_consumer_map[downstream_value].end());
               node_iter = size_iter->second.erase(node_iter);
               if (size_iter->second.empty()) {
                 local_iter->second.erase(size_iter);
@@ -1342,8 +1343,9 @@ class PlannerImpl {
     ort_value_usecount.reserve(ort_value_info_.size());
 #endif
     for (size_t i = 0; i < stream_nodes_.size(); ++i) {
-      // compute use count first
+      // compute use count first. TODO(leca): call ComputeReuseCount() only once is enough!
       ORT_RETURN_IF_ERROR(ComputeReuseCount());
+      for (int j = 0; static_cast<size_t>(j) < ort_value_info_.size(); j++) Buffer(j) = j;
 #if !defined(ORT_MINIMAL_BUILD) && defined(ORT_MEMORY_PROFILE)
       if (i == 0) {
         for (auto ort_value_info : ort_value_info_) {
@@ -1693,8 +1695,8 @@ class PlannerImpl {
             const auto& name = input.Name();
             int value_idx;
             ORT_RETURN_IF_ERROR(ort_value_name_idx_map_.GetIdx(name, value_idx));
-            auto origin = Buffer(value_idx);
-            if (origin != -1 && plan_.allocation_plan[origin].alloc_kind == AllocKind::kAllocate) {
+            auto origin = AllocPlan(value_idx).reused_buffer;
+            if (AllocPlan(origin).alloc_kind == AllocKind::kAllocate) {
               // add current node as consumer for origin buffer
               value_consumers[origin].push_back(node_index);
             }
@@ -1889,7 +1891,7 @@ class PlannerImpl {
                   // 2. the consumer is in the same stream(non-cpu device), but it consumes a CPU tensor from an non-shape op.
                   //    for example, a resize cuda kernel consumer a tensor from MemCpyToHost cuda kernel on the same stream.
                   //    in this case, the FIFO can't guarantee the cpu tensor is ready when resize kernel is launching
-                  OrtDevice::DeviceType output_arg_device = plan_.allocation_plan[output_arg_idx].location.Type();
+                  OrtDevice::DeviceType output_arg_device = AllocPlan(output_arg_idx).location.Type();
                   WaitNotificationFn wait_handle = stream_handle_registry.GetWaitHandle(stream_device, output_arg_device);
                   if ((node_stream_map_[it->Index()] != i || output_arg_device == OrtDevice::CPU) && wait_handle != nullptr) {
                     if (node_to_notification.find(node_index) == node_to_notification.end()) {
diff --git a/onnxruntime/test/framework/allocation_planner_test.cc b/onnxruntime/test/framework/allocation_planner_test.cc
index d7b1de5c930c5..3e0d94e94e48c 100644
--- a/onnxruntime/test/framework/allocation_planner_test.cc
+++ b/onnxruntime/test/framework/allocation_planner_test.cc
@@ -1974,6 +1974,74 @@ TEST_F(PlannerTest, TestCpuIf) {
     ASSERT_TRUE(exe_plan[1]->steps_[6]->ToString().substr(0, WaitOnEPStep.size()) == WaitOnEPStep);
   }
 }
+
+// model looks like:
+//                                                 |-----------> Gather
+//                                                 |-----------> Gather
+//                                                 |-----------> Gather
+//                                                 |-----------> Gather
+// Shape ----------------> Reshape --> Shape ------------------> Reshape
+//                           ^                                     ^
+// InstanceNormalization ----|         InstanceNormalization ------|
+//
+// Python script to create this model:
+// def CreateModelFor19480():
+//    #shape->reshape->shape->reshape, 4 gather
+//    graphNodes = []
+//    graphNodes.append(h.make_node('Shape', inputs=['shape_input'], outputs=['9']))
+//    graphNodes.append(h.make_node('InstanceNormalization', inputs=['in0_input', 'scale0', 'B0'], outputs=['8']))
+//    graphNodes.append(h.make_node('Reshape', inputs=['8', '9'], outputs=['Reshape15_output']))
+//    graphNodes.append(h.make_node('Shape', inputs=['Reshape15_output'], outputs=['281']))
+//    graphNodes.append(h.make_node('InstanceNormalization', inputs=['in1_input', 'scale1', 'B1'], outputs=['293']))
+//    graphNodes.append(h.make_node('Reshape', inputs=['293', '281'], outputs=['output0']))
+//    graphNodes.append(h.make_node('Gather', inputs=['281', 'indices1'], outputs=['output1']))
+//    graphNodes.append(h.make_node('Gather', inputs=['281', 'indices2'], outputs=['output2']))
+//    graphNodes.append(h.make_node('Gather', inputs=['281', 'indices3'], outputs=['output3']))
+//    graphNodes.append(h.make_node('Gather', inputs=['281', 'indices4'], outputs=['output4']))
+//    g = h.make_graph(graphNodes, 'issue_19480',
+//                     [h.make_tensor_value_info('shape_input', tp.FLOAT, ['batch', 128, None, None]),
+//                      h.make_tensor_value_info('in0_input', tp.FLOAT, ['batch', 32, None]),
+//                      h.make_tensor_value_info('scale0', tp.FLOAT, [32]),
+//                      h.make_tensor_value_info('B0', tp.FLOAT, [32]),
+//                      h.make_tensor_value_info('in1_input', tp.FLOAT, ['batch', 32, None]),
+//                      h.make_tensor_value_info('scale1', tp.FLOAT, [32]),
+//                      h.make_tensor_value_info('B1', tp.FLOAT, [32]),
+//                      h.make_tensor_value_info('indices1', tp.INT32, []),
+//                      h.make_tensor_value_info('indices2', tp.INT32, []),
+//                      h.make_tensor_value_info('indices3', tp.INT32, []),
+//                      h.make_tensor_value_info('indices4', tp.INT32, [])],
+//                     [h.make_tensor_value_info('output0', tp.FLOAT, None),
+//                      h.make_tensor_value_info('output1', tp.INT64, None),
+//                      h.make_tensor_value_info('output2', tp.INT64, None),
+//                      h.make_tensor_value_info('output3', tp.INT64, None),
+//                      h.make_tensor_value_info('output4', tp.INT64, None)])
+//    model = h.make_model(g, opset_imports=[h.make_operatorsetid("", 17)], producer_name='producer_name')
+//    onnx.save(model, 'issue_19480.onnx')
+//
+TEST(AllocationPlannerTest, ReusedInputCrossDifferentStreams) {
+  SessionOptions sess_opt;
+  sess_opt.graph_optimization_level = TransformerLevel::Default;
+
+  InferenceSession sess(sess_opt, GetEnvironment(), ORT_TSTR("./testdata/multi_stream_models/issue_19480.onnx"));
+  auto status = sess.RegisterExecutionProvider(DefaultCudaExecutionProvider());
+  status = sess.Load();
+  status = sess.Initialize();
+  ASSERT_TRUE(status.IsOK()) << "No crash";
+  const SequentialExecutionPlan* plan = sess.GetSessionState().GetExecutionPlan();
+  ASSERT_EQ(plan->allocation_plan[14].alloc_kind, AllocKind::kReuse) << "The input of reshape and gather will reuse the output of shape";
+
+  int gather_count = 0;
+  for (size_t i = 0; i < plan->execution_plan[1]->steps_.size(); i++) {
+    if (strstr(typeid(*(plan->execution_plan[1]->steps_[i])).name(), "LaunchKernelStep")) {
+      const Node* node = sess.GetSessionState().GetGraphViewer().GetNode(plan->execution_plan[1]->steps_[i]->GetNodeIndex());
+      if (node->OpType() == "Gather")
+        gather_count++;
+      else
+        FAIL() << "CPU stream should contain only gather ops";
+    }
+  }
+  ASSERT_EQ(gather_count, 4) << "4 gather ops are all placed in CPU stream";
+}
 #endif
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/testdata/multi_stream_models/issue_19480.onnx b/onnxruntime/test/testdata/multi_stream_models/issue_19480.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..dc7d39206dd49f4ef6daf65b7d58c5b456ecf331
GIT binary patch
literal 760
zcmaixKTm@|7>Bw3f%9#m_0_6_F_p#1gab^#v5RqW(2b?J(o1>?g{IKO$uH`6@t{2^
z#m0q%=Y9D7j(aJ^(>&%f;j=_M&c!l&{_evy4DtnEiK$Fin*vE__dm*aU~nQ+XN$p9
zA11<q&cs0y*rV4pFL)Y@w`{%SiTvb+%u=`q5&#Kt_%5nU_B72lg^IPU9f3P&4YBJ6
zOt*SvD{Hhe)yk9<ddGTip`5Bmz|h(@I(6l^8UU5)T*e~*RhVtQ@M0m>aA3GP#64zs
z+VGAUzBYVq;6Ud2Mod}g2Tt_Ry!IQoq685v?9X@+FQ7}m2pC{Q_TCzB1Q$v>tF;at
zE9X-02LY%OdZ2hTthTjJs;u4R{+GpCSxtg_7ivO}nrK8dbFt05KbWuCO#ReubJg)l
W4Oj)N8n}nRI|Tj~OnP7p&wl_GGP8{U

literal 0
HcmV?d00001


From 3bdb10d5ca4f258ec444863bcd5e839eeac5c238 Mon Sep 17 00:00:00 2001
From: jingyanwangms <47403504+jingyanwangms@users.noreply.github.com>
Date: Thu, 22 Feb 2024 10:56:25 -0800
Subject: [PATCH 040/279] Move import to when needed to avoid circular
 dependency error (#19579)

### Description
Move import to when needed to avoid circular dependency error


### Motivation and Context
Fixes dependency error described here:
https://github.com/microsoft/DeepSpeed/issues/5140

---------

Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com>
---
 .../python/training/ortmodule/_graph_execution_manager.py     | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
index 779b6bfe50422..fda6e345da235 100755
--- a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
+++ b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
@@ -20,7 +20,6 @@
 from onnxruntime.capi import _pybind_state as C
 from onnxruntime.tools.symbolic_shape_infer import SymbolicShapeInference
 from onnxruntime.training.utils import ORTModelInputOutputSchemaType, PTable, onnx_dtype_to_pytorch_dtype
-from onnxruntime.training.utils.hooks import configure_ort_compatible_zero_stage3
 
 from . import _are_deterministic_algorithms_enabled, _io, _logger, _onnx_models, _utils
 from ._fallback import (
@@ -143,6 +142,9 @@ def __init__(
 
         self._zero_stage3_param_map = {}
         if self._runtime_options.enable_zero_stage3_support:
+            # Move import to here to avoid circular dependency error
+            from onnxruntime.training.utils.hooks import configure_ort_compatible_zero_stage3  # type: ignore[import]
+
             # Cannot toggle feature enabling/disabling after the first time enabled.
 
             configure_ort_compatible_zero_stage3(debug=False, stats_output_dir="ort_output", stats_overwrite=True)

From fe82fccf1a4d7ea6c24c8448d7264df36605c370 Mon Sep 17 00:00:00 2001
From: Xu Xing <xing.xu@intel.com>
Date: Fri, 23 Feb 2024 05:09:28 +0800
Subject: [PATCH 041/279] [js/webgpu] Fix Conv2DTransposeMatMul f16 compilation
 failure (#19596)

This is used in sam-h-decoder-f16.

### Description
<!-- Describe your changes. -->


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../ops/3rd-party/conv_backprop_mm_webgpu.ts  | 22 +++++++++++--------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_mm_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_mm_webgpu.ts
index b5b6a2a15cd8c..11c8778b72335 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_mm_webgpu.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_mm_webgpu.ts
@@ -23,17 +23,17 @@ import {DataType} from '../../../../wasm-common';
 import {LOG_DEBUG} from '../../../log';
 import {TensorView} from '../../../tensor-view';
 import {ProgramInfo, ProgramInputTensorInfoDependency, ProgramUniform} from '../../types';
-import {createTensorShapeVariables, inputVariable, outputVariable, ShaderHelper, UniformsArrayType} from '../common';
+import {createTensorShapeVariables, inputVariable, outputVariable, ShaderHelper, tensorTypeToWsglStorageType, UniformsArrayType} from '../common';
 import {ConvTransposeAttributes} from '../conv-transpose';
 import {appendActivationUniforms, appendActivationUniformsData, getActivationSnippet} from '../fuse-utils';
 
-import {biasSnippet, typeSnippet} from './activation_util';
+import {biasSnippet} from './activation_util';
 import {utilFunctions} from './conv_util';
 import {makeMatMulPackedSource, makeMatMulPackedVec4Source} from './matmul_packed_webgpu';
 
 const conv2dTransposeCommonSnippet =
-    (isChannelsLast: boolean, addBias = false, attributes: ConvTransposeAttributes, innerElementSize = 4): string => {
-      const type = typeSnippet(innerElementSize, 'f32');
+    (isChannelsLast: boolean, addBias = false, attributes: ConvTransposeAttributes, type: string,
+     innerElementSize = 4): string => {
       const getWSnippet = (innerElementSize: number) => {
         switch (innerElementSize) {
           case 1:
@@ -47,7 +47,7 @@ const conv2dTransposeCommonSnippet =
             let v1 = w[getIndexFromCoords4D(coord1, vec4<i32>(uniforms.w_shape))];
             let v2 = w[getIndexFromCoords4D(coord2, vec4<i32>(uniforms.w_shape))];
             let v3 = w[getIndexFromCoords4D(coord3, vec4<i32>(uniforms.w_shape))];
-            return vec4<f32>(v0, v1, v2, v3);
+            return ${type}(v0, v1, v2, v3);
             `;
           default:
             throw new Error(`innerElementSize ${innerElementSize} is not supported.`);
@@ -224,7 +224,7 @@ export const createConv2DTransposeMatMulProgramInfo =
           const bias = inputVariable('bias', inputs[2].dataType, inputs[2].dims.length, components);
           inputVariables.push(bias);
           declareFunctions += `
-          fn getBiasByOutputCoords(coords : vec4<i32>) -> ${isVec4 ? 'vec4<f32>' : 'f32'} {
+          fn getBiasByOutputCoords(coords : vec4<i32>) -> ${bias.type.value} {
             return bias[coords.${isChannelsLast ? 'w' : 'y'}${isVec4 ? '/ 4' : ''}];
           }`;
         }
@@ -236,16 +236,20 @@ export const createConv2DTransposeMatMulProgramInfo =
           {name: 'pads', type: 'i32', length: pads.length}
         ];
         appendActivationUniforms(attributes, uniforms);
+        const elemType = tensorTypeToWsglStorageType(inputs[0].dataType, 1);
+        if (elemType !== 'f16' && elemType !== 'f32') {
+          throw new Error(`elemType ${elemType} is not supported.`);
+        }
         return `
         ${utilFunctions('uniforms.result_strides')}
         ${shaderHelper.registerUniforms(uniforms).declareVariables(...inputVariables, output)};
         ${declareFunctions}
-        ${conv2dTransposeCommonSnippet(isChannelsLast, hasBias, attributes, innerElementSize)}
+        ${conv2dTransposeCommonSnippet(isChannelsLast, hasBias, attributes, x.type.value, innerElementSize)}
         ${
             isVec4 ? makeMatMulPackedVec4Source(
-                         elementsPerThread, workGroupSize, 'f32', undefined, !isChannelsLast, tileInner) :
+                         elementsPerThread, workGroupSize, elemType, undefined, !isChannelsLast, tileInner) :
                      makeMatMulPackedSource(
-                         elementsPerThread, workGroupSize, 'f32', undefined, !isChannelsLast, tileInner, false,
+                         elementsPerThread, workGroupSize, elemType, undefined, !isChannelsLast, tileInner, false,
                          undefined, sequentialAccessByThreads)}`;
       };
 

From 09622418c45b265977a8f1f17581e15719357423 Mon Sep 17 00:00:00 2001
From: Hector Li <hecli@microsoft.com>
Date: Thu, 22 Feb 2024 13:15:13 -0800
Subject: [PATCH 042/279] Add special handling if there is only 1 graph inside
 the cached QNN context binary (#19594)

Add special handling if there is only 1 graph inside the cached QNN context binary. No need to make the EPContext node name match the QNN graph name. This is for better backward compatibility in case the QNN context model is generated before the PR for QNN context binary model support multi-partition.
---
 .../qnn/builder/onnx_ctx_model_helper.cc      |  6 +-
 .../qnn/builder/onnx_ctx_model_helper.h       |  3 +-
 .../qnn/builder/qnn_backend_manager.cc        | 15 ++--
 .../providers/qnn/qnn_execution_provider.cc   |  3 +-
 .../test/providers/qnn/qnn_ep_context_test.cc | 83 ++++++++++++++++++-
 5 files changed, 99 insertions(+), 11 deletions(-)

diff --git a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
index c2e71081b898e..2d8ec295d613b 100644
--- a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
+++ b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
@@ -151,12 +151,14 @@ Status GetEpContextFromMainNode(const onnxruntime::Node& main_context_node,
 Status LoadQnnCtxFromOnnxGraph(const onnxruntime::GraphViewer& graph_viewer,
                                const onnxruntime::PathString& ctx_onnx_model_path,
                                QnnBackendManager* qnn_backend_manager,
-                               std::unordered_map<std::string, std::unique_ptr<qnn::QnnModel>>& qnn_models) {
+                               std::unordered_map<std::string, std::unique_ptr<qnn::QnnModel>>& qnn_models,
+                               const logging::Logger& logger) {
   Status status = GetEpContextFromMainNode(*graph_viewer.Nodes().begin(), ctx_onnx_model_path, qnn_backend_manager, qnn_models);
 
   // This is the protocol with customer that status with INVALID_GRAPH will be generated if failed to load context model
   if (!status.IsOK()) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_GRAPH, "Failed to load from EpContextModel. ", status.ErrorMessage());
+    LOGS(logger, ERROR) << "Failed to load from EpContext model. " << status.ErrorMessage();
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_GRAPH, "Failed to load from EpContext model. ", status.ErrorMessage());
   }
 
   return Status::OK();
diff --git a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h
index b1360b4e576fa..7d56b45a1dbcd 100644
--- a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h
+++ b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h
@@ -56,7 +56,8 @@ Status GetEpContextFromMainNode(const onnxruntime::Node& main_context_node,
 Status LoadQnnCtxFromOnnxGraph(const onnxruntime::GraphViewer& graph_viewer,
                                const onnxruntime::PathString& ctx_onnx_model_path,
                                QnnBackendManager* qnn_backend_manager,
-                               std::unordered_map<std::string, std::unique_ptr<qnn::QnnModel>>& qnn_models);
+                               std::unordered_map<std::string, std::unique_ptr<qnn::QnnModel>>& qnn_models,
+                               const logging::Logger& logger);
 
 Status CreateEPContextNodes(Model* model,
                             unsigned char* buffer,
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
index 5f0b87c7cb9d7..ca34a1efa6ca7 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
@@ -573,11 +573,16 @@ Status QnnBackendManager::LoadCachedQnnContextFromBuffer(char* buffer, uint64_t
 
   // More work to support multiple partition, how to map the graph name in compile to qnn graph name
   // Need the lower level framework to understand EPContext op and pass in the partition_name in fused_node during Compile
-  for (uint32_t i = 0; i < graph_count; ++i) {
-    std::string graph_name(graphs_info[i].graphInfoV1.graphName);
-    auto qnn_model_pos = qnn_models.find(graph_name);
-    ORT_RETURN_IF(qnn_model_pos == qnn_models.end(), graph_name + " does not match any EPContext node names.");
-    ORT_RETURN_IF_ERROR(qnn_model_pos->second->DeserializeGraphInfoFromBinaryInfo(graphs_info[i]));
+  if (1 == graph_count) {
+    auto qnn_model_pose = qnn_models.begin();
+    ORT_RETURN_IF_ERROR(qnn_model_pose->second->DeserializeGraphInfoFromBinaryInfo(graphs_info[0]));
+  } else {
+    for (uint32_t i = 0; i < graph_count; ++i) {
+      std::string graph_name(graphs_info[i].graphInfoV1.graphName);
+      auto qnn_model_pos = qnn_models.find(graph_name);
+      ORT_RETURN_IF(qnn_model_pos == qnn_models.end(), graph_name + " does not match any EPContext node names.");
+      ORT_RETURN_IF_ERROR(qnn_model_pos->second->DeserializeGraphInfoFromBinaryInfo(graphs_info[i]));
+    }
   }
 
   qnn_sys_interface_.systemContextFree(sys_ctx_handle);
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index f5a166d36b15a..9a6540a3efea5 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -670,7 +670,8 @@ Status QNNExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused
     ORT_RETURN_IF_ERROR(qnn::LoadQnnCtxFromOnnxGraph(main_ctx_graph_viewer,
                                                      context_cache_path,
                                                      qnn_backend_manager_.get(),
-                                                     qnn_models));
+                                                     qnn_models,
+                                                     logger));
 
     for (auto fused_node_and_graph : fused_nodes_and_graphs) {
       const onnxruntime::GraphViewer& graph_viewer(fused_node_and_graph.filtered_graph);
diff --git a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
index b1f3b52e77553..eaef6f6315157 100644
--- a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
+++ b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
@@ -463,7 +463,6 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryCache_InvalidGraph) {
 
   InferenceSessionWrapper session_object{so, GetEnvironment()};
 
-  std::string provider_type = kCpuExecutionProvider;
   ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(QnnExecutionProviderWithOptions(provider_options)));
   ASSERT_STATUS_OK(session_object.Load(qnn_ctx_model_data.data(), static_cast<int>(qnn_ctx_model_data.size())));
   // Verify the return status with code INVALID_GRAPH
@@ -486,7 +485,6 @@ std::string CreateQnnCtxModelWithNonEmbedMode(std::string external_bin_path) {
   auto* graph_output = helper.MakeOutput<float>(shape);
   Node& ep_context_node = helper.AddNode("EPContext", {graph_input}, {graph_output}, kMSDomain);
   ep_context_node.AddAttribute("embed_mode", static_cast<int64_t>(0));
-  // The .. in the path will cause INVALID_GRAPH
   ep_context_node.AddAttribute("ep_cache_context", external_bin_path);
   ep_context_node.AddAttribute("partition_name", "QNNExecutionProvider_QNN_1110111000111000111_1_0");
   ep_context_node.AddAttribute("source", "QNN");
@@ -651,6 +649,87 @@ TEST_F(QnnHTPBackendTests, QnnContextBinary2InputsTest) {
   ASSERT_EQ(std::remove(context_binary_file.c_str()), 0);
 }
 
+// Context binary only contains a single QNN graph, generated context cache model (detached mode) only has 1 EPContext node
+// Create another Onnx model which also reference to the bin file,
+// but the node name is not same with the QNN graph name inside the bin file.
+// This is to support backward compitable for the models generated before the PR that
+// make context generation support multi-partition
+TEST_F(QnnHTPBackendTests, QnnContextBinaryCache_SingleNodeNameNotMatchGraphNameInCtx) {
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+  const std::string context_binary_file = "./qnn_context_cache_non_embed.onnx";
+  std::filesystem::path context_bin = "qnn_context_cache_non_embed.onnx_QNNExecutionProvider_QNN_8283143575221199085_1_0.bin";
+  std::remove(context_binary_file.c_str());
+  std::remove(context_bin.string().c_str());
+
+  std::unordered_map<std::string, std::string> session_option_pairs;
+  session_option_pairs.emplace(kOrtSessionOptionEpContextEnable, "1");
+  session_option_pairs.emplace(kOrtSessionOptionEpContextFilePath, context_binary_file);
+  session_option_pairs.emplace(kOrtSessionOptionEpContextEmbedMode, "0");
+
+  const TestInputDef<float> input_def({1, 2, 3}, false, -10.0f, 10.0f);
+  const std::string op_type = "Atan";
+
+  // Runs model with DQ-> Atan-> Q and compares the outputs of the CPU and QNN EPs.
+  // 1st run will generate the Onnx skeleton file + Qnn context cache binary file
+  TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, {input_def}, {}, {}),
+                       BuildQDQOpTestCase<uint8_t>(op_type, {input_def}, {}, {}),
+                       provider_options,
+                       14,
+                       ExpectedEPNodeAssignment::All,
+                       QDQTolerance(),
+                       logging::Severity::kERROR,
+                       "",  // context model file path, not required for this inference
+                       session_option_pairs);
+
+  // Check the Onnx skeleton file is generated
+  EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str()));
+  // Check the Qnn context cache binary file is generated
+  EXPECT_TRUE(std::filesystem::exists(context_bin));
+
+  const std::unordered_map<std::string, int> domain_to_version = {{"", 11}, {kMSDomain, 1}};
+  auto& logging_manager = DefaultLoggingManager();
+  onnxruntime::Model model("QNN_ctx_model", false, ModelMetaData(), PathString(),
+                           IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {},
+                           logging_manager.DefaultLogger());
+  Graph& graph = model.MainGraph();
+  ModelTestBuilder helper(graph);
+  std::vector<int64_t> shape = {1, 2, 3};
+  NodeArg* graph_input = MakeTestInput(helper, TestInputDef<float>(shape, false, {0.0f, 1.0f, 0.0f, 1.0f, 0.0f, 1.0f}));
+  auto* graph_output = helper.MakeOutput<float>(shape);
+  Node& ep_context_node = helper.AddNode("EPContext", {graph_input}, {graph_output}, kMSDomain);
+  ep_context_node.AddAttribute("embed_mode", static_cast<int64_t>(0));
+  ep_context_node.AddAttribute("ep_cache_context", context_bin.string());
+  ep_context_node.AddAttribute("partition_name", "QNNExecutionProvider_QNN_1110111000111000111_1_0");
+  ep_context_node.AddAttribute("source", "QNNExecutionProvider");
+  helper.SetGraphOutputs();
+  ASSERT_STATUS_OK(graph.Resolve());
+  std::string model_data;
+  model.ToProto().SerializeToString(&model_data);
+
+  // loads and run from Onnx skeleton file + Qnn context cache binary file
+
+  SessionOptions so;
+  so.session_logid = "qnn_ctx_model_logger";
+  RunOptions run_options;
+  run_options.run_tag = so.session_logid;
+
+  InferenceSessionWrapper session_object{so, GetEnvironment()};
+
+  ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(QnnExecutionProviderWithOptions(provider_options)));
+  ASSERT_STATUS_OK(session_object.Load(model_data.data(), static_cast<int>(model_data.size())));
+  // Verify the return status with code INVALID_GRAPH
+  ASSERT_TRUE(session_object.Initialize().Code() == common::StatusCode::OK);
+
+  // Clean up
+  ASSERT_EQ(std::remove(context_binary_file.c_str()), 0);
+  ASSERT_EQ(std::remove(context_bin.string().c_str()), 0);
+}
+
 #endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
 
 }  // namespace test

From 76a2a487a12c7ec579f453a36932429164494ef6 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 22 Feb 2024 13:58:17 -0800
Subject: [PATCH 043/279] Bump ip from 1.1.8 to 1.1.9 in /js/react_native/e2e
 (#19583)

Bumps [ip](https://github.com/indutny/node-ip) from 1.1.8 to 1.1.9.
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/indutny/node-ip/commit/1ecbf2fd8c0cc85e44c3b587d2de641f50dc0217"><code>1ecbf2f</code></a>
1.1.9</li>
<li><a
href="https://github.com/indutny/node-ip/commit/6a3ada9b471b09d5f0f5be264911ab564bf67894"><code>6a3ada9</code></a>
lib: fixed CVE-2023-42282 and added unit test</li>
<li>See full diff in <a
href="https://github.com/indutny/node-ip/compare/v1.1.8...v1.1.9">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=ip&package-manager=npm_and_yarn&previous-version=1.1.8&new-version=1.1.9)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
Dependabot will merge this PR once CI passes on it, as requested by
@fs-eire.

[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)
You can disable automated security fix PRs for this repo from the
[Security Alerts
page](https://github.com/microsoft/onnxruntime/network/alerts).

</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 js/react_native/e2e/yarn.lock | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/js/react_native/e2e/yarn.lock b/js/react_native/e2e/yarn.lock
index 9e20a286c4e27..6f05faf046098 100644
--- a/js/react_native/e2e/yarn.lock
+++ b/js/react_native/e2e/yarn.lock
@@ -3351,9 +3351,9 @@ invariant@^2.2.4:
     loose-envify "^1.0.0"
 
 ip@^1.1.5:
-  version "1.1.8"
-  resolved "https://registry.yarnpkg.com/ip/-/ip-1.1.8.tgz#ae05948f6b075435ed3307acce04629da8cdbf48"
-  integrity sha512-PuExPYUiu6qMBQb4l06ecm6T6ujzhmh+MeJcW9wa89PoAz5pvd4zPgN5WJV104mb6S2T1AwNIAaB70JNrLQWhg==
+  version "1.1.9"
+  resolved "https://registry.yarnpkg.com/ip/-/ip-1.1.9.tgz#8dfbcc99a754d07f425310b86a99546b1151e396"
+  integrity sha512-cyRxvOEpNHNtchU3Ln9KC/auJgup87llfQpQ+t5ghoC/UhL16SWzbueiCsdTnWmqAWl7LadfuwhlqmtOaqMHdQ==
 
 is-accessor-descriptor@^0.1.6:
   version "0.1.6"

From 5e5c36f6df95dfbb25787ea385f733f8c9ef691e Mon Sep 17 00:00:00 2001
From: AtomicVar <guo@shuai.guru>
Date: Fri, 23 Feb 2024 09:03:56 +0800
Subject: [PATCH 044/279] Fix citation author name issue (#19597)

Use `name` rather than `given-names` to set author name.

### Motivation and Context
The old CITATION.cff uses `given-names` to set author names, which won't
be rendered properly with some bibtex style of LaTeX:

<img width="680" alt="image"
src="https://github.com/microsoft/onnxruntime/assets/22856433/c509400e-5b16-4400-8950-550b05186369">

The problem is that **the `"ONNX Runtime developers"` is regarded as a
human name**.

How to fix: by using `name` to set author name, the generated Bibtex
entry will use `{}` to enclose the `"ONNX Runtime developers"`. Then it
is displayed literally:

<img width="742" alt="image"
src="https://github.com/microsoft/onnxruntime/assets/22856433/94083c9f-0daa-4c51-92e1-c966b88d09d2">
---
 CITATION.cff | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/CITATION.cff b/CITATION.cff
index 82bcac5a7b750..10b7290022aef 100644
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -3,8 +3,7 @@ title: ONNX Runtime
 message: "Please use this information to cite ONNX Runtime in
   research or other publications."
 authors:
-  - affiliation: Microsoft Corporation
-    given-names: ONNX Runtime developers
+  - name: ONNX Runtime developers
 date-released: 2018-11-29
 url: "https://onnxruntime.ai"
 repository-code: "https://github.com/microsoft/onnxruntime"

From 4ab497603e915ca992b96ef1ec25bfcf8b9a2ad5 Mon Sep 17 00:00:00 2001
From: Hector Li <hecli@microsoft.com>
Date: Thu, 22 Feb 2024 17:04:59 -0800
Subject: [PATCH 045/279] Enable user to set QNN HTP performance mode for every
 session run (#19521)

### Description
Currently, the QNN HTP performance mode is set during session creation, there's no way to change it afterwards. There's requirement to set it high performance mode for high priority request and set it back to low performance mode later to save the power when the incoming request is idle for example.

Now, still keeps the performance mode at the session level in QNN EP options which is used at the default one. Ort QNN EP will set it once if user set it.
And there are setting (qnn.htp_perf_mode and qnn.htp_perf_mode_post_run) in run option to change the performance mode before and after session run. There's recommended scenario that user set the mode to high performance mode before the the inference sun so that user can get the result back ASAP. And set the mode to low performance mode after the inference to save the power.
---
 .../core/framework/execution_provider.h       |  10 +-
 .../onnxruntime_run_options_config_keys.h     |  12 +
 .../framework/stream_execution_context.cc     |   4 +-
 .../providers/cann/cann_execution_provider.cc |   2 +-
 .../providers/cann/cann_execution_provider.h  |   2 +-
 .../providers/cuda/cuda_execution_provider.cc |   4 +-
 .../providers/cuda/cuda_execution_provider.h  |   5 +-
 .../src/ExecutionProvider.h                   |   4 +-
 .../providers/js/js_execution_provider.cc     |   4 +-
 .../core/providers/js/js_execution_provider.h |   4 +-
 .../migraphx/migraphx_execution_provider.cc   |   4 +-
 .../migraphx/migraphx_execution_provider.h    |   4 +-
 .../qnn/builder/qnn_backend_manager.cc        |  75 +++---
 .../qnn/builder/qnn_backend_manager.h         |  19 +-
 .../providers/qnn/qnn_execution_provider.cc   | 198 +++++++++++++++-
 .../providers/qnn/qnn_execution_provider.h    |  73 +++++-
 .../providers/rocm/rocm_execution_provider.cc |   4 +-
 .../providers/rocm/rocm_execution_provider.h  |   4 +-
 .../tensorrt/tensorrt_execution_provider.cc   |   4 +-
 .../tensorrt/tensorrt_execution_provider.h    |   4 +-
 onnxruntime/core/session/inference_session.cc |  12 +-
 .../cuda_execution_provider_test.cc           |  13 +-
 .../test/providers/qnn/qnn_basic_test.cc      | 217 ++++++++++++++++--
 23 files changed, 577 insertions(+), 105 deletions(-)

diff --git a/include/onnxruntime/core/framework/execution_provider.h b/include/onnxruntime/core/framework/execution_provider.h
index 31c988f500779..c1cc69edc17d8 100644
--- a/include/onnxruntime/core/framework/execution_provider.h
+++ b/include/onnxruntime/core/framework/execution_provider.h
@@ -33,6 +33,8 @@ class Node;
 #include "core/framework/stream_handles.h"
 #include "core/framework/tuning_context.h"
 
+struct OrtRunOptions;
+
 namespace onnxruntime {
 
 /**
@@ -51,6 +53,8 @@ struct NodeComputeInfo {
   DestroyFunctionStateFunc release_state_func;
 };
 
+using RunOptions = OrtRunOptions;
+
 enum class DataLayout {
   NCHW,
   NHWC,
@@ -184,7 +188,7 @@ class IExecutionProvider {
      Run may not be finished on device This function should be regarded as the
      point after which a new Run would start to submit commands from CPU
   */
-  virtual common::Status OnRunStart() { return Status::OK(); }
+  virtual common::Status OnRunStart(const onnxruntime::RunOptions& /*run_options*/) { return Status::OK(); }
 
   /**
      Called when InferenceSession::Run ended
@@ -192,7 +196,9 @@ class IExecutionProvider {
      may not be finished on device This function should be regarded as the point
      that all commands of current Run has been submmited by CPU
   */
-  virtual common::Status OnRunEnd(bool /*sync_stream*/) { return Status::OK(); }
+  virtual common::Status OnRunEnd(bool /*sync_stream*/, const onnxruntime::RunOptions& /*run_options*/) {
+    return Status::OK();
+  }
 
   /**
      Indicate whether the graph capturing mode (e.g., cuda graph) is enabled for
diff --git a/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h
index 1f5fcd50e185c..b0a17e175fef3 100644
--- a/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h
+++ b/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h
@@ -30,3 +30,15 @@ static const char* const kOrtRunOptionsConfigEnableMemoryArenaShrinkage = "memor
 // Per default it will be set to '0'
 // Taking CUDA EP as an example, it omit triggering cudaStreamSynchronize on the compute stream.
 static const char* const kOrtRunOptionsConfigDisableSynchronizeExecutionProviders = "disable_synchronize_execution_providers";
+
+// Set HTP performance mode for QNN HTP backend before session run.
+// options for HTP performance mode: "burst", "balanced", "default", "high_performance",
+// "high_power_saver", "low_balanced", "extreme_power_saver", "low_power_saver", "power_saver",
+// "sustained_high_performance". Default to "default".
+static const char* const kOrtRunOptionsConfigQnnPerfMode = "qnn.htp_perf_mode";
+
+// Set HTP performance mode for QNN HTP backend post session run.
+static const char* const kOrtRunOptionsConfigQnnPerfModePostRun = "qnn.htp_perf_mode_post_run";
+
+// Set RPC control latency for QNN HTP backend
+static const char* const kOrtRunOptionsConfigQnnRpcControlLatency = "qnn.rpc_control_latency";
diff --git a/onnxruntime/core/framework/stream_execution_context.cc b/onnxruntime/core/framework/stream_execution_context.cc
index 875e7f395bfa8..dd7f4d35b34bd 100644
--- a/onnxruntime/core/framework/stream_execution_context.cc
+++ b/onnxruntime/core/framework/stream_execution_context.cc
@@ -181,11 +181,13 @@ void RunSince(size_t stream_idx, StreamExecutionContext& ctx, SessionScope& sess
   }
 
 #ifdef USE_CANN
+  // Leave it to CANN EP to fill the gap if they want to use run_options
+  static onnxruntime::RunOptions run_options;
   // For CANN EP, it is necessary to explicitly create a corresponding Context for each thread in the thread pool,
   // which is different from CUDA Runtime API, but similar to CUDA Driver API.
   auto& execution_providers = ctx.GetSessionState().GetExecutionProviders();
   for (auto& xp : execution_providers) {
-    auto status = xp->OnRunStart();
+    auto status = xp->OnRunStart(run_options);
     if (!status.IsOK()) {
       ctx.SetStatus(status);
       return;
diff --git a/onnxruntime/core/providers/cann/cann_execution_provider.cc b/onnxruntime/core/providers/cann/cann_execution_provider.cc
index 752b742805a7c..9a242919665bb 100644
--- a/onnxruntime/core/providers/cann/cann_execution_provider.cc
+++ b/onnxruntime/core/providers/cann/cann_execution_provider.cc
@@ -1045,7 +1045,7 @@ CANNExecutionProvider::~CANNExecutionProvider() {
 }
 
 // All threads share the same context and stream
-Status CANNExecutionProvider::OnRunStart() {
+Status CANNExecutionProvider::OnRunStart(const onnxruntime::RunOptions& /*run_options*/) {
   CANN_RETURN_IF_ERROR(aclrtSetDevice(info_.device_id));
 
   return Status::OK();
diff --git a/onnxruntime/core/providers/cann/cann_execution_provider.h b/onnxruntime/core/providers/cann/cann_execution_provider.h
index 63ae980869c65..d83bd88d6958f 100644
--- a/onnxruntime/core/providers/cann/cann_execution_provider.h
+++ b/onnxruntime/core/providers/cann/cann_execution_provider.h
@@ -33,7 +33,7 @@ class CANNExecutionProvider : public IExecutionProvider {
   explicit CANNExecutionProvider(const CANNExecutionProviderInfo& info);
   virtual ~CANNExecutionProvider();
 
-  Status OnRunStart() override;
+  Status OnRunStart(const onnxruntime::RunOptions& run_options) override;
 
   template <typename T>
   Status Fill(Tensor* y, void* addr, aclrtStream stream) const {
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index 48a952e6dd98f..0dd568c5ecc05 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -386,7 +386,7 @@ Status CUDAExecutionProvider::Sync() const {
   return Status::OK();
 }
 
-Status CUDAExecutionProvider::OnRunStart() {
+Status CUDAExecutionProvider::OnRunStart(const onnxruntime::RunOptions& /*run_options*/) {
   // always set CUDA device when session::Run() in case it runs in a worker thread
   CUDA_RETURN_IF_ERROR(cudaSetDevice(GetDeviceId()));
   if (IsGraphCaptureEnabled() && GetPerThreadContext().IsGraphCaptureAllowed() && !GetPerThreadContext().IsGraphCaptured()) {
@@ -396,7 +396,7 @@ Status CUDAExecutionProvider::OnRunStart() {
   return Status::OK();
 }
 
-Status CUDAExecutionProvider::OnRunEnd(bool sync_stream) {
+Status CUDAExecutionProvider::OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& /*run_options*/) {
   if (IsGraphCaptureEnabled() && !GetPerThreadContext().IsGraphCaptured()) {
     if (GetPerThreadContext().IsGraphCaptureAllowed()) {
       GetPerThreadContext().CaptureEnd();
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.h b/onnxruntime/core/providers/cuda/cuda_execution_provider.h
index 55f0b5570e0ee..5f62f313b86a2 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.h
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.h
@@ -29,9 +29,9 @@ class CUDAExecutionProvider : public IExecutionProvider {
 
   Status Sync() const override;
 
-  Status OnRunStart() override;
+  Status OnRunStart(const onnxruntime::RunOptions& run_options) override;
 
-  Status OnRunEnd(bool sync_stream) override;
+  Status OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& run_options) override;
 
   DataLayout GetPreferredLayout() const override;
 
@@ -115,6 +115,7 @@ class CUDAExecutionProvider : public IExecutionProvider {
     PerThreadContext(OrtDevice::DeviceId device_id, cudaStream_t stream, size_t cuda_mem_limit, ArenaExtendStrategy arena_extend_strategy,
                      CUDAExecutionProviderExternalAllocatorInfo external_alloc_info, OrtArenaCfg* arena_cfg);
     ~PerThreadContext();
+    ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(PerThreadContext);
 
     cublasHandle_t CublasHandle() const {
       return cublas_handle_;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
index 5617bc7bdcac6..841d6244a983e 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
@@ -270,7 +270,7 @@ namespace Dml
             return m_impl->OnSessionInitializationEnd();
         }
 
-        virtual onnxruntime::Status Sync() const final override
+        onnxruntime::Status Sync() const final override
         {
             // Completely wait until the device has completed all preceding tasks.
             // The application could have called SynchronizeBoundOutputs().
@@ -278,7 +278,7 @@ namespace Dml
             return Status::OK();
         }
 
-        virtual onnxruntime::Status OnRunEnd(bool /*sync_stream*/) final override
+        onnxruntime::Status OnRunEnd(bool /*sync_stream*/, const onnxruntime::RunOptions& /*run_options*/) final override
         {
             // Flush any pending work to the GPU, but don't block for completion, permitting it
             // to overlap other work.
diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc
index 799d4172f2b64..62c3981682cfc 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.cc
+++ b/onnxruntime/core/providers/js/js_execution_provider.cc
@@ -756,7 +756,7 @@ std::unique_ptr<onnxruntime::IDataTransfer> JsExecutionProvider::GetDataTransfer
 JsExecutionProvider::~JsExecutionProvider() {
 }
 
-Status JsExecutionProvider::OnRunStart() {
+Status JsExecutionProvider::OnRunStart(const onnxruntime::RunOptions& /*run_options*/) {
   if (IsGraphCaptureEnabled() && IsGraphCaptureAllowed() && !IsGraphCaptured()) {
     LOGS(*GetLogger(), INFO) << "Capturing the webgpu graph for this model";
     EM_ASM({ Module.jsepCaptureBegin(); });
@@ -764,7 +764,7 @@ Status JsExecutionProvider::OnRunStart() {
   return Status::OK();
 }
 
-Status JsExecutionProvider::OnRunEnd(bool sync_stream) {
+Status JsExecutionProvider::OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& /*run_options*/) {
   if (IsGraphCaptureEnabled() && !IsGraphCaptured()) {
     if (IsGraphCaptureAllowed()) {
       EM_ASM({ Module.jsepCaptureEnd(); });
diff --git a/onnxruntime/core/providers/js/js_execution_provider.h b/onnxruntime/core/providers/js/js_execution_provider.h
index 91a3256ec2bd5..b4518c67d1e60 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.h
+++ b/onnxruntime/core/providers/js/js_execution_provider.h
@@ -59,8 +59,8 @@ class JsExecutionProvider : public IExecutionProvider {
 
   std::vector<AllocatorPtr> CreatePreferredAllocators() override;
 
-  Status OnRunStart() override;
-  Status OnRunEnd(bool sync_stream) override;
+  Status OnRunStart(const onnxruntime::RunOptions& run_options) override;
+  Status OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& run_options) override;
 
   bool IsGraphCaptureEnabled() const override;
   bool IsGraphCaptured() const override;
diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
index 40e76a0a67782..50782569ee80a 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
+++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
@@ -1383,11 +1383,11 @@ Status MIGraphXExecutionProvider::Sync() const {
   return Status::OK();
 }
 
-Status MIGraphXExecutionProvider::OnRunStart() {
+Status MIGraphXExecutionProvider::OnRunStart(const onnxruntime::RunOptions& /*run_options*/) {
   return Status::OK();
 }
 
-Status MIGraphXExecutionProvider::OnRunEnd(bool) {
+Status MIGraphXExecutionProvider::OnRunEnd(bool /*sync_stream*/, const onnxruntime::RunOptions& /*run_options*/) {
   auto status = hipStreamQuery(stream_);
 
   if (status != hipSuccess) {
diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h
index d582338c7e067..c3617f409e72c 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h
+++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h
@@ -56,9 +56,9 @@ class MIGraphXExecutionProvider : public IExecutionProvider {
 #ifdef MIGRAPHX_STREAM_SYNC
   Status Sync() const override;
 
-  Status OnRunStart() override;
+  Status OnRunStart(const onnxruntime::RunOptions& run_options) override;
 
-  Status OnRunEnd(bool sync_stream) override;
+  Status OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& run_options) override;
 #endif
 
   std::vector<std::unique_ptr<ComputeCapability>>
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
index ca34a1efa6ca7..e354bf6562722 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
@@ -634,11 +634,6 @@ Status QnnBackendManager::SetupBackend(const logging::Logger& logger, bool load_
     LOGS(logger, VERBOSE) << "CreateContext succeed.";
   }
 
-  if (htp_performance_mode_ != HtpPerformanceMode::kHtpDefault) {
-    ORT_RETURN_IF_ERROR(SetHtpPowerConfig());
-    LOGS(logger, VERBOSE) << "SetHtpPowerConfig succeed.";
-  }
-
   LOGS(logger, VERBOSE) << "QNN SetupBackend succeed";
 
   backend_setup_completed_ = true;
@@ -646,7 +641,7 @@ Status QnnBackendManager::SetupBackend(const logging::Logger& logger, bool load_
   return Status::OK();
 }
 
-Status QnnBackendManager::SetHtpPowerConfig() {
+Status QnnBackendManager::CreateHtpPowerCfgId(uint32_t device_id, uint32_t core_id, uint32_t& htp_power_config_id) {
   QnnDevice_Infrastructure_t qnn_device_infra = nullptr;
   auto status = qnn_interface_.deviceGetInfrastructure(&qnn_device_infra);
   ORT_RETURN_IF(QNN_SUCCESS != status, "backendGetPerfInfrastructure failed.");
@@ -656,23 +651,37 @@ Status QnnBackendManager::SetHtpPowerConfig() {
                 "HTP infra type = ", htp_infra->infraType, ", which is not perf infra type.");
   QnnHtpDevice_PerfInfrastructure_t& htp_perf_infra = htp_infra->perfInfra;
   // Get power client id
-  status = htp_perf_infra.createPowerConfigId(/*device_id=*/0, /*core_id=*/0, &htp_power_config_client_id_);
+  status = htp_perf_infra.createPowerConfigId(device_id, core_id, &htp_power_config_id);
   ORT_RETURN_IF(QNN_SUCCESS != status, "createPowerConfigId failed.");
 
+  return Status::OK();
+}
+
+Status QnnBackendManager::SetHtpPowerConfig(uint32_t htp_power_config_client_id,
+                                            HtpPerformanceMode htp_performance_mode) {
+  QnnDevice_Infrastructure_t qnn_device_infra = nullptr;
+  auto status = qnn_interface_.deviceGetInfrastructure(&qnn_device_infra);
+  ORT_RETURN_IF(QNN_SUCCESS != status, "backendGetPerfInfrastructure failed.");
+
+  auto* htp_infra = static_cast<QnnHtpDevice_Infrastructure_t*>(qnn_device_infra);
+  ORT_RETURN_IF(QNN_HTP_DEVICE_INFRASTRUCTURE_TYPE_PERF != htp_infra->infraType,
+                "HTP infra type = ", htp_infra->infraType, ", which is not perf infra type.");
+  QnnHtpDevice_PerfInfrastructure_t& htp_perf_infra = htp_infra->perfInfra;
+
   constexpr const int kNumConfigs = 1;
   std::vector<QnnHtpPerfInfrastructure_PowerConfig_t> power_configs(
       kNumConfigs);
   QnnHtpPerfInfrastructure_PowerConfig_t& dcvs_config = power_configs[0];
   dcvs_config.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3;
   QnnHtpPerfInfrastructure_DcvsV3_t& dcvs_v3 = dcvs_config.dcvsV3Config;
-  dcvs_v3.contextId = htp_power_config_client_id_;
+  dcvs_v3.contextId = htp_power_config_client_id;
   dcvs_v3.setSleepDisable = 0;
   dcvs_v3.sleepDisable = 0;
   dcvs_v3.setDcvsEnable = 1;
   dcvs_v3.dcvsEnable = kDcvsDisable;
   dcvs_v3.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE;
   // choose performance mode
-  switch (htp_performance_mode_) {
+  switch (htp_performance_mode) {
     case HtpPerformanceMode::kHtpBurst:
       dcvs_v3.setSleepLatency = 1;  // true
       dcvs_v3.sleepLatency = kSleepMinLatency;
@@ -771,25 +780,40 @@ Status QnnBackendManager::SetHtpPowerConfig() {
       dcvs_v3.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_NOM_PLUS;
       break;
     default:
-      ORT_THROW("Invalid performance profile %d", static_cast<int>(htp_performance_mode_));
+      ORT_THROW("Invalid performance profile %d", static_cast<int>(htp_performance_mode));
       break;
   }
   std::vector<const QnnHtpPerfInfrastructure_PowerConfig_t*> perf_power_configs_ptr = ObtainNullTermPtrVector(power_configs);
-  status = htp_perf_infra.setPowerConfig(htp_power_config_client_id_, perf_power_configs_ptr.data());
+  status = htp_perf_infra.setPowerConfig(htp_power_config_client_id, perf_power_configs_ptr.data());
   ORT_RETURN_IF(QNN_SUCCESS != status, "setPowerConfig failed for HTP performance mode.");
 
-  // Set rpc control latency here, but note that v68 doesn't support rpc polling mode.
-  if (rpc_control_latency_ != 0) {
+  return Status::OK();
+}
+
+Status QnnBackendManager::SetRpcControlLatency(uint32_t htp_power_config_client_id,
+                                               uint32_t rpc_control_latency) {
+  if (rpc_control_latency != 0) {
+    QnnDevice_Infrastructure_t qnn_device_infra = nullptr;
+    auto status = qnn_interface_.deviceGetInfrastructure(&qnn_device_infra);
+    ORT_RETURN_IF(QNN_SUCCESS != status, "backendGetPerfInfrastructure failed.");
+
+    auto* htp_infra = static_cast<QnnHtpDevice_Infrastructure_t*>(qnn_device_infra);
+    ORT_RETURN_IF(QNN_HTP_DEVICE_INFRASTRUCTURE_TYPE_PERF != htp_infra->infraType,
+                  "HTP infra type = ", htp_infra->infraType, ", which is not perf infra type.");
+    QnnHtpDevice_PerfInfrastructure_t& htp_perf_infra = htp_infra->perfInfra;
+
+    // Set rpc control latency here, but note that v68 doesn't support rpc polling mode.
     constexpr int kNumRpcPollingPowerConfigs = 2;
     std::vector<QnnHtpPerfInfrastructure_PowerConfig_t> rpc_power_configs(kNumRpcPollingPowerConfigs);
-    QnnHtpPerfInfrastructure_PowerConfig_t& rpc_control_latency = rpc_power_configs[0];
+    QnnHtpPerfInfrastructure_PowerConfig_t& rpc_control_latency_cfg = rpc_power_configs[0];
     // v68 doesn't support this.
     QnnHtpPerfInfrastructure_PowerConfig_t& rpc_polling_time = rpc_power_configs[1];
-    rpc_control_latency.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY;
+    rpc_control_latency_cfg.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY;
     rpc_polling_time.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME;
-    rpc_control_latency.rpcControlLatencyConfig = rpc_control_latency_;
-    perf_power_configs_ptr = ObtainNullTermPtrVector(rpc_power_configs);
-    status = htp_perf_infra.setPowerConfig(htp_power_config_client_id_, perf_power_configs_ptr.data());
+    rpc_control_latency_cfg.rpcControlLatencyConfig = rpc_control_latency;
+    std::vector<const QnnHtpPerfInfrastructure_PowerConfig_t*> perf_power_configs_ptr =
+        ObtainNullTermPtrVector(rpc_power_configs);
+    status = htp_perf_infra.setPowerConfig(htp_power_config_client_id, perf_power_configs_ptr.data());
     ORT_RETURN_IF(QNN_SUCCESS != status, "setPowerConfig failed for RPC control latency.");
   }
 
@@ -810,11 +834,7 @@ void QnnBackendManager::Split(std::vector<std::string>& split_string,
   }
 }
 
-Status QnnBackendManager::DestroyHTPPowerConfigID() {
-  if (htp_performance_mode_ == HtpPerformanceMode::kHtpDefault) {
-    return Status::OK();
-  }
-
+Status QnnBackendManager::DestroyHTPPowerConfigID(uint32_t htp_power_config_id) {
   QnnDevice_Infrastructure_t qnn_device_infra = nullptr;
   auto status = qnn_interface_.deviceGetInfrastructure(&qnn_device_infra);
   ORT_RETURN_IF(QNN_SUCCESS != status, "backendGetPerfInfrastructure failed.");
@@ -824,7 +844,7 @@ Status QnnBackendManager::DestroyHTPPowerConfigID() {
                 "HTP infra type = ", htp_infra->infraType, ", which is not perf infra type.");
   QnnHtpDevice_PerfInfrastructure_t& htp_perf_infra = htp_infra->perfInfra;
 
-  Qnn_ErrorHandle_t destroy_ret = htp_perf_infra.destroyPowerConfigId(htp_power_config_client_id_);
+  Qnn_ErrorHandle_t destroy_ret = htp_perf_infra.destroyPowerConfigId(htp_power_config_id);
   ORT_RETURN_IF(QNN_SUCCESS != destroy_ret, "destroyPowerConfigId failed.");
   return Status::OK();
 }
@@ -834,12 +854,7 @@ void QnnBackendManager::ReleaseResources() {
     return;
   }
 
-  auto result = DestroyHTPPowerConfigID();
-  if (Status::OK() != result) {
-    ORT_THROW("Failed to DestroyHTPPowerConfigID.");
-  }
-
-  result = ReleaseContext();
+  auto result = ReleaseContext();
   if (Status::OK() != result) {
     ORT_THROW("Failed to ReleaseContext.");
   }
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
index 36375522b5a0a..ff97c4c3a991c 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
@@ -33,8 +33,6 @@ class QnnBackendManager {
  public:
   QnnBackendManager(std::string&& backend_path,
                     ProfilingLevel profiling_level,
-                    uint32_t rpc_control_latency,
-                    HtpPerformanceMode htp_performance_mode,
                     ContextPriority context_priority,
                     std::string&& qnn_saver_path,
                     uint32_t device_id,
@@ -42,8 +40,6 @@ class QnnBackendManager {
                     uint32_t soc_model)
       : backend_path_(backend_path),
         profiling_level_(profiling_level),
-        rpc_control_latency_(rpc_control_latency),
-        htp_performance_mode_(htp_performance_mode),
         context_priority_(context_priority),
         qnn_saver_path_(qnn_saver_path),
         device_id_(device_id),
@@ -92,7 +88,13 @@ class QnnBackendManager {
 
   Status SetupBackend(const logging::Logger& logger, bool load_from_cached_context);
 
-  Status SetHtpPowerConfig();
+  Status CreateHtpPowerCfgId(uint32_t deviceId, uint32_t coreId, uint32_t& htp_power_config_id);
+
+  Status SetHtpPowerConfig(uint32_t htp_power_config_client_id,
+                           HtpPerformanceMode htp_performance_mode);
+
+  Status SetRpcControlLatency(uint32_t htp_power_config_client_id,
+                              uint32_t rpc_control_latency);
 
   const QNN_INTERFACE_VER_TYPE& GetQnnInterface() { return qnn_interface_; }
 
@@ -141,6 +143,8 @@ class QnnBackendManager {
 
   const std::string& GetSdkVersion() { return sdk_build_version_; }
 
+  Status DestroyHTPPowerConfigID(uint32_t htp_power_config_id);
+
  private:
   void* LoadLib(const char* file_name, int flags, std::string& error_msg);
 
@@ -150,8 +154,6 @@ class QnnBackendManager {
 
   Status UnloadLib(void* handle);
 
-  Status DestroyHTPPowerConfigID();
-
   void* LibFunction(void* handle, const char* symbol, std::string& error_msg);
 
   template <class T>
@@ -232,15 +234,12 @@ class QnnBackendManager {
   QnnBackendType qnn_backend_type_ = QnnBackendType::CPU;
   Qnn_ProfileHandle_t profile_backend_handle_ = nullptr;
   std::vector<std::string> op_package_paths_;
-  uint32_t rpc_control_latency_ = 0;
-  HtpPerformanceMode htp_performance_mode_;
   ContextPriority context_priority_;
   std::string sdk_build_version_ = "";
 #ifdef _WIN32
   std::set<HMODULE> mod_handles_;
 #endif
   const std::string qnn_saver_path_;
-  uint32_t htp_power_config_client_id_ = 0;
   uint32_t device_id_ = 0;
   QnnHtpDevice_Arch_t htp_arch_ = QNN_HTP_DEVICE_ARCH_NONE;
   uint32_t soc_model_ = QNN_SOC_MODEL_UNKNOWN;
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index 9a6540a3efea5..3d9cfd92b7922 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -7,6 +7,7 @@
 #include "core/framework/compute_capability.h"
 #include "core/graph/graph_viewer.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
+#include "core/session/onnxruntime_run_options_config_keys.h"
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/framework/kernel_registry.h"
 #include "core/platform/env.h"
@@ -18,11 +19,36 @@
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_def.h"
 #include "core/providers/qnn/builder/onnx_ctx_model_helper.h"
+#include "core/framework/run_options.h"
 
 namespace onnxruntime {
 
 constexpr const char* QNN = "QNN";
 
+static std::unique_ptr<std::vector<std::function<void()>>> s_run_on_unload_;
+
+void RunOnUnload(std::function<void()> function) {
+  OrtMutex mutex;
+  std::lock_guard<OrtMutex> guard(mutex);
+  if (!s_run_on_unload_) {
+    s_run_on_unload_ = std::make_unique<std::vector<std::function<void()>>>();
+  }
+  s_run_on_unload_->push_back(std::move(function));
+}
+
+struct OnUnload {
+  ~OnUnload() {
+    if (!s_run_on_unload_)
+      return;
+
+    for (auto& function : *s_run_on_unload_)
+      function();
+
+    s_run_on_unload_.reset();
+  }
+
+} g_on_unload;
+
 static void ParseProfilingLevel(std::string profiling_level_string,
                                 qnn::ProfilingLevel& profiling_level) {
   std::transform(profiling_level_string.begin(),
@@ -193,18 +219,18 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
   }
 
   static const std::string RPC_CONTROL_LANTENCY = "rpc_control_latency";
-  uint32_t rpc_control_latency = 0;
   auto latency_pos = provider_options_map.find(RPC_CONTROL_LANTENCY);
   if (latency_pos != provider_options_map.end()) {
-    rpc_control_latency = static_cast<uint32_t>(std::stoul(latency_pos->second));
-    LOGS_DEFAULT(VERBOSE) << "rpc_control_latency: " << rpc_control_latency;
+    default_rpc_control_latency_ = static_cast<uint32_t>(std::stoul(latency_pos->second));
+    LOGS_DEFAULT(VERBOSE) << "rpc_control_latency: " << default_rpc_control_latency_;
   }
 
-  qnn::HtpPerformanceMode htp_performance_mode = qnn::HtpPerformanceMode::kHtpDefault;
+  // default_htp_performance_mode from QNN EP option.
+  // set it once only for each thread as default so user don't need to set it for every session run
   static const std::string HTP_PERFORMANCE_MODE = "htp_performance_mode";
   auto htp_performance_mode_pos = provider_options_map.find(HTP_PERFORMANCE_MODE);
   if (htp_performance_mode_pos != provider_options_map.end()) {
-    ParseHtpPerformanceMode(htp_performance_mode_pos->second, htp_performance_mode);
+    ParseHtpPerformanceMode(htp_performance_mode_pos->second, default_htp_performance_mode_);
   }
 
   htp_graph_finalization_opt_mode_ = qnn::HtpGraphFinalizationOptimizationMode::kDefault;
@@ -241,15 +267,14 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
   }
 
   static const std::string QNN_DEVICE_ID = "device_id";
-  uint32_t device_id = 0;
   auto dev_id_pos = provider_options_map.find(QNN_DEVICE_ID);
   if (dev_id_pos != provider_options_map.end()) {
     int value = std::stoi(dev_id_pos->second);
     if (value < 0) {
       LOGS_DEFAULT(WARNING) << "Invalid device ID '" << value
-                            << "', only >= 0 allowed. Set to " << device_id << ".";
+                            << "', only >= 0 allowed. Set to " << device_id_ << ".";
     } else {
-      device_id = static_cast<uint32_t>(value);
+      device_id_ = static_cast<uint32_t>(value);
     }
   }
 
@@ -276,15 +301,23 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
   qnn_backend_manager_ = std::make_unique<qnn::QnnBackendManager>(
       std::move(backend_path),
       profiling_level,
-      rpc_control_latency,
-      htp_performance_mode,
       context_priority,
       std::move(qnn_saver_path),
-      device_id,
+      device_id_,
       htp_arch,
       soc_model);
 }
 
+QNNExecutionProvider::~QNNExecutionProvider() {
+  // clean up thread local context caches
+  std::lock_guard<OrtMutex> lock(context_state_.mutex);
+  for (const auto& cache_weak : context_state_.caches_to_update_on_destruction) {
+    const auto cache = cache_weak.lock();
+    if (!cache) continue;
+    ORT_IGNORE_RETURN_VALUE(cache->erase(this));
+  }
+}
+
 bool QNNExecutionProvider::IsNodeSupported(qnn::QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit,
                                            const logging::Logger& logger) const {
   const std::string& op_type = node_unit.OpType();
@@ -725,4 +758,147 @@ const InlinedVector<const Node*> QNNExecutionProvider::GetEpContextNodes() const
 
   return ep_context_nodes;
 }
+
+QNNExecutionProvider::PerThreadContext::PerThreadContext(qnn::QnnBackendManager* qnn_backend_manager,
+                                                         uint32_t device_id,
+                                                         uint32_t core_id,
+                                                         qnn::HtpPerformanceMode default_htp_performance_mode,
+                                                         uint32_t default_rpc_control_latency)
+    : qnn_backend_manager_(qnn_backend_manager) {
+  Status rt = qnn_backend_manager_->CreateHtpPowerCfgId(device_id, core_id, htp_power_config_id_);
+  is_htp_power_config_id_valid_ = rt.IsOK();
+  // default_htp_performance_mode and default_rpc_control_latency are from QNN EP option.
+  // set it once only for each thread as default so user don't need to set it for every session run
+  if (is_htp_power_config_id_valid_) {
+    if (qnn::HtpPerformanceMode::kHtpDefault != default_htp_performance_mode) {
+      ORT_IGNORE_RETURN_VALUE(qnn_backend_manager_->SetHtpPowerConfig(htp_power_config_id_,
+                                                                      default_htp_performance_mode));
+    }
+    if (default_rpc_control_latency > 0) {
+      ORT_IGNORE_RETURN_VALUE(qnn_backend_manager_->SetRpcControlLatency(htp_power_config_id_,
+                                                                         default_rpc_control_latency));
+    }
+  }
+}
+
+QNNExecutionProvider::PerThreadContext::~PerThreadContext() {
+  if (is_htp_power_config_id_valid_) {
+    ORT_IGNORE_RETURN_VALUE(qnn_backend_manager_->DestroyHTPPowerConfigID(htp_power_config_id_));
+  }
+}
+
+QNNExecutionProvider::PerThreadContext& QNNExecutionProvider::GetPerThreadContext() const {
+  const auto& per_thread_context_cache = PerThreadContextCache();
+
+  // try to use cached context
+  auto cached_context_it = per_thread_context_cache->find(this);
+  if (cached_context_it != per_thread_context_cache->end()) {
+    auto cached_context = cached_context_it->second.lock();
+    ORT_ENFORCE(cached_context);
+    return *cached_context;
+  }
+
+  // get context and update cache
+  std::shared_ptr<PerThreadContext> context;
+  {
+    std::lock_guard<OrtMutex> lock(context_state_.mutex);
+
+    // get or create a context
+    if (context_state_.retired_context_pool.empty()) {
+      uint32_t core_id = 0;
+      context = std::make_shared<PerThreadContext>(qnn_backend_manager_.get(), device_id_, core_id,
+                                                   default_htp_performance_mode_, default_rpc_control_latency_);
+    } else {
+      context = context_state_.retired_context_pool.back();
+      context_state_.retired_context_pool.pop_back();
+    }
+
+    // insert into active_contexts, should not already be present
+    const auto active_contexts_insert_result = context_state_.active_contexts.insert(context);
+    ORT_ENFORCE(active_contexts_insert_result.second);
+
+    // insert into caches_to_update_on_destruction, may already be present
+    ORT_IGNORE_RETURN_VALUE(context_state_.caches_to_update_on_destruction.insert(per_thread_context_cache));
+  }
+
+  per_thread_context_cache->insert(std::make_pair(this, context));
+
+  return *context;
+}
+
+void QNNExecutionProvider::ReleasePerThreadContext() const {
+  const auto& per_thread_context_cache = PerThreadContextCache();
+
+  auto cached_context_it = per_thread_context_cache->find(this);
+  ORT_ENFORCE(cached_context_it != per_thread_context_cache->end());
+  auto cached_context = cached_context_it->second.lock();
+  ORT_ENFORCE(cached_context);
+
+  {
+    std::lock_guard<OrtMutex> lock(context_state_.mutex);
+    context_state_.active_contexts.erase(cached_context);
+    context_state_.retired_context_pool.push_back(cached_context);
+  }
+
+  per_thread_context_cache->erase(cached_context_it);
+}
+
+Status QNNExecutionProvider::OnRunStart(const onnxruntime::RunOptions& run_options) {
+  auto backend_type = qnn_backend_manager_->GetQnnBackendType();
+  if (qnn::QnnBackendType::HTP != backend_type && qnn::QnnBackendType::DSP != backend_type) {
+    return Status::OK();
+  }
+
+  std::string htp_perf_mode = "";
+  qnn::HtpPerformanceMode htp_performance_mode = qnn::HtpPerformanceMode::kHtpDefault;
+  if (run_options.config_options.TryGetConfigEntry(kOrtRunOptionsConfigQnnPerfMode, htp_perf_mode)) {
+    // set power mode
+    ParseHtpPerformanceMode(htp_perf_mode, htp_performance_mode);
+  }
+
+  std::string rpc_latency = "";
+  uint32_t rpc_control_latency = 0;
+  if (run_options.config_options.TryGetConfigEntry(kOrtRunOptionsConfigQnnRpcControlLatency, rpc_latency)) {
+    rpc_control_latency = static_cast<uint32_t>(std::stoul(rpc_latency));
+    LOGS_DEFAULT(VERBOSE) << "rpc_control_latency: " << rpc_control_latency;
+  }
+
+  if (GetPerThreadContext().IsHtpPowerConfigIdValid()) {
+    if (qnn::HtpPerformanceMode::kHtpDefault != htp_performance_mode) {
+      ORT_RETURN_IF_ERROR(qnn_backend_manager_->SetHtpPowerConfig(GetPerThreadContext().GetHtpPowerConfigId(),
+                                                                  htp_performance_mode));
+    }
+
+    if (rpc_control_latency > 0) {
+      ORT_RETURN_IF_ERROR(qnn_backend_manager_->SetRpcControlLatency(GetPerThreadContext().GetHtpPowerConfigId(),
+                                                                     rpc_control_latency));
+    }
+  }
+
+  return Status::OK();
+}
+
+Status QNNExecutionProvider::OnRunEnd(bool /*sync_stream*/, const onnxruntime::RunOptions& run_options) {
+  auto backend_type = qnn_backend_manager_->GetQnnBackendType();
+  if (qnn::QnnBackendType::HTP != backend_type && qnn::QnnBackendType::DSP != backend_type) {
+    return Status::OK();
+  }
+
+  std::string htp_perf_mode = "";
+  qnn::HtpPerformanceMode htp_performance_mode = qnn::HtpPerformanceMode::kHtpDefault;
+  if (run_options.config_options.TryGetConfigEntry(kOrtRunOptionsConfigQnnPerfModePostRun, htp_perf_mode)) {
+    // set power mode
+    ParseHtpPerformanceMode(htp_perf_mode, htp_performance_mode);
+  }
+
+  if (qnn::HtpPerformanceMode::kHtpDefault != htp_performance_mode) {
+    if (!GetPerThreadContext().IsHtpPowerConfigIdValid()) {
+      return Status::OK();
+    }
+    ORT_RETURN_IF_ERROR(qnn_backend_manager_->SetHtpPowerConfig(GetPerThreadContext().GetHtpPowerConfigId(),
+                                                                htp_performance_mode));
+  }
+
+  return Status::OK();
+}
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.h b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
index 0bcaa39b22f6d..43b5e7bff827e 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.h
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
@@ -12,14 +12,19 @@
 #include "core/providers/qnn/builder/qnn_model.h"
 #include "core/providers/qnn/builder/qnn_configs_helper.h"
 #include "HTP/QnnHtpGraph.h"
+#include <vector>
+#include <set>
+#include <unordered_map>
 
 namespace onnxruntime {
 
+void RunOnUnload(std::function<void()> function);
+
 // Logical device representation.
 class QNNExecutionProvider : public IExecutionProvider {
  public:
   explicit QNNExecutionProvider(const ProviderOptions& provider_options_map, const SessionOptions* session_options);
-  virtual ~QNNExecutionProvider() = default;
+  virtual ~QNNExecutionProvider();
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(QNNExecutionProvider);
 
   // we implement the Compile that takes FusedNodeAndGraph instances
@@ -40,6 +45,10 @@ class QNNExecutionProvider : public IExecutionProvider {
 
   const InlinedVector<const Node*> GetEpContextNodes() const override;
 
+  Status OnRunStart(const onnxruntime::RunOptions& run_options) override;
+
+  Status OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& run_options) override;
+
  private:
   bool IsNodeSupported(qnn::QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit,
                        const logging::Logger& logger) const;
@@ -72,6 +81,68 @@ class QNNExecutionProvider : public IExecutionProvider {
   int32_t vtcm_size_in_mb_ = 0;
   std::unique_ptr<onnxruntime::Model> qnn_ep_context_model_;
   ModelMetadefIdGenerator metadef_id_generator_;
+  uint32_t device_id_ = 0;
+  qnn::HtpPerformanceMode default_htp_performance_mode_ = qnn::HtpPerformanceMode::kHtpDefault;
+  uint32_t default_rpc_control_latency_ = 0;
+
+  class PerThreadContext final {
+   public:
+    PerThreadContext(qnn::QnnBackendManager* qnn_backend_manager,
+                     uint32_t device_id, uint32_t core_id,
+                     qnn::HtpPerformanceMode default_htp_performance_mode,
+                     uint32_t default_rpc_control_latency);
+    ~PerThreadContext();
+    ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(PerThreadContext);
+
+    bool IsHtpPowerConfigIdValid() { return is_htp_power_config_id_valid_; }
+
+    uint32_t GetHtpPowerConfigId() { return htp_power_config_id_; }
+
+   private:
+    bool is_htp_power_config_id_valid_ = false;
+    uint32_t htp_power_config_id_ = 0;
+    qnn::QnnBackendManager* qnn_backend_manager_;
+  };
+
+  using PerThreadContextMap = std::unordered_map<const QNNExecutionProvider*, std::weak_ptr<PerThreadContext>>;
+
+  struct ContextCacheHolder {
+    ContextCacheHolder() {
+      RunOnUnload([&, weak_p_ = std::weak_ptr<PerThreadContextMap>(p)] {
+        if (auto lock = weak_p_.lock())
+          p.reset();
+      });
+    }
+
+    std::shared_ptr<PerThreadContextMap> p = std::make_shared<PerThreadContextMap>();
+  };
+
+  static const std::shared_ptr<PerThreadContextMap>& PerThreadContextCache() {
+    thread_local const ContextCacheHolder per_thread_context_cache;
+    return per_thread_context_cache.p;
+  }
+
+  struct PerThreadContextState {
+    // contexts that are currently active
+    std::set<std::shared_ptr<PerThreadContext>, std::owner_less<std::shared_ptr<PerThreadContext>>> active_contexts;
+    // contexts available for reuse
+    std::vector<std::shared_ptr<PerThreadContext>> retired_context_pool;
+    // weak references to thread local caches from which this QNNExecutionProvider instance's entry should be removed
+    // upon destruction
+    std::set<std::weak_ptr<PerThreadContextMap>, std::owner_less<std::weak_ptr<PerThreadContextMap>>>
+        caches_to_update_on_destruction;
+    // synchronizes access to PerThreadContextState members
+    OrtMutex mutex;
+  };
+
+  // The execution provider maintains the PerThreadContexts in this structure.
+  // Synchronization is required to update the contained structures.
+  // On the other hand, access to an individual PerThreadContext is assumed to be from a single thread at a time,
+  // so synchronization is not required for that.
+  mutable PerThreadContextState context_state_;
+
+  PerThreadContext& GetPerThreadContext() const;
+  void ReleasePerThreadContext() const;
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
index ee3578326ac6d..3fd5423681b81 100644
--- a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
+++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
@@ -353,7 +353,7 @@ Status ROCMExecutionProvider::Sync() const {
   return Status::OK();
 }
 
-Status ROCMExecutionProvider::OnRunStart() {
+Status ROCMExecutionProvider::OnRunStart(const onnxruntime::RunOptions& /*run_options*/) {
   // always set ROCM device when session::Run() in case it runs in a worker thread
   HIP_RETURN_IF_ERROR(hipSetDevice(GetDeviceId()));
   if (IsGraphCaptureEnabled() && GetPerThreadContext().IsGraphCaptureAllowed() && !GetPerThreadContext().IsGraphCaptured()) {
@@ -363,7 +363,7 @@ Status ROCMExecutionProvider::OnRunStart() {
   return Status::OK();
 }
 
-Status ROCMExecutionProvider::OnRunEnd(bool sync_stream) {
+Status ROCMExecutionProvider::OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& /*run_options*/) {
   if (IsGraphCaptureEnabled() && !GetPerThreadContext().IsGraphCaptured()) {
     if (GetPerThreadContext().IsGraphCaptureAllowed()) {
       GetPerThreadContext().CaptureEnd();
diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.h b/onnxruntime/core/providers/rocm/rocm_execution_provider.h
index 37d5f7b42210f..da671d9e863bb 100644
--- a/onnxruntime/core/providers/rocm/rocm_execution_provider.h
+++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.h
@@ -28,9 +28,9 @@ class ROCMExecutionProvider : public IExecutionProvider {
 
   Status Sync() const override;
 
-  Status OnRunStart() override;
+  Status OnRunStart(const onnxruntime::RunOptions& run_options) override;
 
-  Status OnRunEnd(bool sync_stream) override;
+  Status OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& run_options) override;
 
   const void* GetExecutionHandle() const noexcept override {
     // The ROCM interface does not return anything interesting.
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index c0bf29e486c88..81346671f2aad 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -1818,11 +1818,11 @@ std::unique_ptr<IDataTransfer> TensorrtExecutionProvider::GetDataTransfer() cons
   return onnxruntime::CreateGPUDataTransfer();
 }
 
-Status TensorrtExecutionProvider::OnRunStart() {
+Status TensorrtExecutionProvider::OnRunStart(const onnxruntime::RunOptions& /*run_options*/) {
   return Status::OK();
 }
 
-Status TensorrtExecutionProvider::OnRunEnd(bool sync_stream) {
+Status TensorrtExecutionProvider::OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& /*run_options*/) {
   if (sync_stream && external_stream_) {
     CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream_));
   }
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
index e86f997b6597a..26f6b2dcc3020 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
@@ -233,8 +233,8 @@ class TensorrtExecutionProvider : public IExecutionProvider {
   common::Status Compile(const std::vector<FusedNodeAndGraph>& fused_nodes_and_graphs,
                          std::vector<NodeComputeInfo>& node_compute_funcs) override;
 
-  Status OnRunStart() override;
-  Status OnRunEnd(bool sync_stream) override;
+  Status OnRunStart(const onnxruntime::RunOptions& run_options) override;
+  Status OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& run_options) override;
 
   ProviderOptions GetProviderOptions() const override {
     return TensorrtExecutionProviderInfo::ToProviderOptions(info_);
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index b045f30a59797..efd7db4ea7629 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -2289,8 +2289,8 @@ Status InferenceSession::PartialRun(onnxruntime::RunOptions& run_options,
     // TODO: only call OnRunStart for all providers in-use
     for (auto& xp : execution_providers_) {
       // call OnRunStart and add to exec_providers_to_stop if successful
-      auto start_func = [&xp, &exec_providers_to_stop]() {
-        auto status = xp->OnRunStart();
+      auto start_func = [&xp, &exec_providers_to_stop, run_options]() {
+        auto status = xp->OnRunStart(run_options);
         if (status.IsOK())
           exec_providers_to_stop.push_back(xp.get());
 
@@ -2326,7 +2326,7 @@ Status InferenceSession::PartialRun(onnxruntime::RunOptions& run_options,
 
   // info all execution providers InferenceSession:Run ended
   for (auto* xp : exec_providers_to_stop) {
-    auto status = xp->OnRunEnd(/*sync_stream*/ false);
+    auto status = xp->OnRunEnd(/*sync_stream*/ false, run_options);
     ORT_CHECK_AND_SET_RETVAL(status);
   }
 
@@ -2448,8 +2448,8 @@ Status InferenceSession::Run(const RunOptions& run_options,
       // TODO: only call OnRunStart for all providers in-use
       for (auto& xp : execution_providers_) {
         // call OnRunStart and add to exec_providers_to_stop if successful
-        auto start_func = [&xp, &exec_providers_to_stop]() {
-          auto status = xp->OnRunStart();
+        auto start_func = [&xp, &exec_providers_to_stop, &run_options]() {
+          auto status = xp->OnRunStart(run_options);
           if (status.IsOK())
             exec_providers_to_stop.push_back(xp.get());
 
@@ -2490,7 +2490,7 @@ Status InferenceSession::Run(const RunOptions& run_options,
       // info all execution providers InferenceSession:Run ended
       for (auto* xp : exec_providers_to_stop) {
         bool synchronize_execution_providers = run_options.config_options.GetConfigOrDefault(kOrtRunOptionsConfigDisableSynchronizeExecutionProviders, "0") == "0";
-        auto status = xp->OnRunEnd(synchronize_execution_providers);
+        auto status = xp->OnRunEnd(synchronize_execution_providers, run_options);
         ORT_CHECK_AND_SET_RETVAL(status);
       }
 
diff --git a/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc b/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc
index a70e439cdf755..5505d689381c9 100644
--- a/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc
@@ -22,6 +22,8 @@ TEST(TestDeferredRelease, WithArena) {
   CUDAExecutionProvider ep(info);
   AllocatorPtr gpu_alloctor = ep.CreatePreferredAllocators()[0];
 
+  RunOptions run_opts;
+  run_opts.run_tag = "log1";
   // Allocator for call cudaMallocHost and cudaFreeHost
   // For details, see CUDAPinnedAllocator in cuda_allocator.cc.
   AllocatorPtr cpu_pinned_alloc = ep.CreatePreferredAllocators()[1];
@@ -31,7 +33,7 @@ TEST(TestDeferredRelease, WithArena) {
   // 10 MB
   const size_t n_bytes = 10 * 1000000;
   const int64_t n_allocs = 64;
-  ORT_THROW_IF_ERROR(ep.OnRunStart());
+  ORT_THROW_IF_ERROR(ep.OnRunStart(run_opts));
   for (size_t i = 0; i < n_allocs; ++i) {
     // Allocate 10MB CUDA pinned memory.
     auto pinned_buffer = IAllocator::MakeUniquePtr<void>(cpu_pinned_alloc, n_bytes);
@@ -44,7 +46,7 @@ TEST(TestDeferredRelease, WithArena) {
   cpu_pinned_alloc->GetStats(&stats);
   ASSERT_EQ(stats.num_allocs, n_allocs);
   ORT_THROW_IF_ERROR(stream.CleanUpOnRunEnd());
-  ORT_THROW_IF_ERROR(ep.OnRunEnd(true));
+  ORT_THROW_IF_ERROR(ep.OnRunEnd(true, run_opts));
 }
 
 TEST(TestDeferredRelease, WithoutArena) {
@@ -52,6 +54,9 @@ TEST(TestDeferredRelease, WithoutArena) {
   CUDAExecutionProviderInfo info;
   CUDAExecutionProvider ep(info);
 
+  RunOptions run_opts;
+  run_opts.run_tag = "log1";
+
   OrtDevice pinned_device{OrtDevice::CPU, OrtDevice::MemType::CUDA_PINNED, DEFAULT_CPU_ALLOCATOR_DEVICE_ID};
   // Create allocator without BFCArena
   AllocatorCreationInfo pinned_memory_info(
@@ -70,7 +75,7 @@ TEST(TestDeferredRelease, WithoutArena) {
   // 10 MB
   const size_t n_bytes = 10 * 1000000;
   const int64_t n_allocs = 64;
-  ORT_THROW_IF_ERROR(ep.OnRunStart());
+  ORT_THROW_IF_ERROR(ep.OnRunStart(run_opts));
   for (size_t i = 0; i < n_allocs; ++i) {
     // Allocate 10MB CUDA pinned memory.
     auto pinned_buffer = IAllocator::MakeUniquePtr<void>(cuda_pinned_alloc, n_bytes);
@@ -79,7 +84,7 @@ TEST(TestDeferredRelease, WithoutArena) {
   }
 
   ORT_THROW_IF_ERROR(stream.CleanUpOnRunEnd());
-  ORT_THROW_IF_ERROR(ep.OnRunEnd(true));
+  ORT_THROW_IF_ERROR(ep.OnRunEnd(true, run_opts));
 }
 
 }  // namespace test
diff --git a/onnxruntime/test/providers/qnn/qnn_basic_test.cc b/onnxruntime/test/providers/qnn/qnn_basic_test.cc
index 4e1aef2c40b2b..8f07c2ce77e77 100644
--- a/onnxruntime/test/providers/qnn/qnn_basic_test.cc
+++ b/onnxruntime/test/providers/qnn/qnn_basic_test.cc
@@ -7,6 +7,7 @@
 
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
+#include "core/session/onnxruntime_run_options_config_keys.h"
 #include "core/providers/cpu/cpu_provider_factory.h"  // For OrtSessionOptionsAppendExecutionProvider_CPU
 #include "core/session/inference_session.h"
 
@@ -332,19 +333,23 @@ static void CreateModelInMemory(std::unique_ptr<ModelAndBuilder>& result,
 static void RunSessionAndVerify(InferenceSession& session, const RunOptions& run_options, const NameMLValMap& feeds,
                                 const std::vector<std::string>& output_names,
                                 const std::vector<std::vector<int64_t>>& output_shapes,
-                                const std::vector<std::vector<float>>& expected_values) {
-  std::vector<OrtValue> fetches;
-  auto status = session.Run(run_options, feeds, output_names, &fetches);
-  ASSERT_TRUE(status.IsOK());
-
-  for (size_t i = 0; i < fetches.size(); i++) {
-    auto& tensor = fetches[i].Get<Tensor>();
-    TensorShape expected_shape(output_shapes[i]);
-    ASSERT_EQ(expected_shape, tensor.Shape());
-
-    gsl::span<const float> actual = tensor.DataAsSpan<float>();
-    gsl::span<const float> expected(expected_values[i].data(), expected_values[i].size());
-    ASSERT_EQ(expected, actual);
+                                const std::vector<std::vector<float>>& expected_values,
+                                int loop_count = 10) {
+  // Let it run for a while
+  for (int it = 0; it < loop_count; ++it) {
+    std::vector<OrtValue> fetches;
+    auto status = session.Run(run_options, feeds, output_names, &fetches);
+    ASSERT_TRUE(status.IsOK());
+
+    for (size_t i = 0; i < fetches.size(); i++) {
+      auto& tensor = fetches[i].Get<Tensor>();
+      TensorShape expected_shape(output_shapes[i]);
+      ASSERT_EQ(expected_shape, tensor.Shape());
+
+      gsl::span<const float> actual = tensor.DataAsSpan<float>();
+      gsl::span<const float> expected(expected_values[i].data(), expected_values[i].size());
+      ASSERT_EQ(expected, actual);
+    }
   }
 }
 
@@ -404,11 +409,11 @@ TEST_F(QnnCPUBackendTests, MultithreadSessionRun) {
 
   std::vector<std::thread> threads;
   constexpr int num_threads = 5;
-
+  constexpr int loop_count = 10;
   for (int i = 0; i < num_threads; i++) {
     threads.push_back(std::thread(RunSessionAndVerify, std::ref(session_obj), run_opts,
                                   model->builder.feeds_, model->builder.output_names_,
-                                  output_shapes, output_values));
+                                  output_shapes, output_values, loop_count));
   }
 
   for (auto& th : threads) {
@@ -484,11 +489,191 @@ TEST_F(QnnHTPBackendTests, MultithreadSessionRun) {
 
   std::vector<std::thread> threads;
   constexpr int num_threads = 5;
+  constexpr int loop_count = 10;
 
   for (int i = 0; i < num_threads; i++) {
     threads.push_back(std::thread(RunSessionAndVerify, std::ref(session_obj), run_opts,
                                   model->builder.feeds_, model->builder.output_names_,
-                                  output_shapes, output_values));
+                                  output_shapes, output_values, loop_count));
+  }
+
+  for (auto& th : threads) {
+    th.join();
+  }
+}
+
+// Tests running a single session in multiple threads on the HTP backend with run option to set power config
+TEST_F(QnnHTPBackendTests, MultithreadHtpPowerCfgSessionRunOption) {
+  std::unique_ptr<ModelAndBuilder> model;
+  std::vector<float> input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  std::vector<int64_t> shape = {1, 3, 2};
+  std::vector<std::vector<int64_t>> output_shapes = {shape};
+  std::vector<std::vector<float>> output_values = {{3.0f, 6.0f, 9.0f, 12.0f, 15.0f, 18.0f}};
+
+  CreateModelInMemory(model,
+                      QDQBuildAdd3Tensors<uint8_t>(TestInputDef<float>(shape, false, input_data),
+                                                   TestInputDef<float>(shape, false, input_data),
+                                                   TestInputDef<float>(shape, false, input_data)),
+                      "add3.qdq");
+
+  SessionOptions session_opts;
+  session_opts.session_logid = "logger0";
+
+  InferenceSession session_obj{session_opts, GetEnvironment()};
+  onnxruntime::ProviderOptions options;
+
+#if defined(_WIN32)
+  options["backend_path"] = "QnnHtp.dll";
+#else
+  options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  auto qnn_ep = QnnExecutionProviderWithOptions(options, &session_opts);
+  EXPECT_TRUE(session_obj.RegisterExecutionProvider(std::move(qnn_ep)).IsOK());
+
+  auto status = session_obj.Load(model->model_data.data(), static_cast<int>(model->model_data.size()));
+  ASSERT_TRUE(status.IsOK());
+  status = session_obj.Initialize();
+  ASSERT_TRUE(status.IsOK());
+
+  std::vector<std::thread> threads;
+  constexpr int num_threads = 5;
+  constexpr int loop_count = 10;
+
+  std::vector<std::string> perf_modes{
+      "burst", "balanced", "default", "high_performance", "high_power_saver",
+      "low_balanced", "extreme_power_saver", "low_power_saver", "power_saver"};
+
+  size_t post_i = perf_modes.size() - 1;
+  ASSERT_TRUE(post_i > num_threads);
+  for (int i = 0; i < num_threads; ++i, --post_i) {
+    RunOptions run_opts;
+    run_opts.run_tag = session_opts.session_logid;
+    auto rt = run_opts.config_options.AddConfigEntry(kOrtRunOptionsConfigQnnPerfMode, perf_modes[i].c_str());
+    ASSERT_TRUE(rt.IsOK());
+    rt = run_opts.config_options.AddConfigEntry(kOrtRunOptionsConfigQnnPerfModePostRun, perf_modes[post_i].c_str());
+    ASSERT_TRUE(rt.IsOK());
+
+    threads.push_back(std::thread(RunSessionAndVerify, std::ref(session_obj), run_opts,
+                                  model->builder.feeds_, model->builder.output_names_,
+                                  output_shapes, output_values, loop_count));
+  }
+
+  for (auto& th : threads) {
+    th.join();
+  }
+}
+
+// Tests running a single session in multiple threads on the HTP backend with EP option to set default power config
+TEST_F(QnnHTPBackendTests, MultithreadDefaultHtpPowerCfgFromEpOption) {
+  std::unique_ptr<ModelAndBuilder> model;
+  std::vector<float> input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  std::vector<int64_t> shape = {1, 3, 2};
+  std::vector<std::vector<int64_t>> output_shapes = {shape};
+  std::vector<std::vector<float>> output_values = {{3.0f, 6.0f, 9.0f, 12.0f, 15.0f, 18.0f}};
+
+  CreateModelInMemory(model,
+                      QDQBuildAdd3Tensors<uint8_t>(TestInputDef<float>(shape, false, input_data),
+                                                   TestInputDef<float>(shape, false, input_data),
+                                                   TestInputDef<float>(shape, false, input_data)),
+                      "add3.qdq");
+
+  SessionOptions session_opts;
+  session_opts.session_logid = "logger0";
+
+  RunOptions run_opts;
+  run_opts.run_tag = session_opts.session_logid;
+
+  InferenceSession session_obj{session_opts, GetEnvironment()};
+  onnxruntime::ProviderOptions options;
+
+#if defined(_WIN32)
+  options["backend_path"] = "QnnHtp.dll";
+#else
+  options["backend_path"] = "libQnnHtp.so";
+#endif
+  options["htp_performance_mode"] = "burst";
+
+  auto qnn_ep = QnnExecutionProviderWithOptions(options, &session_opts);
+  EXPECT_TRUE(session_obj.RegisterExecutionProvider(std::move(qnn_ep)).IsOK());
+
+  auto status = session_obj.Load(model->model_data.data(), static_cast<int>(model->model_data.size()));
+  ASSERT_TRUE(status.IsOK());
+  status = session_obj.Initialize();
+  ASSERT_TRUE(status.IsOK());
+
+  std::vector<std::thread> threads;
+  constexpr int num_threads = 5;
+  constexpr int loop_count = 10;
+
+  for (int i = 0; i < num_threads; i++) {
+    threads.push_back(std::thread(RunSessionAndVerify, std::ref(session_obj), run_opts,
+                                  model->builder.feeds_, model->builder.output_names_,
+                                  output_shapes, output_values, loop_count));
+  }
+
+  for (auto& th : threads) {
+    th.join();
+  }
+}
+
+// Tests running a single session in multiple threads on the HTP backend with
+// EP option to set default power config + run option to set power config for each run
+TEST_F(QnnHTPBackendTests, MultithreadHtpPowerCfgDefaultAndRunOption) {
+  std::unique_ptr<ModelAndBuilder> model;
+  std::vector<float> input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  std::vector<int64_t> shape = {1, 3, 2};
+  std::vector<std::vector<int64_t>> output_shapes = {shape};
+  std::vector<std::vector<float>> output_values = {{3.0f, 6.0f, 9.0f, 12.0f, 15.0f, 18.0f}};
+
+  CreateModelInMemory(model,
+                      QDQBuildAdd3Tensors<uint8_t>(TestInputDef<float>(shape, false, input_data),
+                                                   TestInputDef<float>(shape, false, input_data),
+                                                   TestInputDef<float>(shape, false, input_data)),
+                      "add3.qdq");
+
+  SessionOptions session_opts;
+  session_opts.session_logid = "logger0";
+
+  InferenceSession session_obj{session_opts, GetEnvironment()};
+  onnxruntime::ProviderOptions options;
+
+#if defined(_WIN32)
+  options["backend_path"] = "QnnHtp.dll";
+#else
+  options["backend_path"] = "libQnnHtp.so";
+#endif
+  options["htp_performance_mode"] = "burst";
+
+  auto qnn_ep = QnnExecutionProviderWithOptions(options, &session_opts);
+  EXPECT_TRUE(session_obj.RegisterExecutionProvider(std::move(qnn_ep)).IsOK());
+
+  auto status = session_obj.Load(model->model_data.data(), static_cast<int>(model->model_data.size()));
+  ASSERT_TRUE(status.IsOK());
+  status = session_obj.Initialize();
+  ASSERT_TRUE(status.IsOK());
+
+  std::vector<std::thread> threads;
+  constexpr int num_threads = 5;
+  constexpr int loop_count = 10;
+
+  std::vector<std::string> perf_modes{
+      "burst", "balanced", "default", "high_performance", "high_power_saver",
+      "low_balanced", "extreme_power_saver", "low_power_saver", "power_saver"};
+
+  size_t post_i = perf_modes.size() - 1;
+  ASSERT_TRUE(post_i > num_threads);
+  for (int i = 0; i < num_threads; ++i, --post_i) {
+    RunOptions run_opts;
+    run_opts.run_tag = session_opts.session_logid;
+    auto rt = run_opts.config_options.AddConfigEntry(kOrtRunOptionsConfigQnnPerfMode, perf_modes[i].c_str());
+    ASSERT_TRUE(rt.IsOK());
+    rt = run_opts.config_options.AddConfigEntry(kOrtRunOptionsConfigQnnPerfModePostRun, perf_modes[post_i].c_str());
+    ASSERT_TRUE(rt.IsOK());
+
+    threads.push_back(std::thread(RunSessionAndVerify, std::ref(session_obj), run_opts,
+                                  model->builder.feeds_, model->builder.output_names_,
+                                  output_shapes, output_values, loop_count));
   }
 
   for (auto& th : threads) {

From 29b1106033e291947debb49c3fd03feb479c4b1b Mon Sep 17 00:00:00 2001
From: Segev Finer <segev208@gmail.com>
Date: Fri, 23 Feb 2024 04:53:50 +0200
Subject: [PATCH 046/279] [node] Switch to setImmediate to avoid starving the
 Node.js event loop (#19610)

### Description
<!-- Describe your changes. -->
Switch to setImmediate to avoid starving the Node.js event loop

There should really be a true async version though, running
computationally intensive things on the event loop will stop everything
else from happening while it is running, e.g. a web server from
answering requests.

This can be done by wrapping `RunAsync` behind a
[`napi::Promise`](https://github.com/nodejs/node-addon-api/blob/main/doc/promises.md)
to run on the onnxruntime thread pool or [`AsyncWorker`](
https://github.com/nodejs/node-addon-api/blob/main/doc/async_worker.md)
for the Node.js/libuv thread pool.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
Without this, if you run inference in a tight loop, without anything
else in between that is async/deferred, `process.nextTick` will lead to
starving the event loop and not letting anything else run,
`setImmediate` at least lets the event loop spin between calls to `run`.

See
https://dev.to/ynmanware/setimmediate-settimeout-and-process-nexttick-3mfd

Contributed on behalf of [Swimm](https://swimm.io/)
---
 js/node/lib/backend.ts | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/js/node/lib/backend.ts b/js/node/lib/backend.ts
index e8eb0e9babf5a..927953b4f1dd6 100644
--- a/js/node/lib/backend.ts
+++ b/js/node/lib/backend.ts
@@ -36,7 +36,7 @@ class OnnxruntimeSessionHandler implements InferenceSessionHandler {
   async run(feeds: SessionHandler.FeedsType, fetches: SessionHandler.FetchesType, options: InferenceSession.RunOptions):
       Promise<SessionHandler.ReturnType> {
     return new Promise((resolve, reject) => {
-      process.nextTick(() => {
+      setImmediate(() => {
         try {
           resolve(this.#inferenceSession.run(feeds, fetches, options));
         } catch (e) {
@@ -56,7 +56,7 @@ class OnnxruntimeBackend implements Backend {
   async createInferenceSessionHandler(pathOrBuffer: string|Uint8Array, options?: InferenceSession.SessionOptions):
       Promise<InferenceSessionHandler> {
     return new Promise((resolve, reject) => {
-      process.nextTick(() => {
+      setImmediate(() => {
         try {
           resolve(new OnnxruntimeSessionHandler(pathOrBuffer, options || {}));
         } catch (e) {

From ae92d593c0e2b06decbea64797f9145bc10f34af Mon Sep 17 00:00:00 2001
From: pengwa <pengwa@microsoft.com>
Date: Fri, 23 Feb 2024 11:05:16 +0800
Subject: [PATCH 047/279]  ONNX Gelu Op in Opset 20 (#19560)

### ONNX Gelu Op in Opset 20

Refactor code to support MSDomain Gelu and ONNX Gelu-opset20 Op

1. Move CPU-GELU implmentation from
`onnxruntime/contrib_ops/cpu/activations.h/cc` to
`onnxruntime/core/providers/cpu/tensor/gelu.h/cc`, as the implementation
for approximate attribute to be 'none'.
2. Dumplicate some logic from
`onnxruntime/contrib_ops/cpu/bert/bias_gelu.cc` to
`onnxruntime/core/providers/cpu/tensor/gelu.h/cc`, as the implementation
for approximate attribute to be 'tanh'.
3. Register ONNX domain Gelu CPU kernel from opset 20 in
`onnxruntime/core/providers/cpu/cpu_execution_provider.cc`.
4. Move `onnxruntime/contrib_ops/cuda/bert/fast_gelu_impl.h/cu` to
`onnxruntime/core/providers/cuda/tensor/gelu_impl.h` and
`onnxruntime/core/providers/cuda/tensor/gelu_approximate_impl.cu`
respectively, as the implementation for approximate attribute to be
'tanh'.
5. Implement the logic for approximate attribute to be 'none' in
`onnxruntime/core/providers/cuda/tensor/gelu_impl.cu`.
6. Register ONNX domain Gelu CUDA kernel from opset 20 in
`onnxruntime/core/providers/cuda/cuda_execution_provider.cc`.
7. ROCM ep related changes.
8. Enrich the tests for ONNX domain Gelu in
`onnxruntime/test/providers/cpu/activation/activation_op_test.cc`.
---
 cmake/onnxruntime_rocm_hipify.cmake           |   4 -
 .../InferenceTest.netcore.cs                  |   2 +-
 docs/OperatorKernels.md                       |   2 +
 .../core/providers/cuda/cuda_resource.h       |   2 +-
 onnxruntime/contrib_ops/cpu/activations.cc    |  10 +-
 onnxruntime/contrib_ops/cpu/activations.h     |  41 -------
 .../cuda/activation/activations.cc            |   1 -
 .../contrib_ops/cuda/activation/activations.h |  11 --
 .../cuda/activation/activations_impl.cu       |  14 ---
 .../cuda/activation/activations_impl.h        |   2 -
 .../contrib_ops/cuda/bert/fast_gelu.cc        |  20 +++-
 onnxruntime/contrib_ops/cuda/bert/fast_gelu.h |   2 +-
 .../contrib_ops/rocm/bert/fast_gelu.cc        |  59 ----------
 onnxruntime/contrib_ops/rocm/bert/fast_gelu.h |  24 ----
 .../providers/cpu/cpu_execution_provider.cc   |   2 +
 onnxruntime/core/providers/cpu/tensor/gelu.cc | 108 ++++++++++++++++++
 onnxruntime/core/providers/cpu/tensor/gelu.h  |  18 +++
 .../providers/cuda/cuda_execution_provider.cc |  10 ++
 .../core/providers/cuda/tensor/gelu.cc        |  89 +++++++++++++++
 onnxruntime/core/providers/cuda/tensor/gelu.h |  28 +++++
 .../cuda/tensor/gelu_approximate_impl.cu}     |  17 ++-
 .../core/providers/cuda/tensor/gelu_impl.cu   |  48 ++++++++
 .../providers/cuda/tensor/gelu_impl.h}        |   7 +-
 .../test/contrib_ops/activation_op_test.cc    |  13 ++-
 .../test/onnx/microbenchmark/activation.cc    |   3 +-
 .../cpu/activation/activation_op_test.cc      |  48 ++++++--
 .../cpu/activation/activation_op_test.h       |   7 +-
 27 files changed, 395 insertions(+), 197 deletions(-)
 delete mode 100644 onnxruntime/contrib_ops/rocm/bert/fast_gelu.cc
 delete mode 100644 onnxruntime/contrib_ops/rocm/bert/fast_gelu.h
 create mode 100644 onnxruntime/core/providers/cpu/tensor/gelu.cc
 create mode 100644 onnxruntime/core/providers/cpu/tensor/gelu.h
 create mode 100644 onnxruntime/core/providers/cuda/tensor/gelu.cc
 create mode 100644 onnxruntime/core/providers/cuda/tensor/gelu.h
 rename onnxruntime/{contrib_ops/cuda/bert/fast_gelu_impl.cu => core/providers/cuda/tensor/gelu_approximate_impl.cu} (88%)
 create mode 100644 onnxruntime/core/providers/cuda/tensor/gelu_impl.cu
 rename onnxruntime/{contrib_ops/cuda/bert/fast_gelu_impl.h => core/providers/cuda/tensor/gelu_impl.h} (80%)

diff --git a/cmake/onnxruntime_rocm_hipify.cmake b/cmake/onnxruntime_rocm_hipify.cmake
index 85a9bf50460d3..1bb70e9c2ed27 100644
--- a/cmake/onnxruntime_rocm_hipify.cmake
+++ b/cmake/onnxruntime_rocm_hipify.cmake
@@ -20,10 +20,6 @@ set(contrib_ops_excluded_files
   "bert/fastertransformer_decoder_attention/*"
   "bert/multihead_attention.cc"
   "bert/multihead_attention.h"
-  "bert/fast_gelu_impl.cu"
-  "bert/fast_gelu_impl.h"
-  "bert/fast_gelu.cc"
-  "bert/fast_gelu.h"
   "bert/relative_attn_bias.cc"
   "bert/relative_attn_bias.h"
   "bert/relative_attn_bias_impl.cu"
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/InferenceTest.netcore.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/InferenceTest.netcore.cs
index 715aed7e1d64f..7f3d5d6624b07 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/InferenceTest.netcore.cs
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/InferenceTest.netcore.cs
@@ -145,7 +145,7 @@ private void TestCUDAProviderOptions()
         private void CanRunInferenceOnAModelWithTensorRT()
         {
             string modelPath = Path.Combine(Directory.GetCurrentDirectory(), "squeezenet.onnx");
-            
+
             int deviceId = 0;
             string deviceIdStr = System.Environment.GetEnvironmentVariable("ONNXRUNTIME_TEST_GPU_DEVICE_ID");
             if (!string.IsNullOrEmpty(deviceIdStr) && int.TryParse(deviceIdStr, out int parsedValue) && parsedValue >= 0)
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index 8ff2135c6b1f6..46149c577a106 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -127,6 +127,7 @@ Do not modify directly.*
 |GatherND|*in* data:**T**<br> *in* indices:**tensor(int64)**<br> *out* output:**T**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **indices** = tensor(int64)|
 |||12|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **indices** = tensor(int64)|
 |||11|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **indices** = tensor(int64)|
+|Gelu|*in* X:**T**<br> *out* Y:**T**|20+|**T** = tensor(float)|
 |Gemm|*in* A:**T**<br> *in* B:**T**<br> *in* C:**T**<br> *out* Y:**T**|13+|**T** = tensor(double), tensor(float)|
 |||[11, 12]|**T** = tensor(double), tensor(float)|
 |||[9, 10]|**T** = tensor(double), tensor(float)|
@@ -606,6 +607,7 @@ Do not modify directly.*
 |GatherND|*in* data:**T**<br> *in* indices:**tensor(int64)**<br> *out* output:**T**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int64)<br/> **indices** = tensor(int64)|
 |||12|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int64)<br/> **indices** = tensor(int64)|
 |||11|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int64)<br/> **indices** = tensor(int64)|
+|Gelu|*in* X:**T**<br> *out* Y:**T**|20+|**T** = tensor(double), tensor(float), tensor(float16)|
 |Gemm|*in* A:**T**<br> *in* B:**T**<br> *in* C:**T**<br> *out* Y:**T**|13+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
 |||[11, 12]|**T** = tensor(double), tensor(float), tensor(float16)|
 |||[9, 10]|**T** = tensor(double), tensor(float), tensor(float16)|
diff --git a/include/onnxruntime/core/providers/cuda/cuda_resource.h b/include/onnxruntime/core/providers/cuda/cuda_resource.h
index 1fef077860be3..00e7dec5727d1 100644
--- a/include/onnxruntime/core/providers/cuda/cuda_resource.h
+++ b/include/onnxruntime/core/providers/cuda/cuda_resource.h
@@ -19,4 +19,4 @@ enum CudaResource : int {
   enable_skip_layer_norm_strict_mode_t,
   prefer_nhwc_t,
   use_tf32_t,
-};
\ No newline at end of file
+};
diff --git a/onnxruntime/contrib_ops/cpu/activations.cc b/onnxruntime/contrib_ops/cpu/activations.cc
index 556699192d2eb..3e0533dd8b9e5 100644
--- a/onnxruntime/contrib_ops/cpu/activations.cc
+++ b/onnxruntime/contrib_ops/cpu/activations.cc
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 #include "core/providers/cpu/activation/activations.h"
-#include "activations.h"
+#include "contrib_ops/cpu/activations.h"
 
 namespace onnxruntime {
 namespace contrib {
@@ -26,14 +26,6 @@ ONNX_CPU_OPERATOR_VERSIONED_KERNEL(
     KernelDefBuilder().MayInplace(0, 0).TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
     ThresholdedRelu<float>);
 
-ONNX_OPERATOR_KERNEL_EX(
-    Gelu,
-    kMSDomain,
-    1,
-    kCpuExecutionProvider,
-    KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
-    Gelu<float>);
-
 ONNX_OPERATOR_KERNEL_EX(
     QuickGelu,
     kMSDomain,
diff --git a/onnxruntime/contrib_ops/cpu/activations.h b/onnxruntime/contrib_ops/cpu/activations.h
index aed4c2229215d..7e64235d3fc3d 100644
--- a/onnxruntime/contrib_ops/cpu/activations.h
+++ b/onnxruntime/contrib_ops/cpu/activations.h
@@ -54,47 +54,6 @@ namespace contrib {
 DEFINE_ELE_KERNEL(ScaledTanh);
 DEFINE_ELE_KERNEL(ParametricSoftplus);
 
-template <typename T>
-class Gelu : public OpKernel {
- public:
-  Gelu(const OpKernelInfo& info) : OpKernel(info) {
-  }
-
-  Status Compute(OpKernelContext* context) const override {
-    const Tensor* input = context->Input<Tensor>(0);
-    const T* input_data = input->Data<T>();
-
-    Tensor* output = context->Output(0, input->Shape());
-    T* output_data = output->MutableData<T>();
-
-    concurrency::ThreadPool* tp = context->GetOperatorThreadPool();
-    int64_t elem_count = input->Shape().Size();
-    constexpr int64_t length_per_task = 4096;  // this number comes from FastGelu.
-    int64_t task_count = (elem_count + length_per_task - 1) / length_per_task;
-    concurrency::ThreadPool::TryBatchParallelFor(
-        tp, static_cast<int32_t>(task_count),
-        [&](ptrdiff_t task_idx) {
-          const auto start = task_idx * length_per_task;
-          const T* p_input = input_data + start;
-          T* p_output = output_data + start;
-          int64_t count = std::min(length_per_task, elem_count - start);
-
-          for (int64_t i = 0; i < count; i++) {
-            T value = p_input[i];
-            p_output[i] = value * static_cast<T>(M_SQRT1_2);
-          }
-
-          MlasComputeErf(p_output, p_output, narrow<size_t>(count));
-
-          for (int64_t i = 0; i < count; i++) {
-            p_output[i] = 0.5f * p_input[i] * (p_output[i] + 1.0f);
-          }
-        },
-        0);
-    return Status::OK();
-  }
-};
-
 // Implement a new one instead of inheriting from ElementWiseRangedTransform so that we can call
 // MlasComputeLogistic instead of using Eigen for better perf.
 template <typename T>
diff --git a/onnxruntime/contrib_ops/cuda/activation/activations.cc b/onnxruntime/contrib_ops/cuda/activation/activations.cc
index 1a86c5dbece5a..6303858b9bd48 100644
--- a/onnxruntime/contrib_ops/cuda/activation/activations.cc
+++ b/onnxruntime/contrib_ops/cuda/activation/activations.cc
@@ -49,7 +49,6 @@ namespace cuda {
 UNARY_ACTIVATION_OP_HFD(Affine, 1, kOnnxDomain);
 UNARY_ACTIVATION_OP_HFD(ParametricSoftplus, 1, kOnnxDomain);
 UNARY_ACTIVATION_OP_HFD(ScaledTanh, 1, kOnnxDomain);
-UNARY_ACTIVATION_OP_HFD(Gelu, 1, kMSDomain);
 UNARY_ACTIVATION_OP_HFD(QuickGelu, 1, kMSDomain);
 
 REGISTER_ACTIVATION_KERNEL(ThresholdedRelu, 1, kOnnxDomain, MLFloat16)
diff --git a/onnxruntime/contrib_ops/cuda/activation/activations.h b/onnxruntime/contrib_ops/cuda/activation/activations.h
index ab339f276c2bd..fc9a71b0b7fa1 100644
--- a/onnxruntime/contrib_ops/cuda/activation/activations.h
+++ b/onnxruntime/contrib_ops/cuda/activation/activations.h
@@ -66,17 +66,6 @@ class ScaledTanh final : public UnaryElementwise {
   float beta_;
 };
 
-template <typename T>
-class Gelu final : public UnaryElementwise {
- public:
-  Gelu(const OpKernelInfo& info) : UnaryElementwise(info) {}
-
-  Status ComputeInternal(OpKernelContext* context) const override;
-
- private:
-  MAKE_FUNC_CTX_NULL()
-};
-
 template <typename T>
 class QuickGelu final : public UnaryElementwise {
  public:
diff --git a/onnxruntime/contrib_ops/cuda/activation/activations_impl.cu b/onnxruntime/contrib_ops/cuda/activation/activations_impl.cu
index 0c856815fd437..36f33fbb24c18 100644
--- a/onnxruntime/contrib_ops/cuda/activation/activations_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/activation/activations_impl.cu
@@ -36,20 +36,6 @@ struct OP_ScaledTanh : public CtxScaledTanh {
   }
 };
 
-template <typename T>
-struct OP_Gelu : public CtxGelu {
-  __device__ __inline__ T operator()(const T& a) const {
-    return _Gelu(a);
-  }
-};
-
-template <>
-struct OP_Gelu<half> : public CtxGelu {
-  __device__ __inline__ half operator()(const half& a) const {
-    return static_cast<half>(_Gelu(static_cast<float>(a)));
-  }
-};
-
 template <typename T>
 struct OP_QuickGelu : public CtxQuickGelu {
   __device__ __inline__ T operator()(const T& a) const {
diff --git a/onnxruntime/contrib_ops/cuda/activation/activations_impl.h b/onnxruntime/contrib_ops/cuda/activation/activations_impl.h
index 5d18283a395e3..782d4bf59a5ad 100644
--- a/onnxruntime/contrib_ops/cuda/activation/activations_impl.h
+++ b/onnxruntime/contrib_ops/cuda/activation/activations_impl.h
@@ -11,14 +11,12 @@ namespace cuda {
 typedef onnxruntime::cuda::CtxAlphaBeta CtxAffine;
 typedef onnxruntime::cuda::CtxAlphaBeta CtxParametricSoftplus;
 typedef onnxruntime::cuda::CtxAlphaBeta CtxScaledTanh;
-typedef onnxruntime::cuda::CtxNull CtxGelu;
 typedef onnxruntime::cuda::CtxAlpha CtxQuickGelu;
 
 #define UNARY_CONTRIB_ACTIVATION_OPS()         \
   UNARY_ACTIVATION_OP_NAME(ScaledTanh)         \
   UNARY_ACTIVATION_OP_NAME(Affine)             \
   UNARY_ACTIVATION_OP_NAME(ParametricSoftplus) \
-  UNARY_ACTIVATION_OP_NAME(Gelu)               \
   UNARY_ACTIVATION_OP_NAME(QuickGelu)
 
 #define UNARY_ACTIVATION_OP_NAME(name) UNARY_ACTIVATION_IMPL_DECLARATION(name);
diff --git a/onnxruntime/contrib_ops/cuda/bert/fast_gelu.cc b/onnxruntime/contrib_ops/cuda/bert/fast_gelu.cc
index 892f5c181a607..e8974a29476b6 100644
--- a/onnxruntime/contrib_ops/cuda/bert/fast_gelu.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/fast_gelu.cc
@@ -4,9 +4,14 @@
 #include "core/providers/cuda/cuda_common.h"
 #include "core/providers/cuda/cudnn_common.h"
 #include "fast_gelu.h"
-#include "fast_gelu_impl.h"
+#include "core/providers/cuda/tensor/gelu_impl.h"
 #include "contrib_ops/cpu/bert/bias_gelu_helper.h"
-#include "transformer_common.h"
+#ifdef USE_ROCM
+#include "contrib_ops/rocm/bert/elementwise.h"
+#endif
+#ifdef USE_CUDA
+#include "contrib_ops/cuda/bert/transformer_common.h"
+#endif
 
 namespace onnxruntime {
 namespace contrib {
@@ -31,8 +36,10 @@ using namespace ONNX_NAMESPACE;
 
 template <typename T>
 FastGelu<T>::FastGelu(const OpKernelInfo& op_kernel_info) : CudaKernel(op_kernel_info) {
+#ifdef USE_CUDA
   const TransformerOptions* options = TransformerOptions::GetInstance();
   use_half2_ = !options->DisableHalf2();
+#endif
 }
 
 template <typename T>
@@ -50,6 +57,14 @@ Status FastGelu<T>::ComputeInternal(OpKernelContext* context) const {
   int64_t bias_length = (nullptr == bias) ? 0 : bias->Shape().Size();
   typedef typename ToCudaType<T>::MappedType CudaT;
 
+#ifdef USE_ROCM
+  return LaunchElementwiseKernel<functor::FastGeLU, CudaT>(
+      GetTuningContext(), context->GetComputeStream(),
+      reinterpret_cast<const CudaT*>(input->Data<T>()), static_cast<int>(input_length),
+      (nullptr != bias) ? reinterpret_cast<const CudaT*>(bias->Data<T>()) : nullptr, static_cast<int>(bias_length),
+      reinterpret_cast<CudaT*>(output->MutableData<T>()));
+#endif
+#ifdef USE_CUDA
   return LaunchFastGeluKernel<CudaT>(GetDeviceProp(),
                                      Stream(context),
                                      static_cast<int>(input_length),
@@ -58,6 +73,7 @@ Status FastGelu<T>::ComputeInternal(OpKernelContext* context) const {
                                      (nullptr != bias) ? reinterpret_cast<const CudaT*>(bias->Data<T>()) : nullptr,
                                      reinterpret_cast<CudaT*>(output->MutableData<T>()),
                                      use_half2_);
+#endif
 }
 
 }  // namespace cuda
diff --git a/onnxruntime/contrib_ops/cuda/bert/fast_gelu.h b/onnxruntime/contrib_ops/cuda/bert/fast_gelu.h
index 3e642a70afef5..d563556593e6e 100644
--- a/onnxruntime/contrib_ops/cuda/bert/fast_gelu.h
+++ b/onnxruntime/contrib_ops/cuda/bert/fast_gelu.h
@@ -18,7 +18,7 @@ class FastGelu final : public CudaKernel {
   Status ComputeInternal(OpKernelContext* ctx) const override;
 
  private:
-  bool use_half2_;
+  bool use_half2_;  // Only applicable to CUDA kernel (not ROCM).
 };
 
 }  // namespace cuda
diff --git a/onnxruntime/contrib_ops/rocm/bert/fast_gelu.cc b/onnxruntime/contrib_ops/rocm/bert/fast_gelu.cc
deleted file mode 100644
index 9cb414e4e8980..0000000000000
--- a/onnxruntime/contrib_ops/rocm/bert/fast_gelu.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "contrib_ops/rocm/bert/fast_gelu.h"
-
-#include "core/providers/rocm/rocm_common.h"
-#include "core/providers/rocm/miopen_common.h"
-#include "contrib_ops/cpu/bert/bias_gelu_helper.h"
-#include "contrib_ops/rocm/bert/elementwise.h"
-#include "contrib_ops/rocm/bert/transformer_common.h"
-
-namespace onnxruntime {
-namespace contrib {
-namespace rocm {
-
-#define REGISTER_KERNEL_TYPED(T)                                  \
-  ONNX_OPERATOR_TYPED_KERNEL_EX(                                  \
-      FastGelu,                                                   \
-      kMSDomain,                                                  \
-      1,                                                          \
-      T,                                                          \
-      kRocmExecutionProvider,                                     \
-      (*KernelDefBuilder::Create())                               \
-          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
-      FastGelu<T>);
-
-REGISTER_KERNEL_TYPED(float)
-REGISTER_KERNEL_TYPED(MLFloat16)
-REGISTER_KERNEL_TYPED(BFloat16)
-
-using namespace ONNX_NAMESPACE;
-
-template <typename T>
-Status FastGelu<T>::ComputeInternal(OpKernelContext* context) const {
-  ORT_RETURN_IF_ERROR(bias_gelu_helper::CheckInputs(context));
-
-  const Tensor* input = context->Input<Tensor>(0);
-  const Tensor* bias = context->Input<Tensor>(1);
-  Tensor* output = context->Output(0, input->Shape());
-
-  int64_t input_length = input->Shape().Size();
-  if (input_length == 0) {
-    return Status::OK();
-  }
-  int64_t bias_length = (nullptr == bias) ? 0 : bias->Shape().Size();
-  typedef typename ToHipType<T>::MappedType HipT;
-
-  const HipT* input_buffer = reinterpret_cast<const HipT*>(input->Data<T>());
-  const HipT* bias_buffer = (nullptr != bias) ? reinterpret_cast<const HipT*>(bias->Data<T>()) : nullptr;
-  return LaunchElementwiseKernel<functor::FastGeLU, HipT>(
-      GetTuningContext(), context->GetComputeStream(),
-      input_buffer, static_cast<int>(input_length),
-      bias_buffer, static_cast<int>(bias_length),
-      reinterpret_cast<HipT*>(output->MutableData<T>()));
-}
-
-}  // namespace rocm
-}  // namespace contrib
-}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/rocm/bert/fast_gelu.h b/onnxruntime/contrib_ops/rocm/bert/fast_gelu.h
deleted file mode 100644
index 42bfe5a0b0246..0000000000000
--- a/onnxruntime/contrib_ops/rocm/bert/fast_gelu.h
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-
-#include "core/common/common.h"
-#include "core/providers/rocm/rocm_kernel.h"
-
-namespace onnxruntime {
-namespace contrib {
-namespace rocm {
-
-using namespace onnxruntime::rocm;
-
-template <typename T>
-class FastGelu final : public RocmKernel {
- public:
-  FastGelu(const OpKernelInfo& op_kernel_info) : RocmKernel(op_kernel_info) {}
-  Status ComputeInternal(OpKernelContext* ctx) const override;
-};
-
-}  // namespace rocm
-}  // namespace contrib
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
index 813fdc54ecd0d..48e4617b33b4d 100644
--- a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
+++ b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
@@ -1035,6 +1035,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, float, IsNaN);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, double, IsNaN);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, MLFloat16, IsNaN);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Gelu);
 #if !defined(DISABLE_FLOAT8_TYPES)
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E4M3FN, IsNaN);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E4M3FNUZ, IsNaN);
@@ -2562,6 +2563,7 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, double, IsNaN)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, MLFloat16,
                                                                 IsNaN)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Gelu)>,
 #if !defined(DISABLE_FLOAT8_TYPES)
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E4M3FN,
                                                                 IsNaN)>,
diff --git a/onnxruntime/core/providers/cpu/tensor/gelu.cc b/onnxruntime/core/providers/cpu/tensor/gelu.cc
new file mode 100644
index 0000000000000..d55973eda180f
--- /dev/null
+++ b/onnxruntime/core/providers/cpu/tensor/gelu.cc
@@ -0,0 +1,108 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/common/common.h"
+#include "core/common/narrow.h"
+#include "core/framework/op_kernel.h"
+#include "core/util/math_cpuonly.h"
+#include "core/mlas/inc/mlas.h"
+
+#include "core/platform/threadpool.h"
+#include <unsupported/Eigen/SpecialFunctions>
+#include "core/providers/cpu/element_wise_ranged_transform.h"
+#include "core/providers/cpu/tensor/gelu.h"
+
+using onnxruntime::narrow;
+using namespace onnxruntime::common;
+
+namespace onnxruntime {
+
+// May revisit the implementations to support inplace computation, if needed.
+
+ONNX_CPU_OPERATOR_KERNEL(
+    Gelu,
+    20,
+    KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+    Gelu<float>);
+
+#ifndef DISABLE_CONTRIB_OPS
+namespace contrib {
+ONNX_OPERATOR_KERNEL_EX(
+    Gelu,
+    kMSDomain,
+    1,
+    kCpuExecutionProvider,
+    KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+    Gelu<float>);
+}
+#endif
+
+template <typename T>
+Status Gelu<T>::Compute(OpKernelContext* context) const {
+  const Tensor* input = context->Input<Tensor>(0);
+  const T* input_data = input->Data<T>();
+
+  Tensor* output = context->Output(0, input->Shape());
+  T* output_data = output->MutableData<T>();
+
+  concurrency::ThreadPool* tp = context->GetOperatorThreadPool();
+  int64_t elem_count = input->Shape().Size();
+  constexpr int64_t length_per_task = 4096;  // this number comes from FastGelu.
+  int64_t task_count = (elem_count + length_per_task - 1) / length_per_task;
+
+  if (approximation_algorithm_ == "tanh") {
+    // FastGelu allows optional bias. Here we split input data into chunks. Each chunk
+    // has N elements (except the last chunk), and use thread pool to parallel chunks.
+    // N = 4096 is selected based on performance test results on input shape 1x128x768.
+    // FastGelu uses approximation for Gelu. The formula is 0.5 * (1 + Tanh(x * (C * x * x + B))) * x.
+    static constexpr float B = 0.7978845608028654f;    // sqrt(2.0 / M_PI)
+    static constexpr float C = 0.035677408136300125f;  // 0.044715 * sqrt(2.0 / M_PI)
+
+    concurrency::ThreadPool::TryBatchParallelFor(
+        tp, static_cast<int32_t>(task_count),
+        [&](ptrdiff_t task_idx) {
+          const auto start = task_idx * length_per_task;
+          const T* p_input = input_data + start;
+          T* p_output = output_data + start;
+          int64_t count = std::min(length_per_task, elem_count - start);
+
+          for (int64_t i = 0; i < count; i++) {
+            T value = p_input[i];
+            p_output[i] = value * (static_cast<T>(C) * value * value + static_cast<T>(B));
+          }
+
+          MlasComputeTanh(p_output, p_output, narrow<size_t>(count));
+
+          for (int64_t i = 0; i < count; i++) {
+            p_output[i] = 0.5f * p_input[i] * (p_output[i] + 1.0f);
+          }
+        },
+        0);
+    return Status::OK();
+  } else if (approximation_algorithm_ == "none") {
+    concurrency::ThreadPool::TryBatchParallelFor(
+        tp, static_cast<int32_t>(task_count),
+        [&](ptrdiff_t task_idx) {
+          const auto start = task_idx * length_per_task;
+          const T* p_input = input_data + start;
+          T* p_output = output_data + start;
+          int64_t count = std::min(length_per_task, elem_count - start);
+
+          for (int64_t i = 0; i < count; i++) {
+            T value = p_input[i];
+            p_output[i] = value * static_cast<T>(M_SQRT1_2);
+          }
+
+          MlasComputeErf(p_output, p_output, narrow<size_t>(count));
+
+          for (int64_t i = 0; i < count; i++) {
+            p_output[i] = 0.5f * p_input[i] * (p_output[i] + 1.0f);
+          }
+        },
+        0);
+    return Status::OK();
+  }
+  return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unsupported approximation_algorithm: ", approximation_algorithm_);
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/tensor/gelu.h b/onnxruntime/core/providers/cpu/tensor/gelu.h
new file mode 100644
index 0000000000000..13238028d878a
--- /dev/null
+++ b/onnxruntime/core/providers/cpu/tensor/gelu.h
@@ -0,0 +1,18 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+namespace onnxruntime {
+
+template <typename T>
+class Gelu final : public OpKernel {
+ public:
+  explicit Gelu(const OpKernelInfo& info) : OpKernel(info) {
+    approximation_algorithm_ = info.GetAttrOrDefault<std::string>("approximate", "none");
+  }
+  Status Compute(OpKernelContext* ctx) const override;
+
+ private:
+  std::string approximation_algorithm_;
+};
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index 0dd568c5ecc05..be2530aec49fa 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -1329,6 +1329,11 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, S
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Shape);
 #endif
 
+// Opset 20
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, float, Gelu);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, double, Gelu);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, MLFloat16, Gelu);
+
 template <>
 KernelCreateInfo BuildKernelCreateInfo<void>() {
   return {};
@@ -2222,6 +2227,11 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Reshape)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Scan)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Shape)>,
+
+    // Opset 20
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, float, Gelu)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, double, Gelu)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, MLFloat16, Gelu)>,
 #endif
   };
 
diff --git a/onnxruntime/core/providers/cuda/tensor/gelu.cc b/onnxruntime/core/providers/cuda/tensor/gelu.cc
new file mode 100644
index 0000000000000..67b2fad373a7f
--- /dev/null
+++ b/onnxruntime/core/providers/cuda/tensor/gelu.cc
@@ -0,0 +1,89 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/cuda/cuda_common.h"
+#include "core/providers/cuda/cudnn_common.h"
+#include "core/providers/cuda/tensor/gelu.h"
+#include "core/providers/cuda/tensor/gelu_impl.h"
+
+namespace onnxruntime {
+namespace cuda {
+
+#define REGISTER_KERNEL_TYPED(T)                                 \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                 \
+      Gelu,                                                      \
+      kOnnxDomain,                                               \
+      20,                                                        \
+      T,                                                         \
+      kCudaExecutionProvider,                                    \
+      (*KernelDefBuilder::Create())                              \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()) \
+          .MayInplace(0, 0),                                     \
+      Gelu<T>);
+
+REGISTER_KERNEL_TYPED(float)
+REGISTER_KERNEL_TYPED(MLFloat16)
+REGISTER_KERNEL_TYPED(double)
+
+template <typename T>
+Status Gelu<T>::ComputeInternal(OpKernelContext* context) const {
+  const Tensor* input = context->Input<Tensor>(0);
+  const auto& input_dims = input->Shape().GetDims();
+  if (input_dims.size() < 1) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "Input 0 is expected to have 1 or more dimensions, got ", input_dims.size());
+  }
+
+  Tensor* output = context->Output(0, input->Shape());
+
+  int64_t input_length = input->Shape().Size();
+  if (input_length == 0) {
+    return Status::OK();
+  }
+
+  typedef typename ToCudaType<T>::MappedType CudaT;
+
+  if (approximation_algorithm_ == "tanh") {
+    return LaunchFastGeluKernel<CudaT>(GetDeviceProp(),
+                                       Stream(context),
+                                       static_cast<int>(input_length),
+                                       0 /* no bias */,
+                                       reinterpret_cast<const CudaT*>(input->Data<T>()),
+                                       nullptr /* no bias */,
+                                       reinterpret_cast<CudaT*>(output->MutableData<T>()),
+                                       use_half2_);
+  } else if (approximation_algorithm_ == "none") {
+    return LaunchGeluKernel<CudaT>(Stream(context),
+                                   reinterpret_cast<const CudaT*>(input->Data<T>()),
+                                   reinterpret_cast<CudaT*>(output->MutableData<T>()),
+                                   static_cast<size_t>(input_length));
+  }
+
+  return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unsupported approximation_algorithm: ", approximation_algorithm_);
+}
+
+}  // namespace cuda
+
+#ifndef DISABLE_CONTRIB_OPS
+namespace contrib::cuda {
+#define REGISTER_CONTRIB_KERNEL_TYPED(T)                         \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                 \
+      Gelu,                                                      \
+      kMSDomain,                                                 \
+      1,                                                         \
+      T,                                                         \
+      kCudaExecutionProvider,                                    \
+      (*KernelDefBuilder::Create())                              \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()) \
+          .MayInplace(0, 0),                                     \
+      onnxruntime::cuda::Gelu<T>);
+
+REGISTER_CONTRIB_KERNEL_TYPED(float)
+REGISTER_CONTRIB_KERNEL_TYPED(MLFloat16)
+REGISTER_CONTRIB_KERNEL_TYPED(double)
+
+#undef REGISTER_CONTRIB_KERNEL_TYPED
+}  // namespace contrib::cuda
+#endif
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/tensor/gelu.h b/onnxruntime/core/providers/cuda/tensor/gelu.h
new file mode 100644
index 0000000000000..1c8189ab24121
--- /dev/null
+++ b/onnxruntime/core/providers/cuda/tensor/gelu.h
@@ -0,0 +1,28 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include "core/common/common.h"
+#include "core/providers/cuda/cuda_kernel.h"
+#include "core/providers/cuda/math/unary_elementwise_ops.h"
+
+namespace onnxruntime {
+namespace cuda {
+
+template <typename T>
+class Gelu final : public UnaryElementwise {
+ public:
+  Gelu(const OpKernelInfo& info) : UnaryElementwise(info) {
+    approximation_algorithm_ = info.GetAttrOrDefault<std::string>("approximate", "none");
+  }
+
+  Status ComputeInternal(OpKernelContext* ctx) const override;
+
+ private:
+  const bool use_half2_{true};
+
+  std::string approximation_algorithm_;
+};
+
+}  // namespace cuda
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/bert/fast_gelu_impl.cu b/onnxruntime/core/providers/cuda/tensor/gelu_approximate_impl.cu
similarity index 88%
rename from onnxruntime/contrib_ops/cuda/bert/fast_gelu_impl.cu
rename to onnxruntime/core/providers/cuda/tensor/gelu_approximate_impl.cu
index c9498eb1bcd7b..3292650584de8 100644
--- a/onnxruntime/contrib_ops/cuda/bert/fast_gelu_impl.cu
+++ b/onnxruntime/core/providers/cuda/tensor/gelu_approximate_impl.cu
@@ -24,12 +24,9 @@ limitations under the License.
 #include "core/providers/cuda/cuda_common.h"
 #include "core/providers/cuda/cu_inc/common.cuh"
 #include "core/providers/cuda/shared_inc/cuda_call.h"
-#include "contrib_ops/cuda/bert/fast_gelu_impl.h"
-
-using namespace onnxruntime::cuda;
+#include "core/providers/cuda/tensor/gelu_impl.h"
 
 namespace onnxruntime {
-namespace contrib {
 namespace cuda {
 
 // constants for approximating the normal cdf
@@ -75,6 +72,17 @@ Status LaunchFastGeluKernel(const cudaDeviceProp& prop, cudaStream_t stream, int
   return CUDA_CALL(cudaGetLastError());
 }
 
+template <>
+Status LaunchFastGeluKernel(const cudaDeviceProp& prop, cudaStream_t stream, int input_length, int bias_length,
+                            const double* input, const double* bias, double* output, bool /*use_half2*/) {
+  constexpr int blockSize = 256;
+  const int gridSize = (input_length + blockSize - 1) / blockSize;
+  FastGeluKernel<double, blockSize><<<gridSize, blockSize, 0, stream>>>(A, B, C, input_length, bias_length,
+                                                                        input, bias, output);
+
+  return CUDA_CALL(cudaGetLastError());
+}
+
 template <>
 Status LaunchFastGeluKernel(const cudaDeviceProp& prop, cudaStream_t stream, int input_length, int bias_length,
                             const half* input, const half* bias, half* output, bool use_half2) {
@@ -114,5 +122,4 @@ Status LaunchFastGeluKernel(const cudaDeviceProp& prop, cudaStream_t stream, int
 }
 
 }  // namespace cuda
-}  // namespace contrib
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/tensor/gelu_impl.cu b/onnxruntime/core/providers/cuda/tensor/gelu_impl.cu
new file mode 100644
index 0000000000000..3f96da38b37bb
--- /dev/null
+++ b/onnxruntime/core/providers/cuda/tensor/gelu_impl.cu
@@ -0,0 +1,48 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <cuda_runtime.h>
+#include "core/providers/cuda/tensor/gelu_impl.h"
+#include "core/providers/cuda/cu_inc/common.cuh"
+#include "core/providers/cuda/cu_inc/unary_elementwise_impl.cuh"
+
+namespace onnxruntime {
+namespace cuda {
+
+template <typename T>
+struct OP_Gelu {
+  __device__ __inline__ T operator()(const T& a) const {
+    return _Gelu(a);
+  }
+};
+
+template <>
+struct OP_Gelu<half> {
+  __device__ __inline__ half operator()(const half& a) const {
+    return static_cast<half>(_Gelu(static_cast<float>(a)));
+  }
+};
+
+template <typename T>
+Status LaunchGeluKernel(
+    cudaStream_t stream,
+    const T* input_data,
+    T* output_data,
+    size_t count) {
+  UnaryElementWiseImpl(stream, input_data, output_data, OP_Gelu<T>(), count);
+
+  return CUDA_CALL(cudaGetLastError());
+}
+
+#define SPECIALIZED_GELU_IMPL(T)                                                                \
+  template Status LaunchGeluKernel<T>(cudaStream_t stream, const T* input_data, T* output_data, \
+                                      size_t count);
+
+SPECIALIZED_GELU_IMPL(float);
+SPECIALIZED_GELU_IMPL(half);
+SPECIALIZED_GELU_IMPL(double);
+
+#undef SPECIALIZED_GELU_IMPL
+
+}  // namespace cuda
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/bert/fast_gelu_impl.h b/onnxruntime/core/providers/cuda/tensor/gelu_impl.h
similarity index 80%
rename from onnxruntime/contrib_ops/cuda/bert/fast_gelu_impl.h
rename to onnxruntime/core/providers/cuda/tensor/gelu_impl.h
index ba78310f5dfc2..2ea0d3441fda3 100644
--- a/onnxruntime/contrib_ops/cuda/bert/fast_gelu_impl.h
+++ b/onnxruntime/core/providers/cuda/tensor/gelu_impl.h
@@ -1,17 +1,18 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
-
 #pragma once
+
 #include "core/common/common.h"
 
 namespace onnxruntime {
-namespace contrib {
 namespace cuda {
 
+template <typename T>
+Status LaunchGeluKernel(cudaStream_t stream, const T* input, T* output, size_t count);
+
 template <typename T>
 Status LaunchFastGeluKernel(const cudaDeviceProp& prop, cudaStream_t stream, int input_length, int bias_length,
                             const T* input, const T* bias, T* output, bool use_half2);
 
 }  // namespace cuda
-}  // namespace contrib
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/contrib_ops/activation_op_test.cc b/onnxruntime/test/contrib_ops/activation_op_test.cc
index b1e54ec605a39..2a56991ec5af4 100644
--- a/onnxruntime/test/contrib_ops/activation_op_test.cc
+++ b/onnxruntime/test/contrib_ops/activation_op_test.cc
@@ -22,7 +22,8 @@ namespace test {
 TEST_F(ActivationOpTest, ThresholdedRelu_version_1_to_9) {
   float alpha = 0.1f;
   TestActivationOp<float>(
-      "ThresholdedRelu", input_values, [alpha](float x) { return (x >= alpha) ? x : 0; }, {{"alpha", alpha}}, true, 1);
+      "ThresholdedRelu", input_values, [alpha](float x) { return (x >= alpha) ? x : 0; }, {{"alpha", alpha}}, {},
+      true, 1);
 }
 
 TEST_F(ActivationOpTest, ScaledTanh) {
@@ -46,13 +47,13 @@ TEST_F(ActivationOpTest, ParametricSoftplus) {
         else
           return alpha * logf(expf(bx) + 1);
       },
-      {{"alpha", alpha}, {"beta", beta}}, false);  // Disable TensorRT due to result mismatch
+      {{"alpha", alpha}, {"beta", beta}}, {}, false);  // Disable TensorRT due to result mismatch
 }
 
 TEST_F(ActivationOpTest, Gelu) {
   TestActivationOp<float>(
       "Gelu", input_values, [](float x) { return x * 0.5f * (1.0f + std::erf(x * static_cast<float>(M_SQRT1_2))); }, {},
-      false, 1, kMSDomain);
+      {}, false, 1, kMSDomain);
 }
 
 #if defined(USE_DNNL)
@@ -115,7 +116,7 @@ TEST_F(ActivationOpTest, QuickGelu) {
           y = tmp >= 0 ? y : 1 - y;
           return x * y;
         },
-        {{"alpha", alpha}}, false, 1, kMSDomain);
+        {{"alpha", alpha}}, {}, false, 1, kMSDomain);
   }
 
   // Silu = x*sigmoid(x), i.e., alpha = 1.0f.
@@ -129,7 +130,7 @@ TEST_F(ActivationOpTest, QuickGelu) {
           y = tmp >= 0 ? y : 1 - y;
           return x * y;
         },
-        {{"alpha", alpha}}, false, 1, kMSDomain);
+        {{"alpha", alpha}}, {}, false, 1, kMSDomain);
   }
 
   // Negative alpha.
@@ -143,7 +144,7 @@ TEST_F(ActivationOpTest, QuickGelu) {
           y = tmp >= 0 ? y : 1 - y;
           return x * y;
         },
-        {{"alpha", alpha}}, false, 1, kMSDomain);
+        {{"alpha", alpha}}, {}, false, 1, kMSDomain);
   }
 }
 
diff --git a/onnxruntime/test/onnx/microbenchmark/activation.cc b/onnxruntime/test/onnx/microbenchmark/activation.cc
index cf859facf4765..69ee72996365e 100644
--- a/onnxruntime/test/onnx/microbenchmark/activation.cc
+++ b/onnxruntime/test/onnx/microbenchmark/activation.cc
@@ -11,6 +11,7 @@
 #include "core/framework/node_index_info.h"
 #include "core/framework/execution_frame.h"
 #include "contrib_ops/cpu/activations.h"
+#include "core/providers/cpu/tensor/gelu.h"
 #include "core/providers/cpu/activation/activations.h"
 #include <onnx/defs/attr_proto_util.h>
 #include <benchmark/benchmark.h>
@@ -182,7 +183,7 @@ static void RunSingleNode(const std::string& op_name, const std::string& domain,
 }
 
 static void BM_GeluCompute(benchmark::State& state) {
-  RunSingleNode<contrib::Gelu<float>>("Gelu", kMSDomain, {}, state);
+  RunSingleNode<Gelu<float>>("Gelu", kMSDomain, {}, state);
 }
 
 BENCHMARK(BM_GeluCompute)
diff --git a/onnxruntime/test/providers/cpu/activation/activation_op_test.cc b/onnxruntime/test/providers/cpu/activation/activation_op_test.cc
index ddb0a6620619c..acd513172f95d 100644
--- a/onnxruntime/test/providers/cpu/activation/activation_op_test.cc
+++ b/onnxruntime/test/providers/cpu/activation/activation_op_test.cc
@@ -116,13 +116,13 @@ TEST_F(ActivationOpTest, Relu) {
       "Relu",
       input_values_double,
       [](double x) { return std::max(x, 0.0); },
-      {},
+      {}, {},
       /*is_tensorrt_supported=*/false);
   TestActivationOp<int8_t>(
       "Relu",
       input_values_int8,
       [](int8_t x) { return std::max(x, static_cast<int8_t>(0)); },
-      {},
+      {}, {},
       /*is_tensorrt_supported=*/false,
       /*opset_version= */ 14);
 #ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
@@ -133,7 +133,7 @@ TEST_F(ActivationOpTest, Relu) {
         if (x.ToFloat() > 0.0f) return x;
         return MLFloat16();
       },
-      {},
+      {}, {},
       /*is_tensorrt_supported=*/false,
       /*opset_version= */ 11);
 #endif  // MLAS_F16VEC_INTRINSICS_SUPPORTED
@@ -402,7 +402,7 @@ TEST_F(ActivationOpTest, Celu) {
       // TODO: Investigate why gcc 4 fails to compile without the explicit cast
       [alpha](float x) { return std::max(0.0f, x) + std::min(0.0f, alpha * (static_cast<float>(exp(x / alpha)) - 1)); },
       // Disable on TensorRT as it seems like it doesn't yet support Celu
-      {{"alpha", alpha}}, false, 12);
+      {{"alpha", alpha}}, {}, false, 12);
 }
 
 TEST_F(ActivationOpTest, LeakyRelu) {
@@ -410,7 +410,7 @@ TEST_F(ActivationOpTest, LeakyRelu) {
   TestActivationOp<float>("LeakyRelu",
                           input_values,
                           [alpha](float x) { return (x >= 0) ? x : alpha * x; },
-                          {{"alpha", alpha}});
+                          {{"alpha", alpha}}, {});
 }
 
 #ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
@@ -442,7 +442,7 @@ TEST_F(ActivationOpTest, ThresholdedRelu) {
       "ThresholdedRelu",
       input_values,
       [alpha](float x) { return (x >= alpha) ? x : 0; },
-      {{"alpha", alpha}}, true, 10);
+      {{"alpha", alpha}}, {}, true, 10);
 }
 
 TEST_F(ActivationOpTest, Selu) {
@@ -452,7 +452,7 @@ TEST_F(ActivationOpTest, Selu) {
   TestActivationOp<float>("Selu",
                           input_values,
                           [](float x) { return x <= 0 ? gamma * (alpha * exp(x) - alpha) : gamma * x; },
-                          {{"alpha", alpha}, {"gamma", gamma}});
+                          {{"alpha", alpha}, {"gamma", gamma}}, {});
 }
 
 TEST_F(ActivationOpTest, Selu_Attributes) {
@@ -462,7 +462,7 @@ TEST_F(ActivationOpTest, Selu_Attributes) {
   TestActivationOp<float>("Selu",
                           input_values,
                           [](float x) { return x <= 0 ? gamma * (alpha * exp(x) - alpha) : gamma * x; },
-                          {{"alpha", alpha}, {"gamma", gamma}});
+                          {{"alpha", alpha}, {"gamma", gamma}}, {});
 }
 
 TEST_F(ActivationOpTest, Selu_GH10726) {
@@ -472,7 +472,7 @@ TEST_F(ActivationOpTest, Selu_GH10726) {
   TestActivationOp<float>("Selu",
                           {{1.f, -1.f}},
                           [](float x) { return x <= 0 ? gamma * (alpha * exp(x) - alpha) : gamma * x; },
-                          {{"alpha", alpha}, {"gamma", gamma}});
+                          {{"alpha", alpha}, {"gamma", gamma}}, {});
 }
 
 TEST_F(ActivationOpTest, PRelu) {
@@ -625,7 +625,7 @@ TEST_F(ActivationOpNoInfTest, Softsign) {
 
         return result;
       },
-      {}, false);  // Disable TensorRT because result mismatches
+      {}, {}, false);  // Disable TensorRT because result mismatches
 }
 
 #if defined(ENABLE_TRAINING_OPS)
@@ -695,5 +695,33 @@ TEST(LeakyReluGradInferenceTest, Basic) {
 }
 #endif
 
+// Remove DNNL from running this test because DNNL Gelu op seems not check domain for kernel implementation.
+// It will run the DNNL Gelu op which only be part of standard of Gelu-20 op.
+#if !defined(USE_DNNL) && !defined(USE_QNN)
+TEST_F(ActivationOpTest, ONNX_Gelu) {
+  TestActivationOp<float>(
+      "Gelu",
+      input_values,
+      [](float x) { return 0.5 * x * (1 + erf(x * M_SQRT1_2)); }, {},
+      {{"approximate", "none"}}, true, 20);
+
+  TestActivationOp<float>(
+      "Gelu",
+      input_values,
+      [](float x) { return 0.5 * x * (1 + erf(x * M_SQRT1_2)); },
+      {},
+      {/*default value of approximate attribute is none */}, true, 20);
+
+  TestActivationOp<float>(
+      "Gelu",
+      input_values,
+      [](float x) {
+        return 0.5 * x * (1 + tanh(sqrt(2 / M_PI) * (x + 0.044715 * x * x * x)));
+      },
+      {},
+      {{"approximate", "tanh"}}, true, 20);
+}
+#endif
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/activation/activation_op_test.h b/onnxruntime/test/providers/cpu/activation/activation_op_test.h
index b5ec1402584fb..984b8f4437a3b 100644
--- a/onnxruntime/test/providers/cpu/activation/activation_op_test.h
+++ b/onnxruntime/test/providers/cpu/activation/activation_op_test.h
@@ -17,13 +17,16 @@ namespace test {
 template <typename T>
 inline void TestActivationOp(const char* szOp, const std::vector<std::vector<T>>& input_vals_vec,
                              std::function<T(T)> expected_func,
-                             const std::unordered_map<std::string, float> attribs = {},
+                             const std::unordered_map<std::string, float> float_attribs = {},
+                             const std::unordered_map<std::string, std::string> string_attribs = {},
                              bool is_tensorrt_supported = true, int opset_version = 7,
                              const char* domain = kOnnxDomain) {
   for (const std::vector<T>& input_vals : input_vals_vec) {
     OpTester test(szOp, opset_version, domain);
 
-    for (auto attr : attribs) test.AddAttribute<float>(attr.first, attr.second);
+    for (auto attr : float_attribs) test.AddAttribute<float>(attr.first, attr.second);
+    for (auto attr : string_attribs) test.AddAttribute(attr.first, attr.second);
+
     std::vector<int64_t> dims{(int64_t)input_vals.size()};
 
     std::vector<T> expected_vals;

From 5e432a3ae69dbbed603420493c52ba48b3726471 Mon Sep 17 00:00:00 2001
From: Markus Tavenrath <mtavenrath@users.noreply.github.com>
Date: Fri, 23 Feb 2024 04:47:15 +0100
Subject: [PATCH 048/279] Add support for NHWC GridSample in the CUDA EP and
 enable grid_sample_test for all EPs (#19562)

I've added NHWC GridSample support to the CUDA EP to reduce the number
of layout transforms. Also I've enabled the full set of GridSampleTests
for all EPs. I've also added the GridSample OpSet 16 to the registered
kernels.

### Motivation and Context
This is the first PR is a series of enhancements of the CUDA EP
improving NHWC support to avoid costly layout transforms between NWHC
and NCHW nodes which are layout sensitive. Also testing was quite
rudimentary for the CUDA EP while it was great for the CPU path. I've
regenerated grid_sample_test.cc enabling tests for other platforms as
well. Those tests resurfaced #10607 again which is fixed as well.
---
 docs/OperatorKernels.md                       |   1 +
 .../contrib_ops/cuda/cuda_contrib_kernels.cc  |   7 +
 onnxruntime/contrib_ops/cuda/grid_sample.cc   |  35 ++--
 onnxruntime/contrib_ops/cuda/grid_sample.h    |   2 +-
 .../contrib_ops/cuda/grid_sample_impl.cu      | 101 ++++++----
 .../contrib_ops/cuda/grid_sample_impl.h       |   2 +-
 .../layout_transformation.cc                  |   2 +
 .../providers/cuda/cuda_execution_provider.cc |   2 +
 .../providers/cuda/shared_inc/cuda_utils.h    |  26 +++
 .../providers/cpu/tensor/grid_sample_test.cc  | 172 ++++++++----------
 .../cpu/tensor/grid_sample_test_gen.py        |   2 +-
 onnxruntime/test/util/default_providers.cc    |  16 ++
 .../test/util/include/default_providers.h     |   3 +
 13 files changed, 223 insertions(+), 148 deletions(-)

diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index 46149c577a106..b0ed68d595c42 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -619,6 +619,7 @@ Do not modify directly.*
 |||[7, 8]|**T** = tensor(double), tensor(float), tensor(float16)|
 |GreaterOrEqual|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T1**|16+|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)<br/> **T1** = tensor(bool)|
 |||[12, 15]|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)<br/> **T1** = tensor(bool)|
+|GridSample|*in* X:**T1**<br> *in* grid:**T2**<br> *out* Y:**T1**|16+|**T1** = tensor(float)<br/> **T2** = tensor(float)|
 |HardSigmoid|*in* X:**T**<br> *out* Y:**T**|6+|**T** = tensor(double), tensor(float), tensor(float16)|
 |Identity|*in* input:**T**<br> *out* output:**T**<br><br>or<br><br>*in* input:**V**<br> *out* output:**V**|19+|**V** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(float8e4m3fn)), seq(tensor(float8e4m3fnuz)), seq(tensor(float8e5m2)), seq(tensor(float8e5m2fnuz)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[14, 18]|**V** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
diff --git a/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc b/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc
index be8c0dc86c135..57e951d3a68ff 100644
--- a/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc
@@ -203,6 +203,10 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, DistributedSqueeze);
 #endif
 
+#ifdef ENABLE_CUDA_NHWC_OPS
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 16, float, GridSample);
+#endif
+
 template <>
 KernelCreateInfo BuildKernelCreateInfo<void>() {
   KernelCreateInfo info;
@@ -408,6 +412,9 @@ Status RegisterCudaContribKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, DistributedSqueeze)>,
 #endif
 
+#ifdef ENABLE_CUDA_NHWC_OPS
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 16, float, GridSample)>,
+#endif
   };
 
   for (auto& function_table_entry : function_table) {
diff --git a/onnxruntime/contrib_ops/cuda/grid_sample.cc b/onnxruntime/contrib_ops/cuda/grid_sample.cc
index 4c2999c279e0a..2500de39d3536 100644
--- a/onnxruntime/contrib_ops/cuda/grid_sample.cc
+++ b/onnxruntime/contrib_ops/cuda/grid_sample.cc
@@ -9,22 +9,23 @@ namespace onnxruntime {
 namespace contrib {
 namespace cuda {
 
-#define REGISTER_KERNEL_TYPED(T)                                   \
+#define REGISTER_KERNEL_TYPED(T, VERSION, LAYOUT, DOMAIN)          \
   ONNX_OPERATOR_TYPED_KERNEL_EX(                                   \
       GridSample,                                                  \
-      kMSDomain,                                                   \
-      1,                                                           \
+      DOMAIN,                                                      \
+      VERSION,                                                     \
       T,                                                           \
       kCudaExecutionProvider,                                      \
       (*KernelDefBuilder::Create())                                \
           .TypeConstraint("T1", DataTypeImpl::GetTensorType<T>())  \
           .TypeConstraint("T2", DataTypeImpl::GetTensorType<T>()), \
-      GridSample<T>);
+      onnxruntime::contrib::cuda::GridSample<T, LAYOUT>);
 
-REGISTER_KERNEL_TYPED(float)
+REGISTER_KERNEL_TYPED(float, 1, LAYOUT_NCHW, kMSDomain)
+REGISTER_KERNEL_TYPED(float, 16, LAYOUT_NHWC, kMSInternalNHWCDomain)
 
-template <typename T>
-GridSample<T>::GridSample(const OpKernelInfo& info) : CudaKernel(info) {
+template <typename T, bool IsNHWC>
+GridSample<T, IsNHWC>::GridSample(const OpKernelInfo& info) : CudaKernel(info) {
   std::string mode_str = info.GetAttrOrDefault<std::string>("mode", "bilinear");
   std::string padding_mode_str = info.GetAttrOrDefault<std::string>("padding_mode", "zeros");
   align_corners_ = static_cast<bool>(info.GetAttrOrDefault<int64_t>("align_corners", 0));
@@ -48,8 +49,8 @@ GridSample<T>::GridSample(const OpKernelInfo& info) : CudaKernel(info) {
   }
 }
 
-template <typename T>
-Status GridSample<T>::ComputeInternal(OpKernelContext* context) const {
+template <typename T, bool IsNHWC>
+Status GridSample<T, IsNHWC>::ComputeInternal(OpKernelContext* context) const {
   const Tensor* X = context->Input<Tensor>(0);
   const auto& dims_input = X->Shape().GetDims();
   const Tensor* Grid = context->Input<Tensor>(1);
@@ -61,11 +62,13 @@ Status GridSample<T>::ComputeInternal(OpKernelContext* context) const {
   ORT_ENFORCE(dims_grid[0] == dims_input[0], "Grid batch size ", dims_grid[0], " does not match input batch size ", dims_input[0]);
   ORT_ENFORCE(dims_grid[3] == 2, "Last dimension of grid: ", dims_grid[3], ", expect 2");
 
+  using Ch = Channels<IsNHWC>;
+
   TensorShapeVector dims_output(4);
-  dims_output[0] = dims_input[0];
-  dims_output[1] = dims_input[1];
-  dims_output[2] = dims_grid[1];
-  dims_output[3] = dims_grid[2];
+  dims_output[Ch::N] = dims_input[Ch::N];
+  dims_output[Ch::C] = dims_input[Ch::C];
+  dims_output[Ch::H] = dims_grid[1 /* Grid::H */];
+  dims_output[Ch::W] = dims_grid[2 /* Grid::W */];
   Tensor* Y = context->Output(0, dims_output);
   // Return early if the output tensor is going to be of size 0
   if (Y->Shape().Size() == 0) {
@@ -74,7 +77,7 @@ Status GridSample<T>::ComputeInternal(OpKernelContext* context) const {
 
   typedef typename ToCudaType<T>::MappedType CudaT;
   CudaT* Y_data = reinterpret_cast<CudaT*>(Y->MutableData<T>());
-  GridSampleImpl<CudaT>(
+  GridSampleImpl<CudaT, IsNHWC>(
       Stream(context),
       reinterpret_cast<const CudaT*>(X->Data<T>()),
       reinterpret_cast<const CudaT*>(Grid->Data<T>()),
@@ -89,4 +92,8 @@ Status GridSample<T>::ComputeInternal(OpKernelContext* context) const {
 }
 }  // namespace cuda
 }  // namespace contrib
+
+namespace cuda {
+REGISTER_KERNEL_TYPED(float, 16, LAYOUT_NCHW, kOnnxDomain)
+}  // namespace cuda
 }  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/grid_sample.h b/onnxruntime/contrib_ops/cuda/grid_sample.h
index 08ca58c7cc458..16581bfe77482 100644
--- a/onnxruntime/contrib_ops/cuda/grid_sample.h
+++ b/onnxruntime/contrib_ops/cuda/grid_sample.h
@@ -12,7 +12,7 @@ namespace cuda {
 
 using namespace onnxruntime::cuda;
 
-template <typename T>
+template <typename T, bool IsNHWC>
 class GridSample final : public CudaKernel {
  public:
   explicit GridSample(const OpKernelInfo& info);
diff --git a/onnxruntime/contrib_ops/cuda/grid_sample_impl.cu b/onnxruntime/contrib_ops/cuda/grid_sample_impl.cu
index 8a391eca7e86a..b23da635bc83d 100644
--- a/onnxruntime/contrib_ops/cuda/grid_sample_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/grid_sample_impl.cu
@@ -50,28 +50,34 @@ __device__ T GsReflect(T x, float x_min, float x_max) {
   return static_cast<T>(fx);
 }
 
-template <typename T>
+template <typename T, bool Layout>
 __device__ T PixelAtGrid(const T* input_data, int64_t bIdx, int64_t cIdx, int64_t y, int64_t x,
-    int64_t padding_mode, int64_t N, int64_t C, int64_t H, int64_t W, float border[4]) {
+                         int64_t padding_mode, int64_t N, int64_t C, int64_t H, int64_t W, float border[4]) {
   T pixel = 0.0f;
+
+  auto PixelOffset = [bIdx, cIdx, C, H, W](int64_t x, int64_t y) -> int64_t {
+    return Layout == LAYOUT_NCHW
+       ? (bIdx * C * H * W + cIdx * H * W + y * W + x)
+       : (bIdx * H * W * C + y * W * C + x * C + cIdx);
+  };
+
   if (padding_mode == 0) {  // zeros
     if (x >= 0 && x < W && y >= 0 && y < H) {
-      pixel = input_data[bIdx * C * H * W + cIdx * H * W + y * W + x];
+      pixel = input_data[PixelOffset(x, y)];
     }
-  } else if (padding_mode == 1) {  //border
+  } else if (padding_mode == 1) {  // border
     x = max((int64_t)0, min((int64_t)W - 1, (int64_t)x));
     y = max((int64_t)0, min((int64_t)H - 1, (int64_t)y));
-    pixel = input_data[bIdx * C * H * W + cIdx * H * W + y * W + x];
+    pixel = input_data[PixelOffset(x, y)];
   } else {  // Reflection
-    x = (int64_t) GsReflect<T>(x, border[0], border[2]);
-    y = (int64_t) GsReflect<T>(y, border[1], border[3]);
-    pixel = input_data[bIdx * C * H * W + cIdx * H * W + y * W + x];
+    x = (int64_t)GsReflect<T>(x, border[0], border[2]);
+    y = (int64_t)GsReflect<T>(y, border[1], border[3]);
+    pixel = input_data[PixelOffset(x, y)];
   }
   return pixel;
 }
 
-__device__ void GsGetCubicCoeffs(float x, float coeffs[4])
-{
+__device__ void GsGetCubicCoeffs(float x, float coeffs[4]) {
   float cubic_alpha = -0.75f;
   x = abs(x);
   coeffs[0] = (((cubic_alpha * (x + 1) - 5 * cubic_alpha) * (x + 1) + 8 * cubic_alpha) * (x + 1) - 4 * cubic_alpha);
@@ -93,7 +99,7 @@ __device__ T GsBicubicInterpolate(T p[4][4], float x, float y) {
   return pixel;
 }
 
-template <typename T>
+template <typename T, bool Layout>
 __global__ void _GridSampleKernel(
     const T* input_data,
     const T* grid_data,
@@ -110,16 +116,32 @@ __global__ void _GridSampleKernel(
 {
     CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(idx, N * C * H_out * W_out);
     // extract batch index, channel index, y index, x index for current thread
-    int BIdx = idx / (C * H_out * W_out );
-    int tmpBCnt = BIdx * (C * H_out * W_out);
+    int BIdx, yIdx, xIdx, cIdx;
+    if constexpr (Layout == LAYOUT_NCHW) {
+      BIdx = idx / (C * H_out * W_out);
+      int tmpBCnt = BIdx * (C * H_out * W_out);
+
+      cIdx = (idx - tmpBCnt) / (H_out * W_out);
+      int tmpCCnt = tmpBCnt + cIdx * (H_out * W_out);
 
-    int cIdx = (idx - tmpBCnt) / (H_out * W_out);
-    int tmpCCnt = tmpBCnt + cIdx * (H_out * W_out);
+      yIdx = (idx - tmpCCnt) / W_out;
+      int tmpHCnt = tmpCCnt + yIdx * W_out;
 
-    int yIdx = (idx - tmpCCnt) / W_out;
-    int tmpHCnt = tmpCCnt + yIdx * W_out;
+      xIdx = (idx - tmpHCnt);
+    } else {
+      static_assert(Layout == LAYOUT_NHWC, "Unsupported layout");
 
-    int xIdx = (idx - tmpHCnt);
+      BIdx = idx / (H_out * W_out * C);
+      int tmpBCnt = BIdx * (H_out * W_out * C);
+
+      yIdx = (idx - tmpBCnt) / (W_out * C);
+      int tmpHCnt = tmpBCnt + yIdx * (W_out * C);
+
+      xIdx = (idx - tmpHCnt) / C;
+      int tmpWCnt = tmpHCnt + xIdx * C;
+
+      cIdx = (idx - tmpWCnt);
+    }
 
     int grid_idx = BIdx * H_out * W_out + yIdx * W_out + xIdx;
     T grid_X = grid_data[grid_idx * 2 + 0];
@@ -147,8 +169,9 @@ __global__ void _GridSampleKernel(
     if (grid_x_imgSpace < x_min || grid_x_imgSpace > x_max ||
         grid_y_imgSpace < y_min || grid_y_imgSpace > y_max) { // out of bound
       if (padding_mode == 1) {  // border
-        grid_x_imgSpace = max(0.0f, min(grid_x_imgSpace, W_in - 1.0f));
-        grid_y_imgSpace = max(0.0f, min(grid_y_imgSpace, H_in - 1.0f));
+        // Clamping must not be done here, see #10607
+        // grid_x_imgSpace = max(0.0f, min(grid_x_imgSpace, W_in - 1.0f));
+        // grid_y_imgSpace = max(0.0f, min(grid_y_imgSpace, H_in - 1.0f));
       } else if (padding_mode == 2) {  // reflection
         grid_x_imgSpace = GsReflect(grid_x_imgSpace, x_min, x_max);
         grid_y_imgSpace = GsReflect(grid_y_imgSpace, y_min, y_max);
@@ -175,10 +198,10 @@ __global__ void _GridSampleKernel(
       w_lb = w_b * w_l;
       w_rb = w_b * w_r;
 
-      T lt_v = PixelAtGrid(input_data, BIdx, cIdx, y1, x1, padding_mode, N, C, H_in, W_in, border);
-      T rt_v = PixelAtGrid(input_data, BIdx, cIdx, y1, x2, padding_mode, N, C, H_in, W_in, border);
-      T lb_v = PixelAtGrid(input_data, BIdx, cIdx, y2, x1, padding_mode, N, C, H_in, W_in, border);
-      T rb_v = PixelAtGrid(input_data, BIdx, cIdx, y2, x2, padding_mode, N, C, H_in, W_in, border);
+      T lt_v = PixelAtGrid<T, Layout>(input_data, BIdx, cIdx, y1, x1, padding_mode, N, C, H_in, W_in, border);
+      T rt_v = PixelAtGrid<T, Layout>(input_data, BIdx, cIdx, y1, x2, padding_mode, N, C, H_in, W_in, border);
+      T lb_v = PixelAtGrid<T, Layout>(input_data, BIdx, cIdx, y2, x1, padding_mode, N, C, H_in, W_in, border);
+      T rb_v = PixelAtGrid<T, Layout>(input_data, BIdx, cIdx, y2, x2, padding_mode, N, C, H_in, W_in, border);
       T interpoV = w_lt * lt_v + w_rt * rt_v + w_lb * lb_v + w_rb * rb_v;
       output_data[outIdx] = interpoV;
       return;
@@ -186,7 +209,8 @@ __global__ void _GridSampleKernel(
     if (mode == 1) {  // nearest
       int x_n = grid_x_imgSpace;
       int y_n = grid_y_imgSpace;
-      output_data[outIdx] = PixelAtGrid(input_data, BIdx, cIdx, y_n, x_n, padding_mode, N, C, H_in, W_in, border);
+      output_data[outIdx] =
+        PixelAtGrid<T, Layout>(input_data, BIdx, cIdx, y_n, x_n, padding_mode, N, C, H_in, W_in, border);
       return;
     }
     if (mode == 2) {  // bicubic
@@ -195,7 +219,8 @@ __global__ void _GridSampleKernel(
       T p[4][4] = {};  // [H][W]
       for (int64_t h = 0; h < 4; h++) {
         for (int64_t w = 0; w < 4; w++) {
-          p[h][w] = PixelAtGrid(input_data, BIdx, cIdx, h + y0, w + x0, padding_mode, N, C, H_in, W_in, border);
+          p[h][w] = 
+            PixelAtGrid<T, Layout>(input_data, BIdx, cIdx, h + y0, w + x0, padding_mode, N, C, H_in, W_in, border);
         }
       }
       T dx = grid_x_imgSpace - x0 - 1;
@@ -204,7 +229,7 @@ __global__ void _GridSampleKernel(
     }
 }
 
-template <typename T>
+template <typename T, bool IsNHWC>
 void GridSampleImpl(
     cudaStream_t stream,
     const T* input_data,
@@ -216,17 +241,23 @@ void GridSampleImpl(
     const int64_t H_out,
     const int64_t W_out,
     T* output_data) {
-  int blocksPerGrid = (int)(ceil(static_cast<T>(dims[0] * dims[1] * H_out * W_out) / GridDim::maxThreadsPerBlock));
-  _GridSampleKernel<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
-      input_data, grid_data, mode, padding_mode, align_corners, dims[0], dims[1], dims[2], dims[3], H_out, W_out, output_data);
+  using Ch = Channels<IsNHWC>;
+
+  int blocksPerGrid = static_cast<int>(
+    ceil(static_cast<T>(dims[Ch::N] * dims[Ch::C] * H_out * W_out) / GridDim::maxThreadsPerBlock));
+  _GridSampleKernel<T, IsNHWC><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+      input_data, grid_data, mode, padding_mode, align_corners, 
+      dims[Ch::N], dims[Ch::C], dims[Ch::H], dims[Ch::W],
+      H_out, W_out, output_data);
 }
 
-#define SPECIALIZED_IMPL(T) \
-  template void GridSampleImpl<T>(cudaStream_t stream, const T* input_data, const T* grid_data, \
-                                  const int64_t mode, const int64_t padding_mode, const int64_t align_corners, \
-                                  const int64_t[4], const int64_t H_out, const int64_t W_out, T* output_data);
+#define SPECIALIZED_IMPL(T, IsNHWC)                                                                                    \
+  template void GridSampleImpl<T, IsNHWC>(cudaStream_t stream, const T* input_data, const T* grid_data,                \
+                                          const int64_t mode, const int64_t padding_mode, const int64_t align_corners, \
+                                          const int64_t[4], const int64_t H_out, const int64_t W_out, T* output_data);
 
-SPECIALIZED_IMPL(float)
+SPECIALIZED_IMPL(float, false)  // NCHW
+SPECIALIZED_IMPL(float, true)   // NHWC
 
 }  // namespace cuda
 }  // namespace contrib
diff --git a/onnxruntime/contrib_ops/cuda/grid_sample_impl.h b/onnxruntime/contrib_ops/cuda/grid_sample_impl.h
index 6df86ce161908..62cd66a48fa84 100644
--- a/onnxruntime/contrib_ops/cuda/grid_sample_impl.h
+++ b/onnxruntime/contrib_ops/cuda/grid_sample_impl.h
@@ -8,7 +8,7 @@ namespace onnxruntime {
 namespace contrib {
 namespace cuda {
 
-template <typename T>
+template <typename T, bool IsNHWC>
 void GridSampleImpl(
     cudaStream_t stream,
     const T* input_data,
diff --git a/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc b/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc
index 4505d4afdf1e0..a8717b99a8750 100644
--- a/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc
+++ b/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc
@@ -31,6 +31,7 @@ CostCheckResult PostLayoutTransformCostCheck(const api::GraphRef& graph, const a
 }
 
 #if defined(USE_CUDA) && ENABLE_CUDA_NHWC_OPS
+// TODO(mtavenrath) generate list from registered kernels using nhwc domain
 const std::unordered_set<std::string_view>& GetCUDALayoutSensitiveOps() {
   static std::unordered_set<std::string_view> cuda_nhwc_ops = []() {
     return std::unordered_set<std::string_view>{
@@ -41,6 +42,7 @@ const std::unordered_set<std::string_view>& GetCUDALayoutSensitiveOps() {
         "MaxPool",
         "GlobalAveragePool",
         "AveragePool",
+        "GridSample",
     };
   }();
   return cuda_nhwc_ops;
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index be2530aec49fa..00783bcbc2665 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -1256,6 +1256,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, double, LessOrEqual);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, MLFloat16, LessOrEqual);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, 17, ScatterElements);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, float, GridSample);
 
 // Opset 17
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, float, LayerNormalization);
@@ -2148,6 +2149,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, double, LessOrEqual)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, MLFloat16, LessOrEqual)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, 17, ScatterElements)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, float, GridSample)>,
 
     // Opset 17
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, float, LayerNormalization)>,
diff --git a/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h b/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h
index fa987866c002f..54c024793ff0b 100644
--- a/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h
+++ b/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h
@@ -168,5 +168,31 @@ struct NumericLimits<double> {
   }
 };
 
+// TODO Where to put this? good places might be
+// core/framework/tensor_shape.h
+// core/util/matrix_layout.h
+
+constexpr bool LAYOUT_NCHW = false;
+constexpr bool LAYOUT_NHWC = true;
+
+template <bool IsNHWC>
+struct Channels;
+
+template <>
+struct Channels<LAYOUT_NHWC> {
+  static constexpr size_t N = 0;
+  static constexpr size_t H = 1;
+  static constexpr size_t W = 2;
+  static constexpr size_t C = 3;
+};
+
+template <>
+struct Channels<LAYOUT_NCHW> {
+  static constexpr size_t N = 0;
+  static constexpr size_t C = 1;
+  static constexpr size_t H = 2;
+  static constexpr size_t W = 3;
+};
+
 }  // namespace cuda
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc b/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc
index 0f097622abff0..5c89d6ea7bd75 100644
--- a/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc
@@ -6,6 +6,33 @@
 
 namespace onnxruntime {
 namespace test {
+
+std::vector<std::unique_ptr<IExecutionProvider>> GetExecutionProviders(int opset_version) {
+  ORT_UNUSED_PARAMETER(opset_version);
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+
+  execution_providers.emplace_back(DefaultCpuExecutionProvider());
+#ifdef USE_CUDA
+  if (opset_version < 20) {
+    execution_providers.emplace_back(DefaultCudaExecutionProvider());
+#ifdef ENABLE_CUDA_NHWC_OPS
+    execution_providers.push_back(DefaultCudaNHWCExecutionProvider());
+#endif
+  }
+
+#endif
+  return execution_providers;
+}
+
+template <typename T>
+void RunTests(T& test, std::vector<std::unique_ptr<IExecutionProvider>>&& execution_providers) {
+  for (size_t idx = 0; idx < execution_providers.size(); ++idx) {
+    test.ConfigEp(std::move(execution_providers[idx])).RunWithConfig();
+  }
+  execution_providers.clear();
+}
+
 // DO NOT edit following tests. They are generated by:
 // onnxruntime/test/providers/cpu/tensor/grid_sample_test_gen.py
 TEST(GridsampleTest, test_grid_sample_16_4D_nearest_zeros_align_corners) {
@@ -25,8 +52,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_nearest_zeros_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_nearest_zeros_no_align_corners) {
@@ -46,8 +72,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_nearest_zeros_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_nearest_border_align_corners) {
@@ -67,8 +92,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_nearest_border_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_nearest_border_no_align_corners) {
@@ -88,8 +112,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_nearest_border_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_nearest_reflection_align_corners) {
@@ -109,8 +132,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_nearest_reflection_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_nearest_reflection_no_align_corners) {
@@ -130,8 +152,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_nearest_reflection_no_align_corners)
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_zeros_align_corners) {
@@ -151,8 +172,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_zeros_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_zeros_no_align_corners) {
@@ -172,8 +192,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_zeros_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_border_align_corners) {
@@ -193,8 +212,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_border_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_border_no_align_corners) {
@@ -214,8 +232,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_border_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_reflection_align_corners) {
@@ -235,8 +252,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_reflection_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_reflection_no_align_corners) {
@@ -256,8 +272,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_reflection_no_align_corners
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_zeros_align_corners) {
@@ -277,8 +292,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_zeros_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_zeros_no_align_corners) {
@@ -298,8 +312,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_zeros_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_border_align_corners) {
@@ -319,8 +332,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_border_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_border_no_align_corners) {
@@ -340,8 +352,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_border_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_reflection_align_corners) {
@@ -361,8 +372,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_reflection_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_reflection_no_align_corners) {
@@ -382,8 +392,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_reflection_no_align_corners)
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_nearest_zeros_align_corners) {
@@ -403,8 +412,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_nearest_zeros_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_5D_nearest_zeros_align_corners) {
@@ -424,8 +432,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_nearest_zeros_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_nearest_zeros_no_align_corners) {
@@ -445,8 +452,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_nearest_zeros_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_5D_nearest_zeros_no_align_corners) {
@@ -466,8 +472,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_nearest_zeros_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_nearest_border_align_corners) {
@@ -487,8 +492,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_nearest_border_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_5D_nearest_border_align_corners) {
@@ -508,8 +512,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_nearest_border_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_nearest_border_no_align_corners) {
@@ -529,8 +532,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_nearest_border_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_5D_nearest_border_no_align_corners) {
@@ -550,8 +552,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_nearest_border_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_nearest_reflection_align_corners) {
@@ -571,8 +572,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_nearest_reflection_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_5D_nearest_reflection_align_corners) {
@@ -592,8 +592,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_nearest_reflection_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_nearest_reflection_no_align_corners) {
@@ -613,8 +612,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_nearest_reflection_no_align_corners)
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_5D_nearest_reflection_no_align_corners) {
@@ -634,8 +632,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_nearest_reflection_no_align_corners)
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_zeros_align_corners) {
@@ -655,8 +652,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_zeros_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_zeros_align_corners) {
@@ -676,8 +672,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_zeros_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_zeros_no_align_corners) {
@@ -697,8 +692,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_zeros_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_zeros_no_align_corners) {
@@ -718,8 +712,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_zeros_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_border_align_corners) {
@@ -739,8 +732,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_border_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_border_align_corners) {
@@ -760,8 +752,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_border_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_border_no_align_corners) {
@@ -781,8 +772,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_border_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_border_no_align_corners) {
@@ -802,8 +792,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_border_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_reflection_align_corners) {
@@ -823,8 +812,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_reflection_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_reflection_align_corners) {
@@ -844,8 +832,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_reflection_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_reflection_no_align_corners) {
@@ -865,8 +852,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_reflection_no_align_corners
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_reflection_no_align_corners) {
@@ -886,8 +872,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_reflection_no_align_corners
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_zeros_align_corners) {
@@ -907,8 +892,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_zeros_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_zeros_no_align_corners) {
@@ -928,8 +912,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_zeros_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_border_align_corners) {
@@ -949,8 +932,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_border_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_border_no_align_corners) {
@@ -970,8 +952,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_border_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_reflection_align_corners) {
@@ -991,8 +972,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_reflection_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_reflection_no_align_corners) {
@@ -1012,8 +992,8 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_reflection_no_align_corners)
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/tensor/grid_sample_test_gen.py b/onnxruntime/test/providers/cpu/tensor/grid_sample_test_gen.py
index e4d58e79243ef..c60e55617774f 100644
--- a/onnxruntime/test/providers/cpu/tensor/grid_sample_test_gen.py
+++ b/onnxruntime/test/providers/cpu/tensor/grid_sample_test_gen.py
@@ -76,6 +76,6 @@
                     print('test.AddAttribute("padding_mode", padding_mode);')
                     print('test.AddAttribute("align_corners", align_corners);')
                     print('test.AddOutput<float>("Y", Y_shape, Y_data);')
-                    print("test.Run();")
+                    print(f"RunTests(test, GetExecutionProviders({opset_version}));")
                     print("}")
                     print("\n")
diff --git a/onnxruntime/test/util/default_providers.cc b/onnxruntime/test/util/default_providers.cc
index 40b40136af1af..b404c12db3582 100644
--- a/onnxruntime/test/util/default_providers.cc
+++ b/onnxruntime/test/util/default_providers.cc
@@ -8,6 +8,9 @@
 #ifdef USE_COREML
 #include "core/providers/coreml/coreml_provider_factory.h"
 #endif
+#if defined(ENABLE_CUDA_NHWC_OPS)
+#include <core/providers/cuda/cuda_provider_options.h>
+#endif
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/framework/session_options.h"
 
@@ -118,6 +121,19 @@ std::unique_ptr<IExecutionProvider> DefaultCudaExecutionProvider() {
   return nullptr;
 }
 
+#ifdef ENABLE_CUDA_NHWC_OPS
+std::unique_ptr<IExecutionProvider> DefaultCudaNHWCExecutionProvider() {
+#if defined(USE_CUDA)
+  OrtCUDAProviderOptionsV2 provider_options{};
+  provider_options.do_copy_in_default_stream = true;
+  provider_options.prefer_nhwc = true;
+  if (auto factory = CudaProviderFactoryCreator::Create(&provider_options))
+    return factory->CreateProvider();
+#endif
+  return nullptr;
+}
+#endif
+
 std::unique_ptr<IExecutionProvider> CudaExecutionProviderWithOptions(const OrtCUDAProviderOptionsV2* provider_options) {
 #ifdef USE_CUDA
   if (auto factory = CudaProviderFactoryCreator::Create(provider_options))
diff --git a/onnxruntime/test/util/include/default_providers.h b/onnxruntime/test/util/include/default_providers.h
index 9f78e0a0d4eb2..738fc66d775c6 100644
--- a/onnxruntime/test/util/include/default_providers.h
+++ b/onnxruntime/test/util/include/default_providers.h
@@ -35,6 +35,9 @@ namespace test {
 // unique_ptr providers with default values for session registration
 std::unique_ptr<IExecutionProvider> DefaultCpuExecutionProvider(bool enable_arena = true);
 std::unique_ptr<IExecutionProvider> DefaultCudaExecutionProvider();
+#ifdef ENABLE_CUDA_NHWC_OPS
+std::unique_ptr<IExecutionProvider> DefaultCudaNHWCExecutionProvider();
+#endif
 std::unique_ptr<IExecutionProvider> CudaExecutionProviderWithOptions(const OrtCUDAProviderOptionsV2* provider_options);
 std::unique_ptr<IExecutionProvider> DefaultDnnlExecutionProvider();
 std::unique_ptr<IExecutionProvider> DnnlExecutionProviderWithOptions(const OrtDnnlProviderOptions* provider_options);

From ae3d73c9818c34af42c785ff2bd9558007ba315f Mon Sep 17 00:00:00 2001
From: satyajandhyala <satya.k.jandhyala@gmail.com>
Date: Fri, 23 Feb 2024 00:21:15 -0800
Subject: [PATCH 049/279] [JS/WebGPU] Fix Split and Where to handle corner
 cases. (#19613)

### Description
<!-- Describe your changes. -->
1. Fix Where operator to handle Boolean input less than 4 bytes.
2. Fix JSEP test harness to use tensor names consistently.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 js/web/lib/wasm/jsep/webgpu/ops/where.ts |  3 ++-
 js/web/test/data/ops/where.jsonc         | 34 ++++++++++++++++++++++++
 js/web/test/test-runner.ts               |  4 +--
 3 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/where.ts b/js/web/lib/wasm/jsep/webgpu/ops/where.ts
index cfee07a9239d7..a6375847fc42f 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/where.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/where.ts
@@ -27,7 +27,7 @@ const createWhereOpProgramShader =
           const expressionA = `a_data[index_a${x}][component_a${x}]`;
           const expressionB = `b_data[index_b${x}][component_b${x}]`;
           // eslint-disable-next-line no-bitwise
-          const expressionC = `bool(c_data[index_c${x}] & ${0xff000000 >>> ((3 - x) * 8)}u)`;
+          const expressionC = `bool(c_data[index_c${x}] & (0xffu << (component_c${x} * 8)))`;
           return `
             let output_indices${x} = ${output.offsetToIndices(`global_idx * 4u + ${x}u`)};
             let offset_a${x} = ${a.broadcastedIndicesToOffset(`output_indices${x}`, output)};
@@ -38,6 +38,7 @@ const createWhereOpProgramShader =
             let index_c${x} = offset_c${x} / 4u;
             let component_a${x} = offset_a${x} % 4u;
             let component_b${x} = offset_b${x} % 4u;
+            let component_c${x} = offset_c${x} % 4u;
             ${resStr}[${x}] = ${typeCast}(${expression(expressionA, expressionB, expressionC)});
           `;
         };
diff --git a/js/web/test/data/ops/where.jsonc b/js/web/test/data/ops/where.jsonc
index 047fd6fd7511b..990120dd3708e 100644
--- a/js/web/test/data/ops/where.jsonc
+++ b/js/web/test/data/ops/where.jsonc
@@ -168,5 +168,39 @@
         ]
       }
     ]
+  },
+  {
+    "name": "Where with no attributes",
+    "operator": "Where",
+    "attributes": [],
+    "cases": [
+      {
+        "name": "T[1 1 2 1] T[1 4] T[1 1 2 4] float32 broadcast 1",
+        "inputs": [
+          {
+            "data": [true, false],
+            "dims": [1, 1, 2, 1],
+            "type": "bool"
+          },
+          {
+            "data": [1, 2, 3, 4],
+            "dims": [1, 4],
+            "type": "float32"
+          },
+          {
+            "data": [5, 6, 7, 8, 9, 10, 11, 12],
+            "dims": [1, 1, 2, 4],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [1, 2, 3, 4, 9, 10, 11, 12],
+            "dims": [1, 1, 2, 4],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
   }
 ]
diff --git a/js/web/test/test-runner.ts b/js/web/test/test-runner.ts
index ecc7d4b4a09a5..a4adf5c4ce144 100644
--- a/js/web/test/test-runner.ts
+++ b/js/web/test/test-runner.ts
@@ -627,8 +627,8 @@ export async function runModelTestSet(
   try {
     const feeds: Record<string, ort.Tensor> = {};
     const outputsMetaInfo: Record<string, ort.Tensor> = {};
-    testCase.inputs!.forEach((tensor, i) => feeds[context.session.inputNames[i]] = tensor);
-    testCase.outputs!.forEach((tensor, i) => outputsMetaInfo[context.session.outputNames[i]] = tensor);
+    testCase.inputs!.forEach((tensor) => feeds[tensor.name] = tensor);
+    testCase.outputs!.forEach((tensor) => outputsMetaInfo[tensor.name] = tensor);
     const [start, end, outputs] =
         await sessionRun({session: context.session, feeds, outputsMetaInfo, ioBinding: context.ioBinding});
     if (context.perfData.count === 0) {

From f4306004321efe9a0e65a19a707bf2266ffd7b16 Mon Sep 17 00:00:00 2001
From: cao lei <jslhcl@gmail.com>
Date: Fri, 23 Feb 2024 06:02:05 -0800
Subject: [PATCH 050/279] Enable streams for DML EP. This change is to revert
 PR 19481 since the bug 19480 is fixed by PR 19515 (#19609)

### Description
<!-- Describe your changes. -->
Enable streams for DML EP. This change is to revert PR 19481 since the
bug 19480 is fixed by PR 19515


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
Enable streams for DML EP. This change is to revert PR 19481 since the
bug 19480 is fixed by PR 19515
---
 cmake/adjust_global_compile_flags.cmake | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/cmake/adjust_global_compile_flags.cmake b/cmake/adjust_global_compile_flags.cmake
index a56864ebf4644..8161ea574b8cc 100644
--- a/cmake/adjust_global_compile_flags.cmake
+++ b/cmake/adjust_global_compile_flags.cmake
@@ -92,13 +92,8 @@ if (onnxruntime_MINIMAL_BUILD)
   endif()
 endif()
 
-# Enable stream for all the non-minimal build, except for DML. There's currently a bug
-# in the allocation planner when reusing buffers and more than one streams are used that
-# make it possible (although rarely) to reach a reference count of 0 for a buffer that is
-# still being used. Since DML doesn't benefit from multiple streams, disabling it is the
-# safest option for now.
-# https://github.com/microsoft/onnxruntime/issues/19480
-if (NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_USE_DML)
+# Enable stream for all the non-minimal build
+if (NOT onnxruntime_MINIMAL_BUILD)
   add_compile_definitions(ORT_ENABLE_STREAM)
 endif()
 

From efbe2b84556c195e7d7f3353321eb3f410a1e645 Mon Sep 17 00:00:00 2001
From: Markus Tavenrath <mtavenrath@users.noreply.github.com>
Date: Fri, 23 Feb 2024 17:45:17 +0100
Subject: [PATCH 051/279] Fix cuDNN v9 build by replacing removed cuDNN v6 RNN
 API usage by cuDNN v8 RNN API and reenable RNN tests for CUDA EP (#19419)

Replace deprecated cuDNN RNN based API by cuDNN v8 RNN API and re-enable
RNN tests for the CUDA EP.

### Motivation and Context
The deprecated cuDNN RNN API might vanish soon and in addition for the
current CUDA EP RNN implementation all RNN tests are disabled due to
failures. With this change the deprecated API has been removed and the
new updated implemented doesn't fail the tests anymore.
---
 .../core/providers/cuda/cudnn_common.h        |   4 +-
 .../core/providers/cuda/rnn/cudnn_rnn_base.cc | 350 +++++++++---------
 .../core/providers/cuda/rnn/cudnn_rnn_base.h  |  55 +--
 onnxruntime/core/providers/cuda/rnn/rnn.cc    |   3 +-
 onnxruntime/core/providers/cuda/rnn/rnn.h     |   1 +
 .../core/providers/cuda/rnn/rnn_impl.cu       |  91 +----
 .../core/providers/cuda/rnn/rnn_impl.h        |  14 +-
 .../test/providers/cpu/rnn/rnn_op_test.cc     |  24 +-
 8 files changed, 240 insertions(+), 302 deletions(-)

diff --git a/onnxruntime/core/providers/cuda/cudnn_common.h b/onnxruntime/core/providers/cuda/cudnn_common.h
index fdd14dedad47e..2cbeb13696270 100644
--- a/onnxruntime/core/providers/cuda/cudnn_common.h
+++ b/onnxruntime/core/providers/cuda/cudnn_common.h
@@ -24,12 +24,12 @@ class CudnnTensor final {
 
   operator cudnnTensorDescriptor_t() const { return tensor_; }
 
+  Status CreateTensorIfNeeded();
+
   template <typename T>
   static cudnnDataType_t GetDataType();
 
  private:
-  Status CreateTensorIfNeeded();
-
   cudnnTensorDescriptor_t tensor_;
 };
 
diff --git a/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc b/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc
index 99c1f48e21c74..b61b104790fe5 100644
--- a/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc
+++ b/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc
@@ -9,40 +9,49 @@ namespace onnxruntime {
 namespace cuda {
 
 template <typename T>
-void CudnnRnnBase<T>::SetWeightBias(const cudnnHandle_t handle,
-                                    const cudnnRNNDescriptor_t rnn_desc,
-                                    const int pseudo_layer,
-                                    const cudnnTensorDescriptor_t x_desc,
-                                    const cudnnFilterDescriptor_t w_desc,
-                                    const cudnnFilterDescriptor_t filter_desc,
-                                    const void* reorganized_w_data,
-                                    const int lin_layer_id,
-                                    const T* pos,
-                                    int& offset,
-                                    bool is_matrix,
-                                    cudaStream_t cuda_stream) const {
+Status CudnnRnnBase<T>::SetWeightBias(const cudnnHandle_t handle,
+                                      const cudnnRNNDescriptor_t rnn_desc,
+                                      const int pseudo_layer,
+                                      size_t reorganized_w_data_size,
+                                      const void* reorganized_w_data,
+                                      const int lin_layer_id,
+                                      const T* pos,
+                                      int& offset,
+                                      bool is_matrix,
+                                      cudaStream_t cuda_stream) const {
   int numDims;
-  std::vector<int> matDims(3);
+  std::array<int, 3> matDims;
+  std::array<int, 3> strideA;
   cudnnDataType_t dt;
-  cudnnTensorFormat_t tf;
   T* mem_offset;
 
-  if (is_matrix) {
-    cudnnGetRNNLinLayerMatrixParams(handle, rnn_desc, pseudo_layer, x_desc, w_desc, reorganized_w_data, lin_layer_id, filter_desc, (void**)&mem_offset);
-  } else {
-    cudnnGetRNNLinLayerBiasParams(handle, rnn_desc, pseudo_layer, x_desc, w_desc, reorganized_w_data, lin_layer_id, filter_desc, (void**)&mem_offset);
-  }
+  CudnnTensor tensor_desc_matrix, tensor_desc_bias;
+  ORT_RETURN_IF_ERROR(tensor_desc_bias.CreateTensorIfNeeded());
+  ORT_RETURN_IF_ERROR(tensor_desc_matrix.CreateTensorIfNeeded());
 
-  cudnnGetFilterNdDescriptor(filter_desc, 3, &dt, &tf, &numDims, matDims.data());
+  T *mem_offset_matrix, *mem_offset_bias;
+  CUDNN_RETURN_IF_ERROR(cudnnGetRNNWeightParams(
+      handle, rnn_desc, pseudo_layer, reorganized_w_data_size, reorganized_w_data,
+      lin_layer_id, tensor_desc_matrix, (void**)&mem_offset_matrix, tensor_desc_bias, (void**)&mem_offset_bias));
+  CUDNN_RETURN_IF_ERROR(cudnnGetTensorNdDescriptor(
+      is_matrix ? tensor_desc_matrix : tensor_desc_bias, 3, &dt, &numDims, matDims.data(), strideA.data()));
+
+  mem_offset = is_matrix ? mem_offset_matrix : mem_offset_bias;
   int count = matDims[0] * matDims[1] * matDims[2];
+
+  if (strideA[0] != count) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, StatusCode::INVALID_ARGUMENT, "Stride is not packed");
+  }
   CUDA_CALL_THROW(cudaMemcpyAsync(mem_offset, pos + offset, count * sizeof(T), cudaMemcpyDeviceToDevice, cuda_stream));
+
   offset += count;
+
+  return Status::OK();
 }
 template <typename T>
 Status CudnnRnnBase<T>::SetCudnnRnnWeightBias(const cudnnHandle_t cudnn_handle,
                                               const cudnnRNNDescriptor_t rnn_desc,
-                                              const cudnnTensorDescriptor_t x_desc,
-                                              const cudnnFilterDescriptor_t w_desc,
+                                              size_t reorganized_w_data_size,
                                               void* reorganized_w_data,
                                               const T* W_data,
                                               const T* R_data,
@@ -51,18 +60,22 @@ Status CudnnRnnBase<T>::SetCudnnRnnWeightBias(const cudnnHandle_t cudnn_handle,
   int w_offset = 0;
   int r_offset = 0;
   int bias_offset = 0;
-  CudnnFilterDescriptor filter_desc;
   for (int layer = 0; layer < RNN_NUM_LAYERS * num_directions_; ++layer) {
     for (size_t idx = 0; idx < W_lin_layer_id_.size(); ++idx) {
-      SetWeightBias(cudnn_handle, rnn_desc, layer, x_desc, w_desc, filter_desc, reorganized_w_data, W_lin_layer_id_[idx], W_data, w_offset, true, cuda_stream);
+      ORT_RETURN_IF_ERROR(SetWeightBias(
+          cudnn_handle, rnn_desc, layer, reorganized_w_data_size, reorganized_w_data,
+          W_lin_layer_id_[idx], W_data, w_offset, true, cuda_stream));
       if (B_data != nullptr) {
-        SetWeightBias(cudnn_handle, rnn_desc, layer, x_desc, w_desc, filter_desc, reorganized_w_data, W_lin_layer_id_[idx], B_data, bias_offset, false, cuda_stream);
+        ORT_RETURN_IF_ERROR(SetWeightBias(cudnn_handle, rnn_desc, layer, reorganized_w_data_size, reorganized_w_data,
+                                          W_lin_layer_id_[idx], B_data, bias_offset, false, cuda_stream));
       }
     }
     for (size_t idx = 0; idx < R_lin_layer_id_.size(); ++idx) {
-      SetWeightBias(cudnn_handle, rnn_desc, layer, x_desc, w_desc, filter_desc, reorganized_w_data, R_lin_layer_id_[idx], R_data, r_offset, true, cuda_stream);
+      ORT_RETURN_IF_ERROR(SetWeightBias(cudnn_handle, rnn_desc, layer, reorganized_w_data_size, reorganized_w_data,
+                                        R_lin_layer_id_[idx], R_data, r_offset, true, cuda_stream));
       if (B_data != nullptr) {
-        SetWeightBias(cudnn_handle, rnn_desc, layer, x_desc, w_desc, filter_desc, reorganized_w_data, R_lin_layer_id_[idx], B_data, bias_offset, false, cuda_stream);
+        ORT_RETURN_IF_ERROR(SetWeightBias(cudnn_handle, rnn_desc, layer, reorganized_w_data_size, reorganized_w_data,
+                                          R_lin_layer_id_[idx], B_data, bias_offset, false, cuda_stream));
       }
     }
   }
@@ -72,6 +85,7 @@ Status CudnnRnnBase<T>::SetCudnnRnnWeightBias(const cudnnHandle_t cudnn_handle,
 
 template <typename T>
 Status CudnnRnnBase<T>::ReorganizeWeights(const Tensor* W, const Tensor* R, const Tensor* B,
+                                          size_t& reorganized_w_data_size_in_bytes,
                                           IAllocatorUniquePtr<void>& reorganized_w_data,
                                           CudnnFilterDescriptor& target_w_desc,
                                           CudnnRNN& rnn_desc, onnxruntime::Stream* ort_stream) const {
@@ -91,19 +105,16 @@ Status CudnnRnnBase<T>::ReorganizeWeights(const Tensor* W, const Tensor* R, cons
   TensorShapeVector dims_w({w_size, 1, 1});
   ORT_RETURN_IF_ERROR(target_w_desc.Set(dims_w, CudnnTensor::GetDataType<CudaT>()));
 
-  TensorShapeVector fake_dims_x({1, input_size, 1});
-  CudnnTensor fake_x_desc;
-  ORT_RETURN_IF_ERROR(fake_x_desc.Set(fake_dims_x, CudnnTensor::GetDataType<CudaT>()));
-
   // Prepare the weight data
-  reorganized_w_data = GetScratchBuffer<void>(w_size * sizeof(T), ort_stream);
+  reorganized_w_data_size_in_bytes = w_size * sizeof(T);
+  reorganized_w_data = GetScratchBuffer<void>(reorganized_w_data_size_in_bytes, ort_stream);
 
   // In many cases, this allocation is bigger than needed, leaving part of
-  // the buffer unintialized. non-zero garbage data leads to wrong result
+  // the buffer uninitialized. non-zero garbage data leads to wrong result
   // in call to cudnnRNNForwardInference()
   // TODO! refine allocation size for each case.
   cudaStream_t cuda_stream = ort_stream ? static_cast<cudaStream_t>(ort_stream->GetHandle()) : nullptr;
-  cudaMemsetAsync(reorganized_w_data.get(), 0, w_size * sizeof(T), cuda_stream);
+  CUDA_RETURN_IF_ERROR(cudaMemsetAsync(reorganized_w_data.get(), 0, reorganized_w_data_size_in_bytes, cuda_stream));
 
   const T* W_data = W->Data<T>();
   const T* R_data = R->Data<T>();
@@ -111,8 +122,9 @@ Status CudnnRnnBase<T>::ReorganizeWeights(const Tensor* W, const Tensor* R, cons
 
   auto* ort_cuda_stream = dynamic_cast<CudaStream*>(ort_stream);
   cudnnHandle_t cudnn_handle = ort_cuda_stream ? ort_cuda_stream->cudnn_handle_ : DefaultCudnnHandle();
-  ORT_RETURN_IF_ERROR(SetCudnnRnnWeightBias(cudnn_handle, rnn_desc, fake_x_desc, target_w_desc,
-                                            reorganized_w_data.get(), W_data, R_data, B_data, cuda_stream));
+  ORT_RETURN_IF_ERROR(SetCudnnRnnWeightBias(cudnn_handle, rnn_desc,
+                                            reorganized_w_data_size_in_bytes, reorganized_w_data.get(),
+                                            W_data, R_data, B_data, cuda_stream));
 
   return Status::OK();
 }
@@ -128,22 +140,31 @@ Status CudnnRnnBase<T>::CacheCudnnRnnWeights(const OpKernelInfo& info) {
   bool get_R = info.TryGetConstantInput(RNN_Input_Index::R, &R);
   bool get_B = info.TryGetConstantInput(RNN_Input_Index::B, &B);
 
+  bool has_bias = B != nullptr;
+
   if (get_W && get_R) {
     CudnnRNN tmp_rnn_desc;
-    ORT_RETURN_IF_ERROR(tmp_rnn_desc.Set(DefaultCudnnHandle(),
+    auto proj_size = hidden_size_;
+    ORT_RETURN_IF_ERROR(tmp_rnn_desc.Set(W->Shape()[2],  // input_size
                                          hidden_size_,
+                                         proj_size,
                                          RNN_NUM_LAYERS,
                                          cudnn_dropout_desc_,
                                          cudnn_direction_mode_,
                                          rnn_mode_,
-                                         CudnnTensor::GetDataType<CudaT>(),
-                                         GetDeviceProp()));
+                                         has_bias,
+                                         CudnnTensor::GetDataType<CudaT>()));
     if (get_B) {
-      ORT_RETURN_IF_ERROR(ReorganizeWeights(W, R, B, w_data_cache_, w_desc_cache_, tmp_rnn_desc, nullptr));
+      ORT_RETURN_IF_ERROR(ReorganizeWeights(W, R, B,
+                                            w_data_cache_size_in_bytes_, w_data_cache_, w_desc_cache_,
+                                            tmp_rnn_desc, nullptr));
     } else {
-      ORT_RETURN_IF_ERROR(ReorganizeWeights(W, R, nullptr, w_data_cache_, w_desc_cache_, tmp_rnn_desc, nullptr));
+      ORT_RETURN_IF_ERROR(ReorganizeWeights(W, R, nullptr,
+                                            w_data_cache_size_in_bytes_, w_data_cache_, w_desc_cache_,
+                                            tmp_rnn_desc, nullptr));
     }
     cudaStreamSynchronize(nullptr);
+
     weight_cached_ = true;
   }
 
@@ -158,17 +179,72 @@ Status CudnnRnnBase<T>::ComputeInternal(OpKernelContext* ctx) const {
   ORT_ENFORCE(nullptr != X);
 
   // optional inputs
-  const Tensor* sequence_lens = ctx->Input<Tensor>(RNN_Input_Index::sequence_lens);  // [batch_size]
-  const Tensor* initial_h = ctx->Input<Tensor>(RNN_Input_Index::initial_h);          // initial hidden. [num_directions_, batch_size, hidden_size_]
+  // [batch_size]
+  const Tensor* sequence_lens = ctx->Input<Tensor>(RNN_Input_Index::sequence_lens);
+  // initial hidden. [num_directions_, batch_size, hidden_size_]
+  const Tensor* initial_h = ctx->Input<Tensor>(RNN_Input_Index::initial_h);
   const Tensor* initial_c(nullptr);
   if (rnn_mode_ == CUDNN_LSTM) {
-    initial_c = ctx->Input<Tensor>(RNN_Input_Index::initial_c);  // initial cell. [num_directions_, batch_size, hidden_size_]
+    // initial cell. [num_directions_, batch_size, hidden_size_]
+    initial_c = ctx->Input<Tensor>(RNN_Input_Index::initial_c);
   }
 
+  size_t proj_size = hidden_size_;
   int64_t seq_length = X->Shape()[0];
   int64_t batch_size = X->Shape()[1];
   int64_t input_size = X->Shape()[2];
 
+  // we thread a single input as sequence_lens of length 1, require to expand to [batch_size]?
+  std::vector<int32_t> sequence_lengths_temp;
+  if (!sequence_lens) {
+    sequence_lengths_temp.resize(batch_size, gsl::narrow_cast<int32_t>(seq_length));
+  }
+
+  const int32_t* sequence_lens_data = (sequence_lens == nullptr)
+                                          ? sequence_lengths_temp.data()
+                                          : sequence_lens->Data<int32_t>();
+
+  // cuDNN doesn't support 0 sequence inside the batch, find the 0 sequence and set it to 1
+  // there's a ZeroMask kernel to reset the result to 0 for the 0 sequence
+  int64_t zero_seq_count = 0;
+  std::vector<int32_t> zero_seq_index_cache(batch_size, 0);
+
+  CudaAsyncBuffer<int32_t> sequence_lens_buffer(this, batch_size);
+  int32_t* seq_len_array = sequence_lens_buffer.CpuPtr();
+
+  // 0-len sequences are not supported by cuDNN.
+  // Replace them by sequences of len 1 and mask them out with SetZeroSequences
+  for (int i = 0; i < batch_size; ++i) {
+    if (0 == sequence_lens_data[i]) {
+      seq_len_array[i] = 1;
+      zero_seq_index_cache[zero_seq_count] = i;
+      ++zero_seq_count;
+    } else {
+      seq_len_array[i] = sequence_lens_data[i];
+    }
+  }
+
+  // Calculate the zero position cache for reverse direction if it's bidirectional
+  // The cache is for Y_h or Y_c, and the 1st sequence for Y, no need to do it for other sequence in Y since
+  // we hacked the 0 sequence to 1
+  if (zero_seq_count && num_directions_ > 1) {
+    zero_seq_index_cache.resize(zero_seq_count * num_directions_);
+    for (int64_t i = 0; i < zero_seq_count; ++i) {
+      zero_seq_index_cache[static_cast<size_t>(zero_seq_count) + i] =
+          static_cast<int32_t>(batch_size + zero_seq_index_cache[i]);
+    }
+    zero_seq_count *= num_directions_;
+  }
+
+  // Prior to cuDNN 8.9.1 the sequence lens buffer must be passed to cudnnRNNForward and thus is must
+  // be copied to the GPU always.
+  ORT_RETURN_IF_ERROR(sequence_lens_buffer.CopyToGpu(ctx->GetComputeStream()));
+  // Starting with cuDNN 8.9.1 the sequence lens buffer is ignored by cudnnRNNForward and thus it must
+  // be copied to the GPU only for the ReverseBySequence kernels.
+  // if (reverse_) {
+  //  ORT_RETURN_IF_ERROR(sequence_lens_buffer.CopyToGpu(ctx->GetComputeStream()));
+  // }
+
   // optional outputs
   TensorShapeVector dims_Y({seq_length, num_directions_, batch_size, hidden_size_});
   TensorShapeVector dims_hxy({RNN_NUM_LAYERS * num_directions_, batch_size, hidden_size_});
@@ -177,25 +253,6 @@ Status CudnnRnnBase<T>::ComputeInternal(OpKernelContext* ctx) const {
   Tensor* Y_h = ctx->Output(Output_Index::Y_h, dims_hxy);
   Tensor* Y_c = ctx->Output(Output_Index::Y_c, dims_yc);
 
-  std::vector<int64_t> dims_x({batch_size, input_size, 1});
-  std::vector<int64_t> dims_y({batch_size, hidden_size_ * num_directions_, 1});
-
-  CudnnTensor x_desc_temp;
-  ORT_RETURN_IF_ERROR(x_desc_temp.Set(dims_x, CudnnTensor::GetDataType<CudaT>()));
-  CudnnTensor y_desc_temp;
-  ORT_RETURN_IF_ERROR(y_desc_temp.Set(dims_y, CudnnTensor::GetDataType<CudaT>()));
-  std::vector<cudnnTensorDescriptor_t> x_desc(seq_length, x_desc_temp);
-  std::vector<cudnnTensorDescriptor_t> y_desc(seq_length, y_desc_temp);
-
-  CudnnTensor hx_desc;
-  CudnnTensor cx_desc;
-  CudnnTensor y_h_desc;
-  CudnnTensor y_c_desc;
-  ORT_RETURN_IF_ERROR(hx_desc.Set(dims_hxy, CudnnTensor::GetDataType<CudaT>()));
-  ORT_RETURN_IF_ERROR(cx_desc.Set(dims_hxy, CudnnTensor::GetDataType<CudaT>()));
-  ORT_RETURN_IF_ERROR(y_h_desc.Set(dims_hxy, CudnnTensor::GetDataType<CudaT>()));
-  ORT_RETURN_IF_ERROR(y_c_desc.Set(dims_hxy, CudnnTensor::GetDataType<CudaT>()));
-
   IAllocatorUniquePtr<T> x_reversed_data;
   const T* x_data = X->Data<T>();
   if (reverse_) {
@@ -203,6 +260,7 @@ Status CudnnRnnBase<T>::ComputeInternal(OpKernelContext* ctx) const {
     x_reversed_data = GetScratchBuffer<T>(seq_length * batch_size * input_size, ctx->GetComputeStream());
     ReverseBySequence(Stream(ctx),
                       gsl::narrow_cast<int32_t>(seq_length),
+                      sequence_lens_buffer.GpuPtr(),
                       gsl::narrow_cast<int32_t>(batch_size),
                       gsl::narrow_cast<int32_t>(input_size),
                       reinterpret_cast<const CudaT*>(x_data),
@@ -226,115 +284,82 @@ Status CudnnRnnBase<T>::ComputeInternal(OpKernelContext* ctx) const {
     y_data = y_alloc_data.get();
   }
 
-  const int32_t* sequence_lens_data = (sequence_lens == nullptr) ? nullptr : sequence_lens->Data<int32_t>();
+  const Tensor* B = ctx->Input<Tensor>(RNN_Input_Index::B);
+  bool has_bias = B != nullptr;
 
   CudnnRNN rnn_desc;
-  ORT_RETURN_IF_ERROR(rnn_desc.Set(GetCudnnHandle(ctx),
+  ORT_RETURN_IF_ERROR(rnn_desc.Set(input_size,
                                    hidden_size_,
+                                   proj_size,
                                    RNN_NUM_LAYERS,
                                    cudnn_dropout_desc_,
                                    cudnn_direction_mode_,
                                    rnn_mode_,
-                                   CudnnTensor::GetDataType<CudaT>(),
-                                   GetDeviceProp()));
+                                   has_bias,
+                                   CudnnTensor::GetDataType<CudaT>()));
 
   // Prepare the weight data
+  size_t w_data_size_in_bytes = 0;
   IAllocatorUniquePtr<void> w_data;
   CudnnFilterDescriptor w_desc;
   if (!weight_cached_) {
     const Tensor& W = *ctx->Input<Tensor>(RNN_Input_Index::W);
     const Tensor& R = *ctx->Input<Tensor>(RNN_Input_Index::R);
     const Tensor* B = ctx->Input<Tensor>(RNN_Input_Index::B);
-    ORT_RETURN_IF_ERROR(ReorganizeWeights(&W, &R, B, w_data, w_desc, rnn_desc, ctx->GetComputeStream()));
+    ORT_RETURN_IF_ERROR(ReorganizeWeights(&W, &R, B, w_data_size_in_bytes, w_data, w_desc,
+                                          rnn_desc, ctx->GetComputeStream()));
   }
 
-  // CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED works with CUDNN_RNN_PADDED_IO_ENABLED, so that it will auto fill 0 for the shorter sequences
-  CUDNN_RETURN_IF_ERROR(cudnnSetRNNPaddingMode(rnn_desc, CUDNN_RNN_PADDED_IO_ENABLED));
+  CudnnDataTensor x_desc1;
+  ORT_RETURN_IF_ERROR(x_desc1.Set(CudnnTensor::GetDataType<CudaT>(), seq_length, batch_size,
+                                  input_size, seq_len_array));
+  CudnnDataTensor y_desc1;
+  ORT_RETURN_IF_ERROR(y_desc1.Set(CudnnTensor::GetDataType<CudaT>(), seq_length, batch_size,
+                                  ((rnn_mode_ == CUDNN_LSTM) ? proj_size : hidden_size_) * num_directions_,
+                                  seq_len_array));
 
-  size_t workspace_bytes;
-  CUDNN_RETURN_IF_ERROR(cudnnGetRNNWorkspaceSize(GetCudnnHandle(ctx), rnn_desc, gsl::narrow_cast<int>(seq_length), x_desc.data(), &workspace_bytes));
-  auto workspace_cuda = GetScratchBuffer<void>(workspace_bytes, ctx->GetComputeStream());
-  int64_t zero_seq_count = 0;
-  std::vector<int32_t> zero_seq_index_cache(batch_size, 0);
-  int64_t zero_seq_index_cache_size = 0;
-
-  if (CUDNN_RNN_RELU == rnn_mode_ || CUDNN_RNN_TANH == rnn_mode_ || nullptr == sequence_lens_data) {
-    CUDNN_RETURN_IF_ERROR(cudnnRNNForwardInference(GetCudnnHandle(ctx),
-                                                   rnn_desc,
-                                                   gsl::narrow_cast<int>(seq_length),
-                                                   x_desc.data(),
-                                                   x_data_input,
-                                                   hx_desc,
-                                                   hx_data,
-                                                   cx_desc,
-                                                   cx_data,
-                                                   weight_cached_ ? w_desc_cache_ : w_desc,
-                                                   weight_cached_ ? w_data_cache_.get() : w_data.get(),
-                                                   y_desc.data(),
-                                                   y_data,
-                                                   y_h_desc,
-                                                   y_h_data,
-                                                   y_c_desc,
-                                                   y_c_data,
-                                                   workspace_cuda.get(),
-                                                   workspace_bytes));
-  } else {
-    // cudnn doesn't support 0 sequence inside the batch, find the 0 sequence and set it to 1
-    // there's a ZeroMask kernel to reset the result to 0 for the 0 sequence
-    std::vector<int32_t> seq_len_array(sequence_lens_data, sequence_lens_data + batch_size);
-    for (int i = 0; i < batch_size; ++i) {
-      if (0 == seq_len_array[i]) {
-        seq_len_array[i] = 1;
-        zero_seq_index_cache[zero_seq_count] = i;
-        ++zero_seq_count;
-      }
-    }
+  CudnnTensor cx_desc;
+  ORT_RETURN_IF_ERROR(cx_desc.Set(dims_hxy, CudnnTensor::GetDataType<CudaT>()));
 
-    // Calculate the zero position cache for reverse direction if it's bidirectional
-    // The cache is for Y_h or Y_c, and the 1st sequence for Y, no need to do it for other sequence in Y since
-    // we hacked the 0 sequence to 1
-    if (zero_seq_count && num_directions_ > 1) {
-      zero_seq_index_cache_size = zero_seq_count * num_directions_;
-      zero_seq_index_cache.resize(zero_seq_index_cache_size);
-      for (int64_t i = 0; i < zero_seq_count; ++i) {
-        zero_seq_index_cache[static_cast<size_t>(zero_seq_count) + i] = static_cast<int32_t>(batch_size + zero_seq_index_cache[i]);
-      }
-    }
+  CudnnTensor hx_desc;
+  ORT_RETURN_IF_ERROR(hx_desc.Set(dims_hxy, CudnnTensor::GetDataType<CudaT>()));
+
+  // reserveSpaceSize is not required cudnnRNNForward, but returned by cudnnGetRNNTempSpaceSizes
+  size_t workspace_bytes, reservespace_bytes;
 
-    CudnnDataTensor x_desc1;
-    ORT_RETURN_IF_ERROR(x_desc1.Set(CudnnTensor::GetDataType<CudaT>(), seq_length, batch_size, input_size, seq_len_array.data()));
-    CudnnDataTensor y_desc1;
-    ORT_RETURN_IF_ERROR(y_desc1.Set(CudnnTensor::GetDataType<CudaT>(), seq_length, batch_size, hidden_size_ * num_directions_, seq_len_array.data()));
-
-    CUDNN_RETURN_IF_ERROR(cudnnRNNForwardInferenceEx(GetCudnnHandle(ctx),
-                                                     rnn_desc,
-                                                     x_desc1,
-                                                     x_data_input,
-                                                     hx_desc,
-                                                     hx_data,
-                                                     cx_desc,
-                                                     cx_data,
-                                                     weight_cached_ ? w_desc_cache_ : w_desc,
-                                                     weight_cached_ ? w_data_cache_.get() : w_data.get(),
-                                                     y_desc1,
-                                                     y_data,
-                                                     y_h_desc,
-                                                     y_h_data,
-                                                     y_c_desc,
-                                                     y_c_data,
-                                                     nullptr, nullptr, nullptr, nullptr,
-                                                     nullptr, nullptr, nullptr, nullptr,
-                                                     workspace_cuda.get(),
-                                                     workspace_bytes));
-
-    // Early terminate for this case since Y data is not required, and Y_h is obtained correctly, no need the following code to retrive Y_h from Y data.
-    if (nullptr == Y) {
+  CUDNN_RETURN_IF_ERROR(cudnnGetRNNTempSpaceSizes(GetCudnnHandle(ctx), rnn_desc, CUDNN_FWD_MODE_INFERENCE,
+                                                  x_desc1, &workspace_bytes, &reservespace_bytes));
+  auto workspace_cuda = GetScratchBuffer<void>(workspace_bytes, ctx->GetComputeStream());
+  auto reservespace_cuda = GetScratchBuffer<void>(reservespace_bytes, ctx->GetComputeStream());
+
+  CUDNN_RETURN_IF_ERROR(cudnnRNNForward(GetCudnnHandle(ctx),
+                                        rnn_desc,
+                                        CUDNN_FWD_MODE_INFERENCE,
+                                        sequence_lens_buffer.GpuPtr(),  // should be zero starting with cudnn 8.9.1
+                                        x_desc1,
+                                        x_data_input,
+                                        y_desc1,
+                                        y_data,  // output
+                                        hx_desc,
+                                        hx_data,   // input
+                                        y_h_data,  // output
+                                        cx_desc, cx_data, y_c_data,
+                                        weight_cached_ ? w_data_cache_size_in_bytes_ : w_data_size_in_bytes,
+                                        weight_cached_ ? w_data_cache_.get() : w_data.get(),
+                                        workspace_bytes,
+                                        workspace_cuda.get(),
+                                        reservespace_bytes,
+                                        reservespace_cuda.get()));
+
+  // Early terminate for this case since Y data is not required, and Y_h is obtained correctly,
+  // no need the following code to retrieve Y_h from Y data.
+  if (nullptr == Y) {
+    // Mask on output for 0 sequence batches
+    if (zero_seq_count > 0) {
       // Mask on output for 0 sequence batches
-      if (zero_seq_count > 0) {
-        SetZeroSequences(zero_seq_index_cache_size, zero_seq_index_cache, y_data, y_h_data, y_c_data, ctx->GetComputeStream());
-      }
-      return Status::OK();
+      SetZeroSequences(zero_seq_count, zero_seq_index_cache, y_data, y_h_data, y_c_data, ctx->GetComputeStream());
     }
+    return Status::OK();
   }
 
   IAllocatorUniquePtr<T> y_reorganized_data;
@@ -345,6 +370,7 @@ Status CudnnRnnBase<T>::ComputeInternal(OpKernelContext* ctx) const {
       // reverse output data
       ReverseBySequence(Stream(ctx),
                         gsl::narrow_cast<int32_t>(seq_length),
+                        sequence_lens_buffer.GpuPtr(),
                         gsl::narrow_cast<int32_t>(batch_size),
                         gsl::narrow_cast<int32_t>(hidden_size_),
                         reinterpret_cast<CudaT*>(y_data),
@@ -361,8 +387,9 @@ Status CudnnRnnBase<T>::ComputeInternal(OpKernelContext* ctx) const {
     }
 
     if (Y != nullptr) {
-      // User specified this optional output, so need to copy the reversed data to orignial place
-      CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(y_data, y_reorganized_data.get(), output_size * sizeof(T), cudaMemcpyDeviceToDevice, Stream(ctx)));
+      // User specified this optional output, so need to copy the reversed data to original place
+      CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(y_data, y_reorganized_data.get(), output_size * sizeof(T),
+                                           cudaMemcpyDeviceToDevice, Stream(ctx)));
     } else {
       y_data = y_reorganized_data.get();
     }
@@ -370,23 +397,9 @@ Status CudnnRnnBase<T>::ComputeInternal(OpKernelContext* ctx) const {
 
   // Mask on output for 0 sequence batches
   if (zero_seq_count > 0) {
-    SetZeroSequences(zero_seq_index_cache_size, zero_seq_index_cache, y_data, y_h_data, y_c_data, ctx->GetComputeStream());
+    SetZeroSequences(zero_seq_count, zero_seq_index_cache, y_data, y_h_data, y_c_data, ctx->GetComputeStream());
   }
 
-  if ((CUDNN_RNN_RELU == rnn_mode_ || CUDNN_RNN_TANH == rnn_mode_) && sequence_lens_data != nullptr && y_h_data != nullptr && y_data != nullptr) {
-    CudaAsyncBuffer<int32_t> sequence_lens_buffer(this, batch_size);
-    memcpy(sequence_lens_buffer.CpuPtr(), sequence_lens_data, batch_size * sizeof(int32_t));
-    ORT_RETURN_IF_ERROR(sequence_lens_buffer.CopyToGpu(ctx->GetComputeStream()));
-    RnnMaskImpl(Stream(ctx),
-                gsl::narrow_cast<int32_t>(num_directions_),
-                gsl::narrow_cast<int32_t>(seq_length),
-                gsl::narrow_cast<int32_t>(batch_size),
-                gsl::narrow_cast<int32_t>(hidden_size_),
-                sequence_lens_buffer.GpuPtr(),
-                reinterpret_cast<CudaT*>(y_data),
-                reinterpret_cast<CudaT*>(y_h_data),
-                output_size);
-  }
   return Status::OK();
 }
 
@@ -399,7 +412,8 @@ void CudnnRnnBase<T>::SetZeroSequences(const int64_t zero_seq_index_cache_size,
                                        onnxruntime::Stream* ort_stream) const {
   typedef typename ToCudaType<T>::MappedType CudaT;
   CudaAsyncBuffer<int32_t> zero_seq_index_cache_async_buffer(this, zero_seq_index_cache_size);
-  memcpy(zero_seq_index_cache_async_buffer.CpuPtr(), zero_seq_index_cache.data(), zero_seq_index_cache_size * sizeof(int32_t));
+  memcpy(zero_seq_index_cache_async_buffer.CpuPtr(), zero_seq_index_cache.data(),
+         zero_seq_index_cache_size * sizeof(int32_t));
   ORT_THROW_IF_ERROR(zero_seq_index_cache_async_buffer.CopyToGpu(ort_stream));
   cudaStream_t cuda_stream = ort_stream ? static_cast<cudaStream_t>(ort_stream->GetHandle()) : nullptr;
   MaskZeroSequences(cuda_stream,
diff --git a/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.h b/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.h
index 1c9483b2afd38..0fa01d3486e99 100644
--- a/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.h
+++ b/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.h
@@ -38,26 +38,28 @@ class CudnnRNN {
     }
   }
 
-  Status Set(const cudnnHandle_t& cudnnHandle, int64_t hidden_size, int num_layers,
+  Status Set(int64_t input_size, int64_t hidden_size, int64_t proj_size, int num_layers,
              cudnnDropoutDescriptor_t cudnn_dropout_desc, cudnnDirectionMode_t cudnn_direction_model,
-             cudnnRNNMode_t rnn_mode, cudnnDataType_t dataType, const cudaDeviceProp& prop) {
+             cudnnRNNMode_t rnn_mode, bool has_bias, cudnnDataType_t dataType) {
     if (!cudnn_rnn_desc_)
       CUDNN_RETURN_IF_ERROR(cudnnCreateRNNDescriptor(&cudnn_rnn_desc_));
 
-    CUDNN_RETURN_IF_ERROR(cudnnSetRNNDescriptor_v6(cudnnHandle,
-                                                   cudnn_rnn_desc_,
+    CUDNN_RETURN_IF_ERROR(cudnnSetRNNDescriptor_v8(cudnn_rnn_desc_,
+                                                   CUDNN_RNN_ALGO_STANDARD,  // CUDNN_RNN_ALGO_PERSIST_STATIC, CUDNN_RNN_ALGO_PERSIST_DYNAMIC
+                                                   rnn_mode,
+                                                   has_bias ? CUDNN_RNN_DOUBLE_BIAS : CUDNN_RNN_NO_BIAS,
+                                                   cudnn_direction_model,
+                                                   CUDNN_LINEAR_INPUT,
+                                                   dataType,
+                                                   dataType,
+                                                   dataType == CUDNN_DATA_HALF ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH,
+                                                   gsl::narrow_cast<int>(input_size),
                                                    gsl::narrow_cast<int>(hidden_size),
+                                                   gsl::narrow_cast<int>(proj_size),  // projected size
                                                    num_layers,
                                                    cudnn_dropout_desc,
-                                                   CUDNN_LINEAR_INPUT,  // We can also skip the input matrix transformation
-                                                   cudnn_direction_model,
-                                                   rnn_mode,
-                                                   CUDNN_RNN_ALGO_STANDARD,  // CUDNN_RNN_ALGO_PERSIST_STATIC, CUDNN_RNN_ALGO_PERSIST_DYNAMIC
-                                                   dataType));
-
-    if (prop.major >= 7 && dataType == CUDNN_DATA_HALF) {
-      cudnnSetRNNMatrixMathType(cudnn_rnn_desc_, CUDNN_TENSOR_OP_MATH);
-    }
+                                                   // CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED works with CUDNN_RNN_PADDED_IO_ENABLED, so that it will auto fill 0 for the shorter sequences
+                                                   CUDNN_RNN_PADDED_IO_ENABLED));
 
     return Status::OK();
   }
@@ -119,8 +121,7 @@ class CudnnRnnBase : public CudaKernel {
  private:
   Status SetCudnnRnnWeightBias(const cudnnHandle_t cudnn_handle,
                                const cudnnRNNDescriptor_t rnn_desc,
-                               const cudnnTensorDescriptor_t x_desc,
-                               const cudnnFilterDescriptor_t w_desc,
+                               size_t w_data_size,
                                void* w_data,
                                const T* W_data,
                                const T* R_data,
@@ -128,23 +129,22 @@ class CudnnRnnBase : public CudaKernel {
                                cudaStream_t cuda_stream) const;
 
   Status ReorganizeWeights(const Tensor* W, const Tensor* R, const Tensor* B,
+                           size_t& target_w_data_size_in_bytes,
                            IAllocatorUniquePtr<void>& target_w_data,
                            CudnnFilterDescriptor& target_w_desc,
                            CudnnRNN& rnn_desc,
                            onnxruntime::Stream* ort_stream) const;
 
-  void SetWeightBias(const cudnnHandle_t handle,
-                     const cudnnRNNDescriptor_t rnn_desc,
-                     const int pseudo_layer,
-                     const cudnnTensorDescriptor_t x_desc,
-                     const cudnnFilterDescriptor_t w_desc,
-                     const cudnnFilterDescriptor_t filter_desc,
-                     const void* w_data,
-                     const int lin_layer_id,
-                     const T* pos,
-                     int& offset,
-                     bool is_matrix,
-                     cudaStream_t cuda_stream) const;
+  Status SetWeightBias(const cudnnHandle_t handle,
+                       const cudnnRNNDescriptor_t rnn_desc,
+                       const int pseudo_layer,
+                       size_t w_data_size,
+                       const void* w_data,
+                       const int lin_layer_id,
+                       const T* pos,
+                       int& offset,
+                       bool is_matrix,
+                       cudaStream_t cuda_stream) const;
 
   void SetZeroSequences(const int64_t zero_seq_index_cache_size,
                         const std::vector<int32_t> zero_seq_index_cache,
@@ -167,6 +167,7 @@ class CudnnRnnBase : public CudaKernel {
   cudnnRNNMode_t rnn_mode_;
   // w_desc_cache_ & w_data_cache_ are changed in Constructor if we can get the weights as constant input
   CudnnFilterDescriptor w_desc_cache_;
+  size_t w_data_cache_size_in_bytes_;
   IAllocatorUniquePtr<void> w_data_cache_;
   bool weight_cached_;
   int64_t layout_;
diff --git a/onnxruntime/core/providers/cuda/rnn/rnn.cc b/onnxruntime/core/providers/cuda/rnn/rnn.cc
index 4bd22340ef2bb..ed8be63679707 100644
--- a/onnxruntime/core/providers/cuda/rnn/rnn.cc
+++ b/onnxruntime/core/providers/cuda/rnn/rnn.cc
@@ -1,8 +1,9 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/shared_library/provider_api.h"
 #include "rnn.h"
+
+#include "core/providers/shared_library/provider_api.h"
 #include "rnn_impl.h"
 #include "core/providers/cuda/cudnn_common.h"
 
diff --git a/onnxruntime/core/providers/cuda/rnn/rnn.h b/onnxruntime/core/providers/cuda/rnn/rnn.h
index e4e50046b3725..6221afb003b22 100644
--- a/onnxruntime/core/providers/cuda/rnn/rnn.h
+++ b/onnxruntime/core/providers/cuda/rnn/rnn.h
@@ -4,6 +4,7 @@
 #pragma once
 
 #include "cudnn_rnn_base.h"
+
 #include "core/providers/cuda/cuda_common.h"
 #include <cudnn.h>
 
diff --git a/onnxruntime/core/providers/cuda/rnn/rnn_impl.cu b/onnxruntime/core/providers/cuda/rnn/rnn_impl.cu
index d485855ddb417..94c8036be6cdf 100644
--- a/onnxruntime/core/providers/cuda/rnn/rnn_impl.cu
+++ b/onnxruntime/core/providers/cuda/rnn/rnn_impl.cu
@@ -8,22 +8,32 @@ namespace onnxruntime {
 namespace cuda {
 
 template <typename T>
-__global__ void _ReverseBySequenceKernel(const int32_t seq_length,
+__global__ void _ReverseBySequenceKernel(const int32_t max_seq_length,
+                                         const int32_t* seq_lengths,
                                          const int32_t block_size,
                                          const fast_divmod div_batch_block,
+                                         const fast_divmod div_input_or_hidden_size,
                                          const T* data,
                                          T* reversed_data,
                                          const CUDA_LONG N) {
   CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
   int seq_id, offset;
   div_batch_block.divmod(id, seq_id, offset);
-  int org_id = (seq_length - seq_id - 1) * block_size + offset;
-  reversed_data[id] = data[org_id];
+  int batch, batch_offset;
+  div_input_or_hidden_size.divmod(offset, batch, batch_offset);
+  int seq_id_org = seq_lengths[batch] - seq_id - 1;
+  if (seq_id_org >= 0) {
+    int org_id = seq_id_org * block_size + offset;
+    reversed_data[id] = data[org_id];
+  } else {
+    reversed_data[id] = T{};
+  }
 }
 
 template <typename T>
 void ReverseBySequence(cudaStream_t stream,
-                       const int32_t seq_length,
+                       const int32_t max_seq_length,
+                       const int32_t *seq_lengths,
                        const int32_t batch_size,
                        const int32_t input_or_hidden_size,
                        const T* data,
@@ -32,9 +42,10 @@ void ReverseBySequence(cudaStream_t stream,
   // kerneral
   int32_t block_size = batch_size * input_or_hidden_size;
   fast_divmod div_batch_block(block_size);
+  fast_divmod div_input_or_hidden_size(input_or_hidden_size);
   int blocksPerGrid = (int)(ceil(static_cast<float>(N) / GridDim::maxThreadsPerBlock));
   _ReverseBySequenceKernel<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
-      seq_length, block_size, div_batch_block, data, reversed_data, (CUDA_LONG)N);
+      max_seq_length, seq_lengths, block_size, div_batch_block, div_input_or_hidden_size, data, reversed_data, (CUDA_LONG)N);
 }
 
 template <typename T>
@@ -82,60 +93,6 @@ void ReorderBidirectionalDataInSequence(cudaStream_t stream,
       data, reordered_data, (CUDA_LONG)N);
 }
 
-template <typename T>
-__global__ void _RnnMaskKernel(const int32_t seq_length,
-                               const int32_t batch_size,
-                               const int32_t hidden_size,
-                               const int32_t* sequence_lens,
-                               const fast_divmod div_seq_block,
-                               const fast_divmod div_dir_block,
-                               const fast_divmod div_batch_block,
-                               T* y_output_data,
-                               T* y_h_output_data,
-                               const CUDA_LONG N) {
-  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
-
-  int seq_id, direction_id, batch_id, offset;
-  div_seq_block.divmod(id, seq_id, offset);
-  div_dir_block.divmod(offset, direction_id, offset);
-  div_batch_block.divmod(offset, batch_id, offset);
-  int32_t batch_seq_length = sequence_lens[batch_id];
-
-  if (batch_id >= batch_size || batch_seq_length == seq_length) {
-    return;
-  }
-
-  if (seq_id >= batch_seq_length) {
-    y_output_data[id] = 0;
-    return;
-  }
-
-  if ((y_h_output_data != nullptr) && 
-      ((direction_id == 0 && (seq_id + 1) == batch_seq_length) || (direction_id == 1 && seq_id == 0))) {
-    int hy_idx = direction_id * batch_size * hidden_size + batch_id * hidden_size + offset;
-    y_h_output_data[hy_idx] = y_output_data[id];
-  }
-}
-
-template <typename T>
-void RnnMaskImpl(cudaStream_t stream,
-                 const int32_t num_directions,
-                 const int32_t seq_length,
-                 const int32_t batch_size,
-                 const int32_t hidden_size,
-                 const int32_t* sequence_lens,
-                 T* y_output_data,
-                 T* y_h_output_data,
-                 const size_t N) {
-  fast_divmod div_seq_block(batch_size * hidden_size * num_directions);
-  fast_divmod div_dir_block(batch_size * hidden_size);
-  fast_divmod div_batch_block(hidden_size);
-  int blocksPerGrid = (int)(ceil(static_cast<float>(N) / GridDim::maxThreadsPerBlock));
-  _RnnMaskKernel<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
-      seq_length, batch_size, hidden_size, sequence_lens, div_seq_block,
-      div_dir_block, div_batch_block, y_output_data, y_h_output_data, (CUDA_LONG)N);
-}
-
 template <typename T>
 __global__ void _MaskZeroSequences(const int32_t hidden_size,
                                    T* y_output_data,
@@ -180,17 +137,9 @@ void MaskZeroSequences(cudaStream_t stream,
 }
 
 #define SPECIALIZED_RNN_IMPL(T)                                                 \
-  template void RnnMaskImpl<T>(cudaStream_t stream,                       \
-                               const int32_t num_directions,                    \
-                               const int32_t seq_length,                        \
-                               const int32_t batch_size,                        \
-                               const int32_t hidden_size,                       \
-                               const int32_t* sequence_lens,                    \
-                               T* y_output_data,                                \
-                               T* y_h_output_data,                              \
-                               const size_t N);                                 \
-  template void ReverseBySequence<T>(cudaStream_t stream,                 \
-                                     const int32_t seq_length,                  \
+  template void ReverseBySequence<T>(cudaStream_t stream,                       \
+                                     const int32_t max_seq_length,              \
+                                     const int32_t* seq_lengths,                \
                                      const int32_t batch_size,                  \
                                      const int32_t hidden_size,                 \
                                      const T* data,                             \
@@ -203,7 +152,7 @@ void MaskZeroSequences(cudaStream_t stream,
                                                       const T* data,            \
                                                       T* reordered_data,        \
                                                      const size_t N);           \
-template void MaskZeroSequences<T>(cudaStream_t stream,                   \
+template void MaskZeroSequences<T>(cudaStream_t stream,                         \
                                    const int32_t hidden_size,                   \
                                    T* y_output_data,                            \
                                    T* y_h_output_data,                          \
diff --git a/onnxruntime/core/providers/cuda/rnn/rnn_impl.h b/onnxruntime/core/providers/cuda/rnn/rnn_impl.h
index 9844e04ff6ec5..ba876011f6b67 100644
--- a/onnxruntime/core/providers/cuda/rnn/rnn_impl.h
+++ b/onnxruntime/core/providers/cuda/rnn/rnn_impl.h
@@ -10,7 +10,8 @@ namespace cuda {
 
 template <typename T>
 void ReverseBySequence(cudaStream_t stream,
-                       const int32_t seq_length,
+                       const int32_t max_seq_length,
+                       const int32_t* seq_lengths,
                        const int32_t batch_size,
                        const int32_t input_or_hidden_size,
                        const T* data,
@@ -26,17 +27,6 @@ void ReorderBidirectionalDataInSequence(cudaStream_t stream,
                                         T* reordered_data,
                                         const size_t N);
 
-template <typename T>
-void RnnMaskImpl(cudaStream_t stream,
-                 const int32_t num_directions,
-                 const int32_t seq_length,
-                 const int32_t batch_size,
-                 const int32_t hidden_size,
-                 const int32_t* sequence_lens,
-                 T* y_output_data,
-                 T* y_h_output_data,
-                 const size_t N);
-
 template <typename T>
 void MaskZeroSequences(cudaStream_t stream,
                        const int32_t hidden_size,
diff --git a/onnxruntime/test/providers/cpu/rnn/rnn_op_test.cc b/onnxruntime/test/providers/cpu/rnn/rnn_op_test.cc
index b9875b9553a55..1a31743e2f7e7 100644
--- a/onnxruntime/test/providers/cpu/rnn/rnn_op_test.cc
+++ b/onnxruntime/test/providers/cpu/rnn/rnn_op_test.cc
@@ -120,15 +120,11 @@ TEST(RNNTest, RNN_bidirectional_bias_initial_zigged_batch) {
   test.AddOutput<float>("Y_h", Y_h_dims, Y_h_data);
 
   // TensorRT failed on RNN tests
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
 // Doesn't work with CUDA 11.4 on Windows. Need investigation.
-#if defined(USE_CUDA) && defined(_WIN32)
-TEST(RNNTest, DISABLED_RNN_bidirectional_zigged_batch) {
-#else
 TEST(RNNTest, RNN_bidirectional_zigged_batch) {
-#endif
   OpTester test("RNN");
   int64_t num_directions = 2, input_size = 2, hidden_size = 3, seq_length = 5;
 
@@ -275,15 +271,11 @@ TEST(RNNTest, RNN_reverse_direction_zigged_batch) {
   std::vector<float> Y_h_data({0.87014002F, 0.09402763F, -0.54269236F, 0.64809889F, -0.19472955F, -0.24271242F});
   test.AddOutput<float>("Y_h", Y_h_dims, Y_h_data);
 
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
 // Doesn't work with CUDA 11.4 on Windows. Need investigation.
-#if defined(USE_CUDA) && defined(_WIN32)
-TEST(RNNTest, DISABLED_RNN_forward_direction_zigged_batch) {
-#else
 TEST(RNNTest, RNN_forward_direction_zigged_batch) {
-#endif
   OpTester test("RNN");
   int64_t num_directions = 1, input_size = 2, hidden_size = 3, seq_length = 5;
 
@@ -357,12 +349,7 @@ TEST(RNNTest, RNN_forward_direction_zigged_batch) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
-// Doesn't work with CUDA 11.4 on Windows. Need investigation.
-#if defined(USE_CUDA) && defined(_WIN32)
-TEST(RNNTest, DISABLED_RNN_bidirectional_0) {
-#else
 TEST(RNNTest, RNN_bidirectional_0) {
-#endif
   OpTester test("RNN");
   int64_t num_directions = 2, input_size = 2, hidden_size = 3, batch_size = 1, seq_length = 5;
 
@@ -424,12 +411,7 @@ TEST(RNNTest, RNN_bidirectional_0) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
-// Doesn't work with CUDA 11.4 on Windows. Need investigation.
-#if defined(USE_CUDA) && defined(_WIN32)
-TEST(RNNTest, DISABLED_RNN_bidirectional_1) {
-#else
 TEST(RNNTest, RNN_bidirectional_1) {
-#endif
   OpTester test("RNN");
   int64_t num_directions = 2, input_size = 2, hidden_size = 2, batch_size = 1, seq_length = 1;
 
@@ -597,7 +579,7 @@ TEST(RNNTest, DISABLED_RNN_default_attributes_and_forward_direction) {
   }
 }
 
-TEST(RNNTest, DISABLED_RNN_reverse_direction) {
+TEST(RNNTest, RNN_reverse_direction) {
   int64_t num_directions = 1, input_size = 2, hidden_size = 3, batch_size = 1, seq_length = 5;
 
   // In case of useDefault, attributes, inputs or outputs are not set.

From aec2389ad0463d218b8cf3b1e245d4c34e98364a Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Fri, 23 Feb 2024 12:52:47 -0800
Subject: [PATCH 052/279] [js/webgpu] allows a ProgramInfo's RunData to use
 zero sized output (#19614)

### Description
This PR allows zero-sized output.

To make the implementation simple, it does not support partial
zero-sized tensor. Which means, either all outputs are zero-sized, or an
error will be reported.

added 2 tests:
 - op test of `Add` with input T[2,0] T[2,1], and
 - test_split_zero_size_splits
---
 js/web/lib/wasm/jsep/backend-webgpu.ts | 32 ++++++++++++++++++++++----
 js/web/lib/wasm/jsep/init.ts           |  3 ++-
 js/web/lib/wasm/jsep/util.ts           | 11 ++++++++-
 js/web/test/data/ops/add.jsonc         | 22 ++++++++++++++++++
 js/web/test/suite-test-list.jsonc      |  2 +-
 js/web/test/test-runner.ts             | 10 ++++++--
 6 files changed, 71 insertions(+), 9 deletions(-)

diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts
index 98990a6fe477b..3e3a191ec3ead 100644
--- a/js/web/lib/wasm/jsep/backend-webgpu.ts
+++ b/js/web/lib/wasm/jsep/backend-webgpu.ts
@@ -385,11 +385,16 @@ export class WebGpuBackend {
     // create info for inputs
     const inputDatas: GpuData[] = [];
     for (let i = 0; i < inputTensorViews.length; ++i) {
-      const gpuData = this.gpuDataManager.get(inputTensorViews[i].data);
+      const data = inputTensorViews[i].data;
+      // if tensor view data is 0, it means the output is zero-sized tensor, and there is no GPU data for it.
+      if (data === 0) {
+        continue;
+      }
+      const gpuData = this.gpuDataManager.get(data);
       if (!gpuData) {
-        throw new Error(`no GPU data for input: ${inputTensorViews[i].data}`);
+        throw new Error(`no GPU data for input: ${data}`);
       }
-      inputDatas[i] = gpuData;
+      inputDatas.push(gpuData);
     }
 
     const {outputs, dispatchGroup, programUniforms} = program.getRunData(inputTensorViews);
@@ -419,6 +424,11 @@ export class WebGpuBackend {
       const tensorView = (isTemporary || isPersistent) ?
           createIntermediateOutput(outputs[i].dataType, outputs[i].dims) :
           createKernelOutput(validatedOutputIndices[i], outputs[i].dataType, outputs[i].dims);
+      outputTensorViews.push(tensorView);
+      // if tensor view data is 0, it means the output is zero-sized tensor, and there is no GPU data for it.
+      if (tensorView.data === 0) {
+        continue;
+      }
       const gpuData = this.gpuDataManager.get(tensorView.data);
       if (!gpuData) {
         throw new Error(`no GPU data for output: ${tensorView.data}`);
@@ -434,10 +444,24 @@ export class WebGpuBackend {
         }
         persistentData.push(gpuData);
       }
-      outputTensorViews.push(tensorView);
       outputDatas.push(gpuData);
     }
 
+    // when there are any zero-sized tensor in the inputs or outputs, we should report error unless all outputs are
+    // zero-sized tensors.
+    if (inputDatas.length !== inputTensorViews.length || outputDatas.length !== outputTensorViews.length) {
+      // if all outputs are zero-sized tensors, there is no need to run the program.
+      if (outputDatas.length === 0) {
+        TRACE_FUNC_END(program.name);
+        return outputTensorViews;
+      }
+      // if some outputs are zero-sized tensors, report an error.
+      //
+      // TODO: so far we don't see any use case that outputs include both zero-sized tensors and non-zero-sized tensors.
+      // If we see such use case, we need to make a change here to support it.
+      throw new Error(
+          `Program ${program.name} has zero-sized tensor(s) in inputs or outputs. This is not supported now.`);
+    }
 
     // load uniforms
     // TODO: add cache for uniform (is it necessary?)
diff --git a/js/web/lib/wasm/jsep/init.ts b/js/web/lib/wasm/jsep/init.ts
index 786ae41646554..b64abf9cc5424 100644
--- a/js/web/lib/wasm/jsep/init.ts
+++ b/js/web/lib/wasm/jsep/init.ts
@@ -104,7 +104,8 @@ class ComputeContextImpl implements ComputeContext {
         throw new Error(`Unsupported data type: ${dataType}`);
       }
       const bufferSize = elementSize * ShapeUtil.size(dims);
-      return new TensorViewImpl(this.module, dataType, this.backend.gpuDataManager.create(bufferSize).id, dims);
+      const gpuDataId = bufferSize > 0 ? this.backend.gpuDataManager.create(bufferSize).id : 0;
+      return new TensorViewImpl(this.module, dataType, gpuDataId, dims);
     };
     return this.backend.run(program, mappedInputs, outputIndices, createKernelOutput, createTemporaryOutput);
   }
diff --git a/js/web/lib/wasm/jsep/util.ts b/js/web/lib/wasm/jsep/util.ts
index c0517ce363644..9a1d5463f7843 100644
--- a/js/web/lib/wasm/jsep/util.ts
+++ b/js/web/lib/wasm/jsep/util.ts
@@ -56,7 +56,16 @@ export class BroadcastUtil {
       if (aLen !== bLen && aLen > 1 && bLen > 1) {
         return undefined;
       }
-      cdims[crank - i] = Math.max(aLen, bLen);
+      const max = Math.max(aLen, bLen);
+      if (aLen && bLen) {
+        cdims[crank - i] = Math.max(aLen, bLen);
+      } else {
+        // when either aLen or bLen is 0, the other should be either 0 or 1, otherwise it is not broadcastable.
+        if (max > 1) {
+          return undefined;
+        }
+        cdims[crank - i] = 0;
+      }
     }
 
     return cdims;
diff --git a/js/web/test/data/ops/add.jsonc b/js/web/test/data/ops/add.jsonc
index e5b4ff2b53148..dd15134861ef0 100644
--- a/js/web/test/data/ops/add.jsonc
+++ b/js/web/test/data/ops/add.jsonc
@@ -157,6 +157,28 @@
             "type": "float32"
           }
         ]
+      },
+      {
+        "name": "T[2,0] T[2,1]",
+        "inputs": [
+          {
+            "data": [],
+            "dims": [2, 0],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2],
+            "dims": [2, 1],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [],
+            "dims": [2, 0],
+            "type": "float32"
+          }
+        ]
       }
     ]
   }
diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc
index b43b1ac37e37d..88555a27be82e 100644
--- a/js/web/test/suite-test-list.jsonc
+++ b/js/web/test/suite-test-list.jsonc
@@ -1231,7 +1231,7 @@
       "test_split_variable_parts_1d",
       "test_split_variable_parts_2d",
       "test_split_variable_parts_default_axis",
-      // // "test_split_zero_size_splits",
+      "test_split_zero_size_splits",
       "test_sqrt_example",
       "test_sqrt",
       "test_squeeze_negative_axes",
diff --git a/js/web/test/test-runner.ts b/js/web/test/test-runner.ts
index a4adf5c4ce144..7c03e5b915fd7 100644
--- a/js/web/test/test-runner.ts
+++ b/js/web/test/test-runner.ts
@@ -573,7 +573,9 @@ export async function sessionRun(options: {
       // replace the CPU tensors in feeds into GPU tensors
       for (const name in feeds) {
         if (Object.hasOwnProperty.call(feeds, name)) {
-          feeds[name] = createGpuTensorForInput(feeds[name]);
+          if (feeds[name].size > 0) {
+            feeds[name] = createGpuTensorForInput(feeds[name]);
+          }
         }
       }
     }
@@ -582,7 +584,11 @@ export async function sessionRun(options: {
       for (const name in options.outputsMetaInfo) {
         if (Object.hasOwnProperty.call(options.outputsMetaInfo, name)) {
           const {type, dims} = options.outputsMetaInfo[name];
-          fetches[name] = createGpuTensorForOutput(type, dims);
+          if (dims.some(d => d === 0)) {
+            fetches[name] = new ort.Tensor(type, [], dims);
+          } else {
+            fetches[name] = createGpuTensorForOutput(type, dims);
+          }
         }
       }
     }

From bb43a0f1338b05e93fcbbe5c5cb53ebf017625ba Mon Sep 17 00:00:00 2001
From: Guenther Schmuelling <guschmue@microsoft.com>
Date: Fri, 23 Feb 2024 15:45:30 -0800
Subject: [PATCH 053/279] [js/webgpu] minor fixes to make tinyllama work
 (#19564)

---
 js/web/lib/wasm/jsep/webgpu/ops/concat.ts | 4 +++-
 js/web/lib/wasm/jsep/webgpu/ops/gather.ts | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/concat.ts b/js/web/lib/wasm/jsep/webgpu/ops/concat.ts
index b06c9fb496d15..b142a82e551a7 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/concat.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/concat.ts
@@ -154,7 +154,9 @@ const createConcatProgramInfo = (inputs: readonly TensorView[], axis: number): P
 
 export const concat = (context: ComputeContext, attributes: ConcatAttributes): void => {
   validateInputs(context.inputs);
-  context.compute(createConcatProgramInfo(context.inputs, attributes.axis));
+  // 0 length tensors are valid for concat, remove them
+  const nonEmptyInputs = context.inputs.filter(input => ShapeUtil.size(input.dims) > 0);
+  context.compute(createConcatProgramInfo(nonEmptyInputs, attributes.axis), {inputs: nonEmptyInputs});
 };
 
 export const parseConcatAttributes = (attributes: Record<string, unknown>): ConcatAttributes =>
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/gather.ts b/js/web/lib/wasm/jsep/webgpu/ops/gather.ts
index 5c31e6dd86c00..d48bb909f7f8f 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/gather.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/gather.ts
@@ -55,7 +55,7 @@ const createGatherProgramInfo = (inputs: readonly TensorView[], attributes: Gath
           if (idx${x} < 0) {
             idx${x} = idx${x} + uniforms.axisDimLimit;
           }
-          var dataIndices${x} = ${data.type.indices}(0);
+          var dataIndices${x} : ${data.type.indices};
         `;
       for (let i = 0, j = 0; i < inputRank; i++) {
         if (i === axis) {

From 46c4d7fe4ad457d517fe92db7681c38849c51beb Mon Sep 17 00:00:00 2001
From: Sheil Kumar <smk2007@gmail.com>
Date: Fri, 23 Feb 2024 18:20:22 -0800
Subject: [PATCH 054/279] Disable gemm activation for non-float data types
 (#19612)

### Description
Disable gemm activation for non-float data types


### Motivation and Context
When a float16 model contains a Gemm+Relu subgraph, the
gemm_activation_fusion will kick in and cause the two nodes to be
eliminated and replaced with a FusedGemm. This however is only
registered for the float data type. This causes model load failures.

Disable the fusion for non-float data types.

---------

Co-authored-by: Sheil Kumar <sheilk@microsoft.com>
---
 onnxruntime/core/optimizer/gemm_activation_fusion.cc | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/onnxruntime/core/optimizer/gemm_activation_fusion.cc b/onnxruntime/core/optimizer/gemm_activation_fusion.cc
index c62887da09fdc..50be2cbd48f7b 100644
--- a/onnxruntime/core/optimizer/gemm_activation_fusion.cc
+++ b/onnxruntime/core/optimizer/gemm_activation_fusion.cc
@@ -56,6 +56,13 @@ Status GemmActivationFusion::ApplyImpl(Graph& graph, bool& modified, int graph_l
       continue;
     }
 
+    NodeArg* node_output = node.MutableOutputDefs()[0];
+    auto data_type = node_output->TypeAsProto()->tensor_type().elem_type();
+    if (data_type != ONNX_NAMESPACE::TensorProto_DataType_FLOAT) {
+      // FusedGemm is only registered for float data type in fused_gemm.cc!
+      continue;
+    }
+
     const Node& next_node = *(node.OutputNodesBegin());
     if (!IsFusableActivation(next_node) || next_node.GetExecutionProviderType() != node.GetExecutionProviderType()) {
       continue;

From c12a20bef95df5437189687b94e7ba2f1bad1505 Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Sat, 24 Feb 2024 14:06:30 +1000
Subject: [PATCH 055/279] Add helper to run CIs for a branch using `az
 pipelines`. (#16843)

### Description
<!-- Describe your changes. -->
Add helper to run CIs for a branch using `az pipelines`.
This can be used to easily kick off multiple CIs for a branch prior to
creating a PR.

Update run_CIs_for_external_pr.py so the CI list can be shared.
Request json output from `gh pr view` so the current state is more
easily parsed.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 tools/python/run_CIs_for_branch.py      | 116 +++++++++++++++++++++++
 tools/python/run_CIs_for_external_pr.py | 120 +++++++++++++-----------
 2 files changed, 181 insertions(+), 55 deletions(-)
 create mode 100644 tools/python/run_CIs_for_branch.py

diff --git a/tools/python/run_CIs_for_branch.py b/tools/python/run_CIs_for_branch.py
new file mode 100644
index 0000000000000..c507cae0d9f43
--- /dev/null
+++ b/tools/python/run_CIs_for_branch.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python3
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import argparse
+import json
+import os
+import subprocess
+import sys
+import typing
+
+from run_CIs_for_external_pr import get_pipeline_names
+from util.platform_helpers import is_windows
+
+
+def _parse_args():
+    parser = argparse.ArgumentParser(
+        os.path.basename(__file__),
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description="""Run the CIs used to validate PRs for the specified branch.
+
+        If specified, the `--include` filter is applied first, followed by any `--exclude` filter.
+
+        Requires the Azure CLI with DevOps extension to be installed.
+          Azure CLI: https://learn.microsoft.com/en-us/cli/azure/install-azure-cli
+          DevOps extension: https://github.com/Azure/azure-devops-cli-extension
+
+        Configuration:
+          Login:`az login`
+          Configure ORT repo as default:
+            `az devops configure --defaults organization=https://dev.azure.com/onnxruntime project=onnxruntime`
+
+        Example usage:
+          List all CIs
+            `python run_CIs_for_branch.py --dry-run my/BranchName`
+          Run all CIs
+            `python run_CIs_for_branch.py my/BranchName`
+          Run only Linux CIs
+            `python run_CIs_for_branch.py --include linux my/BranchName`
+          Exclude training CIs
+            `python run_CIs_for_branch.py --exclude training my/BranchName`
+          Run non-training Linux CIs
+            `python run_CIs_for_branch.py --include linux --exclude training my/BranchName`
+        """,
+    )
+
+    parser.add_argument("-i", "--include", type=str, help="Include CIs that match this string. Case insensitive.")
+    parser.add_argument("-e", "--exclude", type=str, help="Exclude CIs that match this string. Case insensitive.")
+    parser.add_argument("--dry-run", action="store_true", help="Print selected CIs but do not run them.")
+    parser.add_argument("branch", type=str, help="Specify the branch to run.")
+
+    args = parser.parse_args()
+    return args
+
+
+def _run_az_pipelines_command(command: typing.List[str]):
+    try:
+        az = "az.cmd" if is_windows() else "az"
+        az_output = subprocess.run([az, "pipelines", *command], capture_output=True, text=True, check=True)
+    except subprocess.CalledProcessError as cpe:
+        print(cpe)
+        print(cpe.stderr)
+        sys.exit(-1)
+
+    return az_output
+
+
+def main():
+    args = _parse_args()
+    branch = args.branch
+
+    # To debug available pipelines:
+    # az_out = az_pipelines = _run_az_pipelines_command(["list"])
+    # pipeline_info = json.loads(az_out.stdout)
+    # print(pipeline_info)
+
+    pipelines = get_pipeline_names()
+    pipelines_to_run = []
+    if args.include:
+        value = args.include.lower().strip()
+        for p in pipelines:
+            if value in p.lower():
+                print(f"Including {p}")
+                pipelines_to_run.append(p)
+    else:
+        pipelines_to_run = pipelines
+
+    if args.exclude:
+        value = args.exclude.lower().strip()
+        cur_pipelines = pipelines_to_run
+        pipelines_to_run = []
+        for p in cur_pipelines:
+            if value in p.lower():
+                print(f"Excluding {p}")
+            else:
+                pipelines_to_run.append(p)
+
+    print("Pipelines to run:")
+    for p in pipelines_to_run:
+        print(f"\t{p}")
+
+    if args.dry_run:
+        sys.exit(0)
+
+    for pipeline in pipelines_to_run:
+        az_out = _run_az_pipelines_command(["run", "--branch", branch, "--name", pipeline])
+        run_output = json.loads(az_out.stdout)
+        if "id" in run_output:
+            build_url = f"https://dev.azure.com/onnxruntime/onnxruntime/_build/results?buildId={run_output['id']}"
+            print(f"{pipeline} build results: {build_url}&view=results")
+        else:
+            raise ValueError("Build id was not found in az output:\n" + run_output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/python/run_CIs_for_external_pr.py b/tools/python/run_CIs_for_external_pr.py
index df4e70b1e51fe..dcafe898b3bdf 100644
--- a/tools/python/run_CIs_for_external_pr.py
+++ b/tools/python/run_CIs_for_external_pr.py
@@ -3,13 +3,54 @@
 # Licensed under the MIT License.
 
 import argparse
+import json
 import os
 import subprocess
 import sys
 import typing
 
 
-def parse_args():
+def get_pipeline_names():
+    # Current pipelines. These change semi-frequently and may need updating.
+    # There is no easy way to get the list of "required" pipelines using `azp` before they are run,
+    # so we need to maintain this list manually.
+    # NOTE: This list is also used by run_CIs_for_branch.py
+    pipelines = [
+        # windows
+        "Windows ARM64 QNN CI Pipeline",
+        "Windows x64 QNN CI Pipeline",
+        "Windows CPU CI Pipeline",
+        "Windows GPU CI Pipeline",
+        "Windows GPU TensorRT CI Pipeline",
+        "ONNX Runtime Web CI Pipeline",
+        # linux
+        "Linux CPU CI Pipeline",
+        "Linux CPU Minimal Build E2E CI Pipeline",
+        "Linux GPU CI Pipeline",
+        "Linux GPU TensorRT CI Pipeline",
+        "Linux OpenVINO CI Pipeline",
+        "Linux QNN CI Pipeline",
+        # mac
+        "MacOS CI Pipeline",
+        # training
+        "orttraining-amd-gpu-ci-pipeline",
+        "orttraining-linux-ci-pipeline",
+        "orttraining-linux-gpu-ci-pipeline",
+        "orttraining-ortmodule-distributed",
+        # checks
+        "onnxruntime-binary-size-checks-ci-pipeline",
+        # big models
+        "Big Models",
+        # not currently required, but running ensures we're hitting all mobile platforms
+        "Android CI Pipeline",
+        "iOS CI Pipeline",
+        "ONNX Runtime React Native CI Pipeline",
+    ]
+
+    return pipelines
+
+
+def _parse_args():
     parser = argparse.ArgumentParser(
         os.path.basename(__file__),
         formatter_class=argparse.RawDescriptionHelpFormatter,
@@ -25,7 +66,7 @@ def parse_args():
     return args
 
 
-def run_gh_pr_command(command: typing.List[str], check=True):
+def run_gh_pr_command(command: typing.List[str], check: bool = True):
     try:
         return subprocess.run(["gh", "pr", *command], capture_output=True, text=True, check=check)
     except subprocess.CalledProcessError as cpe:
@@ -35,23 +76,25 @@ def run_gh_pr_command(command: typing.List[str], check=True):
 
 
 def main():
-    args = parse_args()
+    args = _parse_args()
     pr_id = args.pr
 
     # validate PR
-    gh_out = run_gh_pr_command(["view", pr_id])
-    info = gh_out.stdout.split("\n")
-    for line in info:
-        pieces = line.split("\t")
-        if len(pieces) != 2:
-            continue
-
-        if pieces[0] == "state:":
-            if pieces[1] != "OPEN":
-                print(f"PR {pr_id} is not OPEN. Currently in state {pieces[1]}.")
-                sys.exit(-1)
-
-    print("Check passed pipelines")
+    print("Checking PR is open")
+    gh_out = run_gh_pr_command(["view", "--json", "state", pr_id])
+    info = json.loads(gh_out.stdout)
+    if "state" not in info:
+        print(f"Could not get current state from `gh pr view` response of\n{gh_out.stdout}")
+        sys.exit(-1)
+
+    if info["state"] != "OPEN":
+        print(f"PR {pr_id} is not OPEN. Currently in state {info['state']}.")
+        sys.exit(0)
+
+    # This will return CIs that have run previously but not passed. We filter the CIs to run based on this, so it's
+    # fine for the initial response to have no info in it.
+    # `gh pr checks` exits with non-zero exit code when failures in pipeline exist, so we set `check` to False.
+    print("Checking for pipelines that have passed.")
     gh_out = run_gh_pr_command(["checks", pr_id, "--required"], check=False)
     # output format is a tab separated list of columns:
     # (pipeline name) "\t" (status) "\t" (ran time) "\t" (url)
@@ -61,54 +104,21 @@ def main():
         if len(columns) == 4 and columns[1] == "pass"
     ]
 
-    print("Adding azp run commands")
-
-    # Current pipelines. These change semi-frequently and may need updating.
-    #
-    # Note: there is no easy way to get the list for azp "required" pipelines before they starts.
-    #       we need to maintain this list manually.
-    #
-    pipelines = [
-        # windows
-        "Windows ARM64 QNN CI Pipeline",
-        "Windows x64 QNN CI Pipeline",
-        "Windows CPU CI Pipeline",
-        "Windows GPU CI Pipeline",
-        "Windows GPU TensorRT CI Pipeline",
-        "ONNX Runtime Web CI Pipeline",
-        # linux
-        "Linux CPU CI Pipeline",
-        "Linux CPU Minimal Build E2E CI Pipeline",
-        "Linux GPU CI Pipeline",
-        "Linux GPU TensorRT CI Pipeline",
-        "Linux OpenVINO CI Pipeline",
-        "Linux QNN CI Pipeline",
-        # mac
-        "MacOS CI Pipeline",
-        # training
-        "orttraining-amd-gpu-ci-pipeline",
-        "orttraining-linux-ci-pipeline",
-        "orttraining-linux-gpu-ci-pipeline",
-        "orttraining-ortmodule-distributed",
-        # checks
-        "onnxruntime-python-checks-ci-pipeline",
-        "onnxruntime-binary-size-checks-ci-pipeline",
-        # big models
-        "Big Models",
-        # not currently required, but running ensures we're hitting all mobile platforms
-        "Android CI Pipeline",
-        "iOS CI Pipeline",
-        "ONNX Runtime React Native CI Pipeline",
-    ]
+    pipelines = get_pipeline_names()
 
     # remove pipelines that have already run successfully
     pipelines = [p for p in pipelines if p not in checked_pipelines]
 
+    print("Pipelines to run:")
+    for p in pipelines:
+        print("\t" + p)
+
     # azp run is limited to 10 pipelines at a time
     max_pipelines_per_comment = 10
     start = 0
     num_pipelines = len(pipelines)
 
+    print("Adding azp run commands")
     while start < num_pipelines:
         end = start + max_pipelines_per_comment
         if end > num_pipelines:

From 9ccdc4961ad76355289ed3a36ccb8307e8dc7789 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Fri, 23 Feb 2024 22:31:57 -0800
Subject: [PATCH 056/279] Stop using apiset in OneCore build: use
 onecoreuap.lib instead of onecoreuap_apiset.lib (#19632)

### Description
Stop using apiset in OneCore build: use onecoreuap.lib instead of
onecoreuap_apiset.lib in onecore build.


### Motivation and Context
1. Now all Windows Editions come with Reverse Forwarders. We should just
use the normal onecore libs.
2. Many new Windows APIs are only available in [windows umbrella
libraries](https://learn.microsoft.com/en-us/windows/win32/apiindex/windows-umbrella-libraries).
So these libraries are not specific for Windows CoreOS or Onecore.
3. Going forward we should use "IsApiSetImplemented" to guard our API
usages:

https://learn.microsoft.com/en-us/windows/win32/apiindex/detect-api-set-availability
.

After this change, our built binaries can pass apivalidator's check.

```
C:\local\apivalidator>apivalidator.exe -BinaryPath:C:\src\onnxruntime\b\Debug\Debug\onnxruntime.dll -SupportedApiXmlFiles:onecoreuap_DDIs.xml
ApiValidation:
Summary:
        "C:\src\onnxruntime\b\Debug\Debug\onnxruntime.dll" is Universal


ApiValidation: All binaries are Universal
```
So it will give an easy way to test ONNX Runtime's compatibility to
Windows versions.
---
 cmake/CMakeLists.txt            | 6 ++----
 cmake/wcos_rules_override.cmake | 4 ++--
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index c9be4aa65d0cc..ed9043f2adc4a 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -1729,14 +1729,12 @@ if(onnxruntime_BUILD_KERNEL_EXPLORER)
 endif()
 
 # When GDK_PLATFORM is set then WINAPI_FAMILY is defined in gdk_toolchain.cmake (along with other relevant flags/definitions).
-if (WIN32 AND NOT GDK_PLATFORM)
+if (WIN32 AND NOT GDK_PLATFORM AND NOT CMAKE_CROSSCOMPILING)
   if (NOT CMAKE_CXX_STANDARD_LIBRARIES MATCHES kernel32.lib)
     # On onecore, link to the onecore build of the MSVC runtime
     get_filename_component(msvc_path "${CMAKE_C_COMPILER}/../../../.." ABSOLUTE)
     link_directories(BEFORE "${msvc_path}/lib/onecore/${onnxruntime_target_platform}")
-    # The .lib files in the MSVC runtime have a DEFAULITLIB entry for onecore.lib, which in turn links to reverse forwarders.
-    # We ignore that entry and use onecore_apiset.lib instead, since system components must not rely on reverse forwarders.
-    add_link_options("/NODEFAULTLIB:onecore.lib")
+    # The .lib files in the MSVC runtime have a DEFAULITLIB entry for onecore.lib, but it shold not cause any conflict with onecoreuap.lib
   endif()
 endif()
 
diff --git a/cmake/wcos_rules_override.cmake b/cmake/wcos_rules_override.cmake
index f3d8093629a42..ec2303b073d5e 100644
--- a/cmake/wcos_rules_override.cmake
+++ b/cmake/wcos_rules_override.cmake
@@ -1,2 +1,2 @@
-set(CMAKE_C_STANDARD_LIBRARIES_INIT onecoreuap_apiset.lib)
-set(CMAKE_CXX_STANDARD_LIBRARIES_INIT onecoreuap_apiset.lib)
+set(CMAKE_C_STANDARD_LIBRARIES_INIT onecoreuap.lib)
+set(CMAKE_CXX_STANDARD_LIBRARIES_INIT onecoreuap.lib)

From 0edb03580823c9d9e97ba1a6ea941fcd70a2500b Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Sat, 24 Feb 2024 10:09:07 -0800
Subject: [PATCH 057/279] [js/web] fix suite test list for zero sized tensor
 (#19638)

### Description

Fixes build break brought by #19614

Currently WebGL backend does not support zero sized tensor. This change
split test data into 2 parts, and only enable zero sized tensor tests
for WebGPU.
---
 js/web/test/data/ops/add.jsonc               |  22 -
 js/web/test/data/ops/add_zero-sized.jsonc    |  31 +
 js/web/test/data/ops/concat_zero-sized.jsonc | 561 +++++++++++++++++++
 js/web/test/suite-test-list.jsonc            |   2 +
 4 files changed, 594 insertions(+), 22 deletions(-)
 create mode 100644 js/web/test/data/ops/add_zero-sized.jsonc
 create mode 100644 js/web/test/data/ops/concat_zero-sized.jsonc

diff --git a/js/web/test/data/ops/add.jsonc b/js/web/test/data/ops/add.jsonc
index dd15134861ef0..e5b4ff2b53148 100644
--- a/js/web/test/data/ops/add.jsonc
+++ b/js/web/test/data/ops/add.jsonc
@@ -157,28 +157,6 @@
             "type": "float32"
           }
         ]
-      },
-      {
-        "name": "T[2,0] T[2,1]",
-        "inputs": [
-          {
-            "data": [],
-            "dims": [2, 0],
-            "type": "float32"
-          },
-          {
-            "data": [1, 2],
-            "dims": [2, 1],
-            "type": "float32"
-          }
-        ],
-        "outputs": [
-          {
-            "data": [],
-            "dims": [2, 0],
-            "type": "float32"
-          }
-        ]
       }
     ]
   }
diff --git a/js/web/test/data/ops/add_zero-sized.jsonc b/js/web/test/data/ops/add_zero-sized.jsonc
new file mode 100644
index 0000000000000..37e08cd7f20ac
--- /dev/null
+++ b/js/web/test/data/ops/add_zero-sized.jsonc
@@ -0,0 +1,31 @@
+[
+  {
+    "name": "Add with no attributes",
+    "operator": "Add",
+    "attributes": [],
+    "cases": [
+      {
+        "name": "T[2,0] T[2,1]",
+        "inputs": [
+          {
+            "data": [],
+            "dims": [2, 0],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2],
+            "dims": [2, 1],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [],
+            "dims": [2, 0],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  }
+]
diff --git a/js/web/test/data/ops/concat_zero-sized.jsonc b/js/web/test/data/ops/concat_zero-sized.jsonc
new file mode 100644
index 0000000000000..7be8e8c1cc602
--- /dev/null
+++ b/js/web/test/data/ops/concat_zero-sized.jsonc
@@ -0,0 +1,561 @@
+[
+  {
+    "name": "Concat 2D axis=0",
+    "operator": "Concat",
+    "attributes": [{ "name": "axis", "data": -2, "type": "int" }],
+    "cases": [
+      {
+        "name": "X",
+        "inputs": [
+          {
+            "data": [],
+            "dims": [1, 4, 0, 64],
+            "type": "float32"
+          },
+          {
+            "data": [
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+            ],
+            "dims": [1, 4, 36, 64],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+            ],
+            "dims": [1, 4, 36, 64],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  }
+]
diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc
index 88555a27be82e..e96a0aa045bc8 100644
--- a/js/web/test/suite-test-list.jsonc
+++ b/js/web/test/suite-test-list.jsonc
@@ -1334,6 +1334,7 @@
       "acos.jsonc",
       "add.jsonc",
       "add_int32.jsonc",
+      "add_zero-sized.jsonc",
       //"and.jsonc",
       "asin.jsonc",
       "attention.jsonc",
@@ -1343,6 +1344,7 @@
       "ceil.jsonc",
       "concat.jsonc",
       "concat_int32.jsonc",
+      "concat_zero-sized.jsonc",
       "cast.jsonc",
       "conv.jsonc",
       "cos.jsonc",

From c980149c857facc2463668a11944af3c6c12365b Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Sun, 25 Feb 2024 05:00:53 +0800
Subject: [PATCH 058/279] Add log for random exception in Linux GPU Test Stage.
  (#19569)

### Description
1. check GPU status in docker
2. use stages to make test stage can leverage existing building
artifacts


### Motivation and Context
To investigate the root cause of the random exception
`CUDA failure 100: no CUDA-capable device is detected`
---
 .../azure-pipelines/linux-gpu-ci-pipeline.yml | 351 ++++++++++--------
 1 file changed, 198 insertions(+), 153 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
index 24319184dd0b8..822bc559d992d 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
@@ -34,6 +34,17 @@ parameters:
     values:
       - 11.8
       - 12.2
+
+  - name: SpecificArtifact
+    displayName: Use Specific Artifact
+    type: boolean
+    default: false
+
+  - name: BuildId
+    displayName: Specific Artifact's BuildId
+    type: string
+    default: '0'
+
 resources:
   repositories:
   - repository: manylinux
@@ -61,163 +72,197 @@ variables:
     ${{ if eq(parameters.CudaVersion, '12.2') }}:
       value: 'onnxruntimecuda12build'
 
-jobs:
-- job: Linux_Build
-  timeoutInMinutes: 120
-  variables:
-    skipComponentGovernanceDetection: true
-    CCACHE_DIR: $(Pipeline.Workspace)/ccache
-  workspace:
-    clean: all
-  pool: onnxruntime-Ubuntu2204-AMD-CPU
-
-  steps:
-  - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
-    displayName: 'Clean Agent Directories'
-    condition: always()
-
-  - checkout: self
-    clean: true
-    submodules: none
-
-  - template: templates/get-docker-image-steps.yml
-    parameters:
-      Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
-      Context: tools/ci_build/github/linux/docker
-      DockerBuildArgs: "
-      --network=host
-      --build-arg BASEIMAGE=$(docker_base_image)
-      --build-arg TRT_VERSION=$(linux_trt_version)
-      --build-arg BUILD_UID=$( id -u )
-      "
-      Repository: $(Repository)
-
-  - task: Cache@2
-    inputs:
-      key: '"ccache" | "${{parameters.CudaVersion}}" |"$(Build.SourceBranch)" | "$(Build.SourceVersion)"'
-      path: $(CCACHE_DIR)
-      restoreKeys: |
-        "ccache" | "${{parameters.CudaVersion}}" | "$(Build.SourceBranch)"
-        "ccache"
-      cacheHitVar: CACHE_RESTORED
-    displayName: Cach Task
-
-  - script: |
-      sudo mkdir -p $(Pipeline.Workspace)/ccache
-    condition: ne(variables.CACHE_RESTORED, 'true')
-    displayName: Create Cache Dir
-
-  - script: |
-      set -e -x
-      mkdir -p $HOME/.onnx
-      docker run -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" --rm \
-        --volume /data/onnx:/data/onnx:ro \
-        --volume $(Build.SourcesDirectory):/onnxruntime_src \
-        --volume $(Build.BinariesDirectory):/build \
-        --volume /data/models:/build/models:ro \
-        --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
-        --volume $(Pipeline.Workspace)/ccache:/cache \
-        -e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \
-        -e NIGHTLY_BUILD \
-        -e BUILD_BUILDNUMBER \
-        -e CCACHE_DIR=/cache \
-        $(Repository) \
-        /bin/bash -c "
-          set -ex; \
-          env; \
-          ccache -s; \
-          /opt/python/cp38-cp38/bin/python3 /onnxruntime_src/tools/ci_build/build.py \
-            --build_dir /build --cmake_generator Ninja \
-            --config Release --update --build \
-            --skip_submodule_sync \
-            --build_shared_lib \
-            --parallel --use_binskim_compliant_compile_flags \
-            --build_wheel \
-            --enable_onnx_tests --use_cuda --cuda_version=${{parameters.CudaVersion}} --cuda_home=/usr/local/cuda-${{parameters.CudaVersion}} --cudnn_home=/usr/local/cuda-${{parameters.CudaVersion}} \
-            --enable_cuda_profiling --enable_cuda_nhwc_ops \
-            --enable_pybind --build_java \
-            --use_cache \
-            --cmake_extra_defines  CMAKE_CUDA_ARCHITECTURES=86; \
-              ccache -sv; \
-              ccache -z"
-    workingDirectory: $(Build.SourcesDirectory)
-    displayName: Build Onnxruntime
-
-  - task: CmdLine@2
-    inputs:
-      script: |
-        rm -rf $(Build.BinariesDirectory)/Release/onnxruntime $(Build.BinariesDirectory)/Release/pybind11
-        rm -f $(Build.BinariesDirectory)/Release/models
-        find $(Build.BinariesDirectory)/Release/_deps -mindepth 1 ! -regex '^$(Build.BinariesDirectory)/Release/_deps/onnx-src\(/.*\)?' -delete
-        cd $(Build.BinariesDirectory)/Release
-        find -executable -type f > $(Build.BinariesDirectory)/Release/perms.txt
-
-  - task: PublishPipelineArtifact@0
-    displayName: 'Publish Pipeline Artifact'
-    inputs:
-      artifactName: 'drop-linux'
-      targetPath: '$(Build.BinariesDirectory)/Release'
-
-  - template: templates/explicitly-defined-final-tasks.yml
-
-- job: Linux_Test
-  timeoutInMinutes: 180
-  variables:
-    skipComponentGovernanceDetection: true
-  workspace:
-    clean: all
-  pool: onnxruntime-Linux-GPU-A10
-  dependsOn:
-  - Linux_Build
-  steps:
-  - task: DownloadPipelineArtifact@2
-    displayName: 'Download Pipeline Artifact'
-    inputs:
-      buildType: 'current'
-      artifactName: 'drop-linux'
-      targetPath: '$(Build.BinariesDirectory)/Release'
-
-  - checkout: self
-    clean: true
-    submodules: none
-
-  - template: templates/get-docker-image-steps.yml
-    parameters:
-      Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
-      Context: tools/ci_build/github/linux/docker
-      DockerBuildArgs: "
-      --network=host
-      --build-arg BASEIMAGE=$(docker_base_image)
-      --build-arg TRT_VERSION=$(linux_trt_version)
-      --build-arg BUILD_UID=$( id -u )
-      "
-      Repository: $(Repository)
-
-  - task: CmdLine@2
-    inputs:
-      script: |
+stages:
+- stage: Linux_Build
+  jobs:
+  - job: Linux_Build
+    timeoutInMinutes: 120
+    variables:
+      skipComponentGovernanceDetection: true
+      CCACHE_DIR: $(Pipeline.Workspace)/ccache
+    workspace:
+      clean: all
+    pool: onnxruntime-Ubuntu2204-AMD-CPU
+
+    steps:
+    - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+      displayName: 'Clean Agent Directories'
+      condition: always()
+
+    - checkout: self
+      clean: true
+      submodules: none
+
+    - template: templates/get-docker-image-steps.yml
+      parameters:
+        Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
+        Context: tools/ci_build/github/linux/docker
+        DockerBuildArgs: "
+        --network=host
+        --build-arg BASEIMAGE=$(docker_base_image)
+        --build-arg TRT_VERSION=$(linux_trt_version)
+        --build-arg BUILD_UID=$( id -u )
+        "
+        Repository: $(Repository)
+
+    - task: Cache@2
+      inputs:
+        key: '"ccache" | "${{parameters.CudaVersion}}" |"$(Build.SourceBranch)" | "$(Build.SourceVersion)"'
+        path: $(CCACHE_DIR)
+        restoreKeys: |
+          "ccache" | "${{parameters.CudaVersion}}" | "$(Build.SourceBranch)"
+          "ccache"
+        cacheHitVar: CACHE_RESTORED
+      displayName: Cach Task
+
+    - script: |
+        sudo mkdir -p $(Pipeline.Workspace)/ccache
+      condition: ne(variables.CACHE_RESTORED, 'true')
+      displayName: Create Cache Dir
+
+    - script: |
         set -e -x
         mkdir -p $HOME/.onnx
-        docker run --gpus all --rm \
-          --volume  $(Build.SourcesDirectory):/onnxruntime_src \
-          --volume $(Build.BinariesDirectory)/Release:/build/Release \
+        docker run -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" --rm \
+          --volume /data/onnx:/data/onnx:ro \
+          --volume $(Build.SourcesDirectory):/onnxruntime_src \
+          --volume $(Build.BinariesDirectory):/build \
           --volume /data/models:/build/models:ro \
           --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
-          --volume /data/onnx:/data/onnx \
-          -e NVIDIA_TF32_OVERRIDE=0 \
+          --volume $(Pipeline.Workspace)/ccache:/cache \
+          -e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \
+          -e NIGHTLY_BUILD \
+          -e BUILD_BUILDNUMBER \
+          -e CCACHE_DIR=/cache \
           $(Repository) \
           /bin/bash -c "
             set -ex; \
-            cp /onnxruntime_src/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt /tmp/requirements.txt; \
-            ln -s /opt/python/cp38-cp38/bin/python3 /tmp/python3; \
-            /tmp/python3 -m pip install -r /tmp/requirements.txt; \
-            /tmp/python3 -m pip install /build/Release/dist/*.whl; \
-            cd /build/Release && xargs -a /build/Release/perms.txt chmod a+x; \
-            cd /onnxruntime_src/java && /onnxruntime_src/java/gradlew cmakeCheck -DcmakeBuildDir=/build/Release -DUSE_CUDA=1; \
-            cd /tmp; \
-            /tmp/python3 /onnxruntime_src/tools/ci_build/build.py \
-              --build_dir /build --config Release --test --skip_submodule_sync --build_shared_lib --parallel --use_binskim_compliant_compile_flags --build_wheel --enable_onnx_tests \
-              --use_cuda --cuda_version=${{parameters.CudaVersion}} --cuda_home=/usr/local/cuda --cudnn_home=/usr/local/cuda \
-              --enable_pybind --build_java --ctest_path '' "
-
-  - template: templates/clean-agent-build-directory-step.yml
+            env; \
+            ccache -s; \
+            /opt/python/cp38-cp38/bin/python3 /onnxruntime_src/tools/ci_build/build.py \
+              --build_dir /build --cmake_generator Ninja \
+              --config Release --update --build \
+              --skip_submodule_sync \
+              --build_shared_lib \
+              --parallel --use_binskim_compliant_compile_flags \
+              --build_wheel \
+              --enable_onnx_tests --use_cuda --cuda_version=${{parameters.CudaVersion}} --cuda_home=/usr/local/cuda-${{parameters.CudaVersion}} --cudnn_home=/usr/local/cuda-${{parameters.CudaVersion}} \
+              --enable_cuda_profiling --enable_cuda_nhwc_ops \
+              --enable_pybind --build_java \
+              --use_cache \
+              --cmake_extra_defines  CMAKE_CUDA_ARCHITECTURES=86; \
+                ccache -sv; \
+                ccache -z"
+      workingDirectory: $(Build.SourcesDirectory)
+      displayName: Build Onnxruntime
+
+    - task: CmdLine@2
+      inputs:
+        script: |
+          rm -rf $(Build.BinariesDirectory)/Release/onnxruntime $(Build.BinariesDirectory)/Release/pybind11
+          rm -f $(Build.BinariesDirectory)/Release/models
+          find $(Build.BinariesDirectory)/Release/_deps -mindepth 1 ! -regex '^$(Build.BinariesDirectory)/Release/_deps/onnx-src\(/.*\)?' -delete
+          cd $(Build.BinariesDirectory)/Release
+          find -executable -type f > $(Build.BinariesDirectory)/Release/perms.txt
+
+    - task: PublishPipelineArtifact@0
+      displayName: 'Publish Pipeline Artifact'
+      inputs:
+        artifactName: 'drop-linux'
+        targetPath: '$(Build.BinariesDirectory)/Release'
+
+    - template: templates/explicitly-defined-final-tasks.yml
+
+- stage: Linux_Test
+  dependsOn:
+    - Linux_Build
+  jobs:
+  - job: Linux_Test
+    timeoutInMinutes: 180
+    variables:
+      skipComponentGovernanceDetection: true
+    workspace:
+      clean: all
+    pool: onnxruntime-Linux-GPU-A10
+    steps:
+    - checkout: self
+      clean: true
+      submodules: none
+
+    - template: templates/flex-downloadPipelineArtifact.yml
+      parameters:
+        ArtifactName: 'drop-linux'
+        StepName: 'Download Pipeline Artifact - Linux Build'
+        TargetPath: '$(Build.BinariesDirectory)/Release'
+        SpecificArtifact: ${{ parameters.SpecificArtifact }}
+        BuildId: ${{ parameters.BuildId }}
+
+    - template: templates/get-docker-image-steps.yml
+      parameters:
+        Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
+        Context: tools/ci_build/github/linux/docker
+        DockerBuildArgs: "
+        --network=host
+        --build-arg BASEIMAGE=$(docker_base_image)
+        --build-arg TRT_VERSION=$(linux_trt_version)
+        --build-arg BUILD_UID=$( id -u )
+        "
+        Repository: $(Repository)
+
+    - task: CmdLine@2
+      inputs:
+        script: |
+          set -e -x
+          mkdir -p $HOME/.onnx
+          docker run --gpus all --rm \
+            --volume  $(Build.SourcesDirectory):/onnxruntime_src \
+            --volume $(Build.BinariesDirectory)/Release:/build/Release \
+            --volume /data/models:/build/models:ro \
+            --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
+            --volume /data/onnx:/data/onnx \
+            -e NVIDIA_TF32_OVERRIDE=0 \
+            $(Repository) \
+            /bin/bash -c '
+              nvidia-smi; \
+              /sbin/ldconfig -N -v $(sed "s/:/ /" <<< $LD_LIBRARY_PATH) 2>/dev/null | grep -E "libcudart.so|libcudnn.so|libnvinfer.so"; \
+              cat /usr/local/cuda/include/cuda.h | grep -m1 CUDA_VERSION; \
+              cat /usr/include/cudnn_version.h | grep CUDNN_MAJOR -m1 -A 2; \
+              ln -s /opt/python/cp38-cp38/bin/python3 /tmp/python3; \
+              /tmp/python3 -m pip install /build/Release/dist/*.whl; \
+              /tmp/python3 -u -c "from onnxruntime.capi._pybind_state import (OrtDevice as C_OrtDevice) ; \
+                        ort_device = C_OrtDevice(C_OrtDevice.cuda(), C_OrtDevice.default_memory(), 0); \
+                        print(ort_device); print(ort_device.device_type(), C_OrtDevice.cuda()); \
+                        assert(ort_device.device_type()==1); assert(C_OrtDevice.cuda()==1);" \
+            '
+      displayName: 'Check GPU'
+
+    - task: CmdLine@2
+      inputs:
+        script: |
+          set -e -x
+          mkdir -p $HOME/.onnx
+          docker run --gpus all --rm \
+            --volume  $(Build.SourcesDirectory):/onnxruntime_src \
+            --volume $(Build.BinariesDirectory)/Release:/build/Release \
+            --volume /data/models:/build/models:ro \
+            --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
+            --volume /data/onnx:/data/onnx \
+            -e NVIDIA_TF32_OVERRIDE=0 \
+            $(Repository) \
+            /bin/bash -c '
+              set -ex; \
+              cp /onnxruntime_src/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt /tmp/requirements.txt; \
+              ln -s /opt/python/cp38-cp38/bin/python3 /tmp/python3; \
+              /tmp/python3 -m pip install -r /tmp/requirements.txt; \
+              /tmp/python3 -m pip install /build/Release/dist/*.whl; \
+              cd /build/Release && xargs -a /build/Release/perms.txt chmod a+x; \
+              cd /onnxruntime_src/java && /onnxruntime_src/java/gradlew cmakeCheck -DcmakeBuildDir=/build/Release -DUSE_CUDA=1; \
+              cd /tmp; \
+              /tmp/python3 /onnxruntime_src/tools/ci_build/build.py \
+                --build_dir /build --config Release --test --skip_submodule_sync --build_shared_lib --parallel --use_binskim_compliant_compile_flags --build_wheel --enable_onnx_tests \
+                --use_cuda --cuda_version=${{parameters.CudaVersion}} --cuda_home=/usr/local/cuda --cudnn_home=/usr/local/cuda \
+                --enable_pybind --build_java --ctest_path "" ; \
+              '
+      displayName: 'Run Tests'
+
+    - template: templates/clean-agent-build-directory-step.yml

From 0fcc6fb7601893bd1e2b53baea4436a7a51b7f8d Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Sun, 25 Feb 2024 14:04:22 +0800
Subject: [PATCH 059/279] Add Whisper model in CI (#19604)

### Description
 Add Whisper Conversion and E2E into Big Models pipeline


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

---------

Co-authored-by: Your Name <your@email.com>
Co-authored-by: kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
---
 .../tools/transformers/benchmark_helper.py    |   4 +-
 .../transformers/models/whisper/benchmark.py  |   3 +-
 .../models/whisper/requirements.txt           |   5 +-
 .../models/whisper/test/1272-141231-0002.mp3  | Bin 0 -> 92124 bytes
 .../whisper/test/whisper_ort_output.txt       |   1 +
 .../azure-pipelines/bigmodels-ci-pipeline.yml | 101 +++++++++++++++++-
 .../docker/Dockerfile.package_ubuntu_2004_gpu |   9 +-
 7 files changed, 115 insertions(+), 8 deletions(-)
 create mode 100644 onnxruntime/python/tools/transformers/models/whisper/test/1272-141231-0002.mp3
 create mode 100644 onnxruntime/python/tools/transformers/models/whisper/test/whisper_ort_output.txt

diff --git a/onnxruntime/python/tools/transformers/benchmark_helper.py b/onnxruntime/python/tools/transformers/benchmark_helper.py
index c7d93470a729e..c9c815f01e053 100644
--- a/onnxruntime/python/tools/transformers/benchmark_helper.py
+++ b/onnxruntime/python/tools/transformers/benchmark_helper.py
@@ -589,7 +589,7 @@ def measure_memory(is_gpu, func, monitor_type="cuda", start_memory=None):
             if max_usage is None:
                 return None
 
-            print(f"GPU memory usage: before={memory_before_test}  peak={max_usage}")
+            logger.info(f"GPU memory usage: before={memory_before_test}  peak={max_usage}")
             if len(memory_before_test) >= 1 and len(max_usage) >= 1 and len(memory_before_test) == len(max_usage):
                 # When there are multiple GPUs, we will check the one with maximum usage.
                 max_used = 0
@@ -620,7 +620,7 @@ def measure_memory(is_gpu, func, monitor_type="cuda", start_memory=None):
             monitor.keep_measuring = False
             max_usage = mem_thread.result()
 
-        print(f"CPU memory usage: before={memory_before_test:.1f} MB, peak={max_usage:.1f} MB")
+        logger.info(f"CPU memory usage: before={memory_before_test:.1f} MB, peak={max_usage:.1f} MB")
         return max_usage - memory_before_test
 
 
diff --git a/onnxruntime/python/tools/transformers/models/whisper/benchmark.py b/onnxruntime/python/tools/transformers/models/whisper/benchmark.py
index e57385aa6db8f..11e596cadc2cb 100644
--- a/onnxruntime/python/tools/transformers/models/whisper/benchmark.py
+++ b/onnxruntime/python/tools/transformers/models/whisper/benchmark.py
@@ -410,7 +410,8 @@ def handle_output(output):
         actual_output = handle_output(ort_outputs[0][0])
         logger.info(f"Generated token length: {len(actual_output)} tokens")
         transcription = args.processor.batch_decode(ort_outputs[0], skip_special_tokens=True)[0]
-        logger.info(f"Transcription: {transcription}")
+        # print to stdout as the output for comparison
+        print(f"{transcription}")
 
     measure_fn(args, generate_fn, ort_inputs)
 
diff --git a/onnxruntime/python/tools/transformers/models/whisper/requirements.txt b/onnxruntime/python/tools/transformers/models/whisper/requirements.txt
index c307a3665f8a0..956922dc83d51 100644
--- a/onnxruntime/python/tools/transformers/models/whisper/requirements.txt
+++ b/onnxruntime/python/tools/transformers/models/whisper/requirements.txt
@@ -8,4 +8,7 @@ librosa
 optimum
 onnxruntime-extensions>=0.9.0
 protobuf==3.20.2
-numpy==1.23.3
\ No newline at end of file
+numpy==1.23.3
+onnx>=1.15.0
+psutil
+py3nvml
diff --git a/onnxruntime/python/tools/transformers/models/whisper/test/1272-141231-0002.mp3 b/onnxruntime/python/tools/transformers/models/whisper/test/1272-141231-0002.mp3
new file mode 100644
index 0000000000000000000000000000000000000000..6d220f5ede6a7c54893b1dda32b7876c31059fcf
GIT binary patch
literal 92124
zcmce-^;?wh^FF-0G%QH7^wJGWr%E?ScP!lqh@{}sEse{9G)Q*{NQbm^igdTUKndZI
z_xJex3(x+tdF(ycTr=mK*UWi8UPku={_kPr<nHkA^)mnf_!a<k_yGW+V`AeG5Rp<)
z($GPfSvh$41VzN9UP#NzDJZLJYU>#qnOa)e*gHD8x_fxN@%0Z34hf5hj*b72oR)#g
z&MPebTvl1r(A3)A(cL%jZFpjGdTw#~$J*xh{?Y0Af7ds+_YeQBr=>2Vp&-c14}+m=
z{hujh*vx223;^KzlO2C2YX<-Ci~paWAD{d~AdTs4@$bUGGEhrU%fcZx#YZn(Qjbp#
z+bpz%_%8`9uqmFMXfc%TZQ{JN&JHT@?Rj>YE%-S9$np5dYx(#WhuLU$!KHdZL^`in
zyZ7C+3^Y_PWm1iq^;?G6W-ByH+mMe;DMxXF<f^uI&*fM~H~4!=&8}hbJQv3;R*P8t
z8B6tdjcoOA5^1pNrKx1i&T7RVy}j4|A<0E=y#q>qe@^?hqK5s!Nv4mr2g7ZxES5E0
z?NEjUr%XYGKhD))Y<h_Y)vt>Nq4gr?urAXd82^MyNG4|}JMwWF8?Ce8$IF;>*A9Wv
zBO*fem^5K%ByHKu9aDCr$v!t;j=LgvtUE{iet)?e_E=bS2%`Oe0prw0UGy!!{WV;B
zyj1RArUxzCTd2f-)hDcZ8DI>^qE~qN`n8ljx37i7Y4b_mEw@6<+fN)MIlGA#cW%h<
zZJI;zj_I8E<@(=FPnRDf{{AfYcHfykYFaS)YAdf)toqyCTm`*Wncy$PW&Klu_Ju6S
z9HQ+wp*+YhCcy;_9bD3%R)mr<>fgPJNi88u0cSx};5So^i^B{gPxfrE9NC*#_$OT%
zXE+f>NnwZW4#l?K)rTf7cYX5(W&ZbEqo@P2=X1CW6}4GTZQ9*`A22K5f59jEMlV#G
zcvg2Vr5!R-GT-F(_WU<LVZV~pOI;%8=gPXf=H;dtY%wsK(xvD+nmP34nQHTv<v~YG
zeR+xW>UZ**_LR*A+WH$fP27Ic1t<2!`*Zz|myZ2o9R0)=7QI^U(!T_}CV5q4btAjC
zuP@0_U1rYNF5i09lC3{NF6c9AdXX=vK3~y-TAs?+XdwI2IeFZDaFMExP7}Mo;f^1(
zzs$o+>x^xj*%E3|4osEbvdpiKNk}s~bax8QkKS8MCi@b|sZ`m>=H?wl<4}FU>$W3S
z&@h46;EjJKCrI_m9@D`K97Ts?S=l`PdrjF^U+6jJZfqtCx+7DgnhM%Mv0>tiDx#i(
zqiPwW+qv`RtyhwA11;unkN&!n+x^ax2$fL#6j<nTII-Af@_1>~f6m!RZen4Lu`f3;
zZl{&#eIwQM`>ws_efC?PK=&H|Y9szYXqP~>mGdgml`zk1_qfbAJa>^^GZ7AEcTGZi
z7FpU_E4Rnf?Sq*71lR8?ncYEO-cX%Q&IIe}wST%nIX2t3U_P`XVzG~^nqRg29QyLX
zPq0(FE!oE?rSz@Ql6rl3kTky@)-5}?L{T(h5v?HWsQAZi_beq5e$kHwI^I1^wp@;S
zMbwNTar)7$8d6U-AG2?6T3c9u+xZ#Cp(4xuIgTOAqg97w@|6;$M4ECQL*j4&>3><#
zK8JN}J~6KQmu1~5@jSC5tSnd(SFR#D<qpj`6)mFlo>zEMzxKS+yOwp^l{l5oTNW-X
zS+!uOnZFe`QV|%=*owMdTs+N#*<$>dP*{K=6>uLJ8daXW<PwV3E91<NE4?gASMc37
zX_H#>W_0Q*{_lRUJtu?M=4Sp~)O-w=`*#AW=f^I>F;DS)#-UqoZ4scgs;#wUTf$?K
znk*T=Oxip+2O<va2NzB)6R&-osQBc>S`d89L)w&g=U1U=ubtuUex4>Z9vXvXNy%X)
zd-y<i>tmqbr3!NN<hmI=_Tj<#`*R6gCx)7^5Wr9fuOIbYd#tc2+>%iMglZ8bV%6tS
zGbW}|y`amq<3F%Y67g=D*8VW~<ij!cH`o969V@vX&n%)h!ReUmK)oZ&;22i9qIqy*
zU~iHT9sI4lu=;3BCT*C!Pk(GEyufPFwYmqrbz$7(yZIfV|KgDw@jR_=?Q*4|WIta)
zue#S3&46PrBO39hl_qV3E0^B9#tp$p5O%<X0!xur+y+gj)o|L_ptX{;pgtPJFw64q
z<D(t4WSxSUf4DA)>b@%U8_sdG&U8XVW352GP#mrz>6-=5O(_Ro?(!#_r`V}aogb{R
zs^+i0jJD3Yo*drjRl0E(k5m2UzZpp;k3mAnAxhjuvAo()9F@a}!ziuFPN>P&)WTKX
zhV}vVjn;O=(-W*R+*o@U*_g8&RZ|xYZj0jgy|)Cv?fYZL^z7~2XwKZ2O6@7#+%wX~
zJ-k_d)Wxg9Twz$pveM^Hzu<+wTpCN!PIq0@aPfA+M{;@9I4?EkWua*L4bs8E@Y;$c
zoUu6WXS%ZiCwEMEPEWY8z;icm|KR`8>17nIP+<87?~1D{UF!fv+qcNn<H_RKUp5!^
zB*p34n?7a6Q+4IVm5K(On^U7g&)wFt=FHisgfhu~=ds;9+xgkx=Qd=4Tt0V)8Z!~d
z&&awd)5p}+&S;|v_VXV}%$F6`Zt=`pIiy(sZQr%*oHI5wGG>v%uejX4tW^94k-Il{
zX!tF5UTw~{+7euj?UR=A{2%UpRg3AX?{Rb9Rbdbz{WuPG{rV--x%OsjWzg&;eR{UM
z_XstPjo<E?Sxe)9$#r#1s*-DP=F?srauLrPSonWT*r%6@{jYAM%jwdHdq$bGDktt!
zxm4w=0^>~HvONp+X~O=C&qi+;yhsO#>KYtR4|wEx>UA3@1;%qwl1+)ox`x8)&RJPs
ztu6V{AO#AgiS&JgnD5P78KEqiRQ-o18O<D9tKXZ>Q37-7&U3U(uYH#1^@Nr^++}~t
zs%IC8wYD^sGs>Co=T@&r?=t9z!}IQJOPMJ3%)%Y(1a>tv#TZz6eQ291pKRXdKs{Jk
z*bx5RzA|jPFKD+qcdJM#sxZ;+4_XLH%qWkHZ`7@k4Q=07=oi-fQ72WM0{d@QpVJNW
zyZm@YTWUaeMcp92&U>yOpI<5Q6J}q$@|}xM@T&de)E~%;vWfzW6Uu@eg_#_E-b^~H
z=r{{WV=a-m3>EW)6t-?@5`AYH9S!+>xN8Wg&iJq^qPMKWU5bT9rxiuHIrfSgFT?CA
zJX+mW>$h-cf1Uit58HSD&*nF$*1-W<ORN<hGNeE5<zBdxn?(NhSBc=5QTyu?aJ%}M
zWLqoi0<q5{$kX+wuIMvHY)wbDRn)rmLr?hXHvx=<Ppi=2s(`z{`wjbbc9O5-*2b#N
zj$e4xvz~GEdR&Yhx;$Dgty<bdWZks?S8~Pb<>2)xwf+75m=~_DFNem`T9?kJmkp99
zUH)v`MRqV`zg_nvGcgXjA4&lE2g<k=q5(<Q;<<f}CJ#i9F?@>o9auP~j^!&j&|M?p
zIklXgM^Biib(MG%F%TVr8{!!PM`IeNcG1BMrjod|9|S?^;H&^=(spB;R(G^qtN7<y
zgEZlyFc1K1bu)F;^6%)OipdJCLetP!uFw#C$B$#(S{U(I2yGMIYjf^8qGGQjW0#cl
zs4!ia7;vMu8wf)KPQmHX07V6jEo@d_qe`ra*!26WcQ_4k%p~wOo~sjg7j@I1tC_)Z
z0akD-D1Zb6fECA^m1K+-OPEe~D6h^vBwnM?j6%$XJAZe3@P{A>A&-}({ikok$#pDI
z=Kv@Wfenxk(U3F-L<5%F4bTCkmEE)G{*<{HLFoUHPr7@_@XbtrOeQxf@%Vg*jWDnc
z=KisB#cU5;XKbIG?Iy$s=?1{TAOHeD0E|$@2c)UOZ~^&nYyzMMA(_td#1Dhyzk^k;
z3KKTCW)8@*q87vJ+zZ-8h=CqZ5XCx_5RBISjs|@L@Bml-_g63w771Ds1q>Tt1CZvV
z!$<|ylEHkjXYjkl_*DrTFre!Eg1BgajcxvRtV<U78q7;Hl@$RL5+$(_Bl{vSE;A#Y
z_e`kr?F=V7l?a97OpqQ}=+siD0n}O&7wwmL)&c{J5UHP+619sqY52{^BaKZ28J^Q*
zB!{8H<4X^+W337V8A48+Zuuse*(dcn7;%}<DGD|(<jItcB9ReVDY5&=0&J}-T7~d~
z=W&-5R;`SfQ;`waM0z7;N~3ybnW<wMELX&u76Wc!SU4wY*PA|{HAo9M-YV*ac#<>X
z*3a6}3=C^P9!$<m(p2rhsl(6J4vAZxv^HXJ-~H6ouAAX4%$?9NHC*9ZXtR^4<TE<_
z7l+aP?F!t)E%eKNfS%WO?{YNfiCa+y=3MEObsqgW?RdOASOyW)Zgq7&MlU9-a8QzQ
z^Cq&7ckeID7cB1sRwOD7zh@&IW?g$?tAm<ZU4-j9=8!pc;wp5uhb8id5r|zAE4y+~
zH;#q1=-;Y#y(O)PaLaP<IH;gwO|}WAUs#9pHtvpL1E9e#IwI7`q}NKNpBtpX9qVWu
zOH=b!h{v2&_9T`C#h{Ye=HkC?U}&J>rH_~~D@}_TH+%CwK0UiJGsjkom?N!>Q=&}0
zz)o|bdk1PY%Y;bU@pE{5d~~LY6eA81(-GHB9$EYEeg3D=jOj=ie_WZ`+)gPfLNH^8
z*qPZOmf%F55M?qq;$9{N&@^Zo$*y_B5kfBXl&7t1Ma1>Nz4AXgeYyDAIhE{xC?2O&
zX~XLY^}1t8Fk^}2dZUnz?_fsh=0*B3GOr!TQf;SPLI;$hS%<pGrKbJ1_lb5&lLyb#
zC%spz(-?I#=bqWUx_NNs+biBBAXLz(mnn^-zpr}#_;~;LE!8aOLQ>UON$B?K-@E=-
zmcm>O&M+y`fVp+YXk025T}<gT48kxh!gVhx2tlM2F-9C<XA=NayeMVE%Gs6{c!3UN
z?&d&+%A`T5--PsbGlX;##j^6Js!`X_1pZ4V;FqRLo+r%QVQbHE%{7t+*zgL4GldaA
z8$+0;qT9NWXb8L?WB?K>U>tuPu^<HYil2%0i@POQjx@S@1%Y_y0YC#ld*ju>T#<wW
zq<Ub+VZs4`r;AQs%AH|ccWitT7!4-_wTr*uZFS6HFUca|43ovQ0E*aj63nr39iFtw
z$C-Qm_dY>^*&aXN-inh{kw7NT>IFF_dN;Q<h+>y0B*9QpvWax<7Ac@uJIWF-6Kcd8
z683ocrT-*oj690|(>b=X1yVDdy_60@4-5&b1w%MP8NvWSBt~d!Xdd08%U&h7*L(aR
zQy&|j%b>MJCA()#(QrMCT|l=U0CNU_Acs8z^<l3!!^*LAagw%SoaB9!o|Z6PED~b4
z*l6bOj-Nj7aoam0OnvD`6f;S3(??JG-rRpgasuSgVXD}2IM985Ms&ENd?H{2;Rl9@
z1Bej&*a&HlZY}IFpp0KP0u78N9hbS0adh+;RMdBu(D}%#W5r%aU^L;C+d8unb#k>3
ztpFzL-XH2OR<{k0A2&m&1ET)xu4Eh2K!<#<{SZ%q^<y9!b8Pe*`WQfCtU&R;o7oo(
z|5h}7Ux`nW1ep4I1{T8XY2cLi(+U!iPj6>;7K?-qNxUFL0ukNW0lI{0km>!j41;PO
z2LD039H$dIx7ohWC9J&d=tR1~Yt(07=W_jqy?NWsz6T)p4i~Hj1IvW`?}SxKc*C}a
zj}{y*KMS51uE%^py`HHasn0Kby=z)h*}R}Rlhno2VD>nX^Y_+kMeon+@Xxfb_H{o|
zpD*=a^!o(2v5||@o~@-k*sW!(b+qh$T<aA0vG<ndzD*#%oj~-?Uu%wX^&0QT>>Tph
zvX7i1hI$w?2PNi@30M?NV1ieiD7T2g121}%n(FgspT(^DryN}8^Cq5n9D@UD&jo9n
zG&rz$u05gBp-^NtI+#=WfVGv{mt{c}d!D`Qo&0*YmfrHgf1;-W;w9RJDM@{}boMeE
z$@%m4hWS-*Sl-(T#2!89K#Z5mw{_~})fs6W7Kd&F?cnF~5<2aaId=aE?4~CT3hvm=
zWg3lU?xUQm=XIWOMkzrC1)A`!%j;A*E6eyp062Xg{|1NdG}iS*C9)4S5QUKQ8r4~>
zB+AzEv)%T&w~{wHJp`e++T-x?6A~E*iL;ibwIvl;n;iNQ3Hyr@As?P{@Ptze)N9eJ
zdNp0(R1pPd;Y<*x3LBGB5!O<N%Mpg<p$y8d=HF$1YWRMZLn$z_N>&}_=d&o4@Q$={
z?=8>tj@p~-lV9~NG@8jWf9awEP3HStZoR+r2l))}=JYny*lM_(?65V9)PLfsHjwh0
zpU*0`)OdKzkSN~QsAm~(WUnX11OWkZAy6YMBmj;B!3&Xupd&~iI5I$_3`>7DM(rgb
zA^@(8@PXrmxWoT<t~}yB#_e4*G6Euuth9M<ZH{n5;r@VmI1#`bZiPsJ0|1!T(s9RD
zxHuDVX;yM@JPQVP2nhffN=QROgjz+T{KZ&7gg`h+1F~!6q(C7m&31KCcGx~PD$GJ6
zxFHHFJOZzf+|P=*;sib_q9Jyf(IUA(ib}viJ{TKVNCvJTH--s>vjbHMLLHm1Xp3L$
zilo`o6ZT{He`LamT(^o1xwYiBN&<LLU}J0Lr*%Hz#(eO9<hJi~_rZycDuh~~5OJ#%
zYs#2{jX13QxXq={GNtQ6j#{-P1oz|-0?}a-4x&tFDGO;Wn1vw4)fos7lviQ{u!S$0
zSYs#IKnXaXA#<>iia6ubozpjZ`hP#PzM30ZnpO^^Z{^Q&a~76`m&(c#gyVO23w58P
zLm@C_BJZ&vylxpIGf@~RZ`W70<DcI@RFP>&3Vw~XQsG9{C}{3(82s=;=?MuGe?f=A
zxXFmZOYne^Km;DPH2|BGP@RA<DFqHQf*Ub2k{q1<_(pZ9?WAY&wQbJjWUjK`+*to!
zGJ-`!NFd9nV!0(g%eN>a!4feMOF51v0j|SS=heo~RgY&Tjjt9S+lbKcB?LQ@fLU$g
zGeGelDaVhb$GO}G5n;A^sX=`ybt&?SocOW*p|Bxa11j);7)T9Z8#6>U#6Jwlin2zF
zK}b`eqvJ>uA!&dNk$-YC9nfz=k^n@2VT3d>5-Wrw&O;k=7NPA(J=7Bc_YYmyDkXJ0
z7IPB#&n}M`g4Q%JMXrLGbX;x2HAI33XaI%ivdW5hEbvDQC(2V87c9X%2w{tg0yKQE
z$}av{M%UsBVqj0VJU`rs2ms*43&<EYlLi1lM9@}}sE}DLU7wtz5CAhXXE%&~jH^p7
z5BH^@9ivI%)4mUQN|C2n=Imo*h9UsLa6e>hL3aRX=7g1@4QI^Dm-xS~h`*{4hVg=>
z<Tx@o(ndXm>J$Jl0lGREsE!Gc&STcyOhZa$MQvVu+f?9fyO$#}%DO#~XS~Gkhl2D*
zhVshrh%G#YgL`#6T8xddd^dmH`Uk+dm2~J6?mKq8-=w_K5p*goJYxv4OC>!Bu$+H!
z58G1-6J$|_0s6}H$`A+?N!cA&<le!ZHNSm_RNwi~Qjp+{^-ujH!w>J;a@QgKR+~<*
zm)&{AL7^k@FImCj*fRN^p&R@O1s9=t{$Y9f5d|9|vK*lRnXsPBuz`&@#*MgP$^<9R
zsPWcL{F`({W>G}G@dBEbHLGz~+qF4R_#9avCc_Mha~`%c98@&au5eqGG2n_Vw?|^a
zDhwd#*!ag3?3o~Dyu2MPh7bpw&AKl~<QL=oknY+@QD53J79A6GGW5hOdK1j(>V_DO
zRfTO5UdkNak3+y0%IVsfl4VPn#>FgE7}Qh^W$4DHjFzw*2L&f;2yaRLixg~uRiOhx
z)?L=m$uDi)Cier<tK5g`vuEeV7N?Zi$H`5hVazEw(>f_-xX9l5yhB~6Bg$#n*0xEQ
zjOA@naY0dvfU+}Ofx!2DWtnwM31+RE+c2;Ppl*7oye{r~47ewMsuxFst^{RWOVL0b
zuVZz+FCJObyIK$lQ+F$`0gs*}IdOMG_QEoZ3PJwanLnTkT$85Wcw#WN<8QI)hQ8%3
z`OtCN7>xdc-u`IlkJ55Vkt)YwQn}i*&l`7E45N~%{z^<P7HiMD<EIN-@|X1zwo3oF
zsTc{$jg$#eF))^fC~B9J2fHP6WtVtqNk<c>h_M<2<fB~P`zEE-BIAVw09vH1LAZXa
z#rQ<KA|1VXD6weZ4l^e<48bf$rf%ObE30AAIXrD~KBP{^Sr%W0ep_QDjeM#X``m91
ziyx-3jMF;>2TXZ)TK3YGK3)yD=qMER1LdJwA>Z|aOfe=$pHo^V2g2rqF9p?0=%>gA
z*aqILZx8jzJ<HAQyreUU2@HrZwb@&E`0#z#l*;$}>$b`7+fp{WUvEB%b7eoW>~^`N
z!6yzfK1{I|DD&TNX0@(${o~TqfhTUorBusBB;#^+uSebqau^wAYwFK*91@NI9uT8c
zBnY^N|G2dDMj*G6Ajh-5f1J&+f0bWwKIUNtUxTmc1zehvke2|_!%U8gd<I>AD_L+w
zfs|%_FO{(^s#Eb$#361V_k$@>$YOQhoybI|JQ4#?KgYU6)sTo)`+<cwp9x9MXXMlM
z?N7%UGMV21!!(uuXR&Q6$!3ZQ7WftNa@YNJ+};>iA^71&>^Hf8+VKSqgwm|GN@|hp
zHKkghr`SuJp1cZWffRHbJXPBx&y(^$$~ND9;|XBAzmkqO=-xW`jl8dzXC%LxZer}u
z*RGd-xwNwqxR>?*_Hm(x@|c6B-i~t3?q5&XuYc=>Hz@k&){Osumn*{MUg(S=5G?ru
zTQ@d&FvrVT!VhlqF1ptQdS`XGg3)cStolGD!CY+l;p{?m+r-N1{!FBAO>xm2?9a6Z
zmrvb)z4Y}M|G1|&sn=GwUT&$w1ztw98knjI86=-gjaI2%JmlG1ZY@8;tSygT`Xk1#
ziglNFC+|Ogq%J;-o|`Z8Tlr~QlK(AvTlT>WM=s<KFh9?rhzR5kehYBMhoQeI8dA2#
z?r}%cBH^g(EX0?_!|h$Pr5`~H5&9xCH^H)?{LCzeqDxG?Lz+pptY_v|)12aek;z?;
zfnE2Hj}{)o_Z>|eIkR?t6!FiV*1jmdb(<YLGxMASgV;&nQrf7`u?js6XBk&fAu=k<
z@;;ST7aC4uRHSheX*XA>IdkPxX>W?JyHu;XgiuD|>y)*GdBtwCxJ4^cG^AD7kmW9E
z+3Kkf$|$f&s>B7gV7@jiWy4v1KKqtUm5VT|q&8Vo^--tA8U=Cj60MV7xF3<r1E#3a
zh|R)AwuZ~NT(~$UE!WpE=7sDUF*~QfsDg;AiC$CFSQvncMx{@?P9|xvoc=XBz11ej
z;T2lo?3f-jB;sRvUbltt8c<wxd8LKW4xC>q{D%NHDx0*+U2_}7FRsG?ZHJ^eRZG_P
zduel}NG2;iKAo4>lx)eHe)f9&;NlG%e$EjC8@3f`hdQJ<GlY*}*d-~HnZ;z57K5ur
zaYnRL>qzok$Q<n4+fCDZcOdo6zh0s>4G~33rZ`L3yOg#^C%P(`co0n9@Kq7Z=P&-L
ze0MVqrplwr%`i2c_upbGsr$MqpZLYY<t=_^-x%R9>pbZ^E7=bUViSj{-jjMkCG`|3
z3kSbOQ(YUrS7v_>Hp|ZXN*)+ev{vUU+j7_=?j+RT)|f(CR;bf>ZOq!%sPg=6siD+L
zd627t`tQa|ei0jjOS4v;`TAY9I$9-><Fc5MzP;+U3)3C0DxR1(Y`j&`5BF0N&Ngb*
zcA0Vt6ZyH6oC*dXW&DF8lcp>pN=8euq+B)3qiaT2z64du|Esx<{j&mJi7OyhnObrt
z`l=GpVlw8N2zZHpPgvv*F;%g7=|Vse`Z;?124RM`Z7~vL#I7bx9!ET>g{!YbtU8oj
zb(W?KX4SMT6GLxkwhttpzZ8_()O~!co8ea|d||;^?(qAnI9bu6%h`*4hUGVVy*STi
z%j4ri#bfTH{^PHe!>$L1uQ$0rgB;ETR)inkr~ma&)2vn;wARflEt{NXH{ij?!LY7_
zD?^C^)(FTxm>utxOd$>m>O>IP3k4S?88}*RnOnN&<q^X2tN_GOUz0Nop0Ib8%_ZmG
zNe2dK%9cYX)Pm82pxqdG!S3o}ZdQvfgXrrPU_c0*j|@nbbKLQZHEkAdKfqG}(*$jM
zV*oEg0Jv;sp^4bV0s@`CM7>~#BK+N87%@-|Sc`@Y>;oY1833rWyI*&GwSuz~Kg!f{
zF?EL2f!$Jy=!i*9m@dX8I*f-5EfwH_aZ15E>qyXWr@rz1%Sn&7N~S!c7~CDN1b{%=
z;NoD4$zxl&g8CVr_|K|;{J&mY_Bpm0H#;j3ZUSfK;U&e#P}ugawK*L)=)C&Dw!CHL
zXZNh{eMRENhv-oxG9)Jck2z8@EIFKz1387AAL@Y<28<uIjaHZIK@1=y<6$xuk1H*2
zYSlHoRB9Yv+EBVR{~W8f#B=vyy56MJYKIe2!Z<O*u%UynSR5OCs4FZHEA<)S4(uO|
zhRGp~V#*@-sgY1jt5^>O2x*A;hcCW=Gw;uDR;J5UqS{bOlyxdR3EkwEi3%2h|6S@P
zSMKvHC)_u4h+I;c@dJxI{*WPM62Igl@5nqWQL!k)$cub!H#%xgh6b)^)axYit)^ib
zLa=sCjAWKZl}dg4H&o?E%I>B|dhb^~rkwhfnW&HH;(8e>4fOy(m{p#MXV<kigi$qu
z@?)O#I(tNl0U>~dS&AkI8w^9hNie~r?428!{ZWgmQL|uXKV|m7i7mb+mfk-iKK5mF
z4nT>X?j`1#+H#!kZ?u?-pUTm<70ipq-fRykN`TwkE97fODOC+yjNiFAut~NvXB!>?
zq_q1X_>2h?V(R0o=Ci*tgMQeZ9GU3@0NYM{8(EmxP@7Ol{;S@gjInPKROk(TQACuw
z!n#HVWBu9GdX077`|9$S3L<ovn26@=BzrJ1B`TF;KIX%hJOx2`Rgm7}C9;3R7M;9{
z8Z`|ia$-dXeBAb2e7h$3M*DN*{SQS&^JV}t!7JHFA*44c!&<rN-)IRJ4B(D#$L(P^
z628wHF^h3R43Nl*P!Qy3Nw)IjW<~8XCPNsObJJf;avhqq_c3bkQ-Q%!DgoHDv*Uy+
z)88r4@)HL!GV;~qSLD*z8@xyvRj)~_LZK=a1tsV(t9}V&xMGqHLma70ssNd(>_AW6
zZyDwmFU`lmzXS_L({q^wrN`y6YZiC%b1SzYzs@51H2eOLGxS7Q>-t({ixXod>_sw>
zyU8<F-~YG6!j~qd<?&)ULZc8vF|P~+WQihhTNK5JNEGVPt(NH>VU|bi7~84PDo7-&
z(^`L1E{#y1JcR9=J45oVZf|SG#s$4N)g+=AOQx?&?k(!P>RgvobsAL~-j7UIjdQ1(
zH2v3~^^kgFNY}XhP55OWU-^8~?8~*5fj&H1jdSkZ8{HlS{f^crtgGbI!<{f)lm;?=
zxbazDV~x^>8)^+Rak}D@4ZL3Yk0p{CEkw{R`G6msgr+0aXCa0spNjcR^*8Wq7+6&p
z7rW-;$NC+wL6&WY59SR{RX6^<0Hnu;w-=GD-ByZA0cw70V@D{tSu23I=Z~0v#}6L)
z1GAtX^9#aKzrw;#1>DTjb#`^;l256e2$by#hBBp(t*gUaZQ5tKg7o_iR$KNr4&$Gt
z>G>V1#-daX>^@|D<Nl%5t6IHA+sD!`$H+aBdWnITD2*`T;5vF`aK=rQ$<g?<u5rG5
z%wB^T!i$dfFH`%0A|`z~Uoc~8>l%aTtkd&sh1Lh0s6%&Is#Q69z^JI%v_}2rWb98x
zGTO?e(a3hOLY>OFvO8Ai7^td>H5|&RhI%PS(81W!O5b4!j#QpHg>m$pv(xq+c<Mh6
zc9h4{<D0LJ3oj09_i=Xwcj;Gueb?olfLWaQ<#wqn>h82nchjF9lP;xAoiNOT(Pbp9
zEI+LX>E)_F0B|Ms9V-dISyz6o`&AwKRk!8JB%=7&2o>|^UqY#Zwfrl@t=$>I48{~Q
zu}>Jj$XSW`<)K&Wf?7%{9H|^!IFF$hDwAxK7m3Tcaye_aXR!N=Doj^ZwQ5==GI_g@
zd+nQc9#ahHSdbFRg-1A<85u3Jxt--;%1kE6oceCIDsElNgp$s#)UK499iBhiGuKM=
z_pxj5SvU>L0sW)UeKc))ldg$kdAam(Uw=tOl7iU_XFd@^dD}zQ!@GcyAIn&?Txixa
zQ-8K&G4?)+%oQeY^~0yh%Bp}udr8aQ|EoJ(S+}h@0ZO#ZO^hoCGc~#9myOI#nSLX`
zTa8`N(CE#)>l<%g3p8l{T+1G`H0sz9%s8;ewLEEP%DUnbsiZeuE0Nz?R8Q7F!tJ}i
z6yVLRXK-TW*`!1NgR{8{=BM_#7AAWZXT#RXVKf)+Mch|qksHK2vT6Kxo~^D&u;TV+
z<)SjHv2y*wdZY&4`+M$DP*A8?!0S`h{n<=)-FKs1ZdN+QWUH_j9gN|dCY54O{BTTo
zjCjPtW@%eY%&WiY0)9@)!ZsHh>k#St-q;<%jUHvi-mLOmoL_?F#Z>xyHnQVDO~5Nq
z=|f`G8&Vea><;dnsms}^-oqvSLhb1agJMDr^~CDo<B5f?NSAa=_7`&o)R8VF5=IAA
zC8+0F$$DgjA2$8XRcJYBveMqB;L3LkRf&JlKHw>rAE~3|Y?l1YB>&U!>v+My;CW|c
z{UM3Vw*@x|>lc*?eDs>vBm$W}2?k55rCyq+XZEW>Zf(8krb^?lg${~MuDlC94F+{`
zv6)5WRi=A)$d#4?j}}{O|FfqozcD5AvOVH6YJ{*x8u$eU*{In+<<6Ff6nPhoHGTZn
z68j6Iqz(Uffw-G)cSzbA|Ke|P*_@nn^FixhU}All55LJ-?EHrAEIMi5CBioqn7Qk^
z-p9#(7<_nPH!dp7g#?R6L}_G>rQLm;?^-zRlKlKgJ;4<C`tij1NO+<Ovvw!q3oovL
zfqjwq-RH&7Yt|M_9(V`hltlRFyz;BJ)g^K@3cck7WEyVZY4+Tf=K&YyZ4wD|q!OZ~
z_RT+kqGyfbEmR9KeDn9O3sz!aR7Y8H0`)2~z26EF>as+fX3bRdG4d<vRi|k^?dhBt
zDS05Jx*xy@wWZk`ErHCO$UHQiuMpRMqvGxDqnu+8?Ur6Mg25Hr;&B-WY`G$ShEhVT
zZKO3Qb;-Bo@fGEnmBV6i=vYBCRLtIGvH=s^71vF(t=uxog!ydXZbd+Te5E-nsN9bE
z6L(X1rT*e1)h1{lkuft@eg;B3#YmD_{F&-T=0C&NyGXPlBxhy$htwkDc~gPaZ_=x_
zjwN-Jc1LdRW+peg@9ox{@F?vChgdFrObRc9e#A4caL3K%zNZ~3qQPNYPmuB0k*b#S
ztuaRnL+f!#>q;dS>&trOE{Z==kI0gL=RiQq!y;5-9gK%ab$_a-TgJBJAL&uO0JBhI
zJ8pNyKQdZw*9nxb^+Y;K-knzrc_pWg<VOAqHY8Bvn;1A8@6r1uinI0J{bw0yA-*xT
z&c{+}!z4I!`qY8L?}u552B_w|r)X0d3#1wW`94@aLS_40&VccEIr|8LE3*2q^llFS
zkp9Eekf9nZ#F6Rr*Qr-Jr*$pMVtUG<MBtaV#<780OF}?KS6P_1pLHXG8Hud&nTe85
zW^t8I`EGK^NJ|hUmp_Rv!OYF<q&mnUuSyG$CS>=P;$ioe;39fPo!|KNYxVzrRm`C^
zQOsfbI9tp&Th$OCkzUN9COV>%RxG%~p{n~5Iml}R8RS=`B}gmgP`^zp<{B}0+!f7_
zw^9H1`Qsx}ijE)ygkz%+{r>UJ3A?w+c1lZjHlhFP{L37U6BRQ|*QEkG3TNg`t@F}^
z`adEp(Mbno8up&xe+KM-GJ*w(*$c-Aa)ymmhyD+rncv*j(2ubL>X~^4>`{v1Z{LR4
zL`IUde@S<4CJZgy6Ih8*cCoitJc|uFx()C!&yjL%p{*$I(<c`ELZAAI>M(!kpA+j(
zu|5s!c&EiwglZ~fgO^5h4~Ns<<V5DJ*D;@(Eu0bs;(RJ}Ynhil91Cxx<I>9gv{ia!
zO5LEVmtBwx5G&HNgN{uq5T|k=8F}jjp#ps&R47J*+odBe^}N{Von>rh{ykGc3o(T!
z&C&_`C6x<ZfL2EwwQQt*TH4KdqEKaZW-Qh7^dn_QW4bp^xWY3mKg)Hfyr1m`EPuW`
z<mBabX8qV@;WhSvdc4`7Vr(~hI6D<qn{#P5m6=TVA(i`AT~2TIWSB&a>H~)6zW{Dt
z9<<;R8NR?Co*^#P8S*x<AwF1Ie=tf-)9$SN=#v-pV<tgKXIAOShhuF0<o5rx5bxYG
z@**;f-J;C)=9!A$qP&#7c-)-c(pI&;G~v~^W0$p+IuK<JOcPzE!_b6amr<~MvOXEX
z54|o8NPEMeTNhTh1x)^}!0TyZHztvte4I$~-x)=WdQJKvdDf4+<~`z%Dozr=$Shy7
z1TMavZO>)Cy>il0iP2!!V7ha;UQv99C-ae5fP%|Q`b;c`z9y@{EyCi(g;j@ieW~n@
zHGxP3p7M`q^_c%x<Jj;1X-=%M>ONEK2&ie0YvAW|{xR76*~#IYu*!px!JSCdd}RRH
z9%lQRR4Kc2CE$TTf2K?WuZvV4K4CQ*S_rdNbbYQ?7D0|aPvGudpPI^G<XHT}==5aW
zyP-BseOG@+5V69FFJ^Ec9dhfU3(R%sW1c8ydd5@j-;+Z-0M@W;`+N3i5URlhK_d>>
z-tq8Y76YPnpXP1bGecf@G{H3;)3MJCNLM|>qwrD8PkvtDDEMXZPph`%obmp3vj<}H
zpQh)fX|UqsbX-+XCQF*r$!fw7rRLwg`VUg`Bj4{w@7*gVLf|&1r!59F*JrikM|XaE
z7454&lDR>Hn7&BU6~4-!VhuGWqvB{4f&%s~>aVGFRn^wIG8Xy#KU-W1wfv--UlOkT
zrL-lz?E{q#iMy_NTb}G<x5EoS<nf4iIR6|gP-wH~&Ws_zmA9<}VIUhqB6ZNO*oC0#
zrG)=<R{w0~4s^D*REGm)h{)72G;wom)G0dDR<IR9??^oG0Lv8PJ8I>sikO&Q;W6ss
zwOZ_B$xx!6{wS@#wGzs=O&VMT=9<fG^~Ovl=1F+yPRu|aEl#2gpB_D$PU2>xtHOA9
zQM&BLwO02_!k`5-2U%GqlbJh^(C>8rI@VT(_)0-BWYi|B92W2_R*uZh=QIr;H^=$=
zHqOdrbLZbU<XrjqfwQ8(?6Ih`Q-6N3k>3;cju~u|+nb|$pwnF$Lp#^pLZda3a;d|M
z%r6P1I;J$It_kXtJ~4~a@D&)m#2*jqx-M^a4kRJvThyz#W+9E}gT*YpGxWsXV;N^f
zWwED;otb5fx@v3AwlfAXenl{GEjHqyBZ>bAU-R}d;6s}>2Y9oj2#D+{A>ar^xH+$`
ze%qjVWaJzqt)o?8RUyh=J-MwoMO?6)6P_e=?u%LNU@J`sq(F2Dh0w**!1hCc&lU7>
zNd)AuAu!oV3^V~5G?ev^uJAA%Ds_Oq4+Ng?9}C3f#xA{qabPD^<$vx|ck;|whiie1
zdA(Cv(WT)@Hg&42=32GF=-`MF{<=vnc3qv5;g2StLJ*tM@GlIj<^@pp^LsIZ*lLMP
z-r~h!`T?QJv%+nDkcd4Ix36V@QfN`gOe%+4Y~re59eX5%`K6np^*P$Z@k_N&?yXGS
zMw{^hL%4v@l{**gjXeAq1mdYzF#3*}+}%HO9e{uX*s)15@IX-XY*!Vb&JaK$8iouM
z0M(lOC%pNB9)Sx;9mL{Okp@zYN;5-Xi>?@$0(n|<>zg6n(FD1;*+@|N5TsKM@OoXE
zc0HpN*Jxr47Y_>)48w#?Mza$N^0W8bPx2oFh++F4-O_}F3@>9q<zIRbDGSJKc0y9@
z=b2ycZ5U!A)gl>jA!hOreDW8NFMf7VhBPz<tZszqtM^&ubx8m~ccf7iVx8%Y>g+2F
z(9*sTF%1PT6<(F+hIm-yAOMP{TEfT#01ZNUsfOO!XeXIjYl*{Zv$aUW`yl8#dD8N*
zQuzU!2KvbE0cmLXw|q1_tT7>60cWXk#;syr0zq@Z+xvdd7IS)N5fd)?n2;6!B=>~u
z3^)NSL81}s=vZE!m3zeIg2#i+C7&eki8ef+VrK0+`Wo8#uACxxW^9~mwN|`AG5P-A
z2u0iv#)H}vqL_3XE7qH@(O-!8YQY!g^4i$$p*gfT@Uf|@>MgQ0?Cbr4QIPXv!h+6=
zg-46dMQg7xvR3mKmhRv4iqz9Y>@GY7lj?s}f2ax!C{R{Q(4ZC&Q%qKHT1*`;*O!ga
zSo>-c_f>s**?Xwf`^3&dMV&<E8KiA=g??v;^fP;+m$j(crVlWbn^iS_UNd7dDdMT0
zIUp%TUPw9S2iV^*h-Yv|p#^&1XkLnK@DvqI=?QP~vyZj&3L$d5>g_nIJIJQ;#)ZUu
zTaH?0i@@Sx7q<f}At$3AED}~uOvPb)!@Jjw;u~FY%&Iefew%+^Jg{&i)JpYZAxpA~
zO6Y7TY~Do0$=MjLc};B%5eodm`ZenN`^Su~$6oGcr&P;02gUBx=jye+e^Qvce**{0
zR;K6C<32Z8)`G9dr{0+KW`N2fG=#55RdlPty2f|?5~2Zp#)P{W^cd+Tzh9Zsqxj6#
z&<LiOo6*DzNgtknCsEP+nedu?wt-VQ2#=Wdjl$NvLse^$L-9@Rv_^If*_!F6Lu#XX
zrI|Ud4=zV>nko?inN68WJ_cU)vqRy)NYW=he<bpb+~?*W?~O$N4q^q#Mf5`^!TI2L
zKS3z_w`9c`01YqU7)Zzze@ePeC26*MFRC0=yTz`FR>!PntnEp~O}j`S-8oX*%Rt(%
zf{jo_zhc$u*uWJ~=E!E3fv39@>stapCRIoLd{iVP&xqFg8;Qdg>Y&#_iissnlK8A3
z^(V?XS#q`m!HeD}AL;unqfYo!BS>(%dvZ)zr?R@x+Np(m*|KbYlKyuG73FPR8%HN2
z4jDyndX@ZW?X;b-T=Icg>(X$mILYs=poE#zuYctio^bVt!7O<eEouZ>#jPu8TG$Rv
zA9AWUJGH(l?8x9BRyzO>&|)2m0|yue_>Q|?`mW!RAGtpi6PJ*#x8aT)Rg5gW9<^uY
zE}#j@QQ~QnzoClmA!mNCWMSWF5Ipy9go!dHEo1M69ls>#Gs|v~fyV1t`K0vohj;u=
zonG_&F~n+dQQd4&!{U1w<t?AFLhMEAV%1kFrZ>6!L4#lyEk?%vMR*iT&47&te|!D!
z`l$3GAs@b*O`Vk6@)!(Uu|(7hr>LZkZ+QVnGb7&C^+4fYy}Cc+wnoRDulvjoKDDPD
zXnUo4lEy8(-71*UXLyTU??zTABqwe!=5_VL{`%ww<-X!ed!KV_67c1!y7<Vl!tF?I
zVc)t&!?_u&lfwVVp|_<1<MP($$F&7U#qtNPs@O|YpQ*Q%W4bxxoJ&<GUyA2t2H9;W
z!NQLe^t?^h<}stpMZzU2J>@8RuR~~resU|*Q~az5GZR<;U4jF=2D5>$B-KfwR9O7$
zDMR>psBoD`WO^7ByMF**96B^L9Lq8s3omd|dFg4*vMm~_m5o*8&!Cm8Ux+lYc+L}T
zZHD#jLabwSZiD&2a|YXrp!c)DNl>Gj*v75(H0(v~zJF2vXr(|3W21hG3q-$4AdYUc
zKu!A)wtJAl_aM>grBrG0=G}au$PVLe=X0+%F6T|FFBzFJ(zgpW|2pV=8&$3Pznr-%
zjD|<7ZCs4g$W@V|cOXEt>2XFothfV2X%r7rmN*845j?GHiudlY{9%Rgq-3Fb*D9XM
z1EbpBg`3yS%G`pvU?q@mlEbY1q8+cG2r<J)RX@wB?fQ;cJ-$7ev2;W^gBn%OrzT{v
zVV{hw5GhfKbYy;OSXs93IQfom+QapS`O1s{g-8-Bk&y*sb8*_uOw*{CQxT*NQ{()_
z2J$H9W!z&$ncozvvyrFs;mc4uvLl0PTU&EHh72d!P|vWr*VeKHrH(amp3357WawC3
zWfI8=uVgJf6c<E-5$sTWXFj20<Sot)nGXN}-v8D`jSk2@Zan_iyD;is=OL-ic`>cc
zk-GXvlvkrx2>Ir82GBcKZvRO>^Al=wkS-_UyGv@Re@9dcd4<X7je5>$%a4bDH#|v2
zS>3?dK|SY@+(FFTUsV(9_Mc}C3zd56S1dd}ltq3ZcMY4;AWzdbIrU9^j16Y3*4b)r
z{al$w5N?FZAq&oFjf|sEGnQ4QH%yFB`iXUi`+N<rApIlRNt-*P#T)#WV$BR2LZvVn
zCVWa_kS`b@O`PHMzx!^cxl;>}4>LqO&V;?AQr2OIs>bX=Kg?*G{qj_dv}uZ7zjH4n
z>;H3t4r}7j6e0U*)6#&#RyptDS$`RVky*f${Wg8zr2OFCt);&`+_BR2QhGwL^2I;}
z)ns-Pra)^9>QZq~xa;8!>hbHY5H{)euTr|AxAV^hyMW)RI$B~aZ-1h$Ixen0ovvK}
ziNMd8v>3;|iLA1|3H$A6P~F7#ctm}>T9<hAI)lo@-0i9bZ!56a(`4t^GOF>v@UWP#
zC;$L$tb0*Km{C)dI6{s#@p3EM`YFZ^g<Uq|=}r9r`ID}?BXFa4cF`UV#`{*wNT(?R
z2n!Mgp-?v(!2n_MqVHm0SfQ^&_-SWJ8()}<k$neX!?9a00!Y5FGwCX)%(6Kn1gD^r
z;U}3f*jSk+((~bPUGdRIDlyMP`;rSMCMWa;_RUvSNOz&NZe?GAo;R8B$$fR{L7zh|
zssmfRRvFTN2&}JJ2TfnjRw#NEjr+D{6y8EL+Fef-j)(ptVLWYQRp{6#<EJ6VmagyW
zqbsqAGI2p~U36hgm_?eT&9EFf`=N*<F5eUOt{IP!`&fP&hEhb)E26Xn&C}<yN7zLE
z)(oxUE`*Ww1`V<h{SFR|%rW@Jq6a%B0neyeMLztO@Mmjt)LI*5^y9LD&K&<z%#&#v
z*Wc9#E2it$Z{F>3RiliOx~^7ahM4)RoFvJdL(w#(KU67!rE*7n)V8sa_;_=riPM8d
zzB@yAfTgb86KkVVj9}n1utEP`m4+HG%({-SR*w<<1_%Jj`e~u{t^=&E<Gv}!oM)8%
z`82XUApLF35L=kvj132_-ZJ!Vz=(=Sg<YTeaEuf(h^+;FChgIy1){nfnM}p`6Ea);
zxx^m|grEcK!q0uXY6JQd^y;ySy&2u4`Vb;ub_LS@qVzH6(LaMit|^PW05^tF=flM|
zbHzH9h%lfuH$Mev3q45yri+J0oD>Ob#VYxPQ>FjMgh20GW0h*Pf5c=IG6ZoD3NeNl
zM)ZWRGeJav#$nP5pW9te6?3DWIPG*WH+g|&&N<;Gzdb=0KzH6=$0#gqA)Y!!zS|Lh
z3_GNXE1nqO*sX^dPa$p8t<0m2-@{hhiuOp|D{wMM4{-~cE$A}osJ0=FXsI3!V>d@a
zLgD8c5PYDYn6x?w@iiJB*ljI<wu{*<YXzf{3zga!*x2*Z#`fJdHjUR6jx@&LgQTO|
zqXhX?#12PJ?F5sf#4I7)SasppQpi_mQ(EX4;e_3`&%gk=q3(4GFkY`%pFBR88sO1w
zOOGkD-+K>bUK(Lz^!$@+{a;#w=T`8sj{UN_l7ED&XRB%UNB!+)m4G?2SbzTy`GH8W
z430!&vdmwZ`PGyl$~C&OaR2d4K*B|`lTD^QeN5bB2jrU3xGJO*DT+o^`sYU`IE1Ji
za-8Xj(5_PbOjaa|CeAuVA~`=i{KqMuivPr$#+#6o0m{jETHhE2MKrzOoWKwO>CBeG
z8>f}=E$t9g*jryz`YT=KyFExVnIemYDIw1UWjA{Etu2{#lxGj#{kRDU>D3S-EN>wG
zrNa2Z)}kst!!`)*fV=yv+gnPy$G<KL%xRv!fjTCnZdD};+bJs3`8ukHY9_p~?DZ)s
zoUi6oXqgy)slILFdSNCaAQn_^SLJf$J^7Y3i!qi}rR~r~FYB{$ziQc#*}0a&A&lhj
z^b>CM^UX|@TP`~S;gjoF9@LAl$OzR2bv?Y;T4fDN=z>goGxHgkLZ{3d)b7N>zsKIZ
z9H$LGwK!s-elF*ERV-ht5vGd(0|!egsHv&xQj||c+A<pFD6aSVXY5k+D#&#9K5GcC
zm>Vo;_I3T2z!<PR-<KldVzTUQrZ;>4f|f@iv7~5L?>X`J#G8M*nSZtQWvZ^Jo5uUc
z$8V44$v>_CG7VExlcWj@3%f-jE0Z|^nAd^tn_Tuf*PN@|(2D5A;cteF1tv*e1Ld}`
zb%7WV5@n93b$#b7oJf0VYy|KaNBhS}m+`)-#G567nXLgK2tdGa4l&uN7a9(%fZPmS
zu6k+#l_jU`tX;6lk`%$R8B~5=ig7Jo5vAup9yK=$7$ED*sI;%e_)SiBlVx^%qsPco
z*!iQ6B(d@8XWRT&FF6Yqwsh@&_R6m<dlYuwZ}__HNG_DTghwQj_bBdExAg}58h^<b
z@i!R_lQNw11C+_-N3mr;*AUI2>K9j??1F#x5o~y69CHy^5wsmy&?MBp3s{?Qu3B1o
z{5NS*4pa%BQw>m?J9s*SoV^kvHnGea0_=Vcbox`=<UUWDa_f_K$6DZQ=1(G!ag_a_
zI&*fW*Y{#8mxI+aUiz}>Cf9TC63_5=4msCe2IaJa!^3gs+|l);58sN@rzi#ie_Ndt
zTQM4PeA&=MDOYA-Bq$GHDLWnoU}zzBrCdx|#0E@vdjs))Ib)AvYC5#_CN)BViU{cu
zS5kG$h-6NBv#h;&IxOach(F#)4`9V_^*<B6th6Gka0ogpHXLciN?;EF;P7Mdl3Rpj
z81x>*2={@sv|$YtOn4sQj07|2gu{3MV#lHtkv9}ZFd#xY1>vpcmA3t@zXnWdpfz=m
z7~>z@?`b3lEB5Zh)M6#{B&bxhzV652aJg`B>?LK0N0({rSqJ}5$M`;t{Da{?9P0sO
z!`X3J(ek9=q}X?#^FJ7ST9!;naIm%lIx~JNiLhg`qUXVJ&;)@nTw;s{z&axla0vl@
zjuidZ>2Sgy{?a$`pCT42!$IgodDpsT-1NWppoUDCAdW5&o()sP(#1$(gT29E0%*Cx
z^nmN@C9vmc9x!qwI>-qTO{z@)n-@C2G}?dNRx8N8Wf=(_*5DH?6V~9bEFa<B$wVN@
zAy}ctAxL@<V!s=T4F(`!Q1*DBv~;%^a132qON_R+J09K(io}73!)XXaomDG$o~!YV
zc;$#fV)`x0T%7lK6I(_vP$sj@2aG14KBwzie$<=+fz<oECC1QX;c13oLri$DHxZaM
z)cafib1<p2(U2J37-3{ypBV0#Z4bPMlnjg^o7blYSEm6UcjqyTY09Bj(r^N#R}VQv
ze@c|&rCWy*eR}t*J!NEU)WawZl;`q`lrTmX%2LJ|!RvhQGNKNXMa%WAGF35BQcg0a
zWpw=v`!04VkGgK+!lTPw&?^5iZYOyEiV2+9V6MTW;lk`VY34D`Do7xC<5o(29cMEC
z_x_CI;!E=TIYCUW#l?_|qYKQY^y9Agr`JF03@|Dz8m6`=3XgQC7w6~eDtG?@MAxIl
zL)O}#t3OR$6Tctc25h$1Jk4C{1k(soESLR&Eo+aDM<-ozioZt5b~k>0oENP!rJiW0
zJbO<T7*h_Arr|Pw{Wx|uUR})dERP%*$61y~fr&1vZ_gR>wRs(ME~%}|%?T>59&-z6
za7KEZczi)~1AnU8iZ&?rs5D>HA9&p`Zr-(L+2nH?wBNfjkahjiJP<Z*s20aN(*H`*
z%sANMZ)V$r>z{cM(G!(MiZ_-o(#_m*+I?m>4cQX(Jn|#=$<k)<4y|pFw(sVS#SM~s
zryI;B_E$-oELX?Azw;$oqbG8IwC($Zi(1<H|2Vn|ho;*ux{VqzVDt#-&QX%$=tf$a
z(Jh^#jGA-_I69<DT6mETN$D1(K}rDu6_5|U{RQ{#xzBxWoO6DakgVzfd*`yY#o^;v
z0I?LXA{Z%;#X-`4<7CX_fCIRX`TPDtP@y)}a)s&^DU7oV24UOMGjQM)VCw)kq3rNI
zU>E?*q?!zcFv+oDW5*s8{3N4h-iRsO#M<cOT(NNTFct-}kp>2O0%1+R&ZFq+d0w$J
zR2Zc1%paUJGjhLsJVFt2wOp0`M6gO?M22&TSx1FYg+00uYa+>M%vjaMLN61)ir4&-
z)3>$6JGj)k(?vq`_<mVF>LLglcmFex@VM#BewMd~X83!!r%RxrkI~H4`I*<B@aK8_
zFPn7(WlUKl%-nODwCx|?%!+@xSRyg9Fw6fxeoWbb${6=tD5Tc*@%;n{wL?H&RQFm{
zffgm9nq|v^$6{VL`q#K_K_B0>%8cKIh-jp`@<;p4hz=$3Cs)H<z9yfD13)B&)qit(
zA1FQhH6ITxMyxC-c2B)k<By93<Iu8(uCtxs0o69V#CLH@cKEYVzF7@1Z?)o8TsVs7
za@^eZURQo@naOh1br&nva%Q#t+~koltvooKO~+tx#1>?bb2(fwN8;7oHK5St>fgNZ
zjiY70Jcw$h;#*(m-;9Ovn~zC<YP!x`l?Nx-a$U`IL++VNp{I8jR(C%SeYCXCV&*Ld
z&c0oFFW-69?~eemphVz)xw=JF7T|_;27HcH9}DUvLQTa=lR7GG$zw|p)(B@Lwk3d<
zf*$&>MV@U?SeKbG0QEgecnf2PwOD7%ya9*8h*8#WoV+l-;an(?CLRFL0Vu#zZ0d%K
z+%uYWoZ$A@A7A+}W!9BU-&%&F05+A7K@zz@G%gvUhPEKPOL%i!5FN({SI26^N`47%
z#P?c%2B!xs5^%Og+6PrctZ>CB@X7SpVUJORT6?2u;zh&cc;o=ja-?q7s$RUx?eaEq
zBrNq96F~X)u`!R)qJH-TVvSLEcR%Fknb1#EN?mzB!F_@B@rqbvvkj>t&@g8gb%z3#
zGi>XWg|lkS?r3sN{BBrJG9(_>V;86tPuHW^Ll1-ypme$ou&dVE^bJ<pSw7R92HC08
zXU}Pj3~t>Dg-sIO`>h)WrG){*|3o;!FnSSf;pNmYE&wn}f&hcSvqpDI!`|cI(v`KN
zpK9Ru37GX3H~L&ujtHR_OO^xviPL4mpMcN7lj3kCun&G6H{2H&$;AkJj4c$|O$MvO
zO+nQeU>qXUA_7$~rV-!6F*q2@@RPgisBfc>?xYjDFz5aUFII<@4gyOh{l_iG4^mSf
zJI)3V3Lp%e)6&o{anO!vZH_r^ZTtTWOF6BVy%W<oJVpbZ3v|(ugvL%3h|(CgM2DiL
z(ocmFuEl|a3hXCHc+bF#8if&3Dtx7$LbxXnWW5^@R)GJGz8eN`>Ul?JJSt@I2?3}R
z1%k%zq5g1}!9k&|zI*H)0EYn(!UfQE)>QMXk2gT1qY#RsSYYLMT-KD*GyM8AsYAly
zSWIwI0u)_&$|f8PCkL=h$vQki3aH`@LV!j{0G`H8HyA*+VIh;i&R$+1ivxsrW5{O~
z3vk1`VdbR5a{9Nd<SjFjPNMZLyFgS9QXV@JC1nU~1pd^4Q)68UXQ6>eTx!6>xp&>a
z%L!fiKcUmd!qkJGW{URf7Cy293!dPj{&;HQA>4Xk`1eHGz#dhGjbS?z0h+UemCN?x
z6zOP9AVB)26Q(7hgEa;N`1V4WAw+#leWLgf7%HSYnJgl#S%F1x7i!kS5nA*jR+ryM
z;bo@b&Wq9F8vmmyloQ~IE2dG0usc!#AcYhFuP2jXgZ7iDhFFLX{at;1v@&wT+bwe3
zj=s;MUp$?hRVF?>WGd-+HPtU6JBE+aW=k9dpg52+Kzo1#(LEt{9SaTvh9MweEONjk
z6arAZD(v8SWvYm#geVXZo#lEi*zzNF7uW-Y>AglbD%URVi*0prpku(H<_H7~=<k6;
zV^El0a{z<{5ZHeMiS32;)gd8feK$k<QJNYNlfA8?5Hgfb4@wpSl{@N=m(f%JnD>ML
z;`w{nstG6+06?<P<8L{=>n7?`0@i10mtnCI?BNG_{O{?#_8yDLnt%n^P9hYY;7{1x
z@f7enC7?|4wD3)AJ^Iz*5!)bHMsa~R@~ULyk;M7ph$BOD;z6y>;+xVxvx?V`RztO1
zb=7>e2XfTxhDTDr`nVqd`t#?XM{N7;6=c!>sK|JB_2dbO$+v;i3qcI#Pn6NpY`_KC
z{j>b=%TC()Aub-&wy_qt!S^lbUKb*;zlxupIKyhm7{J(N*YTs~vY-1NLB|-97Sb;l
z%XevvbSP_Zt7FDyMw}wB1QLm*pA)Luh1gV=e?|7O^|KXlcz3#Q?^3{1Dt}|cLAuI|
z&O0wuy;WM%yef-OY!C|E!5Gk(c|4|*^sO^*^ke7GZlAti-hP~3JQR2SKxrHd40HdA
zT4m3hR1xDG2oR##LEFCPjkh|`orv;DL(jE{q9g0);@_K2nYz;R>JCuO(bRKiWG10q
z`{7uOtKzB-ufIA5xV{@~C~lYhXk7?U#SKR!r`@)e_r*~uBFeHS<n^4n$Wy-btBoYW
z<4}(8wwy^6BD6wAAKtGWbqG~d-U?%~0Z@WVv&htTv&OKWNFkKWU5y;d^^G(YYjAOL
zMf=i+|C|3}H9Nmjs|Df4w0ZLMZoZF;Mbl2MRIO(pv2*iWO8i^HcmP1MYvt1a$=8k3
zDM<yFzBo*^+RQL31kqwTYkA)s6QLQ3+FV9oO8`2pF*d2WlOgOR8g4aJkog=jo<|)*
zn_GBramT`a-Klz*MoS-?XnmXIUG^6dtj|;&&s4cjR)c@jp%)sn1~7Ro8nD!iLGrEY
zKOR+~d$}LhyuGt!<JjYJ<wK#DvhfL2+v&viBPv!`M{B>d`3H+)T-g`izy5JcJ6;`x
zBi7M!ez<6YWAd6ETBh*WH^lpgqwe6fb3kU6;!^+2t0AsUSoA$tF|dC^s+g)~ZG|!9
zQ;tSrOs{fM<mqMugvJ^3PvErNlR<IzrEEkA3jbFF{6u~CUCisMQzx<AEyG$K2YZ1i
zZqv}j1lBNnS}xp;NyJ)}ZnbigMI~SNZ6)E{2yO3|^`T;+MTn;pRHZ~Bm7kM=2YR~b
zpW}WoFjApjDNnB>ZQ1)no?14+N3~36=(AAxrguznppK?HKJAogjUO`y$1}H&Y?rWz
zVHgve0V2n4;5x_O@M#^|(p(_G=l9!MJdQMhtqY?onG1tX|2=Flle(3nL-O<Gmo)$W
zPF%jc3b;wIqtqiarX;EqOIMms@+d+3(Yd&hhwP+2_&y^ViCbgYH$BJ7ggbxN?&xAw
zkVES`KfeMg^Qi7hITB8quzW-3;^JIoRJ+a>Xke67j-OIsQas1afsWAmC4sBdS7@TM
z55~vmvvy?T9pz~<&)0Gp6(J=}G9shrC=*U(EI$r%)p~RkqI2lL{+W{c`J>qvUBCJq
zXbV|}F<F&iO{<}Ir?)DG{y(K40u<WX#o6?JL}Emz-c`_pr0$QwjTV-YF}OR}C3xPX
zX^*gW@vG??lK`L;(o|3ub{`4=fv_pyl5<haq@$3)NIw(<a*@g=5N<2PX2VV4vQBS*
zDojjm92V96J(9UiM-SdgFH{UreyCf1^7d1W+}-o32T|Fi+`o`O%w{2?qEgt-sZyP~
zuB*VN4j`c*<U&AyF1)`@HioChD{oa?jsU_;l{C+~dvt)&=-A3M-5AATU<x;UBdrGD
zJSY!^uXooY5;PUrKWd}RPlVViod0lbiP(*cl?rDCy;CY_URe6eLcmO0mMRt_CvsHi
zRFTZ(%R?}0_zz_=b6IY^Y!o=`rzblcfFEx6?E2STZIdLImm5O{OV)!{!@e^cl~+gZ
zBpdrMPpoq}3kx451+lMnS_zdHyyPQU4SB!^gwDx&5brzL=JhuzS(1<^QyNBtsB^?8
zK<gMeHi0NB#8kl&F~AFFj|WY&mdfLIlDRIze!eB;exl|(){waK(eF0;{m3brG1|Jj
z*oy@|3##AYo<A!lU~5hVl`G~SfLy*3GzypF5r$`BKn}*3Emn#UMZ{#R5Pw1&77he1
zl;+P?H>N8ZnZU)_B6>7jz+DO3UX^J~BWoS|YX4t7Eju7U#K+>_f5kt`0r*?`nmu?Z
z1uU*>`)BvWL+53`bz|$1unLc-dV7EG^QJ!SBYH2z*$O?RpG{GlhA@3}m3+bX{@A%V
zdY!uRV5RcDS$2}bqUFt{H1FyI-5KDdH}t)_g^TVMh&If%EzxK{c(dt)yLpfLCyVCA
z`sG;{4Ux*q)Q_5eA=U0T>NPJ#LO#z{u9Yu9rk?u7j}6$~xKw<6y?1x)gyBDV{|kU`
zFk^ADVBFtIsf=o7OXF<lXj_|A_YFgOX+QO`(ME17%OI(6BdR4Rsfj;l8gMbj01LEx
zH@;y#dx)`Jp7s|(UCRid7gz?y6V!`l%0eQ&D1D9nIr*;N1X6hQ^wfr@ZL+nK8boL&
zuge%Q>B$4jx~BrZQ&=8G3UU2b;0gE^U5&cAUL(upjf}lcSc7a#7g_Er5U{n`91Dn=
z|M{iJ{B-T#Riq-z?~2y@9bUZU-x<XZ8{{1mty`Z@>~P^F1aEAGlB*S2DABD<G7>;I
zOvVo4GE5V#T7aT)fN){Mha$l+k<CZalkpEWZ}K&s1zSBs00)ueVC;nMTVW=}N?^5=
z4;p*D8<+)yNMZtsFib#ZoY9!b#pRShXwZ;PKa<{g<bVmA#ehas;;?l&6hp3B2nHin
zA;{lEf=Lp-bM{<lq%qmYxXW}B?#MpLm|mM_7)AUfP{kv)c*~bOyaK%OARv$l?g=;b
zt#C<xkJE<*0(cg{thu1bI)&GtI#XdG?aO{V5C9Mb<f1~h5^#YEy$Pi5?{aznyq7N(
z`YvZ549DBW0^(w!l_^lTBm~<`L2rO<k==_1w12XV{;bXno!19dRs?5$JZ<&I)x>5E
zp1KXs80$tHTZbX^INAP$cmIh5f{n@H2rM|1#J6h1UGI6vlK@)n^Gt`iC0kX;%E6J9
zhS?WM90Bxrdgj%GfUvO^Iz-GX1fuc<Q`F5PfD%xm&YVo2@%FWeYr4SRvr0Alug8bH
zG#gQDseYrc4H5JpFbajz1?=chsN(4ZyMZ936?il;JPiuRrliD=mt(|LV5RD{#g*5G
zD{iSLk~6Jrncz2et!`Zgu<FR$!s&coZDNUd>D!eRHT?r9F|X06|G11{bD2b?<bK|`
zPTIy-(R%8EH=kS5&%JG0(tFQgtS8$vZp7O7DPfdu_^jgVe{37GL;w&47(1La;y6Ex
z&Mn2Al4aahnO?Uq@`O*O(bBN6XvO%kzpsZgY*LS4a+|>?d&2zrzo%()hDew4<64QN
z`$FsRwlm$(sKhyFzU!gx$}-hmRl0yrKK@UOmNAj-tx=3{0Xt(BDE-@6Mb8{<0h3_*
z^O(PxuO4Ovjhwz@8KH{^Kw0H;O!`PMspOv{_K9AE!uYHIA~g5{BRMEYu~6=ysQ(~B
z#3VTE$vxN#3?M}kpl&4($A`PMTeJ-Q*B%#VjtCv6z~AmAJX9rmvBj7S<CsE~!rlP}
zmPZt?&#x|?Anvj%5?2JZ*p@p>U4GoTW=*%+KZmz4y-$pZv*g8Wg>KroRHX%B7*&2`
z4^T|^uIqe_{Se`x@K~BK0ky=X$WJHnJuM>#KUgsg5s#0Wc?bE_RH)O}oiq6uMuBhs
zLHP0wE<r>rMaF+2$cMYL&Lf={Ze@%BG8N$F)uHqvHM+y%8v2&{6a+7|9s{w{fI19h
z7-B<0eH*85HYtnmDpmXn&2&8@JcXd&mCTwob#eV?*e||oZ6@ml>3#KFl^r*Xv40d?
z?#CU!;Vf(K(X^Gg6p7GGiMshCduf}@*FalX;IU}8Ba)e*3Z>9#+9Q=iv$&7u4Wi^Z
zR%s^r78iWj-t1P{Mqd04jIC&od(C$~`u)c<eX9m$0q2pphuzkiQ4ZC82{wt1fNv{C
zG&_1oG71&M?&%r22<DZfWCba_C7F*_W54=gG@4jb(I)dE55FXAxlzRUyNUnfxkB&g
z$Nfd*j*P1{=YbmTTf7NRy(?+7ARq2bcJie6JT5)(c6Dh+iVF{|Pq6-te%t#VBbrH8
z!9`n~5T-$6Hwf_5LC%EN$NH%#qaAG*{as%YDzzXAXTyp336bw8oD7_*rDmgj6=C)0
z7-}_u^II|qaA2y!s5%+Sm4VtUX6oJS-bl58ntTxefW5fD%BmMsuRmv*Usbhh(L=$J
zIO;al$Y@@|<oQf}RVO(ho9J4=cF7J4a}%mDH6xZ6V$de;0xZ&pRg$w$1!VE{F$TTk
z$m;J1)9e(;vZ69EjFYj(j6sA5-8zfmPuy!bj{fzf3^hq+P#25dG^Gy-TiM(yHO~LI
zI}qp34qZMDK5Gm0DQOleKVGVMQf7H<HeJCdcRw49?qAbpqtd&(pP5VQMF(v;!6BB3
z#mCse#g*Wf)f*1niTlnjDphB-<mcHXk&czpWWy8p+m?_Yb0-zG8$5;Sj<p_q(oqwu
zd;>w9m)CO+E?+x4DINDXPUf1Q@}<em=>MRbT8`WOTDwx6+{-S%psaKiSK8d0MRibN
zA5n%E>$iP<*_zT<%GH)Kpxe6*mXEc=e)1Mu0W3!@EfEZV9GS=|>WpWMe#+CzjIJ9^
z{`G`XK1#87-2ZW5n;{zpRUT}+w7J5f#rAy*R<gT={*#dSJIS#9)}_+GD3&BKU9#5D
z@2$b2lT%puxNvjR;p*i@fH<V)&fTg@xc=tVXI}qCbEP>2MV>^Q?-VV^S8Gd^xIbAa
zBnVS*Sz8}+LMyi-l||PC0^o*j_HvgzJ=-1r_oCuuAKqK$MCC|WLAHuP%9Ux6UBUog
zZ^}!vD>PhAjfsj+sicvFkt6Nwmr^#_xtGLL1hQ$nK#r5Az}n&Kvf??p)&~V<v&bTK
z?87)kf13?{XNx5kwZDn7`Lxm&`ENFtJ0z7G^=Pj@y-r&_H=Hi2Pvtr9dh6&zksNYV
zfxSPLu~O4AXh)S#KCty)Nw}^PJBNT5;l;BaP{XiH^{eNU0(ZgWIr<KNXMep|tEvsU
z>YP0IR+;%`x&3A<YjXY}C#-QwXW3X<C!gjH3G&yI8~k&k(=MCe%KQ>w#a<lf2@$U0
z$*=Y!_fH{Y#CJ;(sFWyBO>-{4q7fvLkse=G(HBdq?I#tAF1Kmcwx3z9?O*l{DIXPV
zV~TR`Jdr3f@a1jE6&}m2ngmI)Xf#7zZ5KQ)+MR-NCxE0(r=_uuh_D9Mg|_Lbp--f6
zzt1#mEFc`{ucF0+8_%z=mCKlsf8@y<9nAd72&(7yXtV#T_j#~+pPLQ$K}WH9EO#mV
zNYa>ElB%<pUPw!(P0!ne>SLz9X|%kxq4RY`jp;?~TN9RPiRCRxtQUiP0Z-=CTg>IA
zUud2j&lp6ra6bo&X9tuVRLn><P#iL5bG@0AP;OE$Q6g@)JxZiNU%8T(I><QVFXNl9
zq>jI6QU2R7kO<6bz~8oQ3yV+vkAsCbJC&vWoKkSGAug|hlF9uIG=N3S^qEnkb)K85
znvE%R&(Xi>!R7;Atjl}-OuS|v&vDSt?5B@ysMrr?jHKzPb|?=Kt3Sl;hfAehi5hON
z<9-PLx8$?P#H#sE!?yA7pPl81TlSHl@&k(|s2sSAES}&Zz!w+B?%B)bB`7ewwEp)G
zIX7vhtM}QWiSMrldIkB&g+BN9<dHf?g%aBgq$ZEDRIkQMdzgNylYId7dMc{3$doj4
zY?`YooKpCRkd68+|A!nO(y%gaDiatSchz{*k{jaqVDk>|%Y%CywaQL`j5F3wxm+Rg
zy6Ok3@5?0SMz1!dtelLA))QhU#oi#lzJ)N%rHHJKHQS{zn-4Kj(ZampF$4Z#5{Ec7
z6sxg5dQa}C$rG(?j8>5Sd$E|uLBo#DFuw9{n1;RVsZ3z!eZM9ub7FBbq<;7E?q6tr
z_J-{Ba>aq&FH(Gxeqg!0Miz61sJVU$w-R10dAtwo6)StB(Lr`fIYQzw<504s;edX2
zblJl_T@&V`vaz%!rB?)F%(bH=iNu}*v%4i*>M!i0up7J#jm6u}0t~(}m@O8);r#*$
ztMqgL0V~BOsaIa_c`TQd#N`NR#T!vBOEFSQ$?KpT7~c)MbD2lj`mMN>1Zgj)<TjJ%
zY6yO&cSFJept@1)@Nm(TYGq#WR2QEm?wI8RWgS}={4S<&@S+++iyF2V*=E>`s*-c?
zrwC~KaOmCb(SqvQye-HFZ%81+U-FAsOkFQYp1~CiBC3vlce9e-Dt|G!z3|M2d*;n_
zwMU6x8Ewh82fx;Z6bbHiIG>ICuBNKXC6Y(Ek)=YBPdOO-`&nnQSX1Bha_Wh?4Pt-6
zQdQ)eK2_i0oR%D)(bam%+mf8pj~#*f$;7M6@vAUnud3redsV5e=O;eHUuV&}QY?K!
zfYLs~l)@MOIr*Lbub<<FHt9KN2vTST@z=(2b8^H<I@Q-^1$c~h@mlIs-q%ul??06>
ztAd1RF7oQF2XZGjX1#{^V(nbseOs*ZlE5l4wG<6>p=8};=*<p~d}<%LH|N^CEpL%q
z@%foi24fJ{32ybBar5rQLyYb6eI<MS&qm4IcW`=W*fNY+Xi8y7Va!Z8#=drCw8=PJ
zHz?KVaSS+&l+-y*CY4Y&OZ|&}eny~Ns1@l#hPJLabeTG?bE&F?r^TQzv}<C}Gtp4t
zx}5)tjLiloS8TUbA~_iq?z-@E?A5*Yz}?+nG!yFRq~FFM#*V3OBce)ZfZoi|KejE<
z(TqISh!0xSLGos|S9ibZw|LAfiF=wuz*EBKj1^DWzn;4HZ{z)+_qW8WNrnCwtYjY9
zTfCh7qp*rEcd)|@C*F$~f~2Qwm?|XuapO0r$C_?qFNMHpG^;6R(}_J!*OD3Mz2C2j
zvtXWhcK@<fU)Y*xVe<T<@_A$OP}FX4b30n@X9Z_6X|_Uc1bZTvo^B!9&`<yB^S#wi
z8NX8+?OP9yiD}K<EI`v8s0H;Frz!(tT1x}X=xVz4an<#B8fc?Thu*ow)|wYFNmS>m
z|3u?SdmoTD={@|<(>yuEZyBspob^gi-HnA)RWFriDzWRONs<WYq-NFF+(Yprv<f3|
zzAyx4RvcHj8`({RX^me;M3WFfd%U?TjIzgw%bjwi>Z5a|j2eAkrg|cnW9p~e=3`7n
z^CHay`kLm5L{?8pS26H<R-s-zVmzdR)YF-J+q=G0)4wm!%NeOOvQ|!JsypzZ$?0nj
zjD6?iXj%I!;TP~%t(q^29a}?DX@9hHTh)Rb_x<qOhnWAu>qeE$5RBtHf*GniR`2~g
zH@=))`(7-_w1j3peR%v`7Un~K@XE)|@vAGA3!zVUYfRnA(odrHFB!BQLuEoUwcux0
zV;0@l`Z{Su*-hF}1EX>i9O|LAyhTDSIq@mzWrksVavM`e(v~=v(EFz0mX{q|mNr1O
zPrIF&;GpnHirJA^rWYFH@&*&2JX3+dO@ZC~2TWVth9fn32YV0eUW1_;abPOb(-oC?
z>d&mZqnt#4HC}#D5GXgk_&eE7!SihNfKWi<&Z@;*Dl;ebi|_9Tj?W?IC*pS$HaK;I
z5jwxUY1NW#m0c4Vc8W>8z|UEM^Op0S_Fov(N`(zg4aYbs<JD`gUh;W(kaDPcN-T1j
zziCUp#p#QVF89tb3B9}9Z!Ym>F=)Vfta?6W*!zYaP`*JCK@}K+$&9wQO+X;w%8QHY
z|DA1Y*vTkp>Kk|0$uTa1zYIrSy?A94c0KkG8x!2V#MulsIB7NJ^PRmJi;Gh#RC|{S
z?hRzqb*6fZo5pTe18zA_*8fBV!E<C%=(v#KOT2-(ymQa1#?eR5tA5uUZ!Bb_TCG+6
z@UM8an!m*}lcRDozU1Vnp=QmVcD+734{pnuuzFBXyt^gMg_g=k(+>^)m+a(e;F&5J
zr5GhQwP>6klUos!>2#E8fZLNITmT`Z&_(jGfZ*yeu#6b_o`z~gEs8j(9>h=e!RATs
z9I|`{>$tQSzyFq;RmZGM`0T$QeaRt4(WF<>Hu=>gNEB%{RFD}Ie&?3&T{Wp^{6>~`
zW4SNW>T^74K0QoVdh>YxiV}-e<NexP)RbPP-OVm(f1&~aYL@Q6aV&Um=}W8Hc}Id~
ztI+=j$3GRm6>y5fWLDU1*CQU$uW#t{nh2n-2u)wNFF(o<UO=CS5FU}YDoLwYNvRed
zR@;1Fz9XMi5XJ8cLn^gLshs@ZX14ac_YCSdewQszmK|!M`D_v-q;)Cl6U`?mpYp`4
zDvr_eyd!T&vw_Nb+$9djZY~vZR9y#V7%;fJCnO=plhY8Rc&_sA+ug17-7giczO<HL
z;ZKGHy(}4Z160v9Dy|jL(E<Kb-Fz=%Bf?I4<-vHT;fv8j7-RIGDYswAGIeS~%J?pQ
zl%Lsmnp}?VXAKV4hgyxhRelj?jrNy6Ttg1#pH<!A-Ro6^pPH1#l|IDkHrsuyGT0D4
zG44#f<#)-hY#^RVHr`cWC!%-7Y#S=8*buS}@dZC29_fopH+KeE&mer5VYZa){AL##
zs?}YWe?R_x!pR!bUB2V6+z#J-Q3Dfkh!r|!Yj^fTGlS%8!o2Ol_z?}Hs#$KT+Mbb+
z;uz>XK!n&@#Qq>Uo2{uh(NdpQM=w>4yyl)C_n?~*@h^$NT4f5ykT#=3Uzc+=qA4Tm
z%St2rv>J}&U{=|9*7q&zIr6Ww@vd9*<#pSdqx=_Nr6xid9d*k0=5aAD^y*ov23cIY
z5<mX7?flp3;J5QU-Rko0-=^!2-|2Fu&0lziV<L0Mabq~Rat@eHSC`FN5{pDh3a`UU
z!qtXMapH<?BKrU9jlSs-!O`Q|kE5XJHG5Pf8s^2c^-j9t;P`fT`l<iM{2_UT*J!=}
zgUzcPD-GV3YRRiob-SETV{_9E6|jhd_ZUkmT(&6oEYMCt?G)NXAD`Fk*2K$cE?<ex
zG_5wiW|eK{U;lCaDBU}L1TbzeXZLC<d4dFGUDx`u>sd>tSe};M;fEht%Ip-?VTuK|
zPTtqBLL(|#6E0LbwyOPox!T#XSe_;wU4U!}(IkH}nX~=7dWRx^vAS;Tqu$G7IiEL}
zJlc)QCrAHq1!+A0#(T_N&-)Ru5k7`L*u2lfyJl%=9knkJqI$m9m6$Jj&LB#l`N6TP
z{rBmQ&}g@R%0t;#(K*`8NsLwB)~h`|lla6XkH7UL#kxL~zDjaT5q!*o2vSKy?Y;c#
zdUto9qBZ(Dh16|+E!>%m(y@-b+A`_<*glcq@T-LT*~`EBYyXlNo&{d7UAZ4F1g-g4
z95mN9OfY^MC84#;3P@pb2FxYPiWA47ix7)WWMFLnkZ+aY88qK<i>?Lq>=1Jg@5Qb#
z8?M8hW*~5>g3YDB<od)Ij8Xs^dmUbgd%+o6w2^i|X5>>4$TrCvVr74grjq#+7AOoO
zjR-;|)5FMt8m<71Ehy`34;FyMp%66$he5HN!;ml(00Y7IU0`xD&%jC;m8#|RE@c6n
z1CW#yIAY64S<s)SvtBO^zi)UBuv)#jbHGLh;$SrbP_l3^kaXQrPzZ(K*OKSnMV4lZ
z^Ev4TuFA;uhc41~y^5znQS^l1Y2tO46JXmgJ%U3)BG?ELEMN?3k<v1u3eRfD`(ZD<
z|I&ae)+*~1Y;dEH+GUw+-edrn<~Ai81c%u3Z}!!b<D*`}IkB)n_nMcPAXdBrMQHeU
zRDmQkj0PnkfPn!kP~CVuT=5o%9vh#%7T@w>?@%Z|Q?2En&tPJ1Etmc^sZWWuIuc;r
zR@DarP{KfyOnCR=jr(=MiGUQ;IwK4e=^UP{kr);fj=;kZ0foXBxiEA%Qz!(^*i`r3
z3l*{ey5O0H%vUO}?jmL$eGA%HG%q4dX**2ytIlv=2(hkInP*GHDZrp1NEKL*Dvzdv
z5`1!qb}VTc+3So<uvD~LW46=6!8DC9IVGB`>WpTl#Ak3vekpE^KddXaR)P!(HSO|y
zk8`F;R0Goe%KVM}L)j8V$yb@a!di`WM&px6VMB9c@tOdby}I(JG#S%F7((tPWte@P
zT2^Yb-&J<vKre2xBug+iYf~&2%G+yH9M+e}ezW!b{LipS^&c=(u+NI^i(g}}z69W~
z;{FNfJQs-`z3b@E*SsCYh%gv`|Dk=PD#?$pR2RS3v}>_?*mc}>S#{gKSNi_rUC=%5
z=k7eA<NV{b;hjt6YL5TscSk-3>NP|&1%Fld<V=KJ3c)+8`J>Vf_->`#?Fehkdx3Pj
z++Hkgrc54W<1a=Igt44`w@LvlE7~VQszb$r`r!`l@U5J+Gp@ja?6ZWjZ^)jMS;uHB
znV!)%bg$GLu!(=XGg}qQ-VGp<&-K*8m-KV?P?(&f+xzpvtJ<{Ns28=p_Fh>y9Jw#S
z1wgXUK(|#$l*)C&!zMR=7$1z2X=GG-B|!&2{b8Q|IN4}G!J0MFNB$x7Usrq0Dvx+*
z6+izah04y__YIzfuD`eE_3=}oM5nX14q}xr(N)D^7u}-KF~2gom?QdwUuSU*ly3PI
zKHR}YA$qEo*Q1kPmua=g23vpCu^P|vq*K`G_Ls=erV_}LC2~o@d=l9V<cXs)Xbh3Y
zrkQP~*61oRwdPQRp2@mbLU>{_uO3b9;4z;R^_I>V-tjcQRIuPXKH}pS(J)YdPf}|^
z``dMw_AoulSZd||nV$2jFv`qQ<MS@};Sg<hAZn`Jy36mn%<uXgW57{(mOG|PFv|4S
zK3d`tq(>OKVT>{o#>Qq!z8Ae!PmBZrBa;1TubW!oz-|=1C~gS=<zQ2`rxqu04<Q?E
z?S>IA$mKko#Tr)=S;e<!2!OFWO-pL5A1k)g$HO)NIzXwwrh$hE`Q6=uI;g&2Go-S?
z6pVsPNNZvPklk!lR;~TGhh1^Rr5=ar!pLZ>QV_yGf{+8ZAXt;t*z|Yv$2D;mAp!G5
z=WtI>f=Ev`a!r~(WKSd=#3AYh(DlPF@2ot&9=fb6#7hI`IA~Ag^D(bf$Yj$5L@04_
z)I1te9$igesXeo5vyJ_4lpF3)G5`I;?})2=PH}7h=Bpd3;LyCEkXUK7E0&ZsNOVv!
zjP@Rm31uT8uJb&HapU0M4=m*LNk5-mO6-2ysS=9g-}*uAdaj>o{Kq;PY66P_evgXY
z!mtt2Bx4}J=%c=B_IQ@C;_i5MfLyQ4qf!8q+4$Yzo$Tv2vlmHwd9Oa!`k2QnBK`fX
z>9I<&)&s36fTj3h2yE;=VjTe76-xxvO@xEp6V@ttUzMoE?Mq<d%KqeVWx!-pR{5WZ
z<_*-y8MIBWbm;|MhLw64M(iZ(FK%`HyuGrN<O^j_OJcnvwEwA4?)91;5AQ>&pKO7$
zPq1u2Oc(CFZKo65YVb?3w#%5>8yuT6yV+nz$ag7wh`Dlg*sgu`4Tsx^CNeD#{l?zO
zesE|2d87qx)wEXSL87*ix(D8}Qf_L=HposXN*W$Inxg{-cS^cND%y-b%mF)1GGFGw
z+wk-{-C#`u*b4v#!H5H31%&CeV#A2vlg7z9Pp7}p+tvAMGVDH8QCOWTY8+{@9uU`U
zb!pH?SIOJw{&z<K@3y{>|D{~Q=F`c#J!>WMi|!W%C}*yc_Y<0nQ~+zU%#_lXa<o0n
z=pDEjce}3)awUmCM1(}MmwC4cyLcD}2S7^$M|*P7Ox2=`M{vUBF=%cFumF?^6AX~Y
z1%;D?Q=4(OAI^O)V4KPqxH<`_i<9Zl#70noXh8V5?+K0VpfChD#ZxJaESzsMD@=}R
zOdi?Ay$j;YJAOZZ$3@1;#2&-Jl3C~A`4#hax`eNJSM#!^kJilCK~s*+Fsw{Uy-kCn
zh|rn1Xvy_JZ>)q{!US|9hnz$v7e*?l)Vu`%=$$4W34KYqB@62rZ$#v)fMW^~>|Az#
zN^)GLV0xwQG>uFiU-QG)PX-Y*u^gJPd%WHD*rI}RJVIvY5hb|gesh;fucQ`5Tmb3E
zBZaE%kUZ;Co$PIph`}SxEaYHcN`ts!VW4QmsYDi5E@Kw4WMfiOCb9zHv>gD31oebZ
z{+`5<IElXxvtI^kQ#KB1NU#pf%QX02&)br9dZi$qPk-JHaKqOC<W;YDs}?+dFXO3S
zD$)_Z@{YjYzA0B908dp6cX*_d)ZQ}N4fi*i=G^?8ZMiwKpefFGTiQ8#cGT>bFwA#%
z?c^1k1WobdJ*q~nW)B=@nDjBcZI6Dgmfdc-f3Gyj#X^C^WSQ`t60!YyG9iPYH*N#J
zo+8M-3;l6*QLS`-HMT@rJi4|tj#czdEIH`ia|6?=gaqQ?s^3<LPmasHhILX|jH^DF
zrKVjxCoY@}DolQhyIn5qroA#{z;yj=t?0%2rrNPVBAE7Uq_)-LAxEwAMO^m2?t}iO
z>}<!j`H2*a7XoITr{U$jUJpLh8u)D0?Q}i!d#Y-qsAxq?5xx)=NYHlJ(iB{F036=U
z!@@FnQTTh8F7EDVZL57f0B$Vc43`GaK?_MG`XWY!<b7!;Pf4HNb%ow1-LjwQqCZ|*
zl=Ob~c{1zabJ3?1#5+dh$R2dhp(tG|uHL80LWDiGbMcV(@EZvcV%PgaE+QZp#`U?X
zy(LnKN62!tYK-aOy_n;Be>TI=H~Bl^_qX|gnyoKQIX|m-48<A5)fg$hbf$<)@5CHX
zWp0N~SqW4UuC}=TLWy=3E8;FAYe}<q@>333{f;V&)fbNL8lH}aYJG!zT&~+=fY$FL
z>&v=ch2HhIXlb2(9!#&XaH|Ry{`T+d1TSnfZz#GG`o18WSVa<UICYU=9Y+H;1av#N
z{vt7-|Hh6ortja=R{=$+(z<y^ceO}gG?oOt%;nFdgj17MvBePvZrJwA>BN8xjjRn*
z6`38ZZF8H8O8m>+mFYhVA&VdSrem(StWo-2T{u-Dam1c+)@7@^jgFMG=o~sl*OZ6x
z?jP;*Dw7Xffi>J5chZ_4h<rek3z{?MR-sMeZ!IT(4Lo{!uDLK?!7*^h(0s05`PCx8
zEA(f3sMv>*3YMn#iL8{-Xq4xW6DwXM2r3uRS5>^v^FBVE@HHDLl6?jqbTdF<zn$KV
zvWdsHzUM9(_m6XiP}Qv7<4IVmpA>gRmFxWu8TVYIV9EsJHD-#Gf1+ahFN$GIfN)G2
zqRBuWPf1|o2|7$yW<6TUl{=apmJ96r+5K5=A88>>|JK926?otpomc#v<xp@6iGvM9
ztAcSgu#iBe*#gV}{Adaaj3_6&G3IEOGa}-^jS52mh$m=rl4n!4rzU-egxvacxcZqi
zhk;xJUhJQ|R;9K3PN|P_C7)-!YdqWR5;*^4`;f=x{qMh@_3?gh-mQpF0MR21@9BoZ
zG~hZ+FfdF9+dNj`o%_rHdz|GHm>0j21&v+IzN1^cp~lxXBS`PhoNgT^Y(dOVSzHoa
zB)HThcULpUp_g;e4u%WHao{AJYIWbuP%EKy*Vfj3{#7A<>Wg<-fy%BVQ8(1Qh#Z<3
z3gLm_L`C4pM-brHgd_Uv*?Qmzh&pUvkcI*dlOo5s;bOd*Ffc+`AgkfrU*}49eFfda
zv$9x{yf;Gmp~a$YE*6qL1o)2ZY4xqI@}53^_x_d!{;3^@s1y%|8^njkFrXnYv<fY*
z0$C|RB8PDNAbwbBvABx5B5nXdX;-D=vr~r@Zah#dhaITI-y#YJp@@x)#D?36vJ-(L
z=m|*Rst^gD+Jr0yIANs(0AI!iDhDlIVb{k4V3@$fU$J_3Tke%u00o{PSviz|F+hF-
zW{aMR>ejqh8bJY6I{-Sk@y!c!DNNYizbU{dQCb|nD5-uF2rE%w@B8KZXSzD<aB~1F
z=qDGP8PM&C-{_l042YUVQUG+H<-E}1Fx~Gi#X&Y9YblipP`=iY5M12#PbWAC87ACW
zYrm04Y?t~@#Hm^OZoQ)N5OM`XuP+sZ5eVs~r_-bcc6S%xLv^>c(Sxa;Z#sTg*dt2x
zIInnTAI^O*w3w=$G3O_A0fG2S4G056K?t4ha;z~>H&_)MkJk<5!-BG2_A>^bUJSLu
zW++-qb4LtG>>*Re)yLAaOx@2lQ0wY&B3uRFPZH&G033n~&<zWuB#uZyfvI5Jpvo16
z0Cu?(u{P;<Fb7W=X)G&Gc7d@L8h@Y9+wZ>ny)JnRtP_Cyf_$(6EI5FTut^yV0YDNJ
z2*m(UCMYNi42q%&gA>D`fcmg|;c^*lD1r@<*`8*&bH&q5GGIo%#P;`vL@dkE$6vGa
zUHtQHp|Z9~tj*=S_v%W?DMIh@6+n|>LPkW5SaMK!6RtjQPqxtcqb&kpkccui7D-C0
zP$2DF$j?jRr+Ny2ZJ^LRB!ZFrmuNTFxj@PSFHgD%N-m@H+utRZ;toNGDAOG1MPw_+
zP<^l_(e>`JX|s6>6cQeGGg!iAOSk?t_|5*~?JD@RoN;~PCT?%~e|mZJ#BHMc$X%xY
zHeKE4HfDc$dQN)WOAOwBZO-#JKJBimLphdJ^^@_G=P%lX$9msgT+#WiH1{K3wRq3Q
zSB~O0eqt9P<6a9&t1AYIt`o}vcZ_$u&q?SO@XBtqAX4c?Cm%TY=E4qeT3zyw?c<NB
z08;N*aDzwVt>(l6q4t)4y&ExJx8Bx^^JVfJ-&j2IX_=iKa?}iW4%L5F%Lu?nz8H`o
zzOto|ogTN6%v%kX=^+csGMBxJySLAEDGCYVNS{FLxjpXwNCvdTe|9-;)Z^=o5jnD1
z^0ocPpr7Ao#kbH=?44`K@uH<NJ}48Hime<hWYO=DyPJ#rKslDk`BUp|Z#&z9ja-YM
z?a4Q*tI3sKV`HWqUPFEMo=B<cp+Fvdg?~T3WvBK1{+gY!l^nwEM*D=O7%k{@6{a?b
zPo|;^AvI6`zs>Ab4CSwTl~-9Kc^y6nyMFl;aH3uzY8T&fSzwKo-6pX14Y90FCh09?
zrtHo8UU|j69&8ryI`4Yd>Z`>Gm;$`~=b!n_saL@M%Puv1aCEw}f@X9Lr!TW$koV+<
z|35GY4;yD-h%u(`dIUs<Kp|US0pt(s4+Snrd1HkY@X4j8VXI`1Vv;nwLnHzTVD=ig
z*u*W=fm0i~2@#4KCz-p-JU`8ysy6jW1L)pTG(Xt9#qGmT(mA4YWB6q8*5&!YMAftM
z!Qd`A;Z50UEd%mL#?+YzTFVk)4jvUUtP&fdquPE+eS_J%e0E(e&AwDTq6=?|p>9B0
z;eY_GUmL~Gl`Adpw+*+P@n~|z?~hG}NmJXO&Q){uG0PS_D(*JXT2<S2Q2bC_xSQ}J
zZH&vM#mH%R+n-iZxxIe}!eX?|9zGGWJ<VoN5;z|kAX}Nb_~m?4Y%Uh0EaWd^n|qda
zU)u!sZ_7~--}85725$cZaGRgNhd5G`)%ssnH{`0-B$S9>T!U@gn)p+AiVT_TN;2nf
zJ1!-aWa^^y>D5D6tfz!O2kM+QmmdG}vbPcJ<$_QJ#t8h8m~2?p5zG7vw@X_PaTfE7
zHcAZAHvxVy<ZYG^g5q;F3sxu=4%uYfbzVP?>>wEow0Pp?Plfln$oOaGa`v<d5wjiA
zp1FAP1I{RIMrumt-P0k%P{N$Tf1%#Uup*+5DWeq1aOEP6E#E(1+*3a5%h6>x%-S+;
zaC32$0Jc~hP)51`+#3Z?q%#|f%QTHP)s(_PBh`K*0=ph}|CZf5qRJ?fA3ym1MKpxC
z;q@rFP}cI7A;goezu%w67>%?U+D>)6HFS4ba{PY&??m{j{;S(5ZZ1w+vN<}BU-XBr
zRjG0WuqE6<am0Iikc<(atA~-JEYZzKm+~3=#Fil`8QS@=q#g(FO(1&cWFlXU@^c>n
zEhL#r+qYesPVzTt-zrtUC5IEX#?LO^DAC|3X52)b!bvTbx^+v2@cmb?$fLHb@4^fP
z@JZ#M8}4G$W6c_aqx}3A_`~nASi9||#y;--5tRDbe;S)aZ@kF(8FiZa{B+&yN!@;2
zD~WjaP>$ypCV9`d8$Dqu65G|Ty3IVy6QuRc`CdNmV*OQ@pA8;-Ka_T(YU}J}grR!G
zSS*TMO7AS}2!;m5HHaub3s-%;RsVAFgqW8+xNwE1ZGt3kHGTHDrYWqtZ7%QmbhDz-
zvn~zQW@;Vlu`l00(nQdPnm+eB#`$%o5=AaR*-ul$e;Pc{dq5_pM0sAb+1g1j>o^<0
zBVl?PU0pbw<6R~8F(yyAdB3jQ00kq$F2*(9jHCrZb(Ts>pHkZ{=ll@)dLkT0bc37c
zo=_;8Ld|HD|Gm6=G+9+r5T&SM`#V`Obt!U#DZqBqRDY{4A};BB{K!`F&9<9(vsFe#
zkkj1=!NKPU0|T-(M!Xch<frKbD)n#c^3>tDdsq*?A0b4k{(JqJX1g4f<Kx!fg3{9p
zvAlR*ry14yjw>cQ_GMPP$A5$bTdlk0<;^A`$Y*MG48XoA#@JXr27WzckLFjD&Cgv?
zN)io6Ru(0z4if78)Xzd*zHJ~?Y7SlUj#=gLueb9#wl2r>K6{#T6(gEs6PJ>Tql+Jw
zLqj=BSDxbR{`t1b4XUG~AfvuIC`%;RA*7sAHLmMly>eQydH!E^EoQOQ=L8~Dfoqmd
zO~YsYx^Aa2>Su?Fz^A6*dIJ*1N9=5gB?g=#PmdRVn2-_WR1?YPl=~aW$phafvk(i=
zkf?xXw-utpgO|M~S*(K)O-;YoccdPCUzSj$YKPrxets769?Far_^+e=zjqeb&hpz*
zOYrq^wmz@{8Cb%)%*f6(=K!6jmKOq`MY_??tQ*XNAP^oAQ25%QY4Ca=jja<QM3mPt
z<4J!n5FJlZ=sn)rM4ZTG|7f^4c7dnTrRMQ;Wo0Fw?BDAjl^8DrVy(ORcmJf=r2MRO
zP1y{AI)vb!q2QW<k!s5-GZE!_*P$9$E!XBtwNIZ#H2HRGUfmtmD2g4LiyennQhJAv
zmHWnKRq@T}xtNZJg65~JhhBpE3<=rEW1f=#q&Pm0O~p3C`dKiHCG=jIj;3+X+)Owm
z_Kgx9{oJmJ@g)#;_K=_V1t|#HY3C<nC^YPO=4pz`8+MRa%;;1Ub74~KBtI5_m};V6
zhc}RhPV%MWp`NKsgpI?BECn0dZc4*3^VM<3^$X0NLvNL~Sq{b9vEFmX3As-nB|p&G
zyjQb^xlk&8B(kx(GVMybyA>>7a=-6IjHEuqB-+OW4#YL(|GBd&-}H;VN`FMwxH?&w
z?4)dHsgT64h&<hw7Y!UjfdGPuMhJu0@2@F=+IW!*U`4ryK5km@6G5W+REi;>2hK#(
z&Ou7}REV6;LqaD?b6OGAgHzfoikbB~H+qZjr{M4cK(IWVCyd4NU)sBiQ7f-1{b%|!
zQIAinsq}IXTq(VdhEUC5`pE9Jy*Ra#G2MuWp~9Y_kg>0~yh<Mp?ZJ(vC25@DSxiq9
zCMd{g%3lk+zA!pz&vU)c`?STj18N-$D^ILO{x*^<@Az3_cJBUuxXZulEG+PQ!!%2q
zrm2W0%EQGRWiuO*uczsNAExulFez6J3TPV>y48DsK+Q$$Ts_pSHGaQme87_^;=cN~
z1|5$|Rv87op+`Ee6GR=tfqj1Ac)XIrdQc>(`^$u(7?)}hLDDeHq7QC@l<S$*GZF0{
zn%*XPRBYrTXJA{H&6*y1vunLMNleg`sCq{9#g8SGXTu<;zkZR=xW|i-i2{qYzXdMF
z&w>bOxKv>j0&sc!m@xr!qk2M{aHS!-ey%?7TK^Jfq#YtlQVnMX+s2Qy?QH($UqPI%
zvNl+?@?{Z`5Qy3<6fp504LqE>)kmSgK{GmLT7L0McVC?82F_ZzVi2M5WP`lKy)I2)
z|GGp7RUpH%N^)J4xKWq0nL4W^N%_<zw$}A}x1BtED7J!*>2bW)Iwco9GPx?&I9klR
zay5hI(}!}9k=0XmcRC5ra}us6oecO~dHPsfW6LC8up8=FJkd6gsPd@iZUK$=7^<DJ
zXA&Rn5BSW~II1MAy#r(4|0kfWWc>MwTk(Fw?^uMFH<EOySaqHpS367nYwn{CQ!<|o
z`g{5N0|9NWoe^fO73Q2fUj%h2qMuB#M&-tc&fuG_p$5hj|CQmh;>VRMWtgb5iTUi$
zW?$kTCvkK4Ep(>=QbG1cKduiSJ$dpwhW~?C7C_?d2R(CQRJM8mHs7CzGd&gwAxrx=
zN<^_q=OH?%8Lh2rb}&0NZzT}j%16n&eeKE@7FMp<(NGmZ3D-Yq?65x!`d$+v_Zw4&
z{-&~3i1c-e(H!}esyOZIIlq_{uGXuo)Jf_?#O6%v#-={5Nb~4=#q8=H3{xuUdu+U?
z?{>#k(r)-Hf9&sV)Apyop-K5y`TWaOV~sDKdm1>kl(2PuXkdzqax+=Y{)KLIvmt*~
zRA6okbZ2k5yJlQukR*ln(LzzsRu)?YG<8DRMM;<;CTtjEvGWFy3ume~93V*+`9Ejm
zKp?`U^xyECpwZJyiJY3s{E63#19iOV+MI|y$gWFWsw4OVsUzt~xkw_qku<j^_o*nD
zl%?wp5MmcYdHrYOL{4omzsNgBv-mkIz=7t9Lf`;7<#q0f6eO<^a?~olWAFY=I@r@s
zY*(r9yLRwD#Vg$lHF`g@{kP{9V{LJUQ|B%-gVrb1l(;2%tYmO|a4>%do0lRU8`-wT
ztLy~B*B_ar=Zr;YW9vmmbz-dUZNC3judwuHsqD<;Qp+IC=E0j>vVWEHn+N}C;s{w`
zG;XnEYis1mS<(bUDm)gLt;v|7AO$UQ(P_0zdxElw(h6Tsy}z1rmGFS65~^G8k^&?a
z)a3GaOj$<ZWGfy*HN9kBC1QNMMq-xvSfUv>h8}FLI8pB??*8EuB@1CEwb~;KS1sFG
zzqK!P@gyEnVA8(V2Lj5NeIa~I$Kq;s2zv9kPP62Qx<9Ngek!a_Z@8=tcWg%>TQ<OC
zODBRwhd!7noQm{!afVr0=>G1pp344~m&B{}2R!YPx&akNcEt0i4Jq1lQ-k}gxPva3
z#?~ij=T#rRqD?BAF+<OKVtnPziNg782DebzXnwwH*Ts9%H)qs*jY1H`=OKBfn}VW@
zQBU7FuxBL<9o71i2#I5>rQ=UjHM3Nw<5b7WGdm1r^B8L}!;4XxK%EqsNTvFCFbY3C
zn^a&d5-XDXd2IFVTUC?hEEi^d7UmzmaSb_~RQ5tbHqLTIeU8cT5d`$3Yjwb(@Ca=J
zaC(PMMw?{1(laK#a9KBXak=pLmTD=Uf-mZ6pNPaVYw?7E4pqzx`kWGjq7YSMYX`Xr
z8k|T?8PZv>j6H*JIB6D$=1s;aVfMN{lMX3U!v=t+qi|?|?~&X$NigA<lRoz3fO;jl
zC0p+UuDjO&n<~pvTcp5EcKETI=w)+^j=3$s9%<8X1-#S}6Gkt|_cj>)J+wf>d}5cU
z`-CvG<5JE|Q6N%uJ!vKO(Z%onuZFiZ<^G>+IK3a4YF!Fai~;G~#&hR97XNnWvke4g
zG(~a4%5{p1jBG%N0$gvKSos3c1vaYaVL&&aJ9C&S-9Wz@!o<Y1iA^38Z+~F*vlUF{
z;Bx;#honP*Y8t5&izo_%LYlWt4v(pZL92ynDAKXl{c)`vh(9RvD^GL=1i8yJzxvqp
z?MM3a&O<WeXX^*X)ESHkXO|p1jk`{oGvB^9XMC&%FD0;dUEVENO|SeYIwbv2<?)d>
z-_n80dGSA^j?{yte3KR)KLsV<AASU<$iffA_GJIHH+}H@b5SrCb;!Fa-((SF^IOPr
zdEVW=nkIefRWpJ3x8}9hl~=XyX1fl%Zgq{7W@)-}9~}5+R5WSK_=g(KzQ)Rzs1e=A
zVPOBbcqG+38nr-x4+vwP!2hT_3p`D+%vLy(l|fMO4j<WqkC>FFrA)xGz#dKOI4u9h
z*eHfa6W;s*v_lhq;@GW!O;X>mmDNn)dG`JgPgLlCBwYhTC0rA}*=^QV8*bLkZnJIM
zwry{AZ8kRBwrjI(ZM*TS_xlGkXU>_KGk6|-fb+iT7yo;GLF7Xi!IYJCc3d%kBJhCA
zpnHQ#WO8yra)VMvQ$&CP$CJ>^KH#COg&&*^3)#SuW{5s-M*oAUtTz}``o&8z9sdh?
z3B6~HdW4(pS0+vfArb>zCVluI<eM<YG<Z_N*|MDcpNkmZn{JXTnjW%8cIwG@2pGq4
z=b;m}K)+E*zr=$xN)pXu=da|49bE6dJq?!`(4op;-|7oYlF)qQ&n*L+zTQ}wpLL{x
z34svcNbo2)M9-)_tu$H`p!c_}bOIP`A7xZDmndd;Qcrw4OM)k%jDf9=A|s**@!tbX
z?HXC!?-9Q#8xf5JD~E8tMRN*t3>;%c<M0D}l_{hl`iKT884w}Ctp|bJHbbP17c2gG
ztu_)PwuoV5@o+4Ro(+OjU*_aiUq?;I4M<`Q*ia6?^-w}$&ZD9lfF=<4zRU=KHnH};
zfXo=}5dz}uLCoL~utXOiQ;0+uF!05gwSU$rVa->no1E(@!zPLy&zovBGHP7jXL>al
zDH0Uv{VnF{e8q&+ab~Ix<TK{_W?8U@u*TX37+b}7vf)lB+rt8(ie`99&`+hd7J>Xj
zgS#V!l0nc(zyH|QnS@f$WZCw0h*-|4q>GN7CdB><`)yUpfw;2FO*a*8WIluAeP_)b
zkEB2jSj}(ag@la%)uS(jf{M!2@+Jk6uJ64FAc<!iOqBRu?IEF6Q<MjZBG83X0Ip{1
z)X><wjM;f)(vVwO4on8VKY2gA<Zm2WsVCc&msD9w96EjU4_+R0%5zLjDi)^Sm)@C=
z*c;yd>AmOtydwK~WNbednH~OvPJdBO8IKGc0Kw?r+(dr<y-uW9rHl6@gH~b3R0b{@
zxyJnv7d+HB0fvJ0V@h+?iXItYI27@jKTlJ%6V|=1f&fsiN`R2+2f_Wy4p&Vua(ekA
zDh=;Po>wNrrMiPus&H{3+7bbWUCEZ)F6Brb5Z$H?<AmVDFco9pOFi;)LwUd`sxFI;
z988aP(ct>PoGvQNyM*4y45>umvj>iw;T}r8?xQT)4l4AO{EL&`gZE8)p?P$?a8K{=
z>vN7FqG((^YOBhrVM@|k6n-i}FaG!aEKAMHuA%&2LO|$>T>gGY#*0O`4?O&jI7shw
znxQ+7az7K7NXf{vIU8St_ygX)rA*b^GF-0N>X#;+=#40JK(ptpn%9=}VhXWf-Jh=r
z)XnPsXl+&|L(sa6nNl`xAti|&9FMW_^!+oz*urc#`{}I#8s)>AGzX8Tk*4_6{3(;s
z@x<-QEEX{#us~|VZE};^=VG!jI0q;TNgf4{Iz-u%_cu8l`S2vtOzua+NXmkhnzK;t
zStgk$J=8k~c6dsqoF?KhM=Wescs!ole$uJ|0b}EKH-q*=Mza%u0S${sH`JeHO5fyd
z#!tPD*_;k@-sWcseDMU3R`@v_;Jq(Yu#k16qFm(#^wrwkveEj=gzR7`gCj5*NUxy&
zJgF+9V(nAA@2*ap!+K~))Kgi=sVqIlELko~EBjbCPfNP-C?TZXuVR_RPhPHRoOSS2
zvHtXI`U4Una1rmfUGdQRMvC(*H#d};Jb6Sb<mP(VZ|mj#jC%hFa_4mQcbbY=xM48`
zbtgh9ir~i0%k+y6a(F08j;el>Id`c64?Z~(k#NQOBs@SLS0v;o|I(;^-<|{0FX7l=
z+);Lv^4NF0PrX(cGV3Y`$7ez6e<GnfR!<=2$K5Az!@j<Dz&a!<_k~<ss$mJY8wrEJ
z5K_2$G(1p{=K5hAB)jq(8krAlP=ia%@u4g^YTn;CWJ?WF|D7pPO7}+<WC8i_MU|i?
z)0yUoU0dirh^pROvht|$IgOoz6s8JS@0Xt{$}8C4-|rNbjc&qthb6QHJ%$xJGPfD_
z82er|KUh~SuV&yzs$}%brmz&qO`tLPr(>9eE$kL?@n~_<q&NWxPyiAns(SYprM2E+
z5(1?^&*iI}q@*Okf5xA0=YXodx`c2Eg}1S1<*BK&!#4Ohx1i{63DY}!!@QkL3T14o
z0H9INI)|=uQHA(>^wOPQrmztQNapdUNsxmgb)dh5`c54$bW=of>){uZgrc|Z_xHWM
z3@P7)GhLE)JM#2XhL1_xvnH$ZP`l&(nyl;X>%5TLuv7wLrfMqoDlifG&2P(eZ!a|c
z7RAOlP$KxCa>Feysd#e?(ABVd8c-Q4w2(=xDrIFXHx-eD^*@>ExQ9|c@G2PxZE5Gx
zU>W$2M<K=>m3twgGl_=f)O#vFoWzECA|>!h9F8k;PJDHo)i|C*SJO71*4S1=s&EXp
zKtxJ)lyawPDM11kcmhn2tyfl3B-@W|n`5pWv#1Sd%`2ZSF4UeCrw*cOOE$}#u+^o_
zlQ2_~NnA-bAsX{~=q|H<u#|kr9VIF;GNZ>zs3NP9E%6<fsRyDMPg8?O6aNQVGh)nw
z7w)hB-+O1Az$*9ChyzT(S$KngaU$N?=G*mzfXrnE;eFD0(INoi6T`LemoaV}`vJ~q
z0MZ^T%LjH*o^a&k+7cdhsIMrLU`R2N-owBj(Zqq6Z=q_ewYl&gm?qG91dLob=vZJH
z2;o$Lf9d)aJalrf>B@?E=-KoIVGfb~m-KgGilKAoh`}u-MjBVw!lZJ-54QP9?m?-L
z^+>(_Oo&j4c#IGqPCuAK-7Kc!S{s2u9oiz$sA~!W9b^Jv2Ge%}g+V#N=re(mkkAo9
zo1mW%T8zIxWA5sI7$zSxCn5jTXhrsp|Ad}CEW6^htCue!VuMNds|!j4drRqIhG6_t
z^zp$11ELKjS)d_$Gbw#62+XV;S7_f<I9`N8?o96fXtNW#vhyt?5&|mGkVGN$`SiD8
zh9GkT^<4p>0Dez{V~j)O+$?I8P?X$jeOM^>y%e?6OC?VQ-nJD3;?$MgFrv^_k*`0O
z6r(o4!D<62=}-bz0_+jf5CcK{-_lS5W`j9+alSy}`up>y5d#DQTd2}t0>J{)7%;$m
zuZNM{csH_#c`uh2It_=u8D%eDRLk~Xk9TW4IZ|K&T_XGi#Cur~rNMhxdhy{WU;;3D
zUI0+wS$#9#5E=XI2b77TN%?;d@L*5yLr(O$Lq(H|b_^ZUMzaqY_jM#iSBU#pCR-9U
zCaJ0r)=dH%!9;@xM(uQoan#HI+(W868Of!OuxbfGX`W<lvq{=E3)=COz9sg?MT;&#
zLz)z^3=6g3DitK#1<8w}U&*FU*%^+LUcCEN>Oxv?vQ=H*@?4l7ig6j<cQQ!7=&1}{
z{EYV8`^GiM{ziu!jfx>xJizt3yY{^G_s@rmY5ZWkJA};WyrYfNZqCppY;3n|tahrf
z!|Z&l`SJc(QcNXn`S&6C=75Ix;X<y=%Eu08deaVb9icOd$M`<%V?N4#V2G*kqj&S$
z9RNl5Q?3mJO#>&D_&dox59N>o#L_uY>ds}BbH!gYfjx-;_)T+On~Wn}r?Gm9GfHve
z9~d<4v^-!(ukCF?+Y3vDnp4k;&=`|iLEvj(ezU04Tx;7^e2mH&Bv6W;VRfc=wLEJ{
zV0G$k$j5(H3K!hBn`;n*qF96d{=U{W<wl&PAQ`aI)U+%RXF}3IG+-D4uT8lxzoVf}
z73UPfzt1_Nu?TcC!|qPLcWqJD<Ua`L*LfqOwAT0mvr1QkqfUdXAV69kH@||Ce0K!m
z$k);n306>Rj-&cL_LSN;oVt|!eTf@QwpdOs82sW%=hOe3QU|?NZ;nAZyk8(A;sS2H
z(oCdZo`7LMAe=ukkGrptl=2I6XE#R6LB}MCelz7-LSY^wheGLM#BO!h$N9(lmX}L@
z6_Qc^Q+U|COZ;;aN0Z)SXs~vJ;|8*(Rseswpg{2lwzN1s6vYLA307Z(XRD#<>h@oc
zwXQ&ZMbC8cQ>j~A>$SSR5(1v7C($X2(nS$8)|b9h^DtDw5(7Ydu?VezdwyVFF<i?R
zr9!4pnU5&@L#z4vQTcglY*RM31)`#)M=m!a4Cn{<!coy61MU64Lc_rY&7*~3MBgZK
zovvykMzv7x(HPAbAxK!uhpBvXLsH!O5Po=gOLA6G=nK)BWZFu@b9Pf&UF{yQ;MuH=
zF4z#5SA{jL*FRt(<D(=<yCJkj*!b9pVn7K$_Lh)2wh5gr_q~z`43$x9D2<YXuL0Sg
zZK*L-74!F<#es}I2KkeQ5H`akHaSl6%csm|bXup?Z~e1K2PnHs8L21c%oYvC2EfV~
zBro&eP^5ghB$LXsvJ{+)o<MR45vbnM;(?leX}6`b*{tKHe)_L<e|Dgh+nL_Fqdw&R
zh`iZl@R)e!S$IJb@zMLu>>U&RB2nR3_IV|!vMi$2BSj!GG<u!iJ5x5()07%B{=zAC
ztBnw%Q*VIll(G0<r%)GN$7q28b76q)@B<_vH2B$j^x~15(!*aS72<~>hESg}uhZ<6
zS9kR(2>T{eJ2%DS0wV`A%>KQF61S0es#dbI2`lnp+hh$WFgDmze9ek-rraO!l#g9t
zYJ<Yl{t14d3GH0Apfst_I&bZz$zF<RQhKiWwLVg@%}ln_T9zq$`BRgmc9nUv_%Qwu
zw7vX^;UGgw56p7FASG!Q(K)3ABEHgL)of-9qj>=2Qvl+y^y&r<oWxat$7YadY`tCm
z?SqiQV{7=Jp*<@pqMg!bY?`Ngt}fFZ6LEmThw}GN-qtJ0kzoxX!*@-ITrFqQ;We?z
z)?R>5c_dfXG_b6X<4@5e7;v~xSsqk5%j&Q``8OX16IRC?VKsULPtvt@GJnT%MB~2F
zb-!_JeYWZP5E-JlX@ECo=!tyx=x6)>^FxEn;qP`&_Nmb;G{%~$;lK80p}g=!@Rl1n
zRAgZ7BW<W7(YT@2hdL41u>ipUo47#$cvF+zKMv1^fO6RIUlF9-3llz6tIm?hO-JEg
zgAU8!4c?x*q0yDvcirdJjcO05GM~0Mr0svJ<m-nasCP|S6CIsg#$_-Q3o4d3lt><<
zEjsidbq^PUt^&J9>$%iZ_NXcG!TXC-o8L=+f3#6p_w+<!6Yv3;BPeYrT$v*-sk-&u
z>G4&?srH%6r&efwa`59u{cc%7rqYxr%aM+Aq9}7&({$B<O)mdv|GCcDl*n*fcLFCb
zZAp#bOcoM)KC;5Bj(@EVL~#y`ta0F|hi1X1m)NNY{9uf<sa<KL&SW3`FY|#_&^mIL
zt{2wp-=BnvRUBNS4)$`I%!~6)OX`#g<R!pb%DyRH%xWvxIc2|_$AvKRFu^>RFmh1=
zHfQ|)pn?Et+)3BBSI@!Wp6AmTEj(ff{gFt;^1+uHy}_xOdK-Bs3>E_t1EVUOU5oD@
z8hD0jm<EPa#$%eVtQK43E)AEKR?)dAET2$PscvqzN@*I$;_*UboU(W<`2$kODGHMd
zLr3P({^lz^nRZSp(@k0MlsLIpjhzcM6S1wGR7$#plucgou1?bsDI-HaM-5L&KYz@$
zs}Jl@bzpV*dc?z$efO4hxj`Y)cMuScaAP!$hjvjEK9<C26EL=8%s&Fw?5^BA9F106
zcU=0}KNhQsjwfv<`c%#>dcM-sn?T#ue5@3m5G!{YYcds>)|%1C2xSK%_7Oj*DY&Ai
zsdEUT^xg>TbGY~99|}B}gW(>MHe~b?$W!e^PgPRL%59Wr#T-<3g?<b%_u1Myd7d<(
z6}z3lSs~S6UN7JTK~PWnNekIG=6vj6e#)$kRqopx#e+9^L$<T$rNk_cWeO_KchR2M
zt#P-74UPtyDk3Q-=SSNzC%u2Kfg^H)0Kr4+Lo)nL-j(JImOmFjc$awCI!pG=dPO_2
zkofujrmOc#&&jzvjQazUEkDk;B`pEGg$s-g-42e~e2AJQG}1l7j@q_|=)#qr9yJ|z
z)MmFth0<}3>S^N7T6UIdt~0uzxXs7}G^{Yd!-7;?aTNov7TZii3XSsHM-AiyFNJaf
ztJvFpKFy@enei$`ronNPV>f)Q2Js*F=)P8RFU}kzm>~Q9W=K8#f^eAbLp-w`_eKps
zD6R3)MrM(VqSE!U=zP}bJME-0LQA}*KT;4l#<hQx2y=MEm*UarigZY_cOZBN7z#=3
zEUeV_E>GmewBPEQkUXrVNkHc<#mb0_F3f8vqv%f>m(}&}lm;f<f!|WVK>a8R#mgm&
zWFFdG2NFKyC__A%j>p%TAA5fvm|()uSS5s>_Ae`GNkdzYuOl9GGX0LUbel<zo{b}g
zvKZ$AH!u#`QhBIuj&<GPBD$wq%BgmXksF)i6+dq!`Qar`#L!~maS0goKNwQ$LLE+%
z2zfec?Or_<$4En#6Rr$@8Wghd6hcFXQ-~-}lLd(Q6b++aGe|mslfXkF0{UON54HB8
zi6!Tf6_6uD0vD~OtiHC2TxaIQ2%!v)7^2;_$Z1hIW_We|BvUGKY?XIBuLOdRQsE}S
zFE5kkH0{<kA27^Mo+M&B>J~eS`0JL|$u}Lx+bPjH5MiH?CKWS9ArL~UsS&{#XbVkA
zbER1YDAbO!TGy9Uj4%4BabuU1gu6Mu6uBB=X1B!Bc_ehw79=Ict~Jvp^nd1B2j2&<
znhEu9!cjKV;H1*BG@gC;{t%A>@0q3FM~^PVN-p%;es)%jq!UA==Yq5qk>mMYtuZ2h
zOk5Iv(f<IV_wc9tm`gLj`Gy5W1WS;B?@(<rmUE%jpl8090fg&uzLpf94@V?v4W=3a
z6Q@Jg;xZ66{n@{Bf#^Q$H6q%jn%#jgIO%mkJW^V{Sk7vA^QZ53CXH^zG)CK<=~w*p
zyEN-^(<9E^9m&Dp&OPOp-ah39<KzVtX}X>MJ3P1+q9#%_iNn=-A86M~tuNT{13Jx|
zav~1h@Zhy>rDF*%vekKF*o#t*Xs^N{>^*Vya(LEIaQ5_KA9!&eyn&q7ST#8MTh{X|
z?L=AgbP9WU^r(8YY@KuMU2EkNQSZS9;*E;%1c_NV=!l#2e0>8HJ8j8_M^MeS;tr_K
zGJc{iD9?6jSpmuQAu3_Cv(EOxd0;{d-y?t-!kCH?#az&m^9q&~oP%a5gs}Qcq^rx8
zrz%QZ*fsATyVCtbXf{d1mRZs=DpKyj(=s2_?Tps6JWEvnqV7FHOKQS2Ukf&|EEM1E
z7FCO?y)wz#nX|F8p!zh;@gaEPMFbUg%Tp`U-7VDkg5=-qI|0Xy{_Doxq~(G;lGX9c
zE4W~W?PEZP76=1}0LmE~o|+2Vj-9}PpdM(6|Ngm~RLaQRc!o$m$|@1a+A3DvR!LYE
zZ29!KOAm8PLW%J7>8FJ0DwcRVY5E_mp_(o@j*%N8%epKo8kq-;Yz(0f5yeRrs>#J&
zbRL?kZ))#Ltu-n&8CU9!$vPQU>JL9wsz+HWfOxT#$|@M6mW-O|_EDMDS!kh#sK(d-
z^mE+PyrCiku_O^3du~_N(drO?e371BqW7?p(9NmfO)*U!+o9-+>c|Sz&^;@^z^wSN
zm-@gK1^UxZ97B1JNRyJBntrq^ulBN|7X5t@7L4e+kPLcX7(d5u?AwaIKjM)Ck&$C#
zbBks7PMJ465{y<etBq7ac_*v$9AzzsZJJgGeyzo~mU{i|1TonoNiDcP@UDD?BwA-u
z4#`WPB-awTZQ4a)I9O4)VTBRG!GaCrA;s3VlfZ6C$PfEaVa<gh;Hqi!ELn=6v`rA=
zUnA?((Z~e{h{~NKb3#B%dHrG=L`xqvEA@dBN}&W(?GOGq-aBY`m6iidg)qslOa<nn
zMAV?Q@Ti)H)XW3=>LOMzRt;qm%P`Nh!$z0g7>R-sL&ZEK0-{2f@!UrmU&L>{4<(aD
zG}r{aGs2201vYC;%NJg#THZ4yW=`#Cp0ph3Zc@f6EY3lLd~%3FWC7C02ZkZ8B16<8
z<npQ{U_$~bQ*i>m23+)tk;U?BX2IYs1t_f+a`)*5?9`D=d~M|NDGsRcrN!-^^Tt<H
zKrCHdNht4a<r>u{6G?n#ydf&r6(gvjy}5wBFl}6{2-j4V^~pAwEvrU4o!~s7$pphh
zJ+5IBGGM#pnvNW1o`4gTbFG>{h8eQ6(%LgQ<lh~%)AfJb^h`OES&R3J?1m4tt5UQv
z%0-MNJo7V27o!&2)zR5u5%W;Uap>W*ECU+#@u<(7cxazzI7#`msrtcZ*NLSs7y%a{
z5h^3)62#1($p5^sOr}7oPD5o>?yXtrAjT*gm@t6Ai9}hZGZDJHB-*f6*`OonWQA?1
zvf;i7A-5phC_SySexZbwpkn>H&JkVfSv}7qBE0A}EZQivSx}|06w0DnwLy!YZZ`z%
z)t_0tdP_#spnbgRepZi0ib&MQjSLgWg;$SWWl@9(5gEusYaj}D|ArhwBDvWlJO{2U
zlNnF)-`vFvg;KFbm#4gs<3#f4C7;cz>O>=}kuWsMZ9%lj!I;nWN?NqijRw4}0Dfq`
z!Uo<1CE5xFqVrX+;yt7Md^y^W;z-NJEEYyw?qWN&!l&%F()ej^EtBv^%q?`CW+5wX
z23Oi~PD)}j$K08hfuuwR0itGBag-}$gfZwvsK1#_D=y2TzU>hNa2yK*>C`Smk$Kw8
zTm=hJ{VMmO(yQobYbZ&VYtse##;2j#Q4B1f=e$Ctv}vd-33JkcLL%9lv*}fV_-wBj
z_ad>t*|o-XP=mR$+r<peW?<ejO=0kotjQ8UMCR(LMJWFGom3#D!k(?n+Uxp=&0Uq4
zn7bk~-Kwh?x6-MlqctJAFm!d%kaau5BE7B2Z`)^}TU&sE)ixgP+fvQ(^-Is{v*qDU
z8yyM1c~rd*uxUsKkv}}T+&)=0!T#%*D&~UCBQ1M%!O2@*n`n>k#L7<J6jQOcVfm-b
ztJD>5nz~*n8)<oi^AF9x-VDDx`MD@V3g`nPl;@m{%HrlM|DK=E+@3#8aqz@KWhNEQ
z*;Qc4o?mj-Orn<)T&P@k*sHOru4Fv-NvYP(KhHhB<($%{;4Jq;W@<Lw`K?tCyZ-E&
z@@e+fWQ*_m+Ddp)_DZM7Gw5~l?xQsnl$b8}x8QGBo3nam-W}4uiX^C5Kyi^+d_H06
zEQPId#=8UPcw^5^4?o-jC&!TX-(K4i{rIQMsPXFF{YOG)5-Rs%HaX{|Yb1LJtBAwD
zwJMI5o)MM>eF(IlfrSTx&@3Vn0RFU5Hhi)H7h|>HQ)0B_cn5*PLn`5TVD)(6K2W`a
zS_aicNI(RND)^jAIw71;$bP(Wt}}Q%lN7cBm17X%NnDhdNoc<E4RbNLNbm2V$C<}F
zSKdh<te(8z4)?(KUBt;9Iw0xXKY|sD)q>M_qGPRuz7?6F88o^fchADVF4(xpXVJeK
zl^YR6iq*s@GjzL?Y426lcL{JUEQ2vvtv+pWKzqBSt7|lG(Dgb_)VcaT@%OdZ#ExFI
zK9yQ;rt~k3-dFg-it4%gKgkmX%y(m%>bs%%p3&y9%553#)fO6(YAC`mD8cwMjcI<v
zXD>c>wYpKd&H2BFTE&u4F-6VP1!z5%cO?UA#&y$LpbEGvK3Hi@lh*z@r;6jJ205EJ
zDi$*5|GLE#3YWKd9vmwoh*&b%6S|@w6d9IgasGzy6iI9UP;xOPCB+v02WqK${!`}P
z<XV;=xHM2z)t7VyH=-GA8oazUVn+$o0=M=F7Dt}y-HI8IPqQn6!$KIeF0^=+YJSt&
z%wK6Svefy;2&-6V=49VqG#76EjFY%8<184`kHQsFFVrkfji@IdBs=k?h}{m7P)FW$
zoUrW=LQ}d=Y8ZWdjV5Sw>&WOGh=22_GU=`!Z;jH!X8=TRZizL_PQRC-Cw4@r<k(s>
zg~iZ8>n@Vn{8GtmUPz;ZBfuB_@0=+IK5%IaVSW{O3ZC6sIb(!1QgCXAHL{|ZALZ%S
z^{RlVTv!&X?5nVGugiz}${+-=hi5wm)-vn3FCQ^x9sA=qpOGS8l=rKj2j-Wpl>8PZ
z-2a#Y#dMBz_ehz}%*VS_3sm?;*KW_Omj>j%h;!L%BwGqp=}O9RYaA#JMMx~M0T&$<
zT*h{#e(AvJ3A^6Il`H+Z1|dzcG)U#gY64AEsnA3aa~T`iN@#4h4nAYb35ArQsjs1Y
zCnn!4ZN@~`VCJ1qo}fz%u)Q?r!e$M3=?7CbJ{>H`T;9i_H|drsF~{BXl)c6F)>8PE
z&C=D%G^Ev?yTq%Sw*+SzTJqeLS$rKqCntUSEu3L6s~mPcV_BzZOH5r5T)-PeSB21j
zCM#4qI2FAj#{p879%Q3!fNyi>77#Krt;1RWd4^LIXQ7(92Jtf0^crxU+E(T5Ru39r
z3BN1McI&NpkuVnz>uK5bd^89ZLaWVR$qAMP4U3kXOT(swDr_cyo$1(lk4<)pDy=Uy
z7w0p|FZ_Elnz}z-G@2r-tug$$$lQ8MP|B6w`-eC6@mJYlJz1xpGa6%Ys>IM=T~qsi
zvd{f}*Aap%mH;vQd7p=-p|namV?p6;Vr@V5h?03sS#?c*Yu_)_l^ebK=S}DrUGg8e
zVd?{m)Je!6cXhbdmlP%>3bwp}R(<9{2a%9o+$)Q76wfk2Jr@fP1LXgdt1QAg+!rOh
za#{<#Q-a_Qh$e>0Hxa|Qjb%xXMe&x3O_fMMF-SHRH&-qvFS@we3wB>_VX{m)+r8ch
z&zzh@XD+yU{m3ONs*22wCLS5xq>X5rtI)uc=gyb6rV(4nL{H<6M|qSv@S(wb<Dmh8
zGQ5+NF-6X&YSM@pkVMcR40P6bbnQ=>XQ`euH1(wU!PvC@<(Xc~TPuiRkcp)*jY1*J
zVVni*0`>K4o;}M?s#ED4gaWUKk82OkQBq=PYs1^<9gkh4b_Cb)*3d!0>MB1Ck*p<)
z+crXQRU)>}c&s{A;?mwII*`bns0ymvH~3$uM`9jBQj1=&rq-KWJ^4)`{fF!bvIA&^
zY>)|jk2c}7)RXvvdWRj8SVGOs6SSO{>5N9vL4i3*j^B!NA|*Ad=$#ZlFjdk9W>jgh
zH2pG10UdCPhP=!{nQridg_8X&id*%r8d72yWfB$a*Gss6E?sIAPZR8ToDLt#fAN)2
z5h9IUA9No?K5RYG4a<vI%Z9-G7&$pPBujcIS}y2FkC~rST09{AhqEB)h+owFb@C-@
z{FsVNp;g-wU)<U0gPN2ikdmZD<5Anyzt~E<Jrva2hfkfJTQ&v=;1`9r0j^L00sy%n
z)Ku)lQ5GCy44>X{UZyns5pf~RAS6V4LPea)Lc;(M{8(zSEJT_zA`glR8k_*EG^G+2
zF=!+)9?F}(N8Htv6l?+|L7uu=MLQ3oC%Xjz(jw7_DR;yE07ylo>=-lv6P`t)FC@EV
z==JRF!J;zT5PX7<<^M$ljZvkfRx?(*<yKUPb5tzi?cHt6Zv;^2IX^67Lc(R7OifK&
z1IC=@)`qZv!Skc>m9r>GZByFw*8`;cGC&6Y<55W&RRsP++w7K~Sf+C(%4ivWS#$gp
z5Pf+qzHHzmuK+<>ph%lSV;Ow5;_$*Rb4B)o(_bBvxc_{qNKuQC|5hTIXMM$bTzp>C
z$o!VguvbRimrFn!ObH-@f}dO|zIp|oS^qGC7!aHRH^d4THhbD~%WlKVD5;>d-pQBV
zQ6jX*!WEStbpIZ543d<VcLKmcimw|^P}T<Wpynp^lR`NWU~q8Jf<tD21t6#cV8L^p
zLLdZ44I-g`gsky`Mehc{{yCBvxo<Oqop;S})V0}+VOlPAhmtM_4~a22I`~?ey&PMQ
zvP6T1#}gnBWw2lVjHS*Aul3bKf7Qp}3xC$I0WCtv6!I93L~l>-4yC<&EIt8QPdPV6
zfh`%bxD>AV{QX)EaV!f|4A3a5wJ%YEohIF2d(0qy7>Q2$7icatE0%H>J!0i=h(*gg
zM@XJ3#Rb+-O5?uw2gk*GXPbky3KDDF0Y{!%cv$tnx3{zRoYRu{M89vSyjY9&I!HJi
zM3&o&;Qm?f-iA-Mdiy<WD6m{Et=kIvN&%y4+P3ARV6efq7c@0VsZqjXm69TqOOtF(
z=U?$Z-|`mq_*3}@mMK_eL~(U@u0j29n&<@O0&yYD6P`<mV=68z<AwF=KE#Efhs`!w
zasnXmpS(Sxp6ek^^q1LQC)s)1wSSr_0dR;2#XzTp_~ES|jvrtEb;}Pq%!~E)v-KPC
zT$UIOHd>j_!&R20Eg8T2yV#ih-tE$Ncx{)z>;OzNjWV6K-v3?j#X52W=i-wEIWfx%
zeqU5|S`NnJ0^HEr{3)4|sZC^LzC%`FD|-D`^Q13$E7qPz+_=By`$nTYT@Rj!eypZm
zT_vPFwmDP=Q8TNSCr%nkUWhajhNRM#R~H?f$ItIh4aQC<D6I?|EY8K(YKj}~wMk2{
zaUdKqfGS*6p6q0rKVQvgAMC|B%6wOUKV5tm0*g<deqv~sVJjvpczP*;*g9q<fwql%
zVJ_WA%>j-<mhRW}E?Tjf$ehH4f^mg1I6Y3{P~yn_rTfl4eW@8CS{&DSe=QUTCMK*e
zOCpS<D1&NE_8_8-$4$U9RF5WkD@p9gv4$1b_ap_4<};q2l3Sxy`}RZsqUkHl*D7}&
ziN>N+j>j8}SdTE6Ipzu9%?0tMth?`q$0urdU!OcDe<39jtTpP!8=488N6;g%j4n4}
zJW5S%i6=cXp*DJCew!f6)*jM6+={(;#_Gs{Fv9ASO+p?U$ZO;m!$FIb*Y>6AorozP
z@%;AUM*f4Eg9$6CRXsiY+$KBpvz9W?D*aJM>W7Jo3|eG`4i%03S-E!R&^MvbON?lW
zK!i7J5Uk8vDO>9#Ysm10vCP91AX>Rx&mM?TWYv!}nh)!2YzYDAPRU(7&t<SS)sjMF
zch#y$IThYYHC24zMaY&ykI3q8ABtQkcCB<IWQ=u<D-UHCSG2GlGXodb5R5L(l@^o@
z@NYewCeODZDcoN=@aq10GyT{<?F-t91V3(!)__EhS@bZqPVk^%l%jCTU*_`Q48>8;
zfCc5pNOi{_Jic^}B1|rP(0h!JgwNb6+4J|oTLKqn$#mXN=${vVh8=!Qhb0Xx0-CVj
zTA#_1`FT6e!2XA)cg~D~)lE1j7WatL`8T5`nvP6-0E47RT{_ab)(PaH-uK;7PSii0
zoi~hW!Dz0b9<BOGt5`(5IJ4k9g7zEoKrS)_l$p`oQulMrY4vw`TdKY%j`5Q`c`t@&
zFZH8c_cz=nQx)7SBYgKxKE9<tZAVd7Z_ro368ZOcR@IbQA?WsRIu+j`s#OkSSsPeh
zFrjj<e&GT;3sSjkIkVDSrq~|!hO1u8<oQq>k5p59=>3i4$CIv<j4}kN3>*<q^3i2X
zCDH~E>SW+T(}i<=ST@_9*FN9-DU~vKRo9&S&5fF)Lwl^shSa5--}^?#_>f@;>Ma%p
zoE)3prreVF5K>))f+9d$Uw2?+$p6}T=O_-FQZO=!K2X9Kj*4)yyX*b&yBmi#^ZMfg
znHKCES75Xd#&y<}-@p00h}JGks!XVD(ao}OQjx&GaTZS#Opc0e1=cznD)BLDA1O`J
zMW`Tk_Apbm(3I4Q#E<2BMid*S2X^HVYGf)BmXE%xKh};m4w@j~Q|2?!0&?~@O>r<&
zAdNg4xGrcopcgoTBMMusdWyE*)K!>*tIc(V>W8l56%DByM)jmyiXc`yD<4S&LKCNo
zP9JvA@ah@E!MoAJxF17q-hOibHcKT|Ne7kIzB3#MNbSF<FWwfR8`iKo?jiZrM&9sG
zYAfwU<ldwrc|FFespx|7S7WnF8p~+Z#Ewz-Y?M+e{Z@Jvt80!9RwO54!cQcj%Xf3)
ziA^jzRI!fhg~88!V@hQKtNxSkn7%CUK@J!#wa55e>wT9xGrD_OpUTbY!GE+lsuep*
zmd0i8*+(7VP0C<>$!7KYQc2PB-e&4MM)JgvJZv?Ev8^)CG1I8h&<UA7RVei2?$z_#
z2~U+rb=wIlZA3A+(SAfISokU#H6=?wYkhXI2<ONAi{j%Oe11k~PCCt4pN8YBWFam!
z?8;(li23RntJ}V~qkJiXkdX{%8%6q;c#!f8zKmOm`iK#zWgZ8XKet#0mU71m-%S_Z
zjcVr!5dEEH=AJqqAI1b0+^Fw>KR3OP|5}-w5I>DGNtk5B03I1j|5*LFr7=OKuJ9{@
zm$0@RtCL6=6FzMJmvZEw^Nd)}zwdvRPv3V!iuW*qB`C;ErxIVpWGC28i4D#|533D-
z=M_&lfqpqkG*)>pF2CDv)s#Qti(zApvtsi#fnaxwkJTms<l@YF6AM>xQm|}!HZGD{
zpq(Ox&sgxm3%u0z`hijZ5@67<ywikAh+|1bbYts(f0J#^s?Un<=xvuOEcz{bD!-p|
zw$=;fRhdG~C<d43YNbAFCE!3Sr^}>rmaMW<{qMJ|S>c5$VfNE2;@^?vKzDN??1tJa
z?0mwo?)Pts$&*Sw_l0zEmOm^8CrVH-d>UQepZe=`+|a9H^&cD|iSa{7Y^>f)yTjMK
zg5T^Cb=$qL9Z#%U-6Ap!1}mI!5!pO@V0Rci@Fc8<1cYKi-#CcNQ&Qycbn;OlLVYqT
zU<FK1XGpAC-wOW21ly+Dhpt*5pIOwV{9O$jIGQ{_X*fxvwFs&R4;~sNS7>NEgP%$j
z7DQx&WeW%mcb-6QhAj;$emLSCKxB7lf`wh6r*zA>XbmMi$~B-Y@Ig_8lriPq=?-3c
zos_MJw_mMRMr$nKQCx^avl;f__9)yVP2^p{i`Wh(M5}EQQ0q};p_sjddNpri+QeG}
zav+T@xqPdC&N?d)I8(-W?rswft?}+<2FFEHyGr_~w0`zwrg<B%syfDWUx1V`=T~bO
zXu9~x`ufJk8)>*WBqdjAk$)WDLOR?-L7XcGS4g_QPc3Z?De<1fglhxQmcVkOEe0L;
zYf8LZs_C;Z83WblxrJ2I=~ESll}DS*D@u0hA;IU&4p+x!t<We<9^|nO*cr^o9xTW3
z^-9{F)Hvyx2t)&mWNfmAGKU$e>${3;3Y_vnwL^J5N=haa?h0bRS7rx4Nzj5fgK8qd
zf=}9o`)NR<g7u7?#2cZ<^%?@sTcWnP;w-eKKAT!axst9qV0i!tB#`~D+<FEGFgS=b
z`AC0U0jgAVoE})vKU`#t--8kXy{2sX#zoMAV^&aRdMr+X<d*BO<d9$STMSzA({E7w
zLvaL4QA?KyK4+iKK5)A90ZVKzB2g1O+PmV>S?-ADHeaA=TW}-|N}>6X&HIYWJ6NoZ
zBglR8P4tT?g$*EvD#aZGLP^982Sxd`ErWN73b4^Z&HnxKXg~<CbT7`lqO?AokiWV2
zI9$eJfWrbN3{bF<X(CGm1T7A(BMZwN2uH1W)pkA_G=n2+5E5J%ngU%8%^_^GvyCBy
zg#h^rM;tB==PT<;TM2H2K{IQHR4<AF$RalHD<f)XHEw2ETV)2=N^UY9@o!r5+v4c0
z{VnQXhvL)()Bd6J2~sy+)I=U3G$7Z^9W)^iCPWSb^NqhLZgnT_-irM3a6v!A1V_&p
zrnIy5H0~4&;4povn}1*?ve(iqsv<q}XE`!g^gphJ@wos8poH;F8|DvWI%i0Vv>F-&
zL=TjtQ3NQWBqa7AWc>8wnEck5h#ZIg={3%<y7E2TqultB9KoM|Cr#f=>&#`_=bxW{
zA}*mFSln~!E4;v-TRvfzXJ062qx9XD$R?DO?!IZq7U4zTD%3F#F-UpOk%~evl_M|<
z1)KEYx0)&`oFPPyA3eo_ok#I3NEit?zJ|blpE)etzK;+APu483_6u?1{+T}Mx?ZL_
zU}wY<LuN7u<*CD#EuXmW;GkYw*VI&@TfSsL#Vz|mn=^E?>u1f>x#sopUv?ij=IX$z
zz%j^q5!IPa+o)(S_sjp1V0|^+GOVQ$)JWpWl>nc8Pl7Bt#8^6TKd<Xq85EHe%Up&-
z|JwP}eyjC+c^5YX;zjJ2->Qeh*s&CsWo5Qin>>@*K^1WY-$xCtxRF&SIQkK%t`>d%
zyb|o_`gseUR+}!1tML#nM=F&f1(WvA4&<DSu!(m#Vk4U*k`#p_pz)E;C|WmEA0MG4
zX;+klJ@k5{B@j^)v61FQkB26QbI5AhOuJZ&gf>9*5F=ZQ9;H=&*StHPRIWmwVguuG
zN)|1qOIHMMQT+~!c!yTNlE_4x*REN}aw}Xb8d{=MRrwKrKXpIn5&r1Md42P)OO-1>
zY6>sWY^}LN$d-xP#v)oEa-Q#;=ZV*kuAOZ`^dB*lN>}&-FvwA*7ODe!yZ4Ad;vwsZ
zBsIF`TS86<Bm%qtta}&dMJTZvpYM<fWttr)AYrPcaHlO^HVhLYBy9O2U$$IV@?dXw
zh_jehkaFnmyBIi_LR7`Ju9*JZP53kDKkDk8P^;+$sXNUFpJau~2OFQHP8}j3ui@%B
zo^XpM8kwt1ZK4c2ZC$y;vW!vX_$Mh|m!y^T8y<<L>L_aIAV1aDK`Qu&DwB+OHdZ((
z1!vA*9qJwjKQ$Z1B2cK)iRYA)jiE3ChSokX65R(*`S!7zamLZjZ%|pt<yRwmRq&;$
znMlEqcMim#JiE1A4Qv?iy+OvYEMyj3MB-eSt}Mpstev(jJ8s;;#!TCJpl~n{MgsGV
zFDn-fvwb-m?K+6LKslO`tsWnPg|UDiimkM#<#=X99SDmE0vQ?<3YNE}yXA9j(gj3g
zV8RO)fad{w1No=pAi;_Tfj4c^jTL|0(j=XeAqkICCChK8JRKeiE4X5TqS_12;Z=U`
z?~j(=C6rKKlPzzfnzJ}(trxURn<;3#k`g<bCXO<vrV%f?-%r~oRNp&yed|vzlwX@o
zp4=xxV`3gyJcvq{ksg5p>It8^Z*XN{<=kFQG$_VXtsRk_)>%1FWspWfgh^n%uK~9a
zn4pK?HZ{>Mp#bRv0L%&B0cglzkFB&okqQ_XP$WPy%hJ6Rr^P|tP&5CaI-o$+T1a_i
zcZ5Xos7D8ZF9Pl-PNonf?ALSH5h#Tw1y=fmg32yyRu+n?<yd5qrLyUbY~Qcl%O=gq
zZbt!hTLtEv@?SZJwa(Z}46hl@<x<P9m?mH9wHxgEyG=JL{GO*T*O<>&=65Gq|CPRG
zxt7%TO2cJ|uV4PK`s>`tZCBuIRG+s%4h~4#I$z9fbXa2=5m@i-!$=QNCW)l|LJdhl
zI13J_+(=Dh|IA5t7?|gB_~vA2p*JK@IMB#bb8~-~)uSV7hjUM_ss&hzP;*FSzlS}b
z`h1H$<y!w1Z8Ywa!vC@nO75H#AKkK0E|o52YjJ*Q!5L0jT$~`&CcL_^P}#ab`Ri3O
z4=6b|RCl+fxZ*0C^w#8A>6nbwa`b?|F4Ha({Ay}Uzx6B4k9{{s3)Llv8w>`V$x-tU
zCR?Z}M+rAKZ1_v*nZnCRK}jm@BXI^InBUVfMWx_&Ac|1{IwiE8er`NUYh^^$-@dYR
zQqD9d&KS@KV*N9<9Ncf~&piAJ<cya5HYetc2@yf<Ij&f#ATyTSYF#HjfJTI<@G*Iq
zyDPeys2x7EOTRC>PXG)(HQZXNmGi^<ihJdykk3Gq&_Ehv+F&3tT#1N`h_6KjGnX+(
z41Hy=Ub!GG(l*?4T*hk^oah$Ms$w5Eo$7nXt^#`>yCOrQ%;F<#xjC^uo}tn3=S<mT
zLbpzuxJhC4zWeU>r}vAu+Nzu0?%pXl00ZmY%xGR!cLO~C@RI1$77HJLhh*JN`D`3|
z1O7b#9maG@f}yFhIJi;V$0m6SP8@mfQJLy?aO77hyb>$=i}zOgh~{)LEh0>)yOgP6
zorJk~KKxF4_)<)N`%TYZ$BcqKX?_gcY2z1`Y7B-#A;{CEcwXz<9Xij%;Wu8U5gLT7
zXrj=&6)i0P%u_0G!yta_8zi@VLz_#$I1l1Z0Be~a2;_u@|A7Wew9bv(haKMi_V(`k
z_T2$d+d7<bPuP5-IB+82B3?5&W!E|$KQaYB;X(*Tx!-NpJ~kDDRqW#u!hGtr#(IyO
zLAZ(1;uM^f8I(R71b&3f_XSQMS0V`-_xw=W+-_qbN=3*lU@v}<O4gu|wH1n~90!q7
z{!1(`ZDk`8`qzZ8n8qa)%ksv|g?dSz$r2-Fs$+Ot4gGBe+&Ic>H^_Fo?4%JWM22A9
zb7(|Fmw=3WA~V=(GbsAG_d3^TAV!^j-j~F`)0;j<H4f~_=hlyi4iQ`L2cBL@e#u(y
ziY_y$JMA+BJY8yKyZ7qNhL%5n=G=ByTO7MNYuX1E3{or4uPX!<OdW`KafnMFxHV*+
ze&&!<eCS=_-iPr7QnR#YWs#Y#L{{<o>tCT`mB9FO`CVXo)YZc=QY~=uURxR9&9&6@
zbDk_AP(n0ge?HVoKAez`h2bfr2_fd}i3XA7EuH2bd(P#IY2nmHVU0g4>w7&mJfYF1
zFuYj&5o)jT6qqo`@w+Aa>87YBk-~!3OdiQ<MWs|vH>^^f?>H2w@{8e@GRcX^<~$Dw
zC!&_ZX#q+!bL{Tyg2zs?#@AMFQ;oaZ{dXtdc1J;nuoX!uA3=uoC^@wv4gO2Oj~C*f
zkMAuc^~z}^yE9Dq1z<=WN>^6FRRxJ^+58KBj?Ir<E0=cozp4#}A*Jj{Ds_{JUj))J
zcO+1~w$hpcp^;Jw;tFiebv+X+TYPTNys>?DKJ9+YQr}nYJ0>$gtoYaF-7g%5_NoRd
zA4d>J%1-(kMmA3|SC2-H%n5`Fp<EB{7=UW2K}1bx^;NaPsiK(uec)U7%~$sc<8!J0
zXby)?e%P5tA}F)#?ZdZj2VGlHy{gCuG$8j)-f00Efggm`?9=;;9XbgIYLJd;70P$^
zF3$##?AW86-nvp<Uv40iX2f8Mwu4enpm<p$H%-r-s^t_G8<;AYZd!fT!7qsXm>h~<
zCJB~_$_)G$raaK221+^4d0A48Dl1Ur6&6;)9}E}`*ApI}zGa)PzpmQ%W6!ssY%m?s
zOJGRsLSa0Lj#W(IMa2|>w>1DoykcUtWL!=mE6jnfwl03YcHn7v*DP&agnOaFX%LM6
zuQ-uv4r*>x-o*K$i}?+WVMu`=yQ0d{RJ3($TE^M?ycH&!;p>@&rrxC`TYE+L+fJcs
zt|qdnlI?D%U3R&Z)uln$7Dc%5kLoAyeCNE!wgwfVWFxcisI0fw=L(l_FdF|k$Ou+&
zB_dR3`_)fd%+Mj!75k2(iX2%Ai`R`f^c!yj4AH*}4RfjFIEn%)o4U>Pzd`*`yQ@aV
z#)5*__uU>Ll6MTOuR2$dq5UgC357<hpAS#92Px%_0+tJpfcATsHIchv8k1QN;Sv><
zDvuaVR{{7p5Q^&Z-__ZBs`I>f`F0SE)5hX<@S8YDez4h1_|eTx?#*P5@nC#%DT8~w
zkj`@FZ%a6TUZ`pugpZx114J`L?H1Ob_VU`YM5s7z%HvBgHTGaA7gG0V;1>pnu6%~F
zGJ)e<I*JG==|$<F|7HRY>73fy3CCw9p>5660010BR9=)wsehDy#J`mTQE4NkHat1#
zeiZ4$`U9Zl2a{o;wqw6Xnuo%n=nQgb#5G^=!-3&axeXa~RyfU#<NYe7k`I|nUtUbd
zjjeEgq+!-=xU^es$WG&%?|f)dJYrO+S8UjE2IH^Y-L?ki=fmICK49of-QQ14(KURb
z*8#7{CQ6W~SfJ<Tl57}XPWN)<dP=LI$Dy~t*ATiN!j-%F%r};KnV-sa$5CsGlAD@>
zs`sqEb!gwI;j4?wQp8VWp=dnZIf5LePHZH&Pb-_TcK|}k_#CtrHz2ZJi5&ARXz*AZ
za`N#Bi6%WTraS;3PrkQS)jr&~@<t`~J`k>~AtXNy^4b!1g^?3>*E)<e_7@0jL`(+L
zut+j=%pF%!shEb^`NZVUb=g@5AZT1|NmC?vvIgTYcQInZ=p72i)V(BUJj;9KU%GUB
zQhT6;-PYpW`%8z8)w1Ui+4p&#Cl9}~OgvHVr#d=%hob(U=_kB4ob+D`Sq6D4uup9;
z$hs8b9j{)L$^*OK|1{YO6;hzDkC)*7ovr)4hqE9E$1L6XiJOWMH|8Ogg)&Jb5D~>6
zfb)jtTtkD!QYOLD6*8E@pIChim*RGseCi0`Ot6atDK-UX_DEn(dt!wCfx0~vWg{k~
z(Y$yC)w}{(wBpt)bFe`_@uh78lflkjRR~hf%?<SUL<qf2L&>b$cP;|4@fADUtH#oG
zIX9Fmb~~1E_2r8rQ<=H*VowZIS%UaoNl@_fjg-y|totQZ?P*N2UsXbx(RGnNr*)1S
z+ArA+CfjsFfdC4vQhhAhvHW9pAId^xkzxw*MkX9~x=)|5PV?0@mv0>9Bt;H+xvuyK
znOmu^*QFq6FqWV@{%(e?`OmAH;rU{OD4q5R%TS6!Qnh>284U9Yc`R&|$#&62<%+Ah
zjOER>%^j(2CVZt1OQmIDhqPtIBc9S~-6KWK^E)S%E$BW^th6F5CBn6kU{S=0v3qWP
zeqhkKTd{e*|2C)?W>Fr>=xLciQvEs)-6URjn#EvTSn1>%qXEG1MUkt=i2XhrmA2Z;
zeGf79Q|2Y=39Kx>d6bqf>t1>;?rrfIv?cOomi0w(D)8}EccPXVg7LOoCEx?Opa#zD
z&C<BIP06dSTlP6pgiz{|gRA#1+%3|3O5f#R9@NUnIK3PLVPI_D=DiuXC)n!O^l=`2
zJ+Jz0%uP4nubyLE*Q73>iCUbjF0K5&{7Y|=f8X4`8%$9>E5LlX>9+Xy!{=&W)NITz
zvwOdB-rH@h>=~}xgc|BRK<f%9FWkC?k7ED-`!j)+<hz8JRnZdfE|IMWF9O$u8BNee
zrwS*w_+9v&J6|$avu-I2CrVP|MeU6n{468c?48iJT5s>w>ZW{0)XoA>ndc>{SC|q;
zu)qffM~K*`(VAs%4#2j_3j{EWKX(l!2s$p3HMu*a;Me0`)<(D2Z1VXu-talyXkW1p
zvbxP_ijZ>o=yDJCjv|KJi~VqyYME{J+v9$Cd#d4V<{U+~+`gSPr{vLo#{v>T19;;L
zV7{WzN}|mxlX1knvX{0i+U!QO;*rkk=bBWA3Xl_i`h?${Uo+AP=jY3QwQti-CDgv#
zJ^lHiLWG*M)Zbh=tfjgVKS;T@qYYaLD)2D*5>x8zByQ5C>)y#e9CbK;QhW75!NiQ+
ztm>FZD7`h*An~4}{84Q7m{58ro#Xj+p<_^*7(-o{gv;2=kHi>=9RW3$w;qKSU@}sC
z)z|;QlGtnXrS3dLAu=xF4Ya$J8t>*=9gjvN9;T3rv5cW1XMwr6^)jd}bLB$Md}x}v
zlw>E!zT6?ExX-PF!1}{=3f4IP0{o0g_OB*($c%6fC8_&5!{(CE!s$skZ!oyCBu+9|
z0L0?2Z?))hbU%P6(L?Y*@a=k`CdHROZMw_k^i-@nhhDp>ev)cDLJ9j7P$^cVx3ZTH
z2{@Zt570^!C@e4oEvRNx)RWX`?rWH3`<e!7{WEuLEXPpm$Rff|5Dv1xPtZUMR%)ar
z_<0X;9SJjTSFd{6S3<`Pd%im#1WXi5qPcJ2HneQrdg{^P<opIeY!H_1LhF^2TOi|;
z><>wqSx&$<K5Td}u*6nUaJHRRNpD81A#3k0R2i&|YH?dHV%Hbl511JeiVEYxC^iuM
z!Shz3K0Fm5j|mT60Gqv=|N3mRXQ~b0Z;ZdpMfucgk9qAWUw0mchyRjw{N}!pUVMS{
z&W50~Jt?T~xG164_|<Us0vnVy6xR}5Wu+|cyabb7jy}5^fO&vL_)m<XhX>Vzt(}7e
zn8b4S*Yc+D9c{U$s)h!}58HY_azm-{!bn3LKzW2LqoVQte`d%(XtCQTY$`QNL8oDu
z>@_w9ow2$5t0`D<hriDrfaF;jqlWTxIjcgsqnL(CuWMFIg)HENU*qEGSVkd(KQ=QF
zjYi<n;@zgV6Aroq4!1aPcl58dwi~&157b|}hL|E!bwB;aCS^f!(+4*11bmz*`*#sb
zoY>hpDRo7ku|2yd!2oitlJCE!znjlC%~Aw%m=FmgB*McT&SNWQp0ksY+z64>=US^a
z-U0-Ei#|;nqO43MKiHS)%J2qj9KijLqpNUe`un0A9RtP)M|aogmeH+rcb7;vGP=7)
zN;gQSUnEC&hajnhgkS;64}YJ(;Jv-O^X`53+;arNI6rsxiqFjk_dlF{zUy^@QKZQd
zNxdyN{><^^J<nO3>NqV0kCfp6`YClA=0#@@IlMKz%YD)C@sIFiALj|)$P2+IEAt9C
zLF~yN4ePzBE*@}H(EnqM&)cQ-&7`s~ub#xVUdd;yRO31rN4<_UUFgC$<;~SWg@OsU
zKz(B4=PW{G6uxKeaTkYuTjzj<`?{41gWrvKn>$?is;fj;6T-yAzclK|J5P1yf)Xq$
z;+>~Ly2j&c0w=u^rs$&#RD+T~8W_0MahD2mFlHJ$7Zc}@6_Q09Tt^6q^bm`&t{QO2
zxWunlU`{d(a)LD}#BKCQUREXl#P}hMx%J9#g7+Ea*>8buTEt5aW~K&Xfra(u<A(i2
zNLkFYR~>aWv5fH;U%!h-vp8Gh{M*;1m>yy8UsT3QxYcx#7n{wb!!DOK4rPSXbudmi
zKAcD|pB`M(3BY`o2~W`>K?jOvJs{eT%h}@=I>37BlLG7zhf>gP#E`37DUfi9l!4~O
z{#?75!n9L@9lJBRn$FMcPLzmouDo9BWK3<u7@~p}6GB`DRXVj?JL!)hc1}Wvrh=i=
zjMNv^lK-)FDJWcczf)p#dzxcN<hen7R%Xwemt|G^4OX#w=ek0V`{&F17Pl8k(LY}o
z)wH{$0Ii$2g?SwLhB|u@{ussz1x_07mj9nMTjvI0^;#^dlMM(i=diR13-eW#<*dJ>
z^Lj1aWybn9??=SDF*kLFyT7Lr*(|RsEWCm`&C<&c9j9x8Y70a1TBX{A=Q7-HYi(Ah
ze7S->+xTKxBg@$9%sjsTWNe;r`N$c%{PHJ>e{qiD2?<Lz`O6F<h|9A<s+m0QiHeot
zLmWTfprbB#y0Dq=myc(*%q8*oT!W6P>^W?Sqwz8DoDtShJ8CW9r4^dNV?vbE5Z=GD
zn1#KERP+y#q-lB@2^bq*Y`s5TnmFKYTtElc%Z+D013b$BjwipYGF@eEI33z=V0M11
z$iBIRL5fC`iiRPC;mZB0CSB-KXx~aUToke<I8LyOIGC=J8qV`I_6vKsnGwwj$|OWW
zJ^PtVh}HeK*AML4!y~}2Qp9>0?QG_#?^o;8jGh9Q*L+0fBH;1v@4MhPv{#Rot?U-}
zO)f*Kzia9jiU_Bs_D&TJZpfW!&a*ueVsW|OGAC!ZTN354dL_MZ*zxZS_FCQk>wH*m
zE)nSUgku3CtD+&~QM8j3X(Cu-Rz5$*cCjvvuqcw7hA12v4!28jG>GkJ`{l%TaZGvG
z5fsC`8ye7v85KG55AU3!HglWuaT2noD*JP@8tHpa2Zj0ER3W{_k!&sr`~=z!#yZD_
zx(lCOg0l3n+ml6l?;+7P4egEkv*NhSUP!ORw1CEu!=yu-NGs+u2KpYDbTOT=g|mC3
zb~_7plDN~8A5p>6$X&*+Q^$`ihfVhw8qTEMOq`Ke5ie~61r}v=B1Kk3dUNc$RcYNs
zdg+<hjURVq9)Hg6{tQI&>J#qPV+l=wp+@{gDxCXU;P9jY9&2^@5LP(wR~v=kzcc;J
zzYK|@T_gzbf0156mlTI+$7%!(mpJ1p{4HUKXk-ttVu*X5Ml2eF!q+sO;M_2TouxIA
zo&8q$t=M*CIc&Kvt42GvftGd7P9F}Oa;l!fK|i;z^=Hdaf9uC5K&w-DoyAto+tAZk
z>rTcY`i61uV%73obCa}qe!iY~DtPqC$4{biX86gXzN2?UoO?&VQDkjPgh$?~)WNM%
z*YWBFFRhd^&Tf7|f??RXa~&7DEZR``rpfz~IwdfB321y&6=cOx-tG;Sg|&2?kBnIH
zPDlL1@Ke66m;l=2v`|_Ya8>D1r}lj@ACPT}&Ei^kyz)Y-RhBPy?>JK0ey1o)GT;B=
z=%u(pdyqTTTbH+_{u^dG#!g&V-OS`_qx;%EPx^SJ^v`IMu?CO~vbAY+akh8-3_%I~
zfo#}#c!*GH9zxd|D~Au13d7j=r1g@L-GBXJPNb&;Q4|_2E8q{Voqw8+|1iF-boh{y
z!C~eRE<XV0(PE@z#=7J}qA4uiy$So<r28)p7sQz^5VTv^>9@0e|Ai3Zs0@^+bTcc!
zGpfl=g3xe)J<==rA^98On8ltE+%jHZ;+ixK^Ad1=G@v;^4uOi$L6;R)FZ3|Olii3Q
zPA<_eO6|E)=WxVg7hG<@tu8Bf)o)~A;HMU)gk*XMa@c*rN2f(XC^`+Wr%4C)hJrVD
zEqY_hJB7aI`%cz2toE6PxiuAMHH|vF^k|@?lTS1KmezQZ@3wf)&t~Wtc5^TJyrw^=
z;Ml}B_h>XKVsw*^w|TJG_51x;K+V#wq3`K;nnw0)SI1jRNq-CpX(0r;fVlr*JKJT^
z!6Sgszx47WI@z1MTKm>7|Ke~<sGc{uc*m$CBnwzxx8+<1?xhqZ(rv(5p?Ed64#F_v
zecf&5@nqSa+9jekBiLMAZ^7r>+Z6vyqG9INv@xyA2lAPnQdZEm8K*?=`QUH&`sEJ2
znQ|-qctwK^AFggM^u!OBCO&F5bMI>(3Nv*-(=OQV5?-WR*wB%&rshn6*%o72=@<Il
z3t<&+2?I+fvQMezMt<)iKm4uP(20!FX@+TXU@Urp;({LwE+32S$9Z0{q}*JOY;6Gt
zBNKOBQ$U2&h^wo?iN@B%-HJ1Cy&&_yGhLDtBG0o}+-T-8WPWOIsaX?v?%P#Y3TTgm
z#OY>L$Fy@o0qF3sL*4xU*5@P9a;%+kM=^gE!mMPcHl*MkI!?=yKqwLmfF4L<sep2>
zrXgKX77EdK5x1#3P-Ec^dq$F6<Y7;PJIdZvvXH7f?xk5|j@R-~yQ&IRKew~l(`1%$
z;CY#*L$^I1LHAN<WqC<B*@->KSH&zN((K~i*&f@T+2?`n@#(u+F~`>G)A%>0{57?_
zL?K*HteV1ceW#kNo|0aCzpkE985)`JNkwfwhB%>QLEAl}i-Y&`(J^{tDZ*f<i^coD
zFwIal!0Jc>8Td(W`N=N;hQ3quq$fkn6{fokLVS|{R<UD!1FDQ72$6suHY=z((NvYf
zy^q|ZH;gh~J8RoCYGD&|4~P)yx8dyjFh2E7ZzNu`2bK;DS|8sH@t|v^nf-POMrls@
z|AaNm?28+Ys^N)O@}sgm5arAM+u!GeAJ_gAcraqGq$<=v2%-P&j*R-N-!M*d(hE)d
zwuN@~*B>Pmy$<F^x7I(-vbs7ncJ98YWPmvU2UJLrOn?Q%ya2LWFA#}vMG7HQ!9XZ)
z6i1R87W5hifCEDVIEmqF{KJ+{+-*%&PxJW9Y-1xR0BsWbfEfr61+<#CG2us1z#Evc
zl<=)mtOul6lO`nWvAB=Pt;(!yu#WN3a_q9Gk6HHAtY*<1fS-6>RJtFRXRU8pY6B6}
zvF?V21SW(}BuFd+@k|ns2$)o?fa9A7%SI~Z5UeKxfaNJn;r7Mmlg+Z)4TMl5P80&=
zmc;@0$N^NK^dr0NC^7&B#?h!Oepx>A5a(wzN?<<2tKE)R)_?u(<v<K_IN+xsC;=af
zlV#p3TwCsx-U$~|Lq*=b{L68^OVM~(rH7p^r)sZKQr8iTY3Iu*WeDcW#BY}kXc$zt
z)>>ZRXAts%nTas+Q}nZXmA%2{zLyVb7PqbH5OF)vtR`W?aqHi<o*wIvVFAAc>o^6*
z`y2+BRX1F%G`q1}<E$D=ghWjdfehi6q*;<MY~}1M6r~9pGOee7CCk8!L*YRZjv!1J
zFY&r(<p_$P03=Ww+X0xW(t!>*2u_7^NphiH{2ljce~>TAj?}IbmZseXzY>+VDk#qz
zLw`veM^4SjkP<F}&0e*Ldd1iRP0Y8&NJDF$U<Q<?;_4rWgZ+jqsF?HF^5)Q93GJXK
z5dUtr&ge5>r2mB7aBo_na{~IMRXZ=nJj$HK9|-;STIk5X0>y!&{F29wL5@B5JH6vZ
zsqW%f>E})ALp5Q?)+&tJymo_~pPT!kS?NRSJSCB2{slGZ7NZ8)#v0#lMrkMA>a;?7
z#Jh&l=;;yAlDrbnSDF3o;nU&q`X+22Lz*gaWX0kbhXiA^<|10DyTt{I!=OkXm4xtQ
zeRouI>Z842y#6vSfmN?sGOo%2Ou&amUTge|-Rv?YG$+p<7n^`Y-@i-2xvC&Mg5IIA
z0_hn*)5B^F7MtCO^y7MlYs3A2Tz%rMR6So*))awf5a)P9Ps6g8cZ%zmdhAq1o`<^s
zI-YT6UHo<Fn(&L~mCo^V+o87)^517|WXGx+Z~oe?h!PX%Gk&&G6&TXuB*jCnby9x}
zIi{5vYrOoK7?!O0eiyRkvy5}2;4A%(i1_bK@@FxW_ujAjs^{P95A-UW9jE>%IX#{|
zzkhrzI*>%QH0ogF&Az^Fbhvhw_El|ZSfbqRX9`n?PmmQ=0sshv7e3&cSrHAP3J2FX
z*(8H9lxSnlE8Z*VM`BT<A<7V9WG1!)Pj!LO$OY_Rv-E>W08q<--;Nr_OltPU&U{HB
zBS!mtT+x8P6>s1mlare`ADpDBnxa~S+wP2<C>D>qb_Al!PxF-V*4aWAe<KgLA&-$6
zAmlQgYSr&2k$=(R82Xxxvv%{=C;1PWYC(|RpmNCx=%YT+O*1`v;Cq38opWw)`tggU
zt$P^ExDDoCWi0At_~Jq+>QiW=lEpI{y5Nu4ELAe!ewn>tydTbp{hi(ok~8N){Ntf5
z=u1IT!ipy7-R4lt=XsA2#`iI&>K{4u)w5D&gj`Z2S;J~8x06l7tOan$Q;J<=&>Z{t
z)hX1=gcAIjb9;T#jeUQ)U{oa=XIUl=d^<I9`b*+hWz*%+^)t|m=sz^`#%Lj%qkPqg
zlr86*AKq;;L>iVuUy~;r2YqrbHLYJ?u75o7j^*u;<jqf=YHyGbq$N;}oE37CgC58V
zxx-+Zg>Qv%$itv8Fyb&7KAhMEveF_Xe^#oC4#$#@qjAK6VwB8dR2XxDej-H(8KTmV
zNMCH7s2D&n+=wN{Nv-J*lqYg8;+G1N6JQ)3z?A}^uEn@KJq@Gn_Jh`0Nnl_R0mQ7-
z@JHd4Fh<}6x)Gcz;#~%C&SS$g{Q{VHWrP&N2PW{pfX>G5#nvPv<I#Oo13Hv?r}FR0
z;uQp<6<4;NVIbfb^wSiB^Me2oMtIs_)XQlnx$x!+arf1s9B#kA(0^FB_M~f6XZWNg
z7(%PQxIX3MOG%}fhE%}jPeA%ZK~aJ6Fe)f_9Djrs3|R*<j)HbU+W}`i9JSvjzf%s&
zhcS7Lp(L(ke>l)NxzlDVy+#s7Q^cp?A-T|@^TklSNPdJ0OE@5EsD^x+cu)_4mPCYL
z9%N^v`;Bt08P+jU#Uyb2hV6HF-7~p3d3gw)J@P!Uu%sNyCQD)i1VC|FF#sx7P|Rce
z@O3K_f*6W$R%>pOW1LbOTwp8U$;pQR9nx$9qe8#X{j4(@Z~mT7{i7=Dp~se4?tq8%
zbiHGp<q?l*o6uU@_R3dOl1VIRq{16SHVqvasM63zKS#}nOhZROQ!q_XL}JAx{Q&5L
zLh~SZDoA=NWRdwzGH<@y2+uPDMz!V<vbN+PMX^TbIy;pHNI0QVg;szXZwjtF79}VH
zEqq`YJ!$rhD~|~WS|r-ww?V0)n9(XQraYNDGfkckE9@%XYpNX2suA&kslu-J{v-ox
ztEAQ9pajel!Lz7YPnm0rmZ7Y2Ob#N9S>{*QyBqO`d~D&(QRsVH=QQmrLE(K5q2))P
z2e1U%u?REYuGhPeIGEr|*t==e$QRCA2_@ms(|-^8xYLPokE-D?J(A>eHOOlI!t<Y>
zu`Xfp2fY7CJdg8&S1mgNe(N+K|NXy)_1(yG!V1SR&|YnLxs1M2*lG&~%{;$*GHM=W
z{<evm{kMo9`rSxn!wzI|&!sbTKYM!ebD4M6RW-RgW%i_9fN8}AIg=;bT?9pze47>>
zM>)mD+w8uKE(U$XfS{u7ay^&VNuDNinW>-)k32Z$D637&Zn|mY!|9&QVO#1bi&U>D
z%YTKP%fml4Hv|8uANcJC1x#Cr9PL4m01*IzPXNH6!cvn2NGeK%Bx~62pf_COc3xoo
zeAN*)#L)~IlpSoJK!N0tc1m{M;5Y|NT_jSq42`me;Y=tG0!)q1Ux$N2_<fx1s7Y%m
zJQMH%@v<c^kN^fQo!bv8IUS^>G10>iPh#chYQSi+(7=#|mli8bOG+gT&dHshd<Z~c
z5!$igQ8(64fDwZO0}!}TXfJzj480Jko%JC{kdVYT)^CN#v^*HzZm{JKotW{VyDdjS
zZl2#>c~iLII|E?6r`oD!WZW`~-0fxIIzWytk>o|;f7{?Fp)(ykb?+3RY_uhVWmrqz
z4M}pBs?CZ1H*17qbeR4buVXU?4Z99^=^k$@BiB|VOT!40NA|!E&h97(ZOG?S^S6j{
zuw1$F$+y+#b<WR98*_yPWtH(`^+NM&IW}>yi&pA#4vmIAe{`tpCk?$Gb=dc8pj&y2
zuxGC%^(>wA%=+=4OOUGO_O636RlK<CMvTbCp-z5g+U?<yRGH1JCsngPx5T_7RU?a9
z?#eoLh3bb@Zn}(}e5v*vs5~I9TK)WSam9<Z?z~@6*9EU?T7uTUmNT}L!+B+`tn{`!
zh?Q~3r5b}Gul4I5!*cK+$m_av@hyksc!f3Q9#Wr7J!$v$7x@>e?=8*coo8bu75zFI
zCERo*&9TNl{Q2-G^GJC1G?i%I;_mx+@zat-TXI#+VpU*owzIi#v!5^j@z<{&SL?CH
zS)Y!M)1qy+6Gba6c%vr*j5@wUF6S~+I(Sqr*`cFFO}4i?`Bl|1;gB<jTVTA2tXw>f
zjy~aL(|&gRU2uc9i^Emap1%NwMoOyS?WV>!AU^l+t~}?snJs<0ZUj`^5)t5Ooh&a!
zPmv^2#&uyyb!4lNK36jSSwJ_UNhDQtlEC|u0)4FtL$5{WPF^PI$Fv?ZHGkV;#X`}-
zJNf66cbqtpUNu^FS{iPNEUq+9DqaTX_L_V&RQ#lz!nB;sY0M%LvFvA{A9NA`CR?@j
zqunBUzij^Ifw=j}dWME+Jv;6Qoqiz7TtzERa14t(YvTe}*Ydk$4nd-ENHFi~nnFT`
z_~d_UKI1yOh<>w?Dh@^cglH_KVwRVWIJ?h~)f~&*E&4)O6YMe56{TjFs{-5pY<E-$
zZc(p|&TJ2kkuQAK<TE6@-h6Vjbk$j7^rL>thVtLJ8hDDZi1a#Y2g3aQGSr1#67sjm
zpR$st@0Wkak4CyR8m9NPLX?S=w5wdi?Afa4E}HLrN-Iuf9yX_B)qiTgg|)K@e2$6|
zT9MMuRi#*oDQsgb*U5Ayz+Mm7h_-m*Sv~2uaC7x1SS;ki>N1%GT3eMQ;;GZX?Nhq0
zj9eyoeKvW8PDd%rt9Rj7+^2>K7iwhU%|ccq+VsgzSB*8JP=O>d8NvxN=_uL-W^ciE
zV)}v3*{oawNqi}BiK*RIet{O3QYSH|{o<xK*(3q=iFY6Fv-kTc_@JD^@(U4>3K0<;
z<N+rf@evWvn2;BX1ATwZ-S3aM?CLE1UvVT4SjTC8R5P@wP%RtOGDm-8PEe`*mmjY4
zNKOV=ENZQEF!GxB_SjNY_YrO8nAVFAc-hTg&#)3cQxM2&5dx7>j(f9dLHPAv_*cHr
z)!-=l212sQu(XsW_)%}i%$(^}Xryx2^13Y;eNFWZT1$w)Hol!(^Lwv9(zopUQQ!Tt
zB-M%IxlqU^%Z-1Kb01rof0ZMjQ|$O)c<BgO5Nb-8yRg$A6sO$NT}`16(*b6phv#9%
z=k?wR=QZH-M_!3RZLx@>IBbv%=+G!e4Q_R6l%5?rdi&r2k%m}$^9UR|O$S5_r>P{h
z|6f=3f~$}`zIY;CY<$RwHHkT#eW2S&gA=Yd;Dj?xhh{bOg)WJFKxIBc!$W1iWvHZz
zciDBnx*Nl%4skflcrKvgU8yDwN6+396|!|{5S=LR<i@tmiwC9=N6$^03B@%iNKqR}
z4MFXg8F3h3R&AJADqe6VJUn(xwFU6_Rc}Gg2}7dTY?M`HjjWWS{t#j!&T3$I2#XR6
zK!}ZlkA{bZWRDMzh>ipSjIA^2K~J+4-6O{VKIH5`i*>-#zHls@hJ8|^QB%>pD+SH^
zMISFEn(OPk$A}GPv7b@skpO(B(u=zXEfIL1CDKdnVcWD>Ufau3)xp=wiQFqk;v^Ly
zkl?>x>?Oz@Vv5qnluu^xlgwuE=&4ibd!#JLtx%A7w~Wg#3y6p049Uj13E3fl`%D00
zeoqe^;s`7(Fy_$oy0Msi<Uob>Ae;er8DZ_gEsXVETM-dUo9N*KXs2MBTw_Vp^nKr=
zPofdUVXh6P2eZ%jqk$ta05Di5!s-*tuz}6KPVO{4$c{#r5D$Zp2t(99qw0JeOl>VI
z3&hv!a}Ro*Ycv{><m3e4<_@qqJmwV+1VYdkh@T`SuWE$AKzTtksMm~^-I8w-sT(gh
zIl2H`tA@Wkh7!*sihU4}q(M3a)Ufk$VqU-`2h1lK!v`iJ$QiL1kD)D-<kpCK%BPS>
zWTt+Sv*jbdc&`G-0Y=M7{SvDFp=DC5oBSq^M<tRqs8{<q>pl*xV%oT2Q8dWjBrz-5
zQ)s7(MPVyW2?FG^8-DmD24liD^N793yQ;7|!^j1B^u3CCNz}oj7x7WC?2z{}#c}pX
zn-=RkDuj;btiIh_NA0|y`x#yxp#y1$p<~Bab5yt4HW|M|cwok8Xu5DG?PwfJ`UF;D
z+<||2%cV62dALl$JPzDh0rOP-YR;epb=+xo?9E!1Aw{}?4@(FPPMWlXR2wMs<rn2p
zv{gC9_gY|aI*(K8D>}}=vt-HRz=8O(8q;EFD#2aBQ{9fFj+Zxx%ih`G+4#$^Yo42P
zL3dj{kV@3@y`TN!hX4Ge8%IwLHM7bDB?!kZgFI_4S15e?ocN2wPM|82o7h#UFk$k-
zex0qE^9^=*f%2HMLx)43$x#(09!JX=`6sKMCfU9CQic}46z~$P8$3RwO>c}CVk|+&
zM4OhP1>-5Av4bEQgtO~dYQnj8V$|mh8IpL&IJ7?j=_(&^g3<FBv!)ndfel*Rb*7Ct
zrHI*Gakm#fDd9!^K#qKJ3hA-`889ABj!>YK7+!Q?t|qb*e2F%}G!<@aC4v|Aof1SJ
zfku!2I`!s!i5AtzYxDA6z1l}zq4DzG8Kc|X295Z9Dlu?6Px#GrCPhB`5Nj;jzq|$I
zv`y|xQMf>_Hyy9;DsSjE+&tB~C)1$v{sm;Y*x^#WaGVlNPN0pDnjL2VuZ`maZ?46E
zXxpB-y1fZAt-Z}v0=`c6#G;#GYZQSF#vJz3S`nN0RG7?wA`wQpy@8Kt6Ol+{=O;TQ
zg<l*AJw^t+MT*7t9E-E&pC8vGZ*!&=?B400fnjz9Wzo;5d&kMG$a1GApJ6=1ix-fU
zk0yX94hsOr*ol+*8HTa)wHe~WXO*)lZeom8!{IItP?r)Ttp54qbTuW1@Z5gqCjQlk
zcUz=&gn?F_uOTswpYE36v-jk_hm%F9+4tRD^)b#L%t;i`%WeFUac*Z4=d>Dzso(&5
ztMq;N{#ETs4+R#GzF7#js3hqF7QiUU%&13kS|w`E@GHFMW)ChqOqyX`@&y1ng3I@#
zakk?|K=AmOvTmd}Y{73xaVVii*2PdUU}(MVHw4ai!4%UTs<##zUl{FLz7)b;RkK?u
z)+#!$VmzAi(a*1*r^RM(_)*6*)ruVr+ix4-Of3*QdyTJ9iKcWnb4%LPf4E$BFgWXe
zUp~8y8*5LzX8z<;Q!_GZYet?*Rj>f~yobtF+{oK@8MoKw_?3H!=WKB=H@7$L`6HtE
z7@CiH*bag!_*xIAAe$Lwlff0tc^RMV8t=N68U#b)o#WE+v(tDD8$6t%h@G9Ri*>3J
zw}|?`{FXRv5Qr(`k#{K!AdZIGQBn^l5MEMP#|SallhKO>M)MLX*%RZl69e!g1jZ=w
z_LS|mIRR!BtN?&kK=*8#*t$)22mnkyql*KE>f$f+lKF8!j#|-{G7wrqtl$U$zQCs%
zSu{CB5<Fuw5}A^sx9sbXw4I-0$rXYY`yv5P2p9B81I%cx&^apWOl=D+gnf>LeFv1b
z2pq@5^um*HI5CX2)YLoap&=s}R2W%TID4PO=6_O{6)roY^zH-%U5=i|PzvTbI(0%>
zIhF_Gu1@ekf4SFxKG|uFgwj&mwf$N;qx3>mWeSeX+FEqSI~lZdsRo^Cm%mhf^Z4J-
zn<UeNu>JcPejb)c2)Bl2vf<t1*)tj8xz(Rvjf6WJj^bXslF5_z=Vj1*$D5`~t$m{Y
zXWEjMCo-*(QweQiTy7bgd=8JP(Jh^<sr&cVy(zs+UQM$Z0O;@$Q9_|=PCH15CmUas
z)~uPn>?(C@;v&pu<3wfG&6(#@g?>li2vOq2dosFarDNrr<Kjz1si)&(RYgNMIA<BO
zD~?4j`P?Q(y&)=#I<u35(IzM_!Pre-Av0<ftJHmlj49qL)uu+f1Flu?RGDdSkH!$k
z%Rdhfzuf%T1f<pRD3<&$MT0-oCANA5NOuN5nkT+AE)%M5o7h3h4lXuG+YRlZb0g59
zLp~XMn-oGHKOM=i+Oq{&*~G~fXf248$KlmuTeY<m&2IN0UtRP9Y61pJb%?drIZ3eR
z<BYJ9yBp#MU}bXQT7@D{@BB|X44k)58i^`12!W#b1W!}<ENu>bm3h70v;q|^k)-mS
zj7+yz5nsg;->A{DS9_+sV|Jb5q1J81q!B4~a$U$Oqnf_O&FNQADiRdykwDwF5e*!Y
z3#oxu$0<Ep+?~qLdMT9&@5{zGE*lh=fbHL^aBsQ3|4%yn+H_-3U`){gOhK>SJeDLe
zjG+>RN_kElGqL$f50mi40=)-BEw1Q&!_H3sWYM2%1dZZ&MBJ}&^?$kFH17!c2Wpk2
z@fXmpKu$EQfvGTRohYq0046el>cr<S?tQrL6gov$X%xio1-`R8a!YIOx6OEo0Olfx
zhMm)5Ns5Pb*a}pZ+&qtdQx4u@F4M!WltC&pN|*Q_y}GD+fLR6;-6k(puN3G@e9~*V
z^8(+Z4n!V*h8pysB_UN3z)LAJDKkg?_V%kC!LL%Ym04XV8gGAXbjdrIA8c#>@Kced
z^R6c2GZdC)YP>4{RM`Wyd&_e5)$4S;yyf)6<JNy&p`EMXBy`MaAT}1T4WF_+?%%$)
z`OVPgXooa82o8kjylDxB0WZ8|m365i4ge#%c*Eu4@C_W!WG*yPUS_lZa3CVy(_hND
zN7kFAiIYuRmgH2!RYA_}Y)pa$oT0BI2m~r3s5&^p&qkSZ%tilhzv>m3&~vABj-1BC
zUt&><I&b|}UiGs*IOm1p0vh(_Gt_@Pxc+Y~gO7jIHog9;vAJThE4U;yJ-rN3v+#a0
zP4s`%l$I`&QgG}G{m;Mpzswz9Tm!$@u(uXeJ8faF^FC93NE+)KtNV4${np%u#GHtu
zx9d$&de6Uo?TH+mF40~jg}$l*{9U}V#WQBxzs2#sVN^nyvS+K%)RLek2E)xc5-@Fz
zb809;@V75jRwW5g`CMKj%P(k>G*9&%^0~+m<^0FeG_H-#7s+vo*SFWI*Za}aqr#Qu
zEmJvaJ(xK!w3=@r^zn?P>@&L@#i$Fa$xCgsMakwnB!#FyzP+P%-`{T?PleMA<H}G9
zQRJsjn90;CAzUIa=MfTQ*gp)t7$j)ynmnttI|;t@YL~%1)*!G04BI4KP>pAe2I$B@
zFW_&e<}toaSg)eNzkSUM+M-Yv!^F<nYM8lRi4P}<AnaB@M6}ijQGRxnlx8;Z!wLuF
zmp-&O1S5BiELT}R|F*EAvMqGl;=!{yI#$M%n)(4VG`_#`-1W4DJhDiUJA6>AdEj``
zvHIPwZU@s#hf}khXs;+7o_JYBlbYNU$NyJ;e{`JFb%?|j-K$Nyk6Cm~%T$jkHwus)
zB81)gsDy$ezK@d$tGmzVM74A?B$OQUeL8$p?63{G&I`NvS!npLVR}_U3PRXAab<Yp
zwqF;=Iezl0e^EOu9WN}s{f7xN?1fmfJHz0j!|{9xg7M@Hs>DmY@tq$5Y6f1$(giM?
z%~AK8OP1;(1Gl$p$Jl**Xg}y|en>g;J-fKXGZ1{xRx~f0|7DAZh^)C4q)@&utbbbe
z+sAK6);rn!^mi+d%g3mBa9z7vqfP<!0S6|%9tHg7bB^N?gpU-UVT3lX6){D~07U!a
zr#A!oYC!5TpGo?4ggt;Wpr3q1K_-4+MSYxE{T+xSJuH7)^H1deaf~DKFKTBP>mxpT
zpPW`U*KgRqjuc|rN<iTGPDrbQHk7farsUQtgu@j*)%n$GK5Ji`ly}8=no(Y(ppEx4
zKi~96_V)~A&_z^>UY-8&+V^{dlp9f;Txv@?{edB>-={K{ieq!A&Khpp_`4R1qw>?!
z(%LiE{eAV!jGnx>l)Xx=Pfe`!8F-B8T8(q|E48YA)_WKz1w2KLO={2q^@&|b7DD?K
zK}3tSE_P1WQ~Vf>EOHj?)X`3XzW{5JE)x)Ndwlhn%5%Nt%$fix(x-<23QBCwoXqxe
z4-&BuW3I>R&U7JPn&sV@J}3hgoc@ErjlVYyulC9`gJK~KrfjpGo7OS^MT&{S4pne}
zpwIILdto!PkIy`MJXXH%{CS+@dZ7E5ZO~z9`SbBHbd0Kx(&$?5g%KsU3V!_X@L)-Y
zlvp2CD8ooT_d%TvcxrR^AfJT11Ehd{MiIskaVysI)a!rZ((o*kH(5-9mpl1!h$lW*
z+FyKcy1{eJJBO4p5kxDr7N?Wwmx?nv^>CcM&Px(C)l%>!5wz|#Ao;p;z8v39tHQu;
zD_plS)0SH&iv9dJRWInv_rQTum@NL61OjzO^E(wy5)HZaTJ4rD0Rz8<Gs1Q``ak&J
z$lsC(lK{iu99b+IIwRt9UtIRDW_xs%c@nT|m6#c#ikk8$Na6#LBvA}F3T(99T?{FS
zcgf0>ZVG{FYHz)(-nI-iU?)s|3=g;Y-S6rDb1LMvl3|W%Jgdra?eD)FU{68_tJPx2
z;!oax*B5`Ksh>0F3J2zgo$@1|#BC52J1Ow&VY9#H&AgfvT^*eN>5_BSxyc6;#5jz9
zGfq*NZZQ1M_31suDE%F-!~t$GHI7cRa&qn*li;7n$NL)*`9SiQuTyz=g3bm5iO%#s
z1O`aXlp3JhThwpRkFF31Z@75F{CtYI83@+OEO~5fx>TgIp8BDjmG+m#?mkz-Oy=4R
zl-pR-dt?2~YegI6+g%b=$wO`uT4-|l{?Y_FO^aRjF3(ww3BFL)`ID1jqB%5;AN18U
z)?fWc%4e&uk9k6~<;Z%~*F1CXbvnkA`SCu)|M<<_+1ityL%_&q32O2-+9wZC#TBLC
z#Q0j*eelV8nmC1!88=-dT1x*L17W?Q=;6p^15u~%-_DdrBP}z%a8PE>7^*|OR$P`*
zFEDuUBpH}E9zKV;R}W*N4)-#7bbjf2hrRKgasFBv1;zuW4bPbUfkL*V%)u&5$fu!k
z3$Fn;1sZq)P^0ttb4p+-Mw9?w2_#`~@ZeNclW%OLvpFE<mN+$z(ii=jmlY~b?7Nup
zWFm!8ol8>wi^;4C+SG99X_3G=QcB#i%wtjZqIv1?%exb!5$5s97VU~9%f_#n_xJ?x
zCTnm&7t%2y><$k;RF;Vm1=3R!j{LQs7)kIip1%r95kAe;=25_ASdE456@J)>p@V?3
z=aH((nFlIy3m^v^bXPZ8mE8uF%#ErKG+Lr>_@I6WS?WGINC*EninB)4*A3kidG?oH
zB>s)N#vtj{dh#GF*HoLW7vDuG8@_hAW_P_b=>9R&*`~K+p!?mI|7J30NMgQyKeKIV
z&Fh#e!ER-2n*^}`E28AL58If^YGHS%ZI=6TkcB3sI0!rH>U&!j-yc&U^WqC(IvVLa
zn-1UKRU>+%YZ#;eq*9>QS*00r!e^%bS%~g`IWDR0tKR?4^gyzZupD}fG(HntojVhr
zG0!#lwgQY0Qt9~)%GLgS;bEun)V7I4m5M@dRjfZz6v<byVzw9>$%z9wr#%=zSatd;
z%%qx5D`+?r1ve4)i=759At?uuLtK1mR$st|nkTG&lLRI%W4ft~F2}8uzPnnR7nQdM
zkF<B8CRde1w)~$47BYhiPqxe^n*18Sa_3q>EJ{wrd2%ty0xDF8`=+L6rqa8(E2`?x
zkSfTRd*=#0iU-PJ?|43R80Lp<@Ju>=W)6j@k*`BE`u-G{Zrom>UJ1JQldez5kpX}S
z@sD!qyR1H7h~Js*0-=yqGqdBMril&izcW4HUnWd=*q?r&5LbqQ)j}D#(FnUeI<N@P
z)f!I}mK{<#>+$b~;pTy9JbmZexk!jep^hrjjJp^!iW5RA%4No)fLpu#`j6;%q5xg7
z8@3fa{_v)M+}pGM?ivNgClV8D6bI>a3@-{dCkgxAviYU=#}f$I3AY#N3iBWf4wGtl
z&#&8rev7cj*bg~-E|rHy*9j4&0~%s_Pl1hhGb*o*M_OYL4G7owdM*En?wR(oEW;9O
z4A$(2cmx`5W!>b!K->zc%`4(}-UdCqEiO=w*8G3>#W63_PK3#(HYl9&RRpShN|^bu
z?C^Mk)RTG`1>0k+QOz@B&-#YxD`N8PPX%roA5s73x7#9u<_mgCj8<uJMlYHxKGDXZ
zRDna;mESa~KKU(dJAfIx*bCg_Pro|%ee0f2Vw^LJLn2e;6-<)DLmO_#%)j|H$Dj8A
zlFv-D%Yt$Th=NxKKgN;yk1;hpoP99gN}ep8)LiN#)|`=hJu`8tP^5&-S>Ur13_d#Z
ze)ew`ZyCOfmSDE&@u#^0wL^;Bn(I7OV!yO(Q5AK`7-bEVhfD?Ks2ydW8Ba;IJ#fD6
zvj6eU|M|rZF&SZZtljTSIisQHoTIG2rv<7{S!#G<hFtw_J88Wb^%(;e?Wuk3x2*nR
zQCid>yN<~>lbIsAOgJg&8@X`8Kd#GLc)a-xv)cCm@N%{*SXVskeD2L_J9GCR5JP8a
zLmB(_+2)4~{2Q&&xFP~-_Xr$CjBIT(UcK6j$@DxwMwcg&9=o*x*N>s!B9qj|4G#Hz
zbe-@KgWk;6d9$B;T0&wGV$w+tkiGx(>3&T$_s}BoG>PU{{OJ7rnqB`=`_rz~wZ?F{
z_1wi6qXD7y&8=&Fm!UaA-ER~LpNn_=qU8t2Fkk=seGU{GAbH!vum=%3q2{y3UH^|i
z{3WS<X&eRmULrDDmFe9lz=JeXq-lq;<Kf%~&*Igbj((5P#V*9jr0vsB?9}cCDJ0EJ
ztef}9y0ApY8^v%bH!?FITnMz5iH1CQfUSZNC2w$}5|xc&@N3yET5p0O{Sy1MCW*{}
zA%c_WzZP`q7Y<t)Lr-hYf-&wb9Mt5V7@Yfpr>$+xEn@sM4Of&arnPS_RlG(ps8`xY
z#zv+jBg;Iw<C3XjPgmor_ZgubK@20(_Un2@d_G5m*!-<JcBcu0<HEvG6~8hmw>fX3
zO4%*0&c7W-RiAm6t;{Br2r1SbRa*aMXT*R{d^yyVW{In+uD6l!5J!h0TQynV&u9IV
z@CR4OhWx{^T{t&+RX`9$g_?-jnH}-`c9w}Gccz9y=5pLS-=>_`mDVhf))Q5us4y}t
z*JGLjlO>MqEwwu9QP@<T2SxMw!uJZ{a^jJQx2Z`fsb4)YGqm5E%t>~Cky)ts@#paW
ztv~tDcsG>4?1-FC*M>0Rr8u#3O0JVpZr2~(-Q5&$2ucmV&Z#aeRB2_O3!G@vE_|ow
zNpDrAsy>CvPkrUkbc5Wy8v|;UBBBLdjJrgVzD5I$#=gV#k&h2~{WR>a&OHlWPvkcO
zUn!KnD<+%AC*L=q;Y5rTGtxa$CvzHW3We|s(5P*UQ|f(s4zLNa<9zdqk^mkr{}0Dz
zID<RuLMCGMxWuCy>O@j=G<7E^5b-%HRwEWdSdp2=QvW4RRktp^0R3h8DwFXQx>Hz0
z`ouH+yH62x_coZV3Jk9#(fopt!~GQr4g-bT(-Wyc3urZ^$P4w(BwNs^HaQtK1i^wt
zFCUxbFfg!WXo@E?HLACNJofF*TsGf-6bph!-SEG)9o{%8O#b2J2mr6wuvNW?Bk;@6
zLotbIjg&+tVxi$j!5p$$s+$gjG}wBLeUXDCFV*|kjJRA0402^U>m)QFybmK*)qm5k
z5Aj3);n)TD19|70Aq_L^9Kk+ThqD&G0>?iBE1dPBb|jT1UqZ9=|5kZyNLs`ks^b#%
z;So;R&IqQU-FTK(yihT4rIL@-cF;^7cez4$a3vt%QeXh6XNDjG<QQ-U!sC{?v-C<x
zJ)#1O=-={N11f-dL-tFUg8bz#h*<$zINTz()m7X=gc`7`B?!8z2fGoDnO05E`_+Nr
zI0X(RUaVIx44x#~@q)S}pdfsTyatNx$fw%8dt@&pi`)pX`u%)SGn#n;I}3=Zp#x@y
zZx3rqfV-+_^Re(@X^}4gf=jv}uz+_a39vGq6mHY@>t6#5HFjPN`PBw&6K0P;Z&
z4r+?8qp(#olHOnhHVi4kQ7CAV4K{G2sw@V|Zo97#UG*lx8V!IJ`BD}psGWswhQ<#s
z=im`sx{<eMjSzqjlMwS~9@vmG5DDmLIiJKvV@c%?%+n8yY9~*Wcr33O&&p{0{ykTu
zfx;wYLS7O9ty!DiygqzVyLA9#8NRSq!74emYC$0bJQOjp9MpJ$-L}|NhB1M}DV%ot
z)C)>M2zG`NT0bbLiJI3QP-ytcSE+fS3~nEe*UH`UkYpWO+`=lm7vbSQQuJ&zfGj>~
zvN130Ms27TMO;e4?IIE@j2GmSvQC^v#?v?%{>W_jGuvW}+)dk-PoJh786702tFIGP
z->2fKCy?*>z2Z#DE~{SJRc+;5<bC~XE+#sEx&u=~oUu180V~Vu<!a*VazCF9ldy3J
z5r9F;?w(te`Yq7k?$y`4@)QfOXPU~UlP}_IU$SR69S7}pQP8N<+_+UQVIHg;Y<9|9
z)pg>h|BW91vvnjp{!TnW`_!;E<vXr0I_ZDUtckerGt3f%<p|4h4#~1Bo>QmMwTQBJ
zk?EtUBth8T{==UQu@J&O${<VX-ivQrf<5$2X^!!|3DvL9sN^c_6gws4N*ha&IQK(N
zMYQEPJ}wSugEodE^7{2()K0u>`4SgyeO;+ZIUGs@yo_96QUWeVt8Mx5<kd4#DIvRT
zGBZ(MkvBv&9ww82Ze0J)Wp~Ude5z(t+#J+qR>@q^4;zeU)Mkl@8ctVPff$p%U>>Ot
zBzb1g^gPeyxwEh9RA=#2oBed-L9r?KpmWr1mPx%<@5%=MVJ#V*kr!TZ)OcM0%k#YN
zwH9_y(i%5B&x7o;zVGH3j20Mbq^~fKY^^?qBLMrTiJ>nwzqM37<SZDwZ$#ogO^>{P
z;-xr^96$NTwLt^(q((IpN2{%Fir;J2hP&c@SqY|7=qTW6;3iM83SDKX<ItPs-H;xt
z8(ZCUz;C$`eAU+(!)J<pw`tkm*&uXx_&slD&EdJZaOLN-SGjrRA6*;6pPXicj{LiX
zH+J${JBCkkN|N6>aD@i?H`;H0J6+3l#hX^!)Mu@&DSVNFS!fR;;)RrQro|{H+M1t9
zy|?pewD~ewYjz*%MQO+&9eK3pSrf|{iD?y25L<qIb?d}TjL{PAfdRz<q=Utoaf{3<
z^PuDcX>zUf|JO4=fh8Whn`>ABG12+!Ztn8-sk0S%oMQm=)K<&pN|pnM^NSJy3weZ4
zt42ZMr^$*20s$rcRHe5HeJ`WATvStRx6xTyOX95CIhFW_-+!H~S70;Jxy<+3i^Lm<
zgzci=kd+w1Hn5qfO3HfNIUfv~=W>VPZy0`743FT0WFtKWj9?mU2(2MAaSg^n`~Oe}
zKV7VM6JE>>?fhJ1v{eCOVKA!LB1BQNgCkZCbjPrN_%jnWP44hmd<Fhw;4sP1^P^F^
z=ELr}3xAO-!K14sKRuWU?NhJ-8D=EoUO%h?PZuvO0|q3$A(;ORdyg$ie!^)q%)u<~
zFv@gb!6L|4x$;?L`c}l6l>$XyvSVjK<G>8(wnouc(OC~!>7dKg>)GL8Sb>jVFpxax
z%Kf*!ke4N2uEQTCI%O}Q&+grQUWcrM%Q>&P#ch9jHdGcXFAmy+{ET1-9}t2@U^^$m
zpCL8E2jd3_kO)len3_7IC>BLjw<Pj44iEuwfR2TOFsUk*`Zyy$vU3)@(yg8u59_7b
z$t3@(kmT~An?76GZsGkPKlKO9sU2BBPYEM$sSU_4_{>x~Q94>!3~wo4kEZsE5g#O#
zpu(W`3{P*!BkLueDodkvgL$_0o)&l$Ul$BXK^p~BO>S3Y@mR<@v1=(S-y)~r@=3#{
z2Xaw9!Y<MuJ@H5d5(xDs%r0u|i0SDa@az`fh+$bB=#}%Is3r)mQlkmH?srnaq?QxA
zO3$5XM$`h)&T2Qud5~rF*@Utb+ve8P+5QZ{^b@0m670miiWCw2#8rVT+0L2=hky39
zY&Be$Gfo(9u!Lv;{xMDt;OOH!dvbTrkW<NY<#}i`dxdipi$n^GZYK7Sv1IeOxVk$7
zj+5fFRl0~WRoIh0HgcrfD2cr2L*4}7<twBovf&#*Cx8-;(C5{gCZ(plw<GrbRp^eM
zbzsqGlSAY0VC8_bsl+6D@a+xCCifk;B*1;Pus^hrqd+;52^`I*fei`cG+OpEH7Nd?
zSGQqz>Mm38Q`Dlr@$<_`74<Xh!B=EuqC)Z7kY8*FvD*QEV_ki1VUW8kX|Dso@RcIx
z5x6kh!0(U*QUIg=n4l?{_UF#u`ZjaQ?&U2$Z{o;t;}$X>naL?n|9Q1a{|B49AkPvG
zIy0gs|5P<GAV+sY4F~bUWmX+X;mQNy;%WYfgM&$?L4Mne>a_lvu!3mPed6MOyy(v}
zvpFM9vFW}x3^+L$hOIeu1|R>ndmlPGr}12$cbo5T{o{|vd*v!YtXHO&Lcc@F_8X5X
zA|~F4YK+mQ(JJ@*#(B0hJpB3m`1fy{$w-IGNy4s{|GV$+zV|qPXc+AZmlUzuXu596
z-w6vn(=kwg@(>9aX%;BB`{ykPd{<}FJIeEZ%Lo;=9qf>clKI_uUHI9vE}Kdi0l@p9
z<CxFd7%K#bMGx5T*n4)!DS%vLQ)`rcQCbJ>j{0o|UiWe(AR{A;%v3UF7_Sq=#~!2J
zLbuk{Ob$NG&(I_zgSx~rwl692xNC}zKCHbN6>Sp?o~-<~`*=P*z1uD?TvS`@Gv;|V
zS@*6yjJb17O8IIn*xQiv{kxCbb&vP1j}Mk#-KIzJ^5ah4aC9g9@pQsL7!#n%O41Zm
zgh5gC9+(aMueNn1M=<#1LB|(}9H}-9Q50h23bs~Y3IhQJHj%(6>qxLjrL`k1VW-Z&
zd~BITu(j3lB^>w{Is4?tsUnFKN89f+aVs+AlvLaPf|Lk1f<uskh*HEj4gLN@N%&s8
zI)P8#qBjs=HaJ2whKL(bz@BD76ZUQXNmD2od!5#VJcu~#$iz+ygH_;q6IiXY0_1`8
z&aiC^6H&r9(wK7}GPRce5PMl~gP`Hu1Is`VNUnJRhKmtCKX3>JVFK#lheD!%O|B<6
z&LpQ;_U$!;&N~}z(E(W47^>m_=bQw!lgB*aAt{U)7lS{tSw05yI%1p<iUBZ2fGMVr
z94<t@YQa;_a#b}=a!hr#PyCmw>Z~g8tAGPpR-9N$nDpaTC_u@RyXZ4CEwJDJslW8-
z35WnvtPhV;?6qtM^*Y5Cj}o_ZkN^F+rSi>IX-(Djgv9_bqHr_-2s1Ps2W)_YRUs}G
zNkkAPP9}M?9_%T_E-{0d07<Z5?J#Y0HI1)c8E<ta;uDm=HA<7s?qN=Uteri}sQDu-
zc*c<RQ0HY#D4#5a<epAU5E>_)OL`6n7{x(MQ_sRDZoweZjpZ5b?wJ(Ahj{}O0U9~R
zz^_J4IULf71xt-11jceM83;u(e}MU4!lGu3K}cwkWt(c=^7gg@xWtXyNnn$*xv(aa
zybvWY7+I#`Ir`y6_O>lfJeQM%tajho#-8%65-pybOcO}W$e@F}&cbt^M}#-(r}TWZ
z{LD9&el4jbY5Q@dgL=+7S<aD&76YYk&l~=fwvTz;Y0B~}fH-UFdjRjj=N9TB6CB%q
z&V)JxNiB!<d~q86q`xiFCQU-6xp{WOjtF9e;ys3cqob@pJ3KLMuiyXnH!WUscA^rh
z$F&z;3wwEYm@}V_IW;(isNkLE)vtLUZ=kka``2)?Kr**>{{mDc<4+}n?b2{yTK%tH
zKGwV?H?Yhb0KD1!Q?tCxm>}afTV}fb{8#JGkG~TSu(wzE*lHW5su+lQPVsW#(y7L|
zvOV6tjX^t&j#;g`OtHt8M=$w`?)8f%O#(s7IuXVAF_nvx&%Vh9#^QMR5-61oS|u@a
zlf{YC<8py0NnkZ@zsA8f8_Z-9kZBb{VF~DDEKVO7Mh_k=)itODc{$joxJ8j-0zJ^p
zVz8{3pFCn6um{SHCd-NPmnZ>_T0Q^}E*vz7gE>tCe?L%xRdsZ^ZRwhwEC2fY)c`~-
z0zrodoIs@Ex)_hOQss!hZ173LQUi!{P6H8ogHe<kwBhG4@@c;Chk=r3G?N@*)^U}L
zL&A{#=gHD)TL)+ec~};fjnxzR9sNnmw0mSQ^p?ErvG58sF98HwhdY^A%<PsV7=Ezf
zz3yb$E}oi|$9VPv3#J8#B=T6`1hj<1NRZ?az(^GkFdQ)u!h{Nca}>)<*TI)P*OwFY
zfDMoWp9|o!oFF}W-ur0oA!-bvZjss%0TM_PwD1TI?1V`EaIks`U?bd!0LlZ@Mj(ix
zY~eY7IsWW)UQT^&>!P)|5LEuSk-J=dZb1=Vc3CGbmdYOyM*t;7Fd)9r;-ewxX;l$`
z04x<mln-t!APs_n0>Y^Q!0R+cm^=ozFTsbOM?vS?GBu-l!9;<-2?wSj4d2O(WKLKe
zY?lRFVm;4e%3TS&NVZYkz0<Ga*$1beY%4H6p`><`+%ZXq)@72Y2QfyMQ$VH@3KJfz
zNp3Mh;~;Giattwrv5}-d`Y`#56x2RVy1emRZ6MwETUO9IO?t)wym?(0xTEH#&g}F_
zP)TUE9*vTLbrD#)OkqM@-Em&*O2D7ptHt$)ffyrvKb47CagG$S)2f;zblTGb-lC1J
zS%B@NU0<T%^un^f2QZsy<4awGs2H5_FV)q?Y-&Gh-dyT*s@2u4a;lb`VmFDk2l=q(
z=<9jxU8%7Q7Os&A%gim0I^8pkQS)ajwCcQlSb6#IIOKkflIwH+;xKn7`P1YwNyeMA
zQQZAsuHMd#U)5>(q+!b#s3z3bh0REb_mYO<1x-N~8V06NSFVem#6n#?XwWH&f7`bn
zt@_$j{|nxw@xfT}hsKU9-Zvs|Lvh`RG^Q_Z7wt2%)B1h=sY)Z(r}W8al&;Tu+P8Gi
zJ}OoaeTlsIWTz+`_Mi|v8mCCzqHWI%FMY=Z4o_(kX8QHi<-6Odd#zCPSxP%qOq{5t
zS%2tnwrQgqm%ctV-h42+oNT)X7cn8F$A-$l{h92RY#kg2%?tL-1NhRn1mLAMDHfil
zmr@8PhN=8{Zv3QS`_ymZW<oVS^~gS|>qH~FyIQSUw>jtzgqT<oY8e$;Ar}4WT9DQX
zTg*j;seMaZ&bQup*GA+4=(xI8N=a&wa-1?6RR(V#KTO8=Qo2!ZFBf=SKR)6cng*5)
z5j1xv`9SinUG+CN-|;-H9maD+)V-RG6a3|s<zv~$DIHq&RF^J8Ps8N8c&+ggVW`er
z)?aRSskUsylo9ZEneICwYR2=p=?txLLZjwSWZ6wvNaY=y?>RNi{j~M8WGy}nn<QP1
zc(Lv4GVWdfbvg3L5Q_8mOw<3nbN@%uRWL-=b=?^{1{k_w7`kU@sfT9hMp}mMZUlyI
zq!~(SX$e6ArMsjXm6lXML=fS__q)Gf-E;2Q=kC4NUh5l=)LNO<xb9ChN9^q&#*4G#
zeB@wD#h2;IJUM;0=^RhA#gre||H)c_KJ}G$EIfm~21_yW>sKWLE@q7KDr3)wKhz(T
zB#q=DmvoPb!oiITqffeY02$R;T7yEiQcJd^0THD<S7XkY7@jE&Qw|Ei+E&8+aQnap
zV@Gc1^woS3%;8X9jxSjmxDosNX;3KZYCZhm&rp_1SZHyHyLkIm)mm1|;Ti%ln>agk
zc>KTmGq*u^KGOujN6j^#u8mM~xk4|VfgKXs5-opl)*DV%wb6r4?Q%hggDbt2rkR8k
zi>f4iCP5Vwo#M+|>CwWZEEDFIW7|P{*xHnykJhwIg7Wd$I|R(mX-`L}AEX7&b+<I`
z)-ZZ-r5MEEfT#Vy#E-<(UO*{?(&I|38i!UWC`P=AG#iwt|MLr%IG;dp9tmT5iE9{k
z1FkTxKf~+G5=x@zNLC!`kI-lq6<x}Im=AdFhzqRx)&HVW{2~xaOo`g>ScJacAt<#+
znc`1Lh|@R<+kO=|S3M38th7-g%Q`s5ps&@3YEQLXN7`(bv@AX@e4%urqbHuQVV^X3
z`doB+sqN~napza{M^=rNg3Pl2(z5Q&k?Fb3=bfrVS5`9YHEpF)`^3x=%>uAa23eE{
z&i9^L48zi=SkDK}qxtuO87;~dhY8B^SXP|@1}~K-E-CJ1G;k|KeRfTAocI4>-sbb8
zihR^p|M5q7+}zCY^+oniaWxzM+olO^Srs_-@NN`cr&2aX8QQNPYzdJsvbiL{<)@=#
z-iMFahwYcDdVM8-_x@Vv*Y6sYy|ew<ZNobY?*3VkNmtm)$AFjr-6U(tPPgY9pUM8V
zJbLw8X-hI()SdU$KJ%M{LY3O?PUke_q@LnJXB1%lU4x<viL8f1Xq6F$<~|Ei0<JA~
z8fE-my3Z_nuJ-*^g|Y8m*v>!9%e;1EsQZbTx8{<?b?Z4!hm!_bT9}cm7d3jMy<c^~
zWJ%Ob2r^v6(?1lZ;v@7v&#K&aA{fiKjJt>bEL8{-59F9#yBTW=+rFH(J<NP}*!A}Z
z+@fpAv)(r>>{gO5&#|~5z<twXWT_)%b>b}i?|RR%E%-c@LLOJf0Vxsxg(Q6atlhoN
z_VYtTdsx_Wh$>OfJ8;O}_cJufQiTzU#sVX>s$Nr6<cE~|Vu9Mi|NORQ1T_gzpXZ6B
zbJUoruC|SgPn|dPzu{2KoA}pJ_?ZuXcN=cdR%!nDl-uJxNURlPlA9XaIvdBwVQ-VL
zX{DHlA5>fUbF0daKAwz8GwN$Z-G~I|WFYI^$HB4R+O(7>nZ1d85<kGh!HY>LMt=-g
zB~y+2$b`K0^H{t~s0K>w=gmDOnL1iKI2nY7tEVr4yJI)xhwRELl!>IC)Nq)h!kx)N
zT$EQ47d5Uz3*j$<H&Tq^&|5uwzJch1BNtU-Xbqz2P{Kwp!u<C$AHLM+@qjna{^c^8
zvW34gx(8NeUZ?%%d005p`@@%i+Sf(=enRX|a}krvgRKonv5&{#TP-y@!W+lT0r43u
zN}qE=hB92k!-8OBV3);jie~m(F8<zo^69zD2&+yzOt_8Gud;sZdw6T2v^(WiSaINz
z^u0_Xn^_iw6sro4D6rZ8lhE%Fx)kv_6@tn)gR;xEp!4Q-_Q`~)vLgYFbWJ?StOtlD
zT8?5YmpBMN2}{m8y&eXG=^B{v-4qgoc|?$krQ7OYpkE9!6=16#W|vth%tb(!EySM)
zwv|Bi7K%^dtP~qnDOh<yReEXcEBBw9xK`Vu>>}enq!2*h+yCRD&xUDIX#$qo?*Mwm
z>FKXxXlc*okU>p{EWkL7WXgCz9C0=dDrYu(TCNW>5;IbHN0$O4BViQuRZfCoiXg63
zQwB1QkpgjZa>B#11q=%Ts7N?);E-m(Y#a@ZS4|v>=0{{oCf$$b?8LE2yKF<<0bnGw
zii`joh(S%$o5YQl=uIe+a2rw5eA}S-726AkM7t>}OBMl0Lubfm)6-@E^J!>zAohR+
z6I~M`fUN_o2CK$Cmx#TBL%5ku?BH}snKh)eDmjH1y)|1vNhgGi!p%2XO-iPlgY+R^
z2+`~QxWrG0V`<svugS*7qRY=@$sx-}cn4}uT1*jYYc4FMr)2OgEP?JO=S+r+G!wpA
zCxAgt7&|;A7`9TZ!n>^ej>1C1tV}={WwcRn$_I2{V4^TI8cPO8W3DI%A3KuaG{k?H
zjeTc*zm6z$T8xdgJLG7NRid2VO0@;cQ$9H9`0OvOv#JJrn=3IP?+lgZUvbIs!nfh+
z!vo4GZ{CrMy0m|KdNRLs;A|B_WRzbp6{%$L?jk3<o(Cbi<WVeW0HWZl5i~pbcoWL~
zs^;jE!%MOL0eYU|>>gnSueBr$;MD2UmqW!%Os!A4)oMLr>Z~UR`#<z=ztDa)O}Ye?
z_A&7=UK`!1nX-WfGnW6A-YEZ<7YxfzQoUkuU>;1WQf4#6!4CyNc9^hcVm0&i`>pC&
zCI{b!zwbAxXZ6c1);yjx;9{V$fnR3|b=$f-M6Ybt<~OW&)uf){Vz^NcYYF70zv$IH
zvKt%Q-Y$$?!euVkc?T51V1c9{Sw{D!nVHC9o^boT?^o@eadF2)ZW9!`WG}nQu;(eV
zgBTD2%<;(W-U)tXyk^XUqbm0BCmd=fE?lx47qM;mtKD>3G);4LTF_-^_428!<<xVt
z<yJ-YH_yEzta|R(zT8i?@OVmR^6oL>3%3wN>{3!+@DS$cV|m4s#O<6naS?#Xu(n)E
zD&|xapD_>$xA3vm8ON~U0Ju|caA<VN@&ob=w?p&SGCH!>0#X4XPus>m_WX;_o1Ar-
zb&QXGTMqmq{VpEfaV)G<zuyKqXTTJug|QL#hLLe7`(f)45{`^v&DRrge?GV=L4f96
zVgga$gnzL3>TxQ;;32l2yNyB)>eO=vxXqk4j3K&PAgQ$7ib3T{B2pmWWTrVaN(XDH
zqn!vF$2G{2NGZE&Qg1X%8%995ul!nH_t)^&V_P-=OT}*r=4rj^AZBB5&8iptI#h~e
zV)PA!;LU1Cfv<^osxXo6zg%W7LX*noUyeWcc1P<q<L}+o+*Gd=hCS|hB4$v@D82yb
zGeW@-&2J`3qUVY~dA67$rE_^ntoMWxVw#_y1>fco9pxsMo5vUJvc4qLd-R=SBBxBR
zh96=_8|@^)U5GH5fxiIJVQTlPyMg6%9bqEvrY=jy@T#g)69EIN=^-ZHV49CadimBO
zw0L=@xhB>5pxD0TRcJmk_9hwqfOd#iOZ9@%3`OdtL*{9005?(Txah$+u|FPpp&jC8
z$~NH(xgn1F>!N~d=(Y(VvnR<}-~Up0Jo~jR)L3v$1Wd@_rR6RRj-i>%2WOM!SierB
zh8L*C(-ZQrRFi7a(7egWg=f$}q>(ccjr7V@-c|aAH1D!A${{K=rwMANShWQWNpa$F
z%Ge@FE3f7lD;MTKgcu^;!<30SLOTHU=+rcz2C4-5lhfsclbiF6wx27mN*WH{VFXsB
z22vxxs!ioA{fMS?9s=IhPcV>k)`TEpzrG?<*buGcD4m+J%GKtW-pt|?m8Vj2Xqnbv
zJLzr}?oB7283VddRbKa(!YYpP+BsL1VXh18KZ||XiN-RyeM`=a!ff6|Z<oArCoks-
zD^mWsxK<Yp5_?L$^>*4ub4Z^3oomlr0iDC1^ILzWiL?5yyzg=vLpwp%l!_G1|7xCd
z29s0~%mt$Wjn7)@yc2YH0+r70geEWKbPZ;{PpfQ+Xxp`i&?mfGE*-<D2y;_-nO7CU
zA7j&UT>dKl@N2%Nz5%X!DfN#V|J(P0v_;OPZPRxumFX%%U+UOzrgu74{MRbYyyhlt
z;Ewix+|IH?`H^+4z9M;>Q2-?t`E)FGx*+xiV^5>?`y#a{3lAz>5Z*Ul#n?sy@FyF?
zjs}ky;jD$2GqJ2n2fBe#+wmfGGOSWp5boD{Q>w!$G%+?I4yr!rx~0ahz+ZN>Qq757
zlO#&d+V9(lz9d<tJFcKfZCasU7IS!5)lpI9%Bp6FWD*&#6AKj_d(h9?V}~TSsDGFi
zupuFtTm{GM$AW_-`J~7)@6aNH3Bj)(^OVQu?0X@bp;wQSre0J!Tqi4;+IJvQTZ<<!
zF?Veg->;%%|A{u&xUxYqvRxym_G%1K8_lXeIsIA^T~NO4!EcoI>rs2{F(&Y7Vq`v>
zlmm80O8G9TH=cl!v!h;xgzJleWZ?Fo!Y_0`|8(2p@fA69HTghw|GUyLRTC3guG)Xw
zcizUk_V8eA_Y^+fQSBGtwjBqr8krRNl@**2A~01tgcJ8_A!7>t4=XxF*Gku`<naR(
z@lir#QNKvV5&$R1_JNzzy=Mc0bN?6{1hmsWe;Ld<{S$}vDE`}r=NH?@@yC>}m7Z}N
zuD-^!T@-0HLDks)Zzgn(Tbj7wKKO<14)5)K%+OKuQN3~E4-+=AqtXa-zm3R1Dn4nZ
zaxCj+_M|uFPoK?NoIgLaZ0d&?Zj-%h{IBP}(yqB^azO6JUXQ;}6fsLgNu=e@O|v%y
zmgBQj-SOyn_2LoopE6S2alRuxG2&W^5L7HFDIUX`5yGKufz*dg{}-vWb?{+2uoVD#
z?w<+PtB&6&XP0Z1N6b<15$9%Lh_`9$k<GdrweRP&TtChiJl0n^d4IjUw;>EpfBwOn
zUVkc1#&TuE7-}g}5g^}Zf^1gcuP*I;_>JD#uQQHLFi=c?8vlACH+E<4zXu<ix5n(v
zIH=0Fwp6Uf1x9TpMs+@kDo1-5MCae)wiWG4aFm&vZXpb!$&9Vj0(qV~$9Ia+X1;@#
zMfWQN*jZZ;5hVa=(%N}2r*BB`rVfcdQB#z|a@HtAj1d_=PfEL<+qAQq)W8JT95Udm
zH#NIHkFobrOOPWQBlBJzdnl36tl@1jMk%<KE@wEfYwO|iyD=#xHK{dMf^IVf)ydME
z-BSH}g*;U{iRWiFy0>UelONuQ3=9D(ig*4Fb~+FDA3Gz8ObQLv9jBiadRoQ4XcD>L
z>5iyM$7-nIA|{g-VdV+tkjojYd5xu@U;Q>VW2JtMt9^oCLH{{ntre`Rv%Qm~=E{^}
zw%>u$SJ&0|k76?08pokWA*m8?0$G)!*@Z1@p6dU5pOOE{y;swksie?h&;WBaedUzG
zCSzy9JH6FiJnXim)QnSfmR1CD_sXM%(7N~-_@?G{5f2YnKaKqq(+2UOza4}a)POKu
z+ZWMvc(p2t@#{*YU5G7P`8jVSR}w(sYa$b#V=C?>f#OPE_7z!`YC^3$N1R^EH)QS9
zk1$!&_BjN;*|54OXZpMU?K^t~PLdTOe)cwp>*%t>Z@wrQKWb>qz*qL-uIKuCb9b{V
z<E>KAIQWO3rlcN(!Zbdqd}6u55B5`4zqIJ%+o0LLEl7!NEFqMu#@^BMal8HZ{jZhi
z*aU2GeqPpxXSiI%GPwkpiJ1B4?&B988HBhS0ab?matAa;(%)tl=X?ZRWRb$g?e`6}
zL<eJcl4tZN7&FuytCD?r;7Z$2Xerul2c7Km<O%ygL8^1xKg_!vm%rj3*YMOxIfL~-
z+fQG54*ve3i^;G5R>YPJM5lG)s1BN(Uj7xk-7ERR!L#4?$M~eSsO+_(NGzc;)iU4V
zwvuJf!(SvUyUrz?V2Y|*b0a?|;*W79bB&RCK^AFtHEp^2+Pn$RI-itxs}_4AzJStQ
z+=n=)=lss;4U(S%<<Etm;PV_7v5$H*o$IdT)SA}11Tze~zv2?ByS8+?pz|HSPJOvp
zkflk^P7fXbpD#IU|6IU*h-T47+Q_l=a`oofpR!?teM^V=BmVJC_c2A2oH;wh*&aT<
zOv6?FF~M`OFOo;nC!uM91uJ2z0Y84NB?+(I@83OqSTGRvgl=ECnPerI2iy)n&->vb
zNUrppx^84%9lWD4bnLbE!r#U_@tIUaaVJAwE_1Nz_ay=v%6X?^Dnc`AE&oT|RFlCS
z?)7^Is}Rk<MWUPvg~0{6L~-=bp>_HNMw_%MzX;`1hCI5+qcoere%>91{+eHET-C*X
z2^}CR@^bf8`(l$zs@9-4^>=@Gudf4EroASMX9iNBKYmx=!82F{Gd%2G*vlYF>G)qg
zp9qPVm*Sr$y$t4JC)$t6U5~hbzdxDcj<m61Cu^XqCzz<d1kSY}q@<a(jH{ME(dXLR
z31q0f|HgzpX-*t@%ubM_z|^b$5A!y61+k4)h^jQnc)M1vG2V-d20L)}N(IQybltZ7
zn?<eqTYvjyc3I<7dSQ3o=XS4q{b%*V=4^7hw6FWrO5Vf88_V8)td!Js^MC6X&C(>>
z9OHiCORK(`>O(&@p}WnA;_e@p!VuQ>{lqyXn}Yykc1_-uJyHI9A+~%SIoz%{#&b5W
z;@W*H4|YPP$53L$@g(Vi+-7jxi0#<8f3T3io0^)=i&$yDW;+2uS?vGA%nu@PdBp7h
zUSX^V1<>KaSp4YN;~BMUjQy)`Q|wg^y`KSs@@tW-h$+GOotZq@Xo98$`<FBd#djSG
z77aNSF=b93Av3=#ny1EnBynUC5@f|9X_|Ifk~Bn-m3bkhJ3M@ie;W6k7o4K3qwQwf
z%;<$3&#7dr9bMOGPd!A&!{a-Z>#c^^t^(<;&h2MPlUUNzk%79gAY1l%rIG%T%}Ksn
zb<OsjBZ8Bk*5%j#>X{3|eq`=1*GakZxO&N&66nY<$iA)#NPNfmIx2rMGM_M%`E`t|
zqVGl1o*iV#Frjy`s5h!<=9*$ngl#~VV!Bj%ARRmckt3=9&L@JB8&JZGi77VvYM_Fm
zO(C$?<}iVH;o0~}>Md~OzqJ{e)Z6}ngQqZJ*=qtVna2Ql-3&yLG7KcT>YDTpK)6ke
zKeRX>#0eZ#i5y1niTJWqhxnu-)ERsPXu+Ml2f?AA67uI?Ej=5_yx4x->C!{JKI*+&
z)%=&c)-l41H$vx4=-H3s9`(nWtc1Z?tvfOgtFxC2w0q14U=G`-t*xIIetxWSb56N%
z|D5ewz;xxwD;+HKw*`7~{QTiFw(ak98rI%~E*2+?dSaraAw@PmUh-sW#hD9Y87DI)
z@3u~TMv(LMSie`bzI@6RiH9_i$W6)D?F4>Q1ttbLj0LYGiWXLgF^}Cu0Q;^?et4Nj
z0ye-x`avP{fF=m+La}^!92aIvFozU`$z!tuOJm@uWCJFJAh@-ju$&GY!~!Pbpr7M3
zp>)EBRS+u3M=E)D&IRX3-Js-P627oOUN{ItNOv7J%JkSS!9vAh0##^9t6b@-v?v`Q
zI9%JvAPEO&S+}S;$Mjz|C>YXN#JV)y_D_Ky?8rU8$hrpM2>xk9zveCVa5*|y|6H01
z2DtWnPlIe)a`Ghxf22)<(-)o7Ot;ef@W{_p%ZYa&!5YTKW>W|`Qe4P^?aQl9j*9ls
z%afudobc$#9b%mDZogDh0Z=Mvi$M)n!TI(M7SiFSW#$hxOhHtccQ~uoCc9~%H=J+V
zC@o94q*NTz{(Sq^p>Eb^;a80nNs1-^^Uy<k!y^kHJ)sqx&zs8PrbhfSNvx`l2y;OH
zbt7Gc#hWZyGfgS25Zur$n2{N+AH6_$HS5KDExxcuF$w(57m{9ncj6>g&yuLB6aafw
zSjluS5OFT*t36N!2jf*DJ7G4aV5b6LB#B8Hi5WWp#OQJ2P_>bZgQ4d{IC67TQhx6s
zim<WzaK)ghHW?bjcXVb6C?Yz<JdFs&)f%w{2(XHy&>EN0s)$bmsQ5%Afk`GI0TAIf
zoLLkGG3?;R6#6QE2E%z_Fko0M>BL3M2RPn*qa!9U4cdSjOqQSS9QOhjU$(KA0v|60
zz+NiZQoTr6moqwV!|?a4)xtD3w{55u+ia~Nq<<p6q(EGm>8m9pOzqniS8|Hs;oSB{
z<I^WDox@gJm){A-_WayK*6BYz?A#vw=e2gI_2}rPl11u$)#G;S;Noibf;o7z^}VOB
zfPDmCSE{u0#?I$o%U2J#J9iQ1uL_Qh8eHAXM7Y))d!~DS-LJp<^{%nuVG60%c}&`2
zP>HCfGZ^pFT@=HRbHPFx<LEA!@skk^<+x`6Y{2;VoaV0qdW<kKyKFod7Ov30Sa$|L
ze);@A^B}-qeQDD(Dj+Q>n8_N!<Rr&uXJD3TgcOuytG8u@e}W+?cF6n8-H2_MI-jjI
zM}4L>Omj{^iqLdp;}?TxMV{DZuPYR*E=in8l=oLy&r!ySvGUl9sXB0$m~4%hX+Gf!
zv{>>7bvI24A*ZQ1_{%Dg-*DnmP%#d_(yJV{&$hMP6J>of@Yl6Jq<$U-yIar3RScl)
zE(EILxbIz}JRRhrnGOpQCjavHspof8zRblxz*8zN+rN&I!t1*QMcF<1uQuQ?mS}GI
zQDJVnOX`^%SO-|oMg{(r*Ona!6^bRl@i^8AOR3bZkL(X{-XmKLr*vQ!sCSX#gY^>Z
z;H0|>uW^)9LO#Q(2ryYl=i#lm!bEa>A_Vk1*kp`C4U}0Bxkz`$y15Ov`MGI>@Ml^v
z;xR9{pId!sXw|4&tJwW8^z+%IXMgsPi44dRbF|k2I7!kE7%0&~DT3``;&D=#ayAtp
z6@v;M8xNc+!d_iu2#>^ZXxE$lfb)tRgTZqBu8f+$J`YbC59*lLE*~6E4pdxfT^)3-
z-`YIZPh@z?+?Og31PS<@to>BtVx|4rpWxiT_~5}8Ih=P%9P~$*qud4>WXs@4?~~-2
z=Ct2;U}lTnR`%)D`9-npS<Q!(EYR0#+{c-x(laJck4X@`b+7?SFDHxu<ld*C;78wj
z@!&g7QS4Y0G%vS>wvao6nh<t3!WS3}y7CJ+)~0nFqcJgP+bX{mE?O^n{nY-rS&Y6C
zg^F0mr~$hDxapb^urnbRr%(+MzOdJGkk=fYNL2442p!G#eEB0R5=WkDcOptS8UUGG
z|Ee(0CBkJ$fL?hAYs4G{SQL2SpuHO<bA~OvnjY^Xf^-3F<!OZyA>elJ$uL4975H!8
z2cH*FnE+R304b1UH5_DLLJUphxUK|ETj`^8CJqXS@lLm7A{Nd*DM2no<Q&^u-djD$
zBIqI$iBq7v9|OyXS`(?b27r*RB?&%KG_zZahUuh;7@iQcZyX*n#la>9GocW&-yIxD
z;i{9sNYBBM@+20_06-*GLTylJXcTGGgxnmc1`LTw!Q`VF%&{(UX!_b)!5$C7)xo7?
z2buO7h`0zxrFh%@2Jr}C32>Qs@o<wi8^kUAZt68EAz$X_PLZ=!w;0036~=i%Knp=&
zP+6dSfwUsE0a)@t%#_>9Vkbkj6g&jDR*6CLUAd0_uJui^CY82Z4z=bG#$yMJ{g+p4
z!4Dii6F@aS7_T`_<Q1Fd;@kAqR6ZA6?CE?;EIETyOifo?x1b#;{BoMkRw4|;KR&>r
zXSfxC<rUIIKCV(B%1K=GRyIky$gHV{LHVA4d-R;hP3u~EptFEJXZBTtj9WnKwFoa&
z^}~1LU)&6vbcm}FzBL<D-06_db8~B}oSK=wX!?Mc;TuE9rIf7(8l+oVQL_0XRpQr6
zT{mja^JvSI@w_p;N>^U^mooz?jo-oUa|JbxD!C_aX0*F1G-1Znc9$}%Nld|c{!ioN
z_Sz8(Lk;2zcbIbf9@&hM^%Puj6j{ZjlNgl1$9;Di{J<gA-T4z4@13H=0kamu*6~?K
z(@+o-iv$@!e2hO%fE8~Wjv4POgjSuh1rpU{@G4>nX8n;CE!-Tbk(+!2jI`2@8L529
zHgR~x-r1GS4|?R#7j;=;wI7!s$e-wA*t%#(MwZ+EeEqbGd3e5rr=!L-<Dkgl=*{UT
zbCy4J8H$y*)b#nWQtyP5)rH?Mwk*4Nm!6_UhX_UUz=cMydu~;(27dkkr1))*sq=th
z^sG<cJAiPKc&ilI+!__1Bq&!j8mPqw*7FX)%_TCSDPG^^cmEfA`POf{n6_0ZO`*Lk
z!uY0)2V^Zz%*@i*jfM>WUJ6$fB^QwGeOgYfwF_mOa|0zzE+l+`?qE=G1#vBC?(o#8
z_(3J?a=|6`cv+JEsoFYsEx)C%f^SO2?>~K#`g_~`>HChNpwgwC!u8eVGF%J0oG@pr
z(DI>CN%JW;aM(hr_q))_e(55#S$wPR=NK(dt*;}h&T)>#;{ID}8jT`Ofox(H?i`j5
zVTqUDr2N(YIIde<A_Of~tMQaCx~?6o&LVzm{*9LovLI^L1yc|VIjpO!2A3NO3h#ob
zfqg=lcyOBfV&Ndfh$TaRl!EW^Wd+jPqHyuLHchu!KG|YzyG34!V@)ad!&8-x_WcxN
z8k^CzxS+-q<IS5-VT-U1DW@T<B*_p|xj^?CUi%;dBaS5zhhd{6<>e?yb4vnOwjztD
z0%2K!g=12*PXp#j9UU6|Y}2z<X&z=Z=G<yhfA;VDIS_mPI2RC{1<qH%h@9`kNKh(#
zS3Zd{lb=KZh^`@@DyI763PJ~Yh~h9arXlt}6*AcXYt*kuUT<(tomRIAb+U1jr+J&=
z_h&>l`td!TXZ!XlZL3<w=ES=6#7{?9sEw1>9%)xbO+>cA5ZQ}E9Q~D!6CAxguqJ02
z!BEgPbWz8QEnHiwXnib}f1%vX!5&=P81lYN&%h{L>_Txq+_}(1&j49qT%~JkQTtbQ
zN~!kLr>TwM$1-W)fUwTVT4T?p)e!8d3oMH8nWyO54SOQ!fA6&-w1Cm_{Lz;Rt?OT|
zl)_=!OH@+zN>&#egCqe1HnW*+GVWd(t(c~Dh7no(-7RU#(((_V`}n;GH4Uk|XY$kO
z*t%2(@=Az2Zhd%<;^N>=ssK8ah!&&2cjep;U&-5^PFioU?DRFo6}kCTyMd6XSk?Pr
z;3k#p(;gzt1L7tp=0`cqX40Hb;VhH0|2>uwjsgq9OOI9P7j`5<`9D)Q-Iq;y(0aE(
zNa`x$dlDaNzb}qZCeKEf&^grUbYEx$v<!h~V~G;A*D@eUchjZ5(UDL7?botE=zZs7
z&-cgq;AcE`kaPlvp^)k}J`W!W=cq>fu4(N@`~|I-qqaBZ8E${B^PYJ=R)Dezg1&Fn
z@z+b?>qIchp5m2bY)hvqe$JMmmSr%sT<>^;R4-&ahyP;2l=B*-qEKor>v+RIqISXS
zY%c8OBk8h(Em8aG_@~ZIUJPsMr;xdFm*p5n;-C$~b_#}aow!h#?X2I&lu}touDLzG
zGQAQXqof}3Kh96IBJ7RkV+#R5kv^V;<LA<wDq48{jaxhru=MbhR#4M>{F`t5u2qj*
z>ql+|ziWSg_`uA$JPF4aOljlKFWaI{QSqS_6I^~^Z_t#x1lRuj;b{}zpgw2zYv~^j
zI(*#^XZ!H)%3&W*wLgA&XRe^6(@YwI_bQy^V{X&nmvB$dq~d}DEvDt!A03$GQIZDU
zM>^{VflP<__0xXeCn(*+*6pQE827#hMeKi<dbidxaD3isa4v!-&v}-r_9w@K73Z9V
zZFO3(j!l_TTjxs=x??QIpwslj^g=+!(EsKX#LDkSm?p6E1;^#GvmIT+dT!eqtzKQo
zxnrbGFc}IZGThVXd2PjvdR`Dgq=8auw9k~1r!1-prRS;CGN@mYp~{4|ICuENCs`D^
z)LPXm%1YESXNU8@(eO$(9SR(}7yN9>5OW!Q{2Fr)S||H6#2Q7Q+g*xKR<@eXR)NWP
zpFA$P4tdxsT&<qX4mN-x4YCh8U1B{E>kr>U7Xk}7;7w<WrqFcHXa5DgSU>n)J0C-+
zUXTp`qs~?nljF-?y`|OZ`3-+^ZMZv7p3eL0A}mTpxuNG_;a=2v&yIx&a816kq}N9-
zyMVkSrTHeFAJ`TtPww*K-+rCK&0Qv#=CqvoK7+q~hG}XHxbc2?PsL*>qMHP;Q+AR}
z#8493ZgEh3oYj6I`Kdw3?S;~9eySE&ZhP}6Fl63u<3`$J?=r@ub_A4UNa}Opd~KYR
zqwF!>5Gc;7tE^@;!K)?iXl|VBTD#~f>9AO=f4g2*SHr(a2@mGaG~=2~&8^L~GN)y;
z6kvOsw;6a}cC<O;&+Wu@`)LH}S&j{WNl*pk@I;}kr5rHeDOmr`&a|NI=ivtvG%%H)
zNgh~lp#YXRM-Q^wno@z#BtqlakNSmrFufm>y>LU){E>Y^FlPdur0si}1P(`CWKv<t
zp<X2bUmrEYWzh%3(B0<8#I;fDn;CPDjjR#^pe^W>=!gkh05B0UAzn=OMUaH8P)8hU
z=&K#?`Aaq-dL{e3^(?n9qE%i4SPGa}K#&9-1iMs}_B^4xMU^&Ds=1T)HUGNVSt`i>
zuNy+s&TUqQ*L`%;*<$eq`@`20Hy<?yyHncG6^q$}B^qDz$vUL*%jHLZYV_c`r6yHT
zMA28aRmJQrv8VSD{@;#<_m1&OK<OaNS1sFm<E%4(tm13mXC}HH({=co7uJymyBmId
zYvAa29y)EV;c{5gOKPub9_M+hs5sdNO`hyFH%Sig&(elW%b_Si{kgkU(0ijEPo5=b
zd!cVUbmK`mk?jHKNwx;{%(vpy8x!vcty68T7>|jbREpyTN6XjFE7Hcu`@!F9st4y8
zMR@YqM5$MNZ&<Sw*O3aQA6>(6vHnnBVXDu0BLl)>q7K-lehb<sB2j&`mAxQCWKp~y
zt)FtRht_f$v*`6}W^0akb*KTxP?K`Zu(xsxeNC`6T+&D^kP{y!hn%qyRWPw%dRvH7
z%e-_l^m*^93U>cMv1V_rw+ewgw$PlGlR$xo*Lg5GMXIGJckzkGXhXVwQRDE0(l)`S
znhg0wOQw2OWy|5yXUoQ^#aW`t_EC0YP6hM@ZEE`-qc6D{v>%l!9yuPpqREd5-U=Eh
zv9~a9H*e6Ib)c1@#h+=~Ov~GQMYhPSIQRWsjXtb2J~@u^#EUEkUn6)&u|-+r&|i4&
zh1$ZkX^wJfb;wK%VS_PdsTY(XzNA8?K8J0euh%Vzo75+*Ev{K#UiW$37lLwCVaBu+
z<TL17{iLL%8pXw6QQPZdE?GJr=#X&j0>ibmu#8Z{qkSVWPad?at+Dn=<I|j-jx^^E
zv-q=08za-kaVoYs)q2yn_3Kt|bz9eUKud_f1+=kAmG>QB#e<dI`kPiB02U=Np<+~1
zdW;;DND(8CNI;bepSC{5BXo5hn?r^gJGzR3j-&72d0*ipT&jO&A_wdlQdO~`bt1DG
zz(tM9;lPz>VI^P!-rXjRiV!mFsVBH8Ke$qmY=$3i9W~!W@HDPt$x?PrwBP_u7wQy4
z2>Xfr8xRV6U42BM%=4Wv_vAcw3@dP;$=b<!RGGqM?plR8j*i~qw|9QSC3YOSkuv$S
z^k_MRq8bsokCwQ;=zrm~mF*9G#5?G%;Z0v=c>`=UTk*&pG88#y3?`-$D_`#@6z;tB
z*lT+~I2{P0@_APD4UYmj_$W82nW4cXw&~h}gU-Zx!iGc1vOhq_VVuYj$s4Eh5(k0i
zcv{NAS?$-lRB383zEjIW@g}+IZ~Kp-L@e=j{iz?LB@*Kq2@6~%BRdrRKQ@Nsq!+Mz
zu~gzH90Dt<Fyn@q2s03EprogYf_@uM9avrR?@|?gKW$_V+MH~hRu9xzwUzV6$!`@w
zLg63*A!{B8Hi!eNIYl+8M4f_WPvK|k)xwXDL%zg<XCq}pDmQZX>8Ce!-dPzm?u?t~
z)q8kcC51?P{^}N)IUy0}NupJtLC5i-%aG{WC$tRYbSCx~xUAx!0U<S3+_6iFBt>%_
zFQ;&q8XmjWx7HSCHl66en@s;}ug4~<ves!qe-d8r)@x0#Y-`DWTrse9#UT@!5<^Vl
zs*qzP882D5+t=uP$x#4Yp45N*X?C)Zm>>(JI!%BX17*mZ&<}o2MP6(p%j2|Tq%~_O
zV`RdWshFj>#b6;xEN=}a0rr|OaWw_iGUjnHCS2?jLrf$nWSR2=Yt77e>WmBCByFv+
zAES*Ljh>X}|4D96w-t_%Z5_Ygcmv-sFlW=9n>F;%_eg3Dr7*MrPox@7L*w6fRo4W<
zzDQrXO|_k*F2$7Wp~wuK4!L$JHJ-!?4%Wl}!x0w2hQ*Zri}M?BHmb%)UM>KpJS7*8
z^q$>9pAYlD)lxKduwTmU`{pIrk=@3dxXR!+$aZMZqIqka`<uDJ#G;KzGOMg*2E|FA
zq4ZHZQTPNeKU%5kNU33oZo>YJnC%Yt?*<;{XSAGd@x%EztI9-^FV)3P|5k;IGLunt
zdaM<Y++93s_dfjTT*}udn6!rSFcQWQN}!EVDN1tum7ExX=%_Sg4}k%V+=vH-8erEj
z??LjFRD|7kC(_N6?Y3GJkh09N*F3hJ3dus(Y8NLH(qSJ&ATMOZ^T@Y(^oE0{kac_x
zCHt_6$3E(3twuJvZUbdgd0D$#oxxuzaARuw>qubST_eq<JCN3#@4;}qjsVMU@L0gQ
znA!-~ssTr^9cOptC)VDti~UJN=gN=r3rcv8*QsvbxK^L|Rq~9r`I-EB`11G)4|fqy
z>1J^abZ-+1x|E9)U-~et@2rAv8_P%r5*@@ve&z-%AWUYwlZy<s!`KkQ2k|bCbk}DO
zzc39^wd(y8V1GAlcIhjEDytV5d2Qon^z^8Tr@ZRh?fe$fU>}U=ky@x0ZT!B?|2#7G
z`51`1!$XWWk5{GmKH^;UFthI?o=cb1fz;U5t;nH?459~@W?et*+Gg9dg?(#EgD%<-
zjJTsbOpEQJXSZWphI6<@LHGF!H*;n7S&!iF!|l%xFat^AKd4>~j!dyCibP00d>lxl
z5gAEfaHl$mmRAVGnI=iF?uZ?z)`N@mW?63Tyf2TzePUIGqSY^qiQ^^BQpn)us=(xt
zwj(1dB`d99Y6y{=EK@O{)BRuXW`kFns>-S_{Vt4ZhonfzEF<98sGzZ}KAXEzI@Cge
zNBz-^(}4ZQ<KI+GNbS(i;%b%e4Be{h{)aK{3r|<LI$~^^qm&`?6#jVSi!`i$v)Nh*
zJxDQ4iU21or|O#u@Snfh5uYL+md{7~oid-=wL3g)8JQkfxess_T7|xQxL>ZI&RWsd
zF%7I?iBXDiT%#;gUI)+uL_-pT(#mTgkS(YHFD(7_(pyH}<Vz9CN6b!xO9FQ|wc!JE
zET;x}lCo?<A%BVqwG=Maw&zuIbA3mCI-7YKmvX`q(8j>mM*!0z=be-8VVg^>**4O_
z<3iz}&{*uh+$z7#6(?>Pj6kBlu%FlC|0bGQr#IeMAH&cin-?<`C`td2w-kcu7Ls^)
zhj5ggtisOvy`zyrWvdsPFiwHpA%b9HkYML{Yga(=*RRe&d?G(WQ$a;F8k&KZlz<$p
z4!LNSsBzYo^T&ofBdv*#Y4DJmdPX<3rQ{d^*j14?({AQnOkZ~U_FQVjpE!lY`QN>~
zCwD?s#}vx+0l{BArWHUf6ijODrD7{GyW_3fveSm_onRFvGa8#SGAqol>wzOEUzBqa
z012}W<uuvw@P_((ZJanX%gyL38jE6XQyQJy2U`+<HtUI%_58}zJ!GRpRr7}+9X{q8
z<0sZt#$Bz}-I%Z$n#i{OF?#SIC@fg6nrT?9;&HdQGmA-UZ`+?z)#)mq?OcFUB7gE)
zTk`F9__w{$Dt*25rTE9Q#`d@7ljtt@=;M`1bA(NvD(7nA+ob6PmhVFC*vk&=lb0$Q
zX(>83>Luzbw$vfu>^}_xmJ{V{*;=fsiH_(OrbPa%Syu%(PlWm(bxI%g&;(uY-_fbl
z>fw`V=hc5;(PwZFj1<m<(djPLXu`4oTWA&HGVJF2L?dZar60|^VuRtuep)<Xojnz9
zN}Xrw8jCk$BY`ieVo!JDtuoi5vEkyKH)dDdO<m|(BOO~_R3pA|v@nRbvLv<_24rgQ
zlR(-kK}*;~z|=z0R9MEUPKn0pRUC<Vi#EOSw)YPXI(AovSGiLLR~JQf=TEP?9vIs`
z>@2BlT(B$Gi3}dNPxfF})L-TXp8eQ-=i}CIE4sx_wtmp{kR>4gpRgRYpK7l_%)j^A
z6xt=rqv_W&^rfUIWn-f*GgRO%;B7owHxPrI8LT+Mf0g~#txs)@>I>QgfbEOypx7h1
z{VZkc2^|WHtSZfudnuiy%35{cz&d7_xBQXL9si)c@Mw*yPi>pC_vvl+mrM^t&AYWm
z-L{qmk*rL2OA$q}rKxk5_fan6r&P5ri~LWCE=AUcydRG0=Pe(Osp_({vi)lM%F+t=
z#E6P7d0zZ3?pgkIYnm}91<hQPDke_A$jLBmlxd$~5N)kz)pT~^n&1o6pLRK^_3?O?
z;=%Fgn)K+6%cD{C@=?ocZ>dI&=Kh!f5gx89g0@1(C$F?F(^*3sQ{rDdg!+wwS-|pm
zaxY(MuYLJR$oaY}I!49nT|~(Cr)J!&MgIsBOPqu#a4Gq6Ao1C~TA5s23=&TY6M)*>
zye}k!{<CP>cvxr6P0{a7m$N96?aIrl#fX!qwOx3)Ke1gxmTr`8pb*Kr32*S`Cl9`C
zKg6j#JrR5K;w2WlW1Hp1%>JE_!~tb-8mRq;d6lcRtg?GHUKf`&R^ApeKS=>)?0O!8
zOeL#Wm7vg5V8wvI85k^M1@*N<r)wyN8i#B|Wft}oUT>o6{Vx|QUCkOA%AHvTnZk((
zC%ReIKWlbm9I|t!90*Dq9_VfR^Qv9Mj59#9B@$gM&IMJIEcWE7*{XXKXi`+z;Qppo
z%JU0uMtoRL3es5wPWA}lMvIw3_FANPzW?cu@IXRH=BO0Lfgn9DBK+hKZ~OWG&8%%A
zPN)jP=b4>O_&ix^D64(IpPsA){nveJIYHOe?#XO6@!RJJbdM9TUGHYm9~WF$F<Pft
zzSAhSz3^k)h?f0ZZUZCF?#UQGoV75=M`hENoWl2u!yi-N1ZRkZRxCM@<@gZ*kDjxJ
z&v7RPN}3!60mdoEMq%J$KSE>Bzk}>5=R_rSE^dhO5)wgm&txy6A-fY`__jjn{DNN4
z79LhF4MA-5YIGc(An=DzP)UDsbrs);(~Ce7GFy?037j%WrFzAklJrkA_PeR179J9_
zD9DVLEC_#S%<u`_o2BmVowU8L`lU>p2{Ep`EcogCG=d5|IqGE<oU4qW?S{whk4HEg
zzk!?IR29ARx5mxoxm@e-ZfoyLpZ<%PlM+f~jaGBCCKj=w#Qq-5H=Auzp$%at1(#x@
z&1s#%;tdmvJc`Z)XhD7%{{0+iSYfA4S2JhTZ7AQa_Oora3?OpGP)**QSdMofVNz2)
zIp*sd!d{Db7&>ix<&>Npm_06}wvi!FoeSrrPEBMRBRj$`tLR)kc2Hw4v$14gEA{L2
zr^ubESy!*xq=aVj>NRI{)MnVSmY?&f(Yuy?dG^pyTa%JJmgm4$_pD{MhO^@MsExfP
z<0Pv_(5{9$HDp#_UAej>b4p(?BUseEC3~{KBQm44ssRf}9)*FYNcdl<%&Rip)lb9p
zN9if1F4&mLN?JsSf3amv;3r0PkBJiyLpz5ps;@Vhp?^9a(}Km?E{4A&ep2Q2A%hkg
z!Kl~~ih-iFp+%Qa*a-uxNB5C5<iHd*AS~Ujzn8`vI3eQ+jKX0g6@A|_hsP&5dP<K-
z_u|g%5L`G?Cj}Ppu~#;(Fjcg)zUHn2{W=ZYJoRI!w_!gtnOm))Xerop{*<FXTvggm
z#|C^MdpeC(nmu*Qbl#Uf(V^q;&Sj<*vF9?a#j!SEQ^W6N@${EDS&)^<&-ub|=w6Gd
zKO3ct8m7AIko$zGJ->8Up6fq8z@hE}m03jL7GNtau<;{aQDgnYjq)?I5t2nK^8v!V
z^W1p@`*n*hip6z{@qz)K`aklm9kz93)qm~dTCq4oE+UJIBKUj{{9gu{b%amDCe^#2
zJvD#l$n$ggD!bfdGjW(zd=UeHO@)sc*))Ry4npFr*D&`n<R5?VcUUM`WO?*=u+oY7
zfX#^scxVaC$XA*3^e(R>Bfk=IL`GT%q4^?bOI8L8OX}dLT)-#{=!3#d!AWNVJu1lK
zqhy8XV$We8qi|u&C~7UN-UJiNNA+Y3RtQNX3JXRLu!gY$*kR-VRu~kZXSNLkZVN1M
z3qIxL3)p~p5u!@K;Fk!XlP-`-z21jA^t;k)4d($LI@f7yVw8Hnsn5TB@O_vYl``|o
z-+23UfC>h{PeyqMpAUobx+=w;lIEnP);O3bVkiIx0SYw%GV$Xk^|h8O-B|-(@wm9c
zoyK3reKqGQ1}ns&xd3V~tg>EAArZ=gKnP+;y2U_jUQm8UdJvmO+i<GE%>9v}Wi~E^
zje7helkUX^NE-qicI5!3VC*tE>^yvS<R5LN^5B5t-xb-<a^}@pRs!;#oZe|OyrZle
zQBQyYo8ANPqS~-gxWoVx7%+nd0|EizqA<w-ASWR16n9CD`@Q43csQJ*C~_w(v&U;w
z2p=qL>MK#~ny*pym81R3I60B$dlf(s5Gu!xAxA>^=psY}G`4mG{IG@s8m;Lu$*jdt
zXlogadTU|Su5~EpH|qv$L+d6S6p|L_#QFpNAo5S79}-BSZ95TZZQGjWW}``-W=mE6
z+n7ECsN-?FjtR=e1f+;69StTCm^Y2mX2s;E-vs3-(WzUrRZ(!E_F!^Ix6Hj@391U8
zLl1+YVyoSvkW6m!_?^p{4`gzSbrbpbhza_FD;qIM)R72Pb*6H>xuRnO>uf{TNc=TM
zM&QT)d_R6^JtJomllUF0@mcm+*H6nzsK@;|OV0%T?}ZnqhzpnHjh2Hu<Llq9UJISq
zKP#_OU9LY~hW%BY3{i~~k9YT9ul{!~8pMl;{UX+bfZZ9~S7El_=M9%0J_nxP%YF~L
zark(ulX?^Qp1w-9GPJrq_KTI6{~;YcpSjVA)ezDM1Ay7(`R+&8Tn<99Xp6vjg*GTh
zj=CA$qzG^pMxyf*CHaWVS18tb<4PnteMBh%4w(s+W;l9yHS{k=SjPPBeMW3li50!5
zafnbUWxC85b8Ox#<Q3}f!spI3C)}3WMEfoa%f*{YhuMcg6>G`Cn?F4+J_Y-~e9ioe
z)_bwbs`jkipw8a;=as4I_G{-tJ(}!T?_I(+yQqmWBLFVv;EMi&<Uj-AFK*dqoPU@}
zCXT|q5?*+|9VZ#h0lX%88>1i(uqNM&xIeYcDVGEIreg&=Eg*uG3>j~6ve8c_9{X00
zTEVc^m$BV*sut7eA)8>aPmf~uyF=O7xSKFLC>&Tqnb5u(NkFVFI7l<asMdR89gHC8
zKeV9<&t#01*{~MLm1mg6$iR(LX6u6woL!%I+tVu9ld`{EO*NO=!SGt}xeGmyILD7q
zq;xj^>zwO&Jwo-0@|-Ox+NbY5ifFK}MC6>oLIy8JncUF%b=QND%;Ei&a-U!ydlKC~
zPaZ-5HXBG!jQZoBxDpf!$d2N{0025Z{q#|Lwr@qDBJq_H{m`poLzN1;@j;gw6oLkD
zG-H4Kk86Dhn<&ea{S3}vn2oiNQ7?wt9j!%W+(fIND^ajj5w<bJe_3oo>Utgl(t9cJ
z4*R`(Z4h{SIQ29rK#3^S4O0=WC2Zv6njz1i(Ku)jPy;)UiBCEXBX${u7D+05w*iOb
z^NQfH-~08ze)W?{FK`E1uoNaP=Uz|{^N;YfeY>`G**SP~+cmG|y{D8MzYk{8BRdN^
zL7j&@R2-;xE%CkxaM}uJv&g%aPnbS@%)9^H%c4qD8Qq_Ci>^i}eK!?ug@`f3SXCq@
z$+?CFK$V&aTyj_h5FSDfH@9>Mmwqd)t_dKP4{kpK1fsz)-0~~8-vE)4B6VX?ZSOm0
z{9cC3&C%g`;6+y4hS2bEHLZVBQ(rVPO;A&+jB-88tm6`S$rv-iDzWrAu&Bb{wgCty
z|4K&b;ecKGNGovf!<c3TnA8*CWH@PPW8cpJS&ZyJ{A+LQ$xQ}MKT{Jc`fwUGYOQ7V
zKG2#3<Y4jv`8M*cGmz+nRX&pLr<zCuf|ajybg$l!&8Jzr$5aKB+b8NUa&@!sAHY<e
zs94G{v5>c6eRFHj7oO(%cTV@!ev&0HeWtnNV<vvWqC1+PV+DbT>Aamwf5XOZX!5$b
ziRslGU5HxMi|BkO{LC0lm_B>{2c7r1F=fBKNw?OQqu7s^$q&Pq7<@eV(h2M&&YG@T
z6%A5Abh1KDy`z=3R3J?gUNV}};8?7E9$Ykq2oVT9V<QcPduxvL3S&iKS!a>sYJzZK
zO-fimi_vkUUhg?g&qO7S9uLq!$QluY!AGm_vhp1*M;t$fvIfhEkYlF+fF(jFcnU6t
z6r>p-#&R4Kt=X=p5&(>v1vo96^V;#@Ss#^{!)Yu7<iO44@9{Hn^w4iya)3-gU^oI`
zsZaS55Kv8xf5D2+orNq-NhO;q5tK7zv5l_ZdfoE2!o{XiQKvKc-~BcFy#7%SZC~6E
ze=^^TPg00%k3AWp9@}_amLfK(XM~SOrW}B^PMo4b5v4_`Pn-A>*E%<Y7->j`mf%Y-
z`1*~77H{@%J?~sEuFJEpQDW@6LO?zmLe$U%qIA2C3=^i?3;@$ebOYt*u_>@AXt_r%
z<c5<lll5Z5@NiPn-<L#66|oIM32~^gr`0Qzl;XtPoaL%qGKq*St--%dV-r!p7;6rF
zutRhuiSmIkWrpn~DK(<ki6ttbU@LN<p#X*~15jq7<s@QghWfR@O;h8BO@k^RP0itf
z?B}{P1CRP{3S~A>Uu7QQ8HG}{ntHTUUk?_YOVEsW7B}sW{0b_XA10Uig`_mJ9W8C>
zmgW+S$Ez``xuPM3mIlmH38TGWNo6U#%{Ps$Et;6@dez<niN82RnP{8Iu%z*fe0dsW
zu<?A(;IoiXS|t!EMFP|Vn*{l&Wf~N!TIrq}>7iEKdt1e-`Czg>S|%ChhRp`SZ=9{h
zl=X{@lc;R;DvfFL9#(V01)k=t8Pji7JfW=WjB9r{zM*_mwV&pEy!OpxFHT=s+4;?m
z6+!2H#Ga?oubzUX0#_rGHr;*Y7*e=4kz%j=5ez#9MoD65B2Ub4i>(@$V;|$>9OoYw
zAMnMpy%hkg2Sh4OeXSV8Aju9|`4^wZcqypfF>A^Jw9pxWiV0(>pwt9i0E1A`K2(Sc
zV%CS}vxUnTRTS0nDr1H-D!Ns~P_Q4SVc)31;o;@O!$v5AoVGow;-wt3*OfNY({dg|
zci&3?%y28XtE>|U{eBl0dzxb!WuKk*YPq>ZOEgWtht%=@GMF^JidE;hSVpixD~Ijl
zQI^84o{L6PzM*%Q22`YJL1^Dc;bHmRmE$H4etQ;P(B8t$j<UnRu`sbcf6x;jJ()Ck
zwP>U9yMVSE`Yuqb32nSg3Z;peA;FrY@lE)@zSV)@TdJpwg}s3PFxS#lgrXp{9SO4L
zIia5zpcV&?w7GL%DwDdu^DfdvvI;7ic{Ao<J#%%}Pm*3T{396Du+&nh*=bxNo+Gka
z7Hw)oIobp<!hMpNhQ_h4HB~TC)RA#=s6XLun3phYg5Fs2&h~453w!NfE_61SP<@o)
z_=24{CZU#zPgjD;J&G`WK#oj84x%LTti?KDl0n-ZDW^O0wS%`-i1#^h8~!&=uI5Og
zsFr#3C^sXea#9`UgC5aRSyIh8WTsO1v%g`~#LK^Rcr=FTOhNhL?|JA>P_LZTY2T1;
zQ*b3<Rpqr0%CVENi91*oyxFLE@E8Y*>`%Q7e!aD8n6jT((c|8?_J+Zp_|Cn!vpIg7
zE{LIO`^(#aDJh;{<~-4y{^Abo{BM8RJT#X!_O$zxOQw@&Nt)@h+nSy0{coP~DX&IT
zYWz40woEXLcX*?MV(r*BDN6!;fQ^gc{9EVm)_hbp!G&WGf#19X6ne-BF@&5Fe$fnJ
zQh_~Ath!N95JfhY?uBc44QbLE6lXBc&x0*^qK2d`(UKQn--y>nz)^z3RqaM|<C{{B
zVr5CR8-R<=gJ^n?v5H8%|Lg54!=mb<H4HuU07D2!$B;uQ-3^k1wB$IXfTD<^ba$t8
z4Gl7M$j}TSE!`=opePb^@%w-O+#h#8&-r=IS!X}%tiATy>wRBpkaN2Qx0vA8i!fcy
z)P80v1pbjQiCd^z0dk7kb;Q=ILXeQmo4@?t$U*4WxP7-c_8S*ql==PLva#$&JA#Uw
zlizIsQI&05B|Mr;s$Ra-VRT;z9=ePI&>P$Ooey{?c1IwnZb%?6A!dSaES3bq<zxa7
z0RdXNo%YI6eZkNHC3G*`mxvmVvw(b3Bt%s+u$MiZVFpyFPlfYCNTacwe>fwykZjzq
zzgj|<g<;X2i}7P4TNiIWO#1oaR%=E^jY|(7u~tx$vQZ>ZEK-En%kHzDk<k%FD#ZV5
z!|-bk-EQJLrcS6!CR^2SkA2?^*+=mDoM%?dai}Q_0gs*xy#9Ff-#m|4JAq%`EDz%}
zO#&O<WSV?-%{QTA?s(yIsh;spc*9u3QYHQI=__up>jx|u-Ki2kkPzbBm-LVJhL~hM
zSe5WfuC;An9}90|75S6&%=7DVi|)tUlRZefQ{bR^#gnauN6_<#qciLMwix%p0vB}Q
zFGKQ4^`N5(EL$ISNYq6Y>0F}#tgr{<J_A)Ci!@5VZxpN7F-Vi)nyEUWyW;|27Shky
zcitFq<ETjKBy1B%`iqCKXMg*MN_w+^CrlLx&>SzKnq<aI8P*G0pjn_`kUs4CimwR#
zQL45cDTj2Cm@7y})c#bEk8OOr@>tK@Ao3}Jq+`;j#jU>V#v2CHts*{87%9H!TPJ*q
z=alIWI`ALwZFv9M2e5r08|wA6Yj{JhvBUZeT<!)1X)070w5&1@P=G&uza-;QR@(S}
z7E)*wsoo(jTXNL@Sp4r?8fjx5Inpe6_C>=NeaJmYtr+7wt#H5oMyMVbdA$t$Q)}<(
zELc&DE)|{Uh|1pLOhr`9IgBrrmg~wj-cc^EBs7FrHFa@NFK&e5hT=OGtNP*qNGf6)
zI0`R8XaY5`<aJElXBEqP`?4&xVW8>5d$f+sAkKKso%kCdZI@sacEm`B=~i0NyXGQ-
z4o=+Mn>{o3ng_1#M?_CVk?w5~=~V8@xPDkpKi=DzI;G`ruZd@9$<eu!4U-@xWvN}b
z(5gT-c~f5p++4p^wZ@LC*P=hS%5LqOJY`-GBoM*o#jq8Wsw5-Xh(EAA@$1S!_@}eo
z^080a<r<THh;Sv0en5aiYTekJn;Wmv!}R~v;+6%+qRZZm8qwK%L`AEow(CqRX8RDy
z;Di*NPhs{<zvDxd;phhK4%EqP`n%_UKF_TpNsivu?S{)vl1XAaueugmK72jP0P6lk
ziJTvuq9eSAQrl@8FKrfr1t|N=olhK$Cpc+g?0*?pw}+(hTfSphcMSFV3M;?>Uv7|#
z&8o@1!H0b;nQS_pH1-C2>-H-`yrmA?RqZt|f3h5~_h!D7YWMwoCI6}Gpr6TN>c6~h
zLg6d8=)Gkxj%`6n!z-B!pAM0-Dv>#E9n0CYL(a4xeJQeUCqC}Jd#VZFlb&nN$5_mf
z%hlezwPht|izUZJMDhSgaIQ+gc+}OOsO(0HBNrOqMp9UoF#_)3fM<z_&wp)N+^D33
z>`2n1H(RVqkY?jBh&wsX>-g5NVD%S+6}-%551Uq!_Xu$~8pOT&G@*pl&*^&kL>t-h
zFQX*)ew8&52qF+``EVS(Oot7y#yzEA2R<qu?!vgAUb3TC=*uDQ%Bcx1LaAVceJTyN
ziZqm(sEHD~3mwAs=YJ1v;9E|=xrhp?@!x|ew>$c%YN)aUoB;W7-be}u%SsJ`o$j4Z
zde89*&zT{SjB?dBht8zgxJ>I*m*|6kdEIV6F1Yc1-VZ9BEXA0^IChOp5+af~KX!(1
zNk)NMS((siy;{?igY}22J!#Y1XKih8iStLQBMN^Nshh}y_(XRY?jSn)&;k+~+(0$x
z6k~{JQi*spBMqGz1yqQIQ{=g{s5_$$Q0^+O+V>q}&Me%fb@kf$<u-N9Y;v@w@RQL!
zD?%>Ezl1!>Y8nse%I-LqoV3`&`8kgq-?HsdshB&~@|OwN8QY<lHE_16o?6P5v4mg~
zzO0>2Z@aoiulzo5wXQUK;F*#&#t*Toi|g8-tQ<=XcT4z*kfAK21v5SS?GH~@VSd_c
zMEC&k&a<_WaC4{D86`9*#ZBem^AHYJj(1$+){xxgaT)FnCm-xX<nqEBJV1|0j^>}f
zUp~PrM{iIAMbB}6xD=_-sqm}lu+s*WC2oCkd4wNQ03p@n=V)eB>0MMHP+GJMGwazY
zv^%&*N-CC&SwL1)Sxe|dN5{|#Baz7UGYCZB=@TUK3|Uf9;|*<Djr}Qu{R0NKI#-3m
z)4+&?_-_uiW<ZmG^3vOO%gR0H$6;~ss&p4FdFAn@=QWz6LnKIbEM{;9QdM2;K?Ebg
zp+Q^|$dgMp``g*EvZ>ghvl+zLrX71{aHApt40d{wd$~YHMRMw#6u~CUM?M#R3N0i;
zbd-$ezq&eaENIkbL<z_G=9A}X)C=ygTCt@Zu`%qQkM|vlNXqJKnB)mZiY6e?T_+ep
z{3+vnMtpQsX{oVMcqV4tv<83T4kCnRF;6>ms5pctS$gwj1{vdR!skfS8BUz0bU4LQ
zNR@x)LgU*Vq6v!{5ub;E@kR@vW~zm`-$pm9U*{z8#u)^)nk!K;t_1p5kK4TMh**?j
ziD=n1^_}_#F#S3YD1y~7q0%@|%>m!WyR){tT7d{NVcg=hVKaR5_vT<(NfZ2rTkr#h
zuz$hk%OmQUH0r?3$b1@0l`uxtSzfu^IjtqBS;J(%rQ3!}j)fJUJ}T?WDZ>tNc_Qx*
z?D3guf3estKg(A5(k_#!G(}jqkh;2sx_(V<&vg=777O<`eFo#XXp4azA3ly~M`r)s
zwS0OsRR4tvNtwDOJM^+g*4(O@O$q3E^K_ZpHhmbs|9F^IBT|jcHRL<UIUdLm$aE6n
z<j5wN?)+P`6?q=S!Tgsdq3!#31yfyZcWG-!ny5+V|1g-}1gxeb-!2*PC+A-Jlec!3
zFLZaM9lV^c1Ty8^@c-(WP<i(r5sAAR?#9s95pI;O_DYU<q(DxRTTPE^criVoy6iq}
zLVzul+`Esl%7j>+kU;{Kxdl0L(>(p={X1RjohI}_z`3?3b`lLUka;3`vBI?rVQ~5O
z{8@ln%l)G{-FT-n2u|etzjePNffR-86H@%+?kc>aa%&O=^}9^QT3W6E^oE=Ebbk&+
zz9jEIEZ%_#8DX;RGSzCplNL*c=;=|m1^+?mxw4W-$M$)dDMv4NO6?q?Aef9A=OUE6
zLKGkoBWYA9FP#>MTQJ30#1$0f&*JLF#ePt6RT+E+OS~ost@i=V%0FxXkmeq14?mkG
zN2-tEy<vQ2+8aDu=lWuC=}=3badFn~0cp@P5htoim6#w!@kkN^VGZp!tih-6g_ntY
ze})+bhJFX*Sb<ea4%O*g9VPa3Dzj`XYBB%Tf@#ehRMLle*3R(lup~$7&u8Wf9*_Zc
zFRRC(ATEZO79SRJeOJwz(U%#<2IBjK{uVp_N(t~kUy{0f`bRO9nzi@7uhrW5d9s{B
zeqK-Q6AifY=l4mb_nodUbPXuco&+ueT1M@^N1&3`6BEA`C#I46+hmhaL*8Y&k@}aU
z{r!+CtbxsK8(!r?^%v<KKS}HP7Qz<<mTC2Y$sk3;{mQD^Zk@>T(n}!YD!Miv4h>SA
za4~+k_Hr?_o|{{T1!177>zgmT{yV1gSkbF|-u6?^yc_5>tH9CCZoqxb7%P=*i^3<g
z7XvL%er0d>t$N4Jx8Huj(%lO3u5tNQw}s&f7v8sg+#y2hfgI3ZG)*}g_v*pcj1KRK
z-PJ1n`UfSgP&M7?fi%+8Qdhace0FWv=Yw55E7yM9O&IFzqod71_`8DIy#6+@ep}(E
zCrxE9+BeHDFZ6i6j(wnqtpy>SO{E}|M14h&-`s48_5-A(pg4q}mOqTv0ww513X36?
zZ;`yxCnfsq!zX7dL)_pT9pt{W$9Fmj${CzTgnFF~Jk3EyT$OcnJAR`Y;_^c8w+RM(
zoV&6vKb}}2Rm7c&trqHc)5ClbFdQdaCKYFS(bZfnF_gTgc)fx!7fdx)7N)}?r6!NU
z3u0R|<N<|~;I`XU00RS;MFVm1$N`EEr$jmN>ggzm?D*&la?vZ$*bTD!fAMf#KsGro
z-$Dr&Ko|p11BPXb>5z+4)T^8Jvf(~7S+$l(X=>)2&ry|Kkc8RRUm1|$+>5E*%_Zi|
zQK`x_*JH|NCB=D};m%<>kV>V6@9$W~3)(xYEte$>gfmZ~fu>lUVnedhK{!>U_gEss
zo;E{2EZfyOv&c7#^#S8sp@$!T@v;(syw0{Vsy2^R)t~{wtI<41**q@!_D%)UTg2#~
zGaG>x&i1+59wR2^C$0b1?k_Ljf?aw(91zdJR6Qj{L{mMWrc^{+gZw0f_oH|W@XKVw
zc?3xb#iHp?mWC`>mfLJ+QPx{p0*>#X<TcAi(I$++c+7({bRje#rRRjWxKmWUhG89q
z6dDpB9Q)LfUe{)>_F4v574hEr#5$+=d<9aMJE4<<Dbc)zFNvXDK9h%*QHLZmSBN;+
z1Qd8q^XZ%vcN?ED1QdMEsFv6&<tRq%;;m<AuWuW1x}LtCla*7PvD>k5Jblr*?*#rj
zt?T@wt-^u1P9#5Vb&l+^IXtS@wJjlnKXFqzmg};**Sl(;q&RxBm#^V-Jau!O%Qn#E
z>jUlZ3Zq%hs9G{;LMTbZ!DR;Ghczt@AAo|Ez@zZzlF~gu5dIE<4I|DL4`KD<XQLuC
zENEC6A3r9TG7La4mf7}Il>EgQF@^a#p}WjLW>6s^?!y#LwU|dgdkOjMz40|Lfm_P6
z{OMJs!b)I?%O9bcAf;jo+0odoU>v;R82+~nMie1_;`XUyAVP!4mCi^|bE~L-M2G02
zc{}QpK<P#upMg1J3_2N65}-DdnQ3(*n<FgvmAhK(Kzh!{PO8J=Yn`MvO~gjFRH1<`
zU)M`%kvQ>+65#3dczex*xwf<iezVt<Uq-D^Lf7<Uk|>xa`w$cVGl&ms>hy$K<G(m|
zWMT@A>&c$N$u7tWw#rq8qmpm1zIA<HvyG2?zso`2V1J#@7iRfa`{WIMDe6f;`EF=#
z5w*bUWC^vG^qObWl%bD5AC<PzwE>JDotrf1rYr@V6{aY8n0*|NShC4baOF)|2tc<9
z2b5PJ3xbI)u6$zgM~36o+Z2dRWM3bRCD=lFbwIajQ{hYB@nFfu6Dzq{g{Wd%%ca~`
zx`-Q#z%!G@xOjX}?3U^m)-N*yQZDd2@w)N#z2$S`2B&F;L1bSH?^6ww!1VXb%<Sc?
zjc4F~fe|Jl-=5#GCVNe2cz3a4U~f?fSAZpX^S9qOKg`YQ)*4PapD9Y0HpMt%Z+g0G
zSXcE4cJov`TxMTddstrS`!18|U!LFb7Do2hNmK)C@85AdTlNQJ(ubVnZA)To_UU8J
zffR0$e^)ovQ%q5XmRjD$4YHZ)!=l7byj5zL@Rdf>RCVTDmz{HRGJ1SB!VWGJzCJf-
zQhl~weHbw<O%${D`=yBUF^jN$GVU-534(_9v3$PeRdr2`?YHLcr)BTn@edhw#wsW)
zjP2Ddo&NH-wf?>IMR>tCd!bw~K7~DtSi%&@!u;1Hx778%C4|a`PM5CneJ$oxkY{tI
z!q1Rd%bY>(KD`eJ!lAnq{`&0#zhy^6SY9F$_Ih!y_Kbum1U*p67IYKL_y|ZATT@M#
z2Wsa{T8`r2&>t^8s}Bwi(Jnhuw~f+!#4b=cFQ2Hhk8o~4HcgMb9plflMatvJ&c7D(
z{TJ)L34~IV+@gWk!rL>abgn+ld-=$v!k-tv-ZW98<OPA4Jlk={(z91(&L{o@gmsSx
zz8F@c;(=jAAW>H)Ij3>#Z^;$O6SewJWUOZ3`L?>dj*of!*H$3nA4J5ei-=>!yEDjF
z&6D@ueOx$u=-QR58VenI_`_c5Mov1&KJ%^Urb7f$2w?J!FT^I5kup|cdYZ@i7(Qwn
zt^~phCDvj<_DvK~G|S1TbF-vdT07n0=<hHZ!WeL(0-9Qn$A7))Yuh_pFF>@)gFI1>
z?+q;@A@{DcyHYahCf^uNj}}?gGp4e3H+BT9e5)QAK!XIM5Hn{e{_g#?okwnY$jjU7
z@27;$ij`S#2kp?@4A1p%cONjhPXQDHSUqwH8qRYC`plj*uH`FY)}B1;N<T^$Az`*C
zpxazNs<KfU%fEnwyyb8us`xdf{_^Qj35Q2P9u0>_AJi$bl|_d|Nns{AL+<)O5dAZP
z{Qy5(G}&r9tj4mE2`pMNKAHMiPE8^O+zif6EORPLHc5rMaE^OC7c`1aa6+rc>$`TB
zOt;2Gi%zrcigojUQ{-wKeE%MFzIOoD{mCtx@~V5vweHR2Fq|Z$K5I^4_xDnSg2>Zf
zEP89Dogq9-;vH_Aiq7q;k8ZopsVQnNeMqH$v0ivM+h{&8KEl3vpgU&=_;TuH#z_|&
z>OhTSry@Y+MMZ<Kw`XGuF^v=jNvt5qM77w+g2fO&IyP|Zyx7xG;i3XQlISv$z&+BC
z%{v`%^ZsLjrVEcA19M!)&@KCLmA1&<nCnl=jj*7gP%gYcz%LXNCq<ArAm}&UUKf+k
zSlJJVZ*?V^9tDP$vbVS1keC6#*kD(FlsAkrmWm)qnLTLZ9%g~en@o?L3KLJSRBRu}
zjvE;Gmc>?;a;Pv$426rM6v+1JyLImrZ4Uw;2S+84IyfXNf?YX<Jt*)~P)OlLi)FCc
zg-;iMAdaXPv4tAO!MW2z{1urtfb4}3<}P2Y=->AFdDoAfbAZoC#><OIWQpa$#Yhq8
zkrF8@>0c7K(cV|T|HkFOU2Vs`QRil6IUz%-C%X5HK@FEdj3g7Esfa<8RuHgtX9z!a
zpd(vSs4_xSbMj@=9LB*b_q=@mkamr&nLBwX;0^>+E4ScMILKzt)+s}rNdb<+?KLE)
zaKS|5vmxy8^PB@s{pdls%Dp0rZ&$yqRsGKGvv7F)SYd5<`Sr}$_xGEv9lU|;lz@XE
zR<7RkOds7WCYj(6RRMI0i2C*;O2>PkZDF;Prv@e@5qN3ij(ShFB}mSt=EYkzw|Rq~
zKZxA+)<dhBoOK<&MEIiSX=B=P)KvQleK;cU`wHC0Od+$n<H4+9Tsp1bu^8Z=AH+<d
zVooCi#C<@b{sp^5BD^9hf`@{__?qOx4vYnTVcMN$>1L^>Dg+!#?sBH){sVcBX)!Y<
z#tUd#uf_0G$Xd7v{P-)f>gvhKE6Gpl3iGkeG=@UYhmoQvWHTC6^>JYGWZ+zK0aJ&a
zVP1LC^uAWRexfT*qV`_1TUfZXB*HQRyGPSvb56qjHsM|V^5B9i?Z1Dgo!^?G_U3MX
zs#`wdjQlJS<>u^aq=CL5Zx0`+%ar=??YK(HfKmThnIC0J!|4~fFzpcz^ObYOv3HlU
z&Un$~1RRM}GwCDkJVTew4eA3IPJ#f5Bbj(Dy|i=>J<Lwe<uVp~vMx{OiE*Fsk6s9I
z(sI*)U^SsVFF!2jOZB=|g>fiei2Yg9KXc}PkKKpaM@82&NzgVTq02(8_^Ic&BLoiL
z15;5S?t{!|;^(D4C713~2};WE0A5Ga>D$Q(nRkw&rLzv5J(mD%J2evM&m3$2<(Yh#
ze|%OEuf@Xc%TBk+pTXFpl+nyV4ALNcM*^e0oKp^$vg%CJ`U?`eWAn`H*<`cLbcOjW
zd#q<$rMa`Fva7pY^QjB_u|APzeD;cSmG#fbv=@dcY$_;e!$vvexk!p2rZK&TJ%!{^
zo`$7TZiOKS{_#Cg{Pn9ZxGcCczWe)v%pp+5(n<r2K`#F}fju#wqWpZOw?It1rRpn&
zB@=ER4$|^g6CGuNnI*CR^qaQ$L&3qe4n(Q+9q=>T%fmZ?g!V5h?jtTg#_Aj6mTf)!
zCw4bvmNgzL4J?*=c6N7$H1XzH(YH?~{m6&*$ED@zEsKmU{7sZFx&76(tmiuuqw(_o
zxco7F>&bc3O5<&#FV_MO_sXJ`1zo|Roz6mvKSb!+js$7<)iJr=XrH}BG%i0g@M~#8
zvg=AU%zlFTIY1+dL=LQ8p=K<Kh{AQdv@I~z!OSZ%cMR_DHdp<d=WTwI53aWM5eKCV
zqtWfcp;1YEQFOneCZ$c;J?yPt%sl^03A1ZZEhMg}cmXBoFttABlql&cgxY@HrDU{D
z7RLz2*YR8LxK{RYrp2Cyfsx$iAv!fxH%)3ovSQ#8H8GOBZVlD#Da$(jeU(+5eg9ul
z>n@~WlpnZx-RTJcY#VPYjYvPSSq6O)8wlJ|56|Ort>}REgCsinyfSCW$%P3dcnPy}
zj4Y$x73S8yEcs{i4xiin9lwpvsuj^HMan8&AtiIR`x-0dlAN(DfzOm`X$WW5J<gG0
zS9fy1!e}Yb8!g(0uE;)mCBMv6ZOe!pCI=S*1)<cuWTU!-hrPoGx*^;=EvNv4RBSV`
z?Q5ANqOk1uhx^3ZPrY(o``<F9)F)ok5npBd=yzkykGEe%Ir|9dDlwQTn9tz^aN)a4
zMC?Z-EimXhH=h0N$=n?iw3MBXqp7tS6<&B=0Z!Gu+cTQ?9}3U{Y!!5f4m(7@0fyO3
zp_RWgWVAW7mURL#fT|3HImBryS3R0GWZl$B(_*V(H@kS)Pd*-I3Fu2vkBoXV%bK?A
zmGOLzW(d|ECvNE!+jmq`>w>J>lJ<^_5*ZOqPZCq8N^o9X_B;|8>T;(56S(hOd0?@N
zQyKRN_6&3e&Rmo#tiWl<&QnsJJ@XY0!k|W^4DL+^iR>kjNqG9ZAV((xaKvqfXLWJn
z3j4-?zQK1D?qF-5<RbQPCgNkNF_uf-h{|VL#ffPklFu<6Z(K|JVJ(O2ez*58QN3s^
zS(7j*vK}HoG!PqO(cF|a@65lq`Q~vJm0qbVfAVKO2S#zp2Qrws=SF89+%2=8XiW3Z
zr_iO_l<N%=zi`Y_QMw}YA5%321@D_B=YR+T7ssVXrX^fy>XJ5|OVW&y#5+vN@5d%r
z%XrB8oE*k|@xI+3=p3U9oA(}OYZ|K@zT<np(}Lu)iih9XOum4f+823JyiCzMJ<)~O
zdv}I)A#oHYEG3JVkeT*jO15jxSBoprRq<qGwf-$ACPPOom$nN}P>X@t%!H)JR<Opi
z$6LD})2r2&LJ(@!0_H6qJjc`GYio@8KjWVN%AyymrSLw1&|eCnPHr2z1&JTwb|uVH
zOa_z8<iau@C4j+NRc0U9>tY8)4^}7X8k2sG7CXB?>q_L7)HsQPoZBD7EX2L9{b%zr
zB<8Q#UEDZ4bdIIV{#1Q?mkSHoShmFOR-D(YiwNW|YY!0=BbmwA6=~dMuDFaeE=&q?
z!V1$s6K2<xz%+)zdwBL2<|yPBe*MVIs@-Z;v0I5y@2;DRIk)xotvZvDj%i1K=3|n>
zB~KTTrEEMw;cHvFWsj?y?kg=n@}ab+W<u%FPIjvu>8`q3`_XAYU8tBl=i%Yj-;4u$
z6I!qAVRag1QL%!1x%&2+1#kY@ybn3=w&=D`+N_Z>@Eq<XuO@SMgWG#x#-rN@jlwFq
z-{|V_l!dVCMO)E5NtTSvvcEQA`!pvzjt<>&(|J9|2+A*QiQx!$Jk7N^)}+D|x(Ji0
zg+@F-be7KlwYQ$R`!MUM>~=eTzoR1O@JDrArxN)Iu{QJh&FRHr{m#2qL9s#IA_$0u
z{#YPYkae`oTg@)LV4jPIOepKYuOnUIbZxD;jlGG07s8(#_=Q-W{O226`~qA5{299c
zIP+Y*pJpyi-WIi1v+Dl^0k5}Y`P0!2{Jav^1u1y@%nEMqmc4G<{F^oL&9(D1<)3Ef
zgWNpr%Qru6TawRycbJYWG6XGfI|8)aJ!4I0Q-elcbokEF$wWrj;Gc`Wyq8YW=bU7T
z+)7o|!S7&Mx~W?lY$fUab0Am6Ml$3&Tkm%iK7Bu-kg>tWKALk<Suihk(N{nStP{7Y
zGW57+v^@;gc=caCzXNgmYkL=;ciU;@d0zz|?$vBNjtaNT$d3+8b|Hjxj-{G2)*npf
z#seP^+o~5Yevo3020Ge8KpsR%-)l@;FCU6=kf#dI!Dw4Bh`sJ#a;;7cpW<>SQ7le8
zWZ>OAop)wd)}{J%E<Hln?f13nv3cFgjE-|8h~*}aI@AzVnVez&m+Lge`4N-$7Dnx%
zI%}A2E^jm4%J=!#D|{Kvai4MZ=?c5MB<(VDmRrx`|M|uyKL%`j=g&KXljYnWzjl`+
z(SPJKQFHQ;eT#v*;@hzNixi=&Tc<(eSJsYw(>yPgS{N@;v0*-1Y90KMXq^jE0xOhL
zs`W_Y;41eklQun-hmp>UO$=clqcv7gaEFg=bUOw8njCop(YnQz@RvnP7!_nX6i<fF
ziVo}6v;5dh8$#A?oEb5d`C~L}QKx8kl8BR<zcm)o*1)|YW2?g}ECZAS(8%&^PbN3U
zasIP;10n~ow0#4-5f}dukK}4|m+A0iPLmOr#{TgZO3^Ad)fq0D(0s11*}xU20X?5B
zhDBK9@JIsf1YoU<s=%HmkA@U!a2Fdd$Ug&y6S)}%0pR_~=1ajAjA(PK#SGA$_lRJh
z{oOEDb=6bWSO0g33qwUk?@m1aPvf7>Ydkbyvpe6A{69V6|NkcJIYa*e4i4Tm&fWN5
GUjIMKg6h=(

literal 0
HcmV?d00001

diff --git a/onnxruntime/python/tools/transformers/models/whisper/test/whisper_ort_output.txt b/onnxruntime/python/tools/transformers/models/whisper/test/whisper_ort_output.txt
new file mode 100644
index 0000000000000..e3dbef248d0b2
--- /dev/null
+++ b/onnxruntime/python/tools/transformers/models/whisper/test/whisper_ort_output.txt
@@ -0,0 +1 @@
+ the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with the thousands of spectators were trivialities not worth thinking about
diff --git a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
index 65866fc9827a5..43dedbc394c38 100644
--- a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
@@ -314,12 +314,111 @@ stages:
               pushd /workspace/onnxruntime/python/tools/transformers/ ; \
               python3 -m pip install --upgrade pip ; \
               pushd models/llama ; \
-              python3 -m pip install -r requirements-cuda.txt ; \
+              python3 -m pip install -r requirements.txt ; \
               popd ; \
               python3 -m pip install /ort-artifact/*.whl ; \
+              python3 -m pip uninstall -y torch ; \
               python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu118 ; \
               python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-fp16 --precision fp16 --execution_provider cuda --input /meta-llama2 --small_gpu ;\
               popd ; \
             "
       displayName: 'Run Llama2 to Onnx F16 and parity Test'
       workingDirectory: $(Build.SourcesDirectory)
+
+- stage: Whisper_ONNX
+  dependsOn:
+  - Build_Onnxruntime_Cuda
+  jobs:
+  - job: Whisper_ONNX
+    variables:
+      skipComponentGovernanceDetection: true
+    workspace:
+      clean: all
+    pool: Onnxruntime-Linux-A10-24G
+    steps:
+    - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+      displayName: 'Clean Agent Directories'
+      condition: always()
+
+    - checkout: self
+      clean: true
+      submodules: none
+
+    - template: templates/flex-downloadPipelineArtifact.yml
+      parameters:
+        StepName: 'Download Onnxruntime Artifact'
+        ArtifactName: 'drop-ort-linux-gpu'
+        TargetPath: '$(Build.BinariesDirectory)/ort-artifact/'
+        SpecificArtifact: ${{ parameters.specificArtifact }}
+        BuildId: ${{ parameters.BuildId }}
+
+    - template: templates/get-docker-image-steps.yml
+      parameters:
+        Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
+        Context: tools/ci_build/github/linux/docker/
+        ScriptName: tools/ci_build/get_docker_image.py
+        DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )"
+        Repository: onnxruntimepackagestest
+        UpdateDepsTxt: false
+
+    - task: DownloadPackage@1
+      # The model data in artifact is downloaded from openai/whisper-large-v3 in huggingface model hub
+      # In order to save size, removed .git directory and pickled files, and keep the safetensors model files
+      displayName: 'Download Whisper Model'
+      inputs:
+        packageType: upack
+        feed: '/7424c8e4-5c62-490e-95c4-79446f31017c'
+        version: 1.0.0
+        definition: 'b583ce7c-1a8f-4099-ae28-5d5f56c478b1'
+        downloadPath: $(Agent.TempDirectory)/whisper_large_v3
+
+    - script: |
+        docker run --rm --gpus all -v $(Build.SourcesDirectory):/workspace \
+           -v $(Build.BinariesDirectory)/ort-artifact/:/ort-artifact \
+           -v $(Agent.TempDirectory)/whisper_large_v3:/whisper_large_v3 \
+           onnxruntimepackagestest \
+            bash -c '
+              set -ex; \
+              pushd /workspace/onnxruntime/python/tools/transformers/ ; \
+              python3 -m pip install --upgrade pip ; \
+              pushd models/whisper ; \
+              python3 -m pip install -r requirements.txt ; \
+              popd ; \
+              python3 -m pip install /ort-artifact/*.whl ; \
+              python3 -m pip uninstall -y torch ; \
+              python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu118 ; \
+              python3 -m models.whisper.convert_to_onnx -m /whisper_large_v3 --output whisperlargev3 --use_external_data_format ; \
+              popd ; \
+            '
+      displayName: 'Convert Whisper Model'
+      workingDirectory: $(Build.SourcesDirectory)
+
+    - script: |
+        docker run --rm --gpus all -v $(Build.SourcesDirectory):/workspace \
+           -v $(Build.BinariesDirectory)/ort-artifact/:/ort-artifact \
+           -v $(Agent.TempDirectory)/whisper_large_v3:/whisper_large_v3 \
+           onnxruntimepackagestest \
+            bash -c '
+              set -ex; \
+              pushd /workspace/onnxruntime/python/tools/transformers/ ; \
+              python3 -m pip install --upgrade pip ; \
+              pushd models/whisper ; \
+              python3 -m pip install -r requirements.txt ; \
+              popd ; \
+              python3 -m pip install /ort-artifact/*.whl ; \
+              python3 -m pip uninstall -y torch ; \
+              python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu118 ; \
+              ls whisperlargev3; \
+              python3 -m models.whisper.benchmark \
+                  --benchmark-type ort \
+                  --audio-path models/whisper/test/1272-141231-0002.mp3 \
+                  --model-name openai/whisper-large-v3 \
+                  --ort-model-path /workspace/onnxruntime/python/tools/transformers/whisperlargev3/whisper_large_v3_beamsearch.onnx \
+                  --precision fp32 \
+                  --device cuda > ort_output.txt ; \
+              cat ort_output.txt ; \
+              diff ort_output.txt /workspace/onnxruntime/python/tools/transformers/models/whisper/test/whisper_ort_output.txt && exit 0 || exit 1
+              popd ; \
+            '
+      displayName: 'Test Whisper ONNX Model'
+      workingDirectory: $(Build.SourcesDirectory)
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
index 9b9dc9ecae822..c9038afc0954c 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
@@ -16,15 +16,18 @@ ENV DEBIAN_FRONTEND=noninteractive
 ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH_ARG}:${LD_LIBRARY_PATH}
 
 RUN apt-get update &&\
-    apt-get install -y git bash wget
+    apt-get install -y git bash wget diffutils
 
 # Install python3
 RUN apt-get install -y --no-install-recommends \
     python3 \
     python3-pip \
     python3-dev \
-    python3-wheel 
-   
+    python3-wheel
+
+# Install ffmpeg, which couldn't be installed in UBI8
+# https://stackoverflow.com/questions/73597789/how-to-install-ffmpeg-on-ubi-docker-images
+RUN apt-get install -y --no-install-recommends ffmpeg
 
 RUN pip install --upgrade pip
 

From 430a086f22684ad0020819dc3e7712f36fe9f016 Mon Sep 17 00:00:00 2001
From: Yufeng Li <liyufeng1987@gmail.com>
Date: Sun, 25 Feb 2024 08:50:45 -0800
Subject: [PATCH 060/279] fix memory mapping on Windows (#19623)

### Description
<!-- Describe your changes. -->
Windows memory map casts mapped_offset to DWORD directly. It will be
truncated if it is larger than 2^32-1. We need to set high
dwFileOffsetHigh for this case.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

The bug was found from #19450
---
 onnxruntime/core/platform/windows/env.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/core/platform/windows/env.cc b/onnxruntime/core/platform/windows/env.cc
index 0eb34cbfbc9eb..983cc6089bb4c 100644
--- a/onnxruntime/core/platform/windows/env.cc
+++ b/onnxruntime/core/platform/windows/env.cc
@@ -459,8 +459,8 @@ Status WindowsEnv::MapFileIntoMemory(_In_z_ const ORTCHAR_T* file_path,
 
   void* const mapped_base = MapViewOfFile(file_mapping_handle.get(),
                                           FILE_MAP_READ,
-                                          0,
-                                          static_cast<DWORD>(mapped_offset),
+                                          static_cast<DWORD>((mapped_offset >> 32) & 0xFFFFFFFF),
+                                          static_cast<DWORD>(mapped_offset & 0xFFFFFFFF),
                                           mapped_length);
   GSL_SUPPRESS(r.11)
   mapped_memory =

From a9568935a52b3d51ec802a4ab89ab3852129fc1e Mon Sep 17 00:00:00 2001
From: Sumit Agarwal <sumitagarwal330@gmail.com>
Date: Mon, 26 Feb 2024 11:35:13 -0800
Subject: [PATCH 061/279] [DML EP] Enable DML Graph Serialization (#19505)

### Description
This PR adds a feature to serialize all DML EP partitions into DML
currency individually for a given a model. This feature can be
dynamically turned on by using DML EP option
`ep.dml.enable_graph_serialization`.


### Motivation and Context
- Why is this change required? What problem does it solve?
Useful when user want to capture the DML EP specific partition into DML
currency to mitigate the dependency on the framework.
<!-- - If it fixes an open issue, please link to the issue here. -->
---
 .../inc/IWinmlExecutionProvider.h             |    7 +-
 .../DmlExecutionProvider/src/ApiTraits.cpp    |  570 +++++++
 .../src/DmlGraphDeserialization.cpp           |  554 +++++++
 .../src/DmlGraphFusionHelper.cpp              |  247 ++-
 .../src/DmlGraphFusionHelper.h                |   19 +-
 .../src/DmlGraphFusionTransformer.cpp         |   41 +-
 .../src/DmlGraphFusionTransformer.h           |    4 +-
 .../src/DmlGraphSerialization.cpp             |  580 ++++++++
 .../src/DmlRuntimeFusedGraphKernel.cpp        |   30 +-
 .../src/External/DirectMLHelpers/ApiTraits.h  |  453 +++++-
 .../External/DirectMLHelpers/DirectMLSchema.h |  112 +-
 .../DirectMLHelpers/DmlGraphDesc_generated.h  |  788 ++++++++++
 .../DirectMLHelpers/DmlGraphDeserialization.h |   14 +
 .../DirectMLHelpers/DmlGraphSerialization.h   |    8 +
 .../DirectMLHelpers/DmlSerializedGraphDesc.h  |   73 +
 .../DirectMLHelpers/GeneratedSchemaHelpers.h  |   92 +-
 .../DirectMLHelpers/GeneratedSchemaTypes.h    |   32 +-
 .../OperatorFieldTypes_generated.h            | 1318 +++++++++++++++++
 .../External/DirectMLHelpers/SchemaHelpers.h  |   54 +-
 .../src/GraphDescBuilder.cpp                  |  404 ++---
 .../src/GraphDescBuilder.h                    |   21 +-
 .../src/MLOperatorAuthorImpl.cpp              |   30 +-
 .../src/Operators/DmlOperator.cpp             |    4 +-
 .../src/Operators/DmlOperatorAttention.cpp    |    2 +-
 .../src/Operators/DmlOperatorBiasAdd.cpp      |    2 +-
 .../Operators/DmlOperatorBiasSplitGelu.cpp    |    2 +-
 .../DmlOperatorEmbedLayerNormalization.cpp    |    2 +-
 .../src/Operators/DmlOperatorGroupNorm.cpp    |    2 +-
 .../DmlOperatorLayerNormalization.cpp         |    2 +-
 .../Operators/DmlOperatorQLinearConcat.cpp    |    2 +-
 .../Operators/DmlOperatorQLinearSigmoid.cpp   |    2 +-
 .../src/Operators/DmlOperatorQuickGelu.cpp    |    2 +-
 .../Operators/DmlOperatorRotaryEmbedding.cpp  |    2 +-
 .../DmlOperatorSkipLayerNormalization.cpp     |    2 +-
 .../dml/DmlExecutionProvider/src/Utility.h    |  141 ++
 .../dml/DmlExecutionProvider/src/precomp.h    |    7 +
 .../MLOperatorAuthorPrivate.h                 |   11 +-
 .../dml/dml_session_options_config_keys.h     |    1 +
 onnxruntime/core/session/inference_session.cc |    9 +-
 .../test/perftest/command_args_parser.cc      |    1 +
 onnxruntime/test/perftest/ort_test_session.cc |   10 +
 41 files changed, 5203 insertions(+), 454 deletions(-)
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/ApiTraits.cpp
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphDeserialization.cpp
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphSerialization.cpp
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphDesc_generated.h
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphDeserialization.h
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphSerialization.h
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlSerializedGraphDesc.h
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/OperatorFieldTypes_generated.h
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/Utility.h

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h
index f29cc3afc3cda..88e3dd487d427 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h
@@ -80,15 +80,10 @@ namespace Windows::AI::MachineLearning::Adapter
     };
 
     // This is the counterpart to the MLOperatorGraphDesc ABI struct which owns its memory and uses containers.
-    // Either nodesAsOperatorDesc or nodesAsIDMLOperator can have non-zero size.
     struct DmlGraphNodeCreateInfo
     {
         uint32_t nodeCount = 0;
-        std::vector<std::unique_ptr<AbstractOperatorDesc>> nodesAsOperatorDesc;
-
-        // TODO (jeffbloo): Remove this
-        std::vector<Microsoft::WRL::ComPtr<IDMLOperator>> nodesAsIDMLOperator;
-
+        std::vector<std::unique_ptr<AbstractOperatorDesc>> nodes;
         std::vector<DML_INPUT_GRAPH_EDGE_DESC> inputEdges;
         std::vector<DML_OUTPUT_GRAPH_EDGE_DESC> outputEdges;
         std::vector<DML_INTERMEDIATE_GRAPH_EDGE_DESC> intermediateEdges;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ApiTraits.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ApiTraits.cpp
new file mode 100644
index 0000000000000..bf9800458102b
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ApiTraits.cpp
@@ -0,0 +1,570 @@
+﻿//---------------------------------------------------------------------------
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// This file is automatically generated. Please do not edit it directly.
+// To modify this file, edit the schema: dml/Tools/DirectMLSchema.json
+// And run this script to regenerate: dml/Tools/GenerateSchema.ps1
+//
+// #dml-new-operator-location
+//---------------------------------------------------------------------------
+
+#pragma once
+
+#include "precomp.h"
+
+template <typename T>
+T ApiTraits::StringifyHelpers::FromString(std::string_view value)
+{
+#ifndef WAI_BUILD_LINUX
+    // Clang will instantiate this template even if it isn't used,
+    // so this static_assert will always fire and break the build.
+    static_assert(false, "Not implemented for this type");
+#endif
+}
+
+template <>
+DML_TENSOR_DATA_TYPE ApiTraits::StringifyHelpers::FromString(std::string_view value)
+{
+    constexpr StringUtil::NameAndIndex mapping[] =
+    {
+        {"DML_TENSOR_DATA_TYPE_UNKNOWN", DML_TENSOR_DATA_TYPE_UNKNOWN},
+        {"DML_TENSOR_DATA_TYPE_FLOAT32", DML_TENSOR_DATA_TYPE_FLOAT32},
+        {"DML_TENSOR_DATA_TYPE_FLOAT16", DML_TENSOR_DATA_TYPE_FLOAT16},
+        {"DML_TENSOR_DATA_TYPE_UINT32", DML_TENSOR_DATA_TYPE_UINT32},
+        {"DML_TENSOR_DATA_TYPE_UINT16", DML_TENSOR_DATA_TYPE_UINT16},
+        {"DML_TENSOR_DATA_TYPE_UINT8", DML_TENSOR_DATA_TYPE_UINT8},
+        {"DML_TENSOR_DATA_TYPE_INT32", DML_TENSOR_DATA_TYPE_INT32},
+        {"DML_TENSOR_DATA_TYPE_INT16", DML_TENSOR_DATA_TYPE_INT16},
+        {"DML_TENSOR_DATA_TYPE_INT8", DML_TENSOR_DATA_TYPE_INT8},
+        {"DML_TENSOR_DATA_TYPE_FLOAT64", DML_TENSOR_DATA_TYPE_FLOAT64},
+        {"DML_TENSOR_DATA_TYPE_UINT64", DML_TENSOR_DATA_TYPE_UINT64},
+        {"DML_TENSOR_DATA_TYPE_INT64", DML_TENSOR_DATA_TYPE_INT64},
+    };
+    auto index = StringUtil::MapToIndex(value, mapping);
+    if (!index)
+    {
+        assert(false);
+        return static_cast<DML_TENSOR_DATA_TYPE>(0);
+    }
+    return static_cast<DML_TENSOR_DATA_TYPE>(*index);
+}
+
+
+template <>
+DML_TENSOR_TYPE ApiTraits::StringifyHelpers::FromString(std::string_view value)
+{
+    constexpr StringUtil::NameAndIndex mapping[] =
+    {
+        {"DML_TENSOR_TYPE_INVALID", DML_TENSOR_TYPE_INVALID},
+        {"DML_TENSOR_TYPE_BUFFER", DML_TENSOR_TYPE_BUFFER},
+    };
+    auto index = StringUtil::MapToIndex(value, mapping);
+    if (!index)
+    {
+        assert(false);
+        return static_cast<DML_TENSOR_TYPE>(0);
+    }
+    return static_cast<DML_TENSOR_TYPE>(*index);
+}
+
+
+template <>
+DML_OPERATOR_TYPE ApiTraits::StringifyHelpers::FromString(std::string_view value)
+{
+    constexpr StringUtil::NameAndIndex mapping[] =
+    {
+        {"DML_OPERATOR_INVALID", DML_OPERATOR_INVALID},
+        {"DML_OPERATOR_ELEMENT_WISE_IDENTITY", DML_OPERATOR_ELEMENT_WISE_IDENTITY},
+        {"DML_OPERATOR_ELEMENT_WISE_ABS", DML_OPERATOR_ELEMENT_WISE_ABS},
+        {"DML_OPERATOR_ELEMENT_WISE_ACOS", DML_OPERATOR_ELEMENT_WISE_ACOS},
+        {"DML_OPERATOR_ELEMENT_WISE_ADD", DML_OPERATOR_ELEMENT_WISE_ADD},
+        {"DML_OPERATOR_ELEMENT_WISE_ASIN", DML_OPERATOR_ELEMENT_WISE_ASIN},
+        {"DML_OPERATOR_ELEMENT_WISE_ATAN", DML_OPERATOR_ELEMENT_WISE_ATAN},
+        {"DML_OPERATOR_ELEMENT_WISE_CEIL", DML_OPERATOR_ELEMENT_WISE_CEIL},
+        {"DML_OPERATOR_ELEMENT_WISE_CLIP", DML_OPERATOR_ELEMENT_WISE_CLIP},
+        {"DML_OPERATOR_ELEMENT_WISE_COS", DML_OPERATOR_ELEMENT_WISE_COS},
+        {"DML_OPERATOR_ELEMENT_WISE_DIVIDE", DML_OPERATOR_ELEMENT_WISE_DIVIDE},
+        {"DML_OPERATOR_ELEMENT_WISE_EXP", DML_OPERATOR_ELEMENT_WISE_EXP},
+        {"DML_OPERATOR_ELEMENT_WISE_FLOOR", DML_OPERATOR_ELEMENT_WISE_FLOOR},
+        {"DML_OPERATOR_ELEMENT_WISE_LOG", DML_OPERATOR_ELEMENT_WISE_LOG},
+        {"DML_OPERATOR_ELEMENT_WISE_LOGICAL_AND", DML_OPERATOR_ELEMENT_WISE_LOGICAL_AND},
+        {"DML_OPERATOR_ELEMENT_WISE_LOGICAL_EQUALS", DML_OPERATOR_ELEMENT_WISE_LOGICAL_EQUALS},
+        {"DML_OPERATOR_ELEMENT_WISE_LOGICAL_GREATER_THAN", DML_OPERATOR_ELEMENT_WISE_LOGICAL_GREATER_THAN},
+        {"DML_OPERATOR_ELEMENT_WISE_LOGICAL_LESS_THAN", DML_OPERATOR_ELEMENT_WISE_LOGICAL_LESS_THAN},
+        {"DML_OPERATOR_ELEMENT_WISE_LOGICAL_GREATER_THAN_OR_EQUAL", DML_OPERATOR_ELEMENT_WISE_LOGICAL_GREATER_THAN_OR_EQUAL},
+        {"DML_OPERATOR_ELEMENT_WISE_LOGICAL_LESS_THAN_OR_EQUAL", DML_OPERATOR_ELEMENT_WISE_LOGICAL_LESS_THAN_OR_EQUAL},
+        {"DML_OPERATOR_ELEMENT_WISE_LOGICAL_NOT", DML_OPERATOR_ELEMENT_WISE_LOGICAL_NOT},
+        {"DML_OPERATOR_ELEMENT_WISE_LOGICAL_OR", DML_OPERATOR_ELEMENT_WISE_LOGICAL_OR},
+        {"DML_OPERATOR_ELEMENT_WISE_LOGICAL_XOR", DML_OPERATOR_ELEMENT_WISE_LOGICAL_XOR},
+        {"DML_OPERATOR_ELEMENT_WISE_MAX", DML_OPERATOR_ELEMENT_WISE_MAX},
+        {"DML_OPERATOR_ELEMENT_WISE_MEAN", DML_OPERATOR_ELEMENT_WISE_MEAN},
+        {"DML_OPERATOR_ELEMENT_WISE_MIN", DML_OPERATOR_ELEMENT_WISE_MIN},
+        {"DML_OPERATOR_ELEMENT_WISE_MULTIPLY", DML_OPERATOR_ELEMENT_WISE_MULTIPLY},
+        {"DML_OPERATOR_ELEMENT_WISE_POW", DML_OPERATOR_ELEMENT_WISE_POW},
+        {"DML_OPERATOR_ELEMENT_WISE_CONSTANT_POW", DML_OPERATOR_ELEMENT_WISE_CONSTANT_POW},
+        {"DML_OPERATOR_ELEMENT_WISE_RECIP", DML_OPERATOR_ELEMENT_WISE_RECIP},
+        {"DML_OPERATOR_ELEMENT_WISE_SIN", DML_OPERATOR_ELEMENT_WISE_SIN},
+        {"DML_OPERATOR_ELEMENT_WISE_SQRT", DML_OPERATOR_ELEMENT_WISE_SQRT},
+        {"DML_OPERATOR_ELEMENT_WISE_SUBTRACT", DML_OPERATOR_ELEMENT_WISE_SUBTRACT},
+        {"DML_OPERATOR_ELEMENT_WISE_TAN", DML_OPERATOR_ELEMENT_WISE_TAN},
+        {"DML_OPERATOR_ELEMENT_WISE_THRESHOLD", DML_OPERATOR_ELEMENT_WISE_THRESHOLD},
+        {"DML_OPERATOR_ELEMENT_WISE_QUANTIZE_LINEAR", DML_OPERATOR_ELEMENT_WISE_QUANTIZE_LINEAR},
+        {"DML_OPERATOR_ELEMENT_WISE_DEQUANTIZE_LINEAR", DML_OPERATOR_ELEMENT_WISE_DEQUANTIZE_LINEAR},
+        {"DML_OPERATOR_ACTIVATION_ELU", DML_OPERATOR_ACTIVATION_ELU},
+        {"DML_OPERATOR_ACTIVATION_CELU", DML_OPERATOR_ACTIVATION_CELU},
+        {"DML_OPERATOR_ACTIVATION_HARDMAX", DML_OPERATOR_ACTIVATION_HARDMAX},
+        {"DML_OPERATOR_ACTIVATION_HARDMAX1", DML_OPERATOR_ACTIVATION_HARDMAX1},
+        {"DML_OPERATOR_ACTIVATION_HARD_SIGMOID", DML_OPERATOR_ACTIVATION_HARD_SIGMOID},
+        {"DML_OPERATOR_ACTIVATION_IDENTITY", DML_OPERATOR_ACTIVATION_IDENTITY},
+        {"DML_OPERATOR_ACTIVATION_LEAKY_RELU", DML_OPERATOR_ACTIVATION_LEAKY_RELU},
+        {"DML_OPERATOR_ACTIVATION_LINEAR", DML_OPERATOR_ACTIVATION_LINEAR},
+        {"DML_OPERATOR_ACTIVATION_LOG_SOFTMAX", DML_OPERATOR_ACTIVATION_LOG_SOFTMAX},
+        {"DML_OPERATOR_ACTIVATION_LOG_SOFTMAX1", DML_OPERATOR_ACTIVATION_LOG_SOFTMAX1},
+        {"DML_OPERATOR_ACTIVATION_PARAMETERIZED_RELU", DML_OPERATOR_ACTIVATION_PARAMETERIZED_RELU},
+        {"DML_OPERATOR_ACTIVATION_PARAMETRIC_SOFTPLUS", DML_OPERATOR_ACTIVATION_PARAMETRIC_SOFTPLUS},
+        {"DML_OPERATOR_ACTIVATION_RELU", DML_OPERATOR_ACTIVATION_RELU},
+        {"DML_OPERATOR_ACTIVATION_SCALED_ELU", DML_OPERATOR_ACTIVATION_SCALED_ELU},
+        {"DML_OPERATOR_ACTIVATION_SCALED_TANH", DML_OPERATOR_ACTIVATION_SCALED_TANH},
+        {"DML_OPERATOR_ACTIVATION_SIGMOID", DML_OPERATOR_ACTIVATION_SIGMOID},
+        {"DML_OPERATOR_ACTIVATION_SOFTMAX", DML_OPERATOR_ACTIVATION_SOFTMAX},
+        {"DML_OPERATOR_ACTIVATION_SOFTMAX1", DML_OPERATOR_ACTIVATION_SOFTMAX1},
+        {"DML_OPERATOR_ACTIVATION_SOFTPLUS", DML_OPERATOR_ACTIVATION_SOFTPLUS},
+        {"DML_OPERATOR_ACTIVATION_SOFTSIGN", DML_OPERATOR_ACTIVATION_SOFTSIGN},
+        {"DML_OPERATOR_ACTIVATION_TANH", DML_OPERATOR_ACTIVATION_TANH},
+        {"DML_OPERATOR_ACTIVATION_THRESHOLDED_RELU", DML_OPERATOR_ACTIVATION_THRESHOLDED_RELU},
+        {"DML_OPERATOR_CONVOLUTION", DML_OPERATOR_CONVOLUTION},
+        {"DML_OPERATOR_GEMM", DML_OPERATOR_GEMM},
+        {"DML_OPERATOR_REDUCE", DML_OPERATOR_REDUCE},
+        {"DML_OPERATOR_AVERAGE_POOLING", DML_OPERATOR_AVERAGE_POOLING},
+        {"DML_OPERATOR_AVERAGE_POOLING1", DML_OPERATOR_AVERAGE_POOLING1},
+        {"DML_OPERATOR_LP_POOLING", DML_OPERATOR_LP_POOLING},
+        {"DML_OPERATOR_LP_POOLING1", DML_OPERATOR_LP_POOLING1},
+        {"DML_OPERATOR_MAX_POOLING", DML_OPERATOR_MAX_POOLING},
+        {"DML_OPERATOR_ROI_POOLING", DML_OPERATOR_ROI_POOLING},
+        {"DML_OPERATOR_SLICE", DML_OPERATOR_SLICE},
+        {"DML_OPERATOR_CAST", DML_OPERATOR_CAST},
+        {"DML_OPERATOR_SPLIT", DML_OPERATOR_SPLIT},
+        {"DML_OPERATOR_JOIN", DML_OPERATOR_JOIN},
+        {"DML_OPERATOR_PADDING", DML_OPERATOR_PADDING},
+        {"DML_OPERATOR_PADDING1", DML_OPERATOR_PADDING1},
+        {"DML_OPERATOR_VALUE_SCALE_2D", DML_OPERATOR_VALUE_SCALE_2D},
+        {"DML_OPERATOR_UPSAMPLE_2D", DML_OPERATOR_UPSAMPLE_2D},
+        {"DML_OPERATOR_GATHER", DML_OPERATOR_GATHER},
+        {"DML_OPERATOR_SPACE_TO_DEPTH", DML_OPERATOR_SPACE_TO_DEPTH},
+        {"DML_OPERATOR_DEPTH_TO_SPACE", DML_OPERATOR_DEPTH_TO_SPACE},
+        {"DML_OPERATOR_TILE", DML_OPERATOR_TILE},
+        {"DML_OPERATOR_TOP_K", DML_OPERATOR_TOP_K},
+        {"DML_OPERATOR_BATCH_NORMALIZATION", DML_OPERATOR_BATCH_NORMALIZATION},
+        {"DML_OPERATOR_BATCH_NORMALIZATION_TRAINING", DML_OPERATOR_BATCH_NORMALIZATION_TRAINING},
+        {"DML_OPERATOR_MEAN_VARIANCE_NORMALIZATION", DML_OPERATOR_MEAN_VARIANCE_NORMALIZATION},
+        {"DML_OPERATOR_LOCAL_RESPONSE_NORMALIZATION", DML_OPERATOR_LOCAL_RESPONSE_NORMALIZATION},
+        {"DML_OPERATOR_LP_NORMALIZATION", DML_OPERATOR_LP_NORMALIZATION},
+        {"DML_OPERATOR_RNN", DML_OPERATOR_RNN},
+        {"DML_OPERATOR_LSTM", DML_OPERATOR_LSTM},
+        {"DML_OPERATOR_GRU", DML_OPERATOR_GRU},
+        {"DML_OPERATOR_ELEMENT_WISE_SIGN", DML_OPERATOR_ELEMENT_WISE_SIGN},
+        {"DML_OPERATOR_ELEMENT_WISE_IS_NAN", DML_OPERATOR_ELEMENT_WISE_IS_NAN},
+        {"DML_OPERATOR_ELEMENT_WISE_ERF", DML_OPERATOR_ELEMENT_WISE_ERF},
+        {"DML_OPERATOR_ELEMENT_WISE_SINH", DML_OPERATOR_ELEMENT_WISE_SINH},
+        {"DML_OPERATOR_ELEMENT_WISE_COSH", DML_OPERATOR_ELEMENT_WISE_COSH},
+        {"DML_OPERATOR_ELEMENT_WISE_TANH", DML_OPERATOR_ELEMENT_WISE_TANH},
+        {"DML_OPERATOR_ELEMENT_WISE_ASINH", DML_OPERATOR_ELEMENT_WISE_ASINH},
+        {"DML_OPERATOR_ELEMENT_WISE_ACOSH", DML_OPERATOR_ELEMENT_WISE_ACOSH},
+        {"DML_OPERATOR_ELEMENT_WISE_ATANH", DML_OPERATOR_ELEMENT_WISE_ATANH},
+        {"DML_OPERATOR_ELEMENT_WISE_IF", DML_OPERATOR_ELEMENT_WISE_IF},
+        {"DML_OPERATOR_ELEMENT_WISE_ADD1", DML_OPERATOR_ELEMENT_WISE_ADD1},
+        {"DML_OPERATOR_ACTIVATION_SHRINK", DML_OPERATOR_ACTIVATION_SHRINK},
+        {"DML_OPERATOR_MAX_POOLING1", DML_OPERATOR_MAX_POOLING1},
+        {"DML_OPERATOR_MAX_UNPOOLING", DML_OPERATOR_MAX_UNPOOLING},
+        {"DML_OPERATOR_DIAGONAL_MATRIX", DML_OPERATOR_DIAGONAL_MATRIX},
+        {"DML_OPERATOR_SCATTER", DML_OPERATOR_SCATTER},
+        {"DML_OPERATOR_ONE_HOT", DML_OPERATOR_ONE_HOT},
+        {"DML_OPERATOR_RESAMPLE", DML_OPERATOR_RESAMPLE},
+        {"DML_OPERATOR_ELEMENT_WISE_BIT_SHIFT_LEFT", DML_OPERATOR_ELEMENT_WISE_BIT_SHIFT_LEFT},
+        {"DML_OPERATOR_ELEMENT_WISE_BIT_SHIFT_RIGHT", DML_OPERATOR_ELEMENT_WISE_BIT_SHIFT_RIGHT},
+        {"DML_OPERATOR_ELEMENT_WISE_ROUND", DML_OPERATOR_ELEMENT_WISE_ROUND},
+        {"DML_OPERATOR_ELEMENT_WISE_IS_INFINITY", DML_OPERATOR_ELEMENT_WISE_IS_INFINITY},
+        {"DML_OPERATOR_ELEMENT_WISE_MODULUS_TRUNCATE", DML_OPERATOR_ELEMENT_WISE_MODULUS_TRUNCATE},
+        {"DML_OPERATOR_ELEMENT_WISE_MODULUS_FLOOR", DML_OPERATOR_ELEMENT_WISE_MODULUS_FLOOR},
+        {"DML_OPERATOR_FILL_VALUE_SEQUENCE", DML_OPERATOR_FILL_VALUE_SEQUENCE},
+        {"DML_OPERATOR_FILL_VALUE_CONSTANT", DML_OPERATOR_FILL_VALUE_CONSTANT},
+        {"DML_OPERATOR_CUMULATIVE_SUMMATION", DML_OPERATOR_CUMULATIVE_SUMMATION},
+        {"DML_OPERATOR_REVERSE_SUBSEQUENCES", DML_OPERATOR_REVERSE_SUBSEQUENCES},
+        {"DML_OPERATOR_GATHER_ELEMENTS", DML_OPERATOR_GATHER_ELEMENTS},
+        {"DML_OPERATOR_GATHER_ND", DML_OPERATOR_GATHER_ND},
+        {"DML_OPERATOR_SCATTER_ND", DML_OPERATOR_SCATTER_ND},
+        {"DML_OPERATOR_MAX_POOLING2", DML_OPERATOR_MAX_POOLING2},
+        {"DML_OPERATOR_SLICE1", DML_OPERATOR_SLICE1},
+        {"DML_OPERATOR_TOP_K1", DML_OPERATOR_TOP_K1},
+        {"DML_OPERATOR_DEPTH_TO_SPACE1", DML_OPERATOR_DEPTH_TO_SPACE1},
+        {"DML_OPERATOR_SPACE_TO_DEPTH1", DML_OPERATOR_SPACE_TO_DEPTH1},
+        {"DML_OPERATOR_MEAN_VARIANCE_NORMALIZATION1", DML_OPERATOR_MEAN_VARIANCE_NORMALIZATION1},
+        {"DML_OPERATOR_RESAMPLE1", DML_OPERATOR_RESAMPLE1},
+        {"DML_OPERATOR_MATRIX_MULTIPLY_INTEGER", DML_OPERATOR_MATRIX_MULTIPLY_INTEGER},
+        {"DML_OPERATOR_QUANTIZED_LINEAR_MATRIX_MULTIPLY", DML_OPERATOR_QUANTIZED_LINEAR_MATRIX_MULTIPLY},
+        {"DML_OPERATOR_CONVOLUTION_INTEGER", DML_OPERATOR_CONVOLUTION_INTEGER},
+        {"DML_OPERATOR_QUANTIZED_LINEAR_CONVOLUTION", DML_OPERATOR_QUANTIZED_LINEAR_CONVOLUTION},
+        {"DML_OPERATOR_ELEMENT_WISE_BIT_AND", DML_OPERATOR_ELEMENT_WISE_BIT_AND},
+        {"DML_OPERATOR_ELEMENT_WISE_BIT_OR", DML_OPERATOR_ELEMENT_WISE_BIT_OR},
+        {"DML_OPERATOR_ELEMENT_WISE_BIT_XOR", DML_OPERATOR_ELEMENT_WISE_BIT_XOR},
+        {"DML_OPERATOR_ELEMENT_WISE_BIT_NOT", DML_OPERATOR_ELEMENT_WISE_BIT_NOT},
+        {"DML_OPERATOR_ELEMENT_WISE_BIT_COUNT", DML_OPERATOR_ELEMENT_WISE_BIT_COUNT},
+        {"DML_OPERATOR_ACTIVATION_RELU_GRAD", DML_OPERATOR_ACTIVATION_RELU_GRAD},
+        {"DML_OPERATOR_AVERAGE_POOLING_GRAD", DML_OPERATOR_AVERAGE_POOLING_GRAD},
+        {"DML_OPERATOR_MAX_POOLING_GRAD", DML_OPERATOR_MAX_POOLING_GRAD},
+        {"DML_OPERATOR_RANDOM_GENERATOR", DML_OPERATOR_RANDOM_GENERATOR},
+        {"DML_OPERATOR_NONZERO_COORDINATES", DML_OPERATOR_NONZERO_COORDINATES},
+        {"DML_OPERATOR_RESAMPLE_GRAD", DML_OPERATOR_RESAMPLE_GRAD},
+        {"DML_OPERATOR_SLICE_GRAD", DML_OPERATOR_SLICE_GRAD},
+        {"DML_OPERATOR_ADAM_OPTIMIZER", DML_OPERATOR_ADAM_OPTIMIZER},
+        {"DML_OPERATOR_ARGMIN", DML_OPERATOR_ARGMIN},
+        {"DML_OPERATOR_ARGMAX", DML_OPERATOR_ARGMAX},
+        {"DML_OPERATOR_ROI_ALIGN", DML_OPERATOR_ROI_ALIGN},
+        {"DML_OPERATOR_GATHER_ND1", DML_OPERATOR_GATHER_ND1},
+        {"DML_OPERATOR_ELEMENT_WISE_ATAN_YX", DML_OPERATOR_ELEMENT_WISE_ATAN_YX},
+        {"DML_OPERATOR_ELEMENT_WISE_CLIP_GRAD", DML_OPERATOR_ELEMENT_WISE_CLIP_GRAD},
+        {"DML_OPERATOR_ELEMENT_WISE_DIFFERENCE_SQUARE", DML_OPERATOR_ELEMENT_WISE_DIFFERENCE_SQUARE},
+        {"DML_OPERATOR_LOCAL_RESPONSE_NORMALIZATION_GRAD", DML_OPERATOR_LOCAL_RESPONSE_NORMALIZATION_GRAD},
+        {"DML_OPERATOR_CUMULATIVE_PRODUCT", DML_OPERATOR_CUMULATIVE_PRODUCT},
+        {"DML_OPERATOR_BATCH_NORMALIZATION_GRAD", DML_OPERATOR_BATCH_NORMALIZATION_GRAD},
+        {"DML_OPERATOR_BATCH_NORMALIZATION_TRAINING_GRAD", DML_OPERATOR_BATCH_NORMALIZATION_TRAINING_GRAD},
+        {"DML_OPERATOR_ELEMENT_WISE_QUANTIZED_LINEAR_ADD", DML_OPERATOR_ELEMENT_WISE_QUANTIZED_LINEAR_ADD},
+        {"DML_OPERATOR_DYNAMIC_QUANTIZE_LINEAR", DML_OPERATOR_DYNAMIC_QUANTIZE_LINEAR},
+        {"DML_OPERATOR_ROI_ALIGN1", DML_OPERATOR_ROI_ALIGN1},
+        {"DML_OPERATOR_ELEMENT_WISE_CLIP1", DML_OPERATOR_ELEMENT_WISE_CLIP1},
+        {"DML_OPERATOR_ELEMENT_WISE_CLIP_GRAD1", DML_OPERATOR_ELEMENT_WISE_CLIP_GRAD1},
+        {"DML_OPERATOR_ELEMENT_WISE_NEGATE", DML_OPERATOR_ELEMENT_WISE_NEGATE},
+        {"DML_OPERATOR_ACTIVATION_GELU", DML_OPERATOR_ACTIVATION_GELU},
+        {"DML_OPERATOR_ACTIVATION_SWISH", DML_OPERATOR_ACTIVATION_SWISH},
+        {"DML_OPERATOR_ACTIVATION_HARD_SWISH", DML_OPERATOR_ACTIVATION_HARD_SWISH},
+        {"DML_OPERATOR_RESAMPLE2", DML_OPERATOR_RESAMPLE2},
+        {"DML_OPERATOR_RESAMPLE_GRAD1", DML_OPERATOR_RESAMPLE_GRAD1},
+        {"DML_OPERATOR_DIAGONAL_MATRIX1", DML_OPERATOR_DIAGONAL_MATRIX1},
+        {"DML_OPERATOR_MULTIHEAD_ATTENTION", DML_OPERATOR_MULTIHEAD_ATTENTION},
+        {"DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING", DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING},
+        {"DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT", DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT},
+    };
+    auto index = StringUtil::MapToIndex(value, mapping);
+    if (!index)
+    {
+        assert(false);
+        return static_cast<DML_OPERATOR_TYPE>(0);
+    }
+    return static_cast<DML_OPERATOR_TYPE>(*index);
+}
+
+
+template <>
+DML_BINDING_TYPE ApiTraits::StringifyHelpers::FromString(std::string_view value)
+{
+    constexpr StringUtil::NameAndIndex mapping[] =
+    {
+        {"DML_BINDING_TYPE_NONE", DML_BINDING_TYPE_NONE},
+        {"DML_BINDING_TYPE_BUFFER", DML_BINDING_TYPE_BUFFER},
+        {"DML_BINDING_TYPE_BUFFER_ARRAY", DML_BINDING_TYPE_BUFFER_ARRAY},
+    };
+    auto index = StringUtil::MapToIndex(value, mapping);
+    if (!index)
+    {
+        assert(false);
+        return static_cast<DML_BINDING_TYPE>(0);
+    }
+    return static_cast<DML_BINDING_TYPE>(*index);
+}
+
+
+template <>
+DML_REDUCE_FUNCTION ApiTraits::StringifyHelpers::FromString(std::string_view value)
+{
+    constexpr StringUtil::NameAndIndex mapping[] =
+    {
+        {"DML_REDUCE_FUNCTION_ARGMAX", DML_REDUCE_FUNCTION_ARGMAX},
+        {"DML_REDUCE_FUNCTION_ARGMIN", DML_REDUCE_FUNCTION_ARGMIN},
+        {"DML_REDUCE_FUNCTION_AVERAGE", DML_REDUCE_FUNCTION_AVERAGE},
+        {"DML_REDUCE_FUNCTION_L1", DML_REDUCE_FUNCTION_L1},
+        {"DML_REDUCE_FUNCTION_L2", DML_REDUCE_FUNCTION_L2},
+        {"DML_REDUCE_FUNCTION_LOG_SUM", DML_REDUCE_FUNCTION_LOG_SUM},
+        {"DML_REDUCE_FUNCTION_LOG_SUM_EXP", DML_REDUCE_FUNCTION_LOG_SUM_EXP},
+        {"DML_REDUCE_FUNCTION_MAX", DML_REDUCE_FUNCTION_MAX},
+        {"DML_REDUCE_FUNCTION_MIN", DML_REDUCE_FUNCTION_MIN},
+        {"DML_REDUCE_FUNCTION_MULTIPLY", DML_REDUCE_FUNCTION_MULTIPLY},
+        {"DML_REDUCE_FUNCTION_SUM", DML_REDUCE_FUNCTION_SUM},
+        {"DML_REDUCE_FUNCTION_SUM_SQUARE", DML_REDUCE_FUNCTION_SUM_SQUARE},
+    };
+    auto index = StringUtil::MapToIndex(value, mapping);
+    if (!index)
+    {
+        assert(false);
+        return static_cast<DML_REDUCE_FUNCTION>(0);
+    }
+    return static_cast<DML_REDUCE_FUNCTION>(*index);
+}
+
+template <>
+DML_MATRIX_TRANSFORM ApiTraits::StringifyHelpers::FromString(std::string_view value)
+{
+    constexpr StringUtil::NameAndIndex mapping[] =
+    {
+        {"DML_MATRIX_TRANSFORM_NONE", DML_MATRIX_TRANSFORM_NONE},
+        {"DML_MATRIX_TRANSFORM_TRANSPOSE", DML_MATRIX_TRANSFORM_TRANSPOSE},
+    };
+    auto index = StringUtil::MapToIndex(value, mapping);
+    if (!index)
+    {
+        assert(false);
+        return static_cast<DML_MATRIX_TRANSFORM>(0);
+    }
+    return static_cast<DML_MATRIX_TRANSFORM>(*index);
+}
+
+
+template <>
+DML_CONVOLUTION_MODE ApiTraits::StringifyHelpers::FromString(std::string_view value)
+{
+    constexpr StringUtil::NameAndIndex mapping[] =
+    {
+        {"DML_CONVOLUTION_MODE_CONVOLUTION", DML_CONVOLUTION_MODE_CONVOLUTION},
+        {"DML_CONVOLUTION_MODE_CROSS_CORRELATION", DML_CONVOLUTION_MODE_CROSS_CORRELATION},
+    };
+    auto index = StringUtil::MapToIndex(value, mapping);
+    if (!index)
+    {
+        assert(false);
+        return static_cast<DML_CONVOLUTION_MODE>(0);
+    }
+    return static_cast<DML_CONVOLUTION_MODE>(*index);
+}
+
+
+template <>
+DML_CONVOLUTION_DIRECTION ApiTraits::StringifyHelpers::FromString(std::string_view value)
+{
+    constexpr StringUtil::NameAndIndex mapping[] =
+    {
+        {"DML_CONVOLUTION_DIRECTION_FORWARD", DML_CONVOLUTION_DIRECTION_FORWARD},
+        {"DML_CONVOLUTION_DIRECTION_BACKWARD", DML_CONVOLUTION_DIRECTION_BACKWARD},
+    };
+    auto index = StringUtil::MapToIndex(value, mapping);
+    if (!index)
+    {
+        assert(false);
+        return static_cast<DML_CONVOLUTION_DIRECTION>(0);
+    }
+    return static_cast<DML_CONVOLUTION_DIRECTION>(*index);
+}
+
+template <>
+DML_PADDING_MODE ApiTraits::StringifyHelpers::FromString(std::string_view value)
+{
+    constexpr StringUtil::NameAndIndex mapping[] =
+    {
+        {"DML_PADDING_MODE_CONSTANT", DML_PADDING_MODE_CONSTANT},
+        {"DML_PADDING_MODE_EDGE", DML_PADDING_MODE_EDGE},
+        {"DML_PADDING_MODE_REFLECTION", DML_PADDING_MODE_REFLECTION},
+        {"DML_PADDING_MODE_SYMMETRIC", DML_PADDING_MODE_SYMMETRIC},
+    };
+    auto index = StringUtil::MapToIndex(value, mapping);
+    if (!index)
+    {
+        assert(false);
+        return static_cast<DML_PADDING_MODE>(0);
+    }
+    return static_cast<DML_PADDING_MODE>(*index);
+}
+
+
+template <>
+DML_INTERPOLATION_MODE ApiTraits::StringifyHelpers::FromString(std::string_view value)
+{
+    constexpr StringUtil::NameAndIndex mapping[] =
+    {
+        {"DML_INTERPOLATION_MODE_NEAREST_NEIGHBOR", DML_INTERPOLATION_MODE_NEAREST_NEIGHBOR},
+        {"DML_INTERPOLATION_MODE_LINEAR", DML_INTERPOLATION_MODE_LINEAR},
+    };
+    auto index = StringUtil::MapToIndex(value, mapping);
+    if (!index)
+    {
+        assert(false);
+        return static_cast<DML_INTERPOLATION_MODE>(0);
+    }
+    return static_cast<DML_INTERPOLATION_MODE>(*index);
+}
+
+
+template <>
+DML_RECURRENT_NETWORK_DIRECTION ApiTraits::StringifyHelpers::FromString(std::string_view value)
+{
+    constexpr StringUtil::NameAndIndex mapping[] =
+    {
+        {"DML_RECURRENT_NETWORK_DIRECTION_FORWARD", DML_RECURRENT_NETWORK_DIRECTION_FORWARD},
+        {"DML_RECURRENT_NETWORK_DIRECTION_BACKWARD", DML_RECURRENT_NETWORK_DIRECTION_BACKWARD},
+        {"DML_RECURRENT_NETWORK_DIRECTION_BIDIRECTIONAL", DML_RECURRENT_NETWORK_DIRECTION_BIDIRECTIONAL},
+    };
+    auto index = StringUtil::MapToIndex(value, mapping);
+    if (!index)
+    {
+        assert(false);
+        return static_cast<DML_RECURRENT_NETWORK_DIRECTION>(0);
+    }
+    return static_cast<DML_RECURRENT_NETWORK_DIRECTION>(*index);
+}
+
+
+template <>
+DML_FEATURE ApiTraits::StringifyHelpers::FromString(std::string_view value)
+{
+    constexpr StringUtil::NameAndIndex mapping[] =
+    {
+        {"DML_FEATURE_TENSOR_DATA_TYPE_SUPPORT", DML_FEATURE_TENSOR_DATA_TYPE_SUPPORT},
+        {"DML_FEATURE_FEATURE_LEVELS", DML_FEATURE_FEATURE_LEVELS},
+    };
+    auto index = StringUtil::MapToIndex(value, mapping);
+    if (!index)
+    {
+        assert(false);
+        return static_cast<DML_FEATURE>(0);
+    }
+    return static_cast<DML_FEATURE>(*index);
+}
+
+
+template <>
+DML_FEATURE_LEVEL ApiTraits::StringifyHelpers::FromString(std::string_view value)
+{
+    constexpr StringUtil::NameAndIndex mapping[] =
+    {
+        {"DML_FEATURE_LEVEL_1_0", DML_FEATURE_LEVEL_1_0},
+        {"DML_FEATURE_LEVEL_2_0", DML_FEATURE_LEVEL_2_0},
+        {"DML_FEATURE_LEVEL_2_1", DML_FEATURE_LEVEL_2_1},
+        {"DML_FEATURE_LEVEL_3_0", DML_FEATURE_LEVEL_3_0},
+        {"DML_FEATURE_LEVEL_3_1", DML_FEATURE_LEVEL_3_1},
+        {"DML_FEATURE_LEVEL_4_0", DML_FEATURE_LEVEL_4_0},
+        {"DML_FEATURE_LEVEL_4_1", DML_FEATURE_LEVEL_4_1},
+        {"DML_FEATURE_LEVEL_5_0", DML_FEATURE_LEVEL_5_0},
+        {"DML_FEATURE_LEVEL_5_1", DML_FEATURE_LEVEL_5_1},
+        {"DML_FEATURE_LEVEL_5_2", DML_FEATURE_LEVEL_5_2},
+        {"DML_FEATURE_LEVEL_6_0", DML_FEATURE_LEVEL_6_0},
+        {"DML_FEATURE_LEVEL_6_1", DML_FEATURE_LEVEL_6_1},
+        {"DML_FEATURE_LEVEL_6_2", DML_FEATURE_LEVEL_6_2},
+    };
+    auto index = StringUtil::MapToIndex(value, mapping);
+    if (!index)
+    {
+        assert(false);
+        return static_cast<DML_FEATURE_LEVEL>(0);
+    }
+    return static_cast<DML_FEATURE_LEVEL>(*index);
+}
+
+
+template <>
+DML_IS_INFINITY_MODE ApiTraits::StringifyHelpers::FromString(std::string_view value)
+{
+    constexpr StringUtil::NameAndIndex mapping[] =
+    {
+        {"DML_IS_INFINITY_MODE_EITHER", DML_IS_INFINITY_MODE_EITHER},
+        {"DML_IS_INFINITY_MODE_POSITIVE", DML_IS_INFINITY_MODE_POSITIVE},
+        {"DML_IS_INFINITY_MODE_NEGATIVE", DML_IS_INFINITY_MODE_NEGATIVE},
+    };
+    auto index = StringUtil::MapToIndex(value, mapping);
+    if (!index)
+    {
+        assert(false);
+        return static_cast<DML_IS_INFINITY_MODE>(0);
+    }
+    return static_cast<DML_IS_INFINITY_MODE>(*index);
+}
+
+
+template <>
+DML_DEPTH_SPACE_ORDER ApiTraits::StringifyHelpers::FromString(std::string_view value)
+{
+    constexpr StringUtil::NameAndIndex mapping[] =
+    {
+        {"DML_DEPTH_SPACE_ORDER_DEPTH_COLUMN_ROW", DML_DEPTH_SPACE_ORDER_DEPTH_COLUMN_ROW},
+        {"DML_DEPTH_SPACE_ORDER_COLUMN_ROW_DEPTH", DML_DEPTH_SPACE_ORDER_COLUMN_ROW_DEPTH},
+    };
+    auto index = StringUtil::MapToIndex(value, mapping);
+    if (!index)
+    {
+        assert(false);
+        return static_cast<DML_DEPTH_SPACE_ORDER>(0);
+    }
+    return static_cast<DML_DEPTH_SPACE_ORDER>(*index);
+}
+
+
+template <>
+DML_AXIS_DIRECTION ApiTraits::StringifyHelpers::FromString(std::string_view value)
+{
+    constexpr StringUtil::NameAndIndex mapping[] =
+    {
+        {"DML_AXIS_DIRECTION_INCREASING", DML_AXIS_DIRECTION_INCREASING},
+        {"DML_AXIS_DIRECTION_DECREASING", DML_AXIS_DIRECTION_DECREASING},
+    };
+    auto index = StringUtil::MapToIndex(value, mapping);
+    if (!index)
+    {
+        assert(false);
+        return static_cast<DML_AXIS_DIRECTION>(0);
+    }
+    return static_cast<DML_AXIS_DIRECTION>(*index);
+}
+
+
+template <>
+DML_ROUNDING_MODE ApiTraits::StringifyHelpers::FromString(std::string_view value)
+{
+    constexpr StringUtil::NameAndIndex mapping[] =
+    {
+        {"DML_ROUNDING_MODE_HALVES_TO_NEAREST_EVEN", DML_ROUNDING_MODE_HALVES_TO_NEAREST_EVEN},
+        {"DML_ROUNDING_MODE_TOWARD_ZERO", DML_ROUNDING_MODE_TOWARD_ZERO},
+        {"DML_ROUNDING_MODE_TOWARD_INFINITY", DML_ROUNDING_MODE_TOWARD_INFINITY},
+    };
+    auto index = StringUtil::MapToIndex(value, mapping);
+    if (!index)
+    {
+        assert(false);
+        return static_cast<DML_ROUNDING_MODE>(0);
+    }
+    return static_cast<DML_ROUNDING_MODE>(*index);
+}
+
+
+template <>
+DML_RANDOM_GENERATOR_TYPE ApiTraits::StringifyHelpers::FromString(std::string_view value)
+{
+    constexpr StringUtil::NameAndIndex mapping[] =
+    {
+        {"DML_RANDOM_GENERATOR_TYPE_PHILOX_4X32_10", DML_RANDOM_GENERATOR_TYPE_PHILOX_4X32_10},
+    };
+    auto index = StringUtil::MapToIndex(value, mapping);
+    if (!index)
+    {
+        assert(false);
+        return static_cast<DML_RANDOM_GENERATOR_TYPE>(0);
+    }
+    return static_cast<DML_RANDOM_GENERATOR_TYPE>(*index);
+}
+
+
+template <>
+DML_MULTIHEAD_ATTENTION_MASK_TYPE ApiTraits::StringifyHelpers::FromString(std::string_view value)
+{
+    constexpr StringUtil::NameAndIndex mapping[] =
+    {
+        {"DML_MULTIHEAD_ATTENTION_MASK_TYPE_NONE", DML_MULTIHEAD_ATTENTION_MASK_TYPE_NONE},
+        {"DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_SEQUENCE_LENGTH", DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_SEQUENCE_LENGTH},
+        {"DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_SEQUENCE_END_START", DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_SEQUENCE_END_START},
+        {"DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_QUERY_SEQUENCE_LENGTH_START_END", DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_QUERY_SEQUENCE_LENGTH_START_END},
+        {"DML_MULTIHEAD_ATTENTION_MASK_TYPE_BOOLEAN", DML_MULTIHEAD_ATTENTION_MASK_TYPE_BOOLEAN},
+    };
+    auto index = StringUtil::MapToIndex(value, mapping);
+    if (!index)
+    {
+        assert(false);
+        return static_cast<DML_MULTIHEAD_ATTENTION_MASK_TYPE>(0);
+    }
+    return static_cast<DML_MULTIHEAD_ATTENTION_MASK_TYPE>(*index);
+}
+
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphDeserialization.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphDeserialization.cpp
new file mode 100644
index 0000000000000..7d8ed17e7d925
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphDeserialization.cpp
@@ -0,0 +1,554 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+
+#pragma once
+#include "precomp.h"
+
+OperatorFieldVariant CreateAttribute(
+    const DML_SCHEMA_FIELD* schemaField,
+    const dml::ir::operatorFieldTypes::AttributeDesc* attributeDesc);
+
+OperatorFieldVariant CreateActivation(
+    const dml::ir::operatorFieldTypes::Activation* activationDesc)
+{
+    DML_OPERATOR_TYPE activationOperatorType = ApiTraits::StringifyHelpers::FromString<DML_OPERATOR_TYPE>(activationDesc->type()->c_str());
+    const DML_OPERATOR_SCHEMA& activationSchema = SchemaHelpers::GetSchema(activationOperatorType);
+    std::vector<OperatorField> activationOperatorFields(activationSchema.FieldCount);
+    uint32_t attributeIndex = 0;
+
+    for (uint32_t fieldIndex = 0; fieldIndex < activationSchema.FieldCount; fieldIndex++)
+    {
+        const DML_SCHEMA_FIELD* schemaField = &activationSchema.Fields[fieldIndex];
+        OperatorFieldVariant field;
+        switch (schemaField->Kind)
+        {
+            case DML_SCHEMA_FIELD_KIND_INPUT_TENSOR:
+            case DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR:
+            {
+                if (schemaField->Type == DML_SCHEMA_FIELD_TYPE_TENSOR_DESC)
+                {
+                    field = OperatorFieldTypes::TensorDesc();
+                }
+                else if (schemaField->Type == DML_SCHEMA_FIELD_TYPE_TENSOR_DESC_ARRAY)
+                {
+                    field = OperatorFieldTypes::TensorDescArray();
+                }
+                break;
+            }
+            case DML_SCHEMA_FIELD_KIND_ATTRIBUTE:
+            {
+                const dml::ir::operatorFieldTypes::AttributeDesc* attributeDesc = 
+                    attributeIndex >= activationDesc->attributes()->size() ?
+                    nullptr : 
+                    activationDesc->attributes()->Get(attributeIndex++);
+                field = CreateAttribute(schemaField, attributeDesc);
+                break;
+            }
+        }
+
+        activationOperatorFields[fieldIndex] = OperatorField(schemaField, std::move(field));
+    }
+
+    return AbstractOperatorDesc(&activationSchema, std::move(activationOperatorFields));
+}
+
+OperatorFieldVariant CreateActivations(
+    const dml::ir::operatorFieldTypes::ActivationArray* activationDescs)
+{
+    std::vector<AbstractOperatorDesc> activations;
+    for (uint32_t index = 0; index < static_cast<uint32_t>(activationDescs->data()->size()); index++)
+    {
+        OperatorFieldVariant activation = CreateActivation(activationDescs->data()->Get(index));
+        activations.push_back(std::get<OperatorFieldTypes::FusedActivationOperatorDesc>(activation).value());
+    }
+    return activations;
+}
+
+OperatorFieldVariant CreateAttribute(
+    const DML_SCHEMA_FIELD* schemaField,
+    const dml::ir::operatorFieldTypes::AttributeDesc* attributeDesc)
+{
+    switch (schemaField->Type)
+    {
+        case DML_SCHEMA_FIELD_TYPE_OPERATOR_DESC:
+        {
+            return attributeDesc != nullptr && attributeDesc->val_as_Activation() != nullptr ?  
+                CreateActivation(attributeDesc->val_as_Activation()) : 
+                OperatorFieldTypes::FusedActivationOperatorDesc();
+        }
+        case DML_SCHEMA_FIELD_TYPE_OPERATOR_DESC_ARRAY:
+        {
+            return attributeDesc != nullptr && attributeDesc->val_as_ActivationArray() != nullptr ?  
+                CreateActivations(attributeDesc->val_as_ActivationArray()) : 
+                OperatorFieldTypes::FusedActivationOperatorDescArray();
+        }
+        case DML_SCHEMA_FIELD_TYPE_UINT:
+        {
+            OperatorFieldTypes::UInt data;
+            if (attributeDesc != nullptr)
+            {
+                data = attributeDesc->val_as_UInt32()->data();
+            }
+            return data;
+        }
+        case DML_SCHEMA_FIELD_TYPE_UINT64:
+        {
+            OperatorFieldTypes::UInt64 data;
+            if (attributeDesc != nullptr)
+            {
+                data = attributeDesc->val_as_UInt64()->data();
+            }
+            return data;
+        }
+        case DML_SCHEMA_FIELD_TYPE_INT:
+        {
+            OperatorFieldTypes::Int data;
+            if (attributeDesc != nullptr)
+            {
+                data = attributeDesc->val_as_Int32()->data();
+            }
+            return data;
+        }
+        case DML_SCHEMA_FIELD_TYPE_FLOAT:
+        {
+            OperatorFieldTypes::Float data;
+            if (attributeDesc != nullptr)
+            {
+                data = attributeDesc->val_as_Float32()->data();
+            }
+            return data;
+        }
+        case DML_SCHEMA_FIELD_TYPE_UINT_ARRAY:
+        {
+            OperatorFieldTypes::UIntArray data;
+            if (attributeDesc != nullptr)
+            {
+                data.assign(attributeDesc->val_as_UIntArray()->data()->begin(), attributeDesc->val_as_UIntArray()->data()->end());
+            }
+            return data;
+        }
+        case DML_SCHEMA_FIELD_TYPE_INT_ARRAY:
+        {
+            OperatorFieldTypes::IntArray data;
+            if (attributeDesc != nullptr)
+            {
+                data.assign(attributeDesc->val_as_IntArray()->data()->begin(), attributeDesc->val_as_IntArray()->data()->end());
+            }
+            return data;
+        }
+        case DML_SCHEMA_FIELD_TYPE_FLOAT_ARRAY:
+        {
+            OperatorFieldTypes::FloatArray data;
+            if (attributeDesc != nullptr)
+            {
+                data.assign(attributeDesc->val_as_FloatArray()->data()->begin(), attributeDesc->val_as_FloatArray()->data()->end());
+            }
+            return data;
+        }	
+        case DML_SCHEMA_FIELD_TYPE_SCALE_BIAS:
+        {
+            OperatorFieldTypes::ScaleBias scaleBias;
+            const dml::ir::operatorFieldTypes::ScaleBias* scaleBiasAttribute = attributeDesc->val_as_ScaleBias();
+            if (scaleBiasAttribute != nullptr)
+            {
+                scaleBias = {scaleBiasAttribute->scale(), scaleBiasAttribute->bias()};
+            }
+            return scaleBias;
+        }
+        case DML_SCHEMA_FIELD_TYPE_SIZE_2D:
+        {
+            OperatorFieldTypes::Size2D size2d = {};
+            if (attributeDesc != nullptr)
+            {
+                size2d.Height = attributeDesc->val_as_Size2D()->height();
+                size2d.Width = attributeDesc->val_as_Size2D()->width();
+            }
+            return size2d;
+        }
+        case DML_SCHEMA_FIELD_TYPE_SCALAR_UNION:
+        {
+            DML_SCALAR_UNION scalarUnion;
+            if (attributeDesc != nullptr)
+            {
+                const dml::ir::operatorFieldTypes::ByteArray* byteArr = attributeDesc->val_as_ScalarUnionData()->data_as_ByteArray();
+                std::copy(byteArr->data()->begin(), byteArr->data()->end(), scalarUnion.Bytes);
+            }
+            return scalarUnion;
+        }
+        case DML_SCHEMA_FIELD_TYPE_BOOL:
+        {
+            OperatorFieldTypes::Bool data;
+            if (attributeDesc != nullptr)
+            {
+                data = attributeDesc->val_as_Bool()->data();
+            }
+            return data;
+        }
+        default:
+        {
+            throw std::invalid_argument("Invalid attribute type.");
+        }
+    }
+}
+
+OperatorFieldTypes::TensorDesc CreateBufferTensorDesc(
+    const dml::ir::DmlBufferTensorDesc* tensorDesc,
+    const bool isConstantTensor = false)
+{
+    DmlBufferTensorDesc bufferTensorDesc = {};
+    bufferTensorDesc.dataType = ApiTraits::StringifyHelpers::FromString<DML_TENSOR_DATA_TYPE>(tensorDesc->dataType()->c_str());
+    if (isConstantTensor)
+    {
+        bufferTensorDesc.flags = DML_TENSOR_FLAG_OWNED_BY_DML;
+    }
+    bufferTensorDesc.sizes.assign(tensorDesc->sizes()->begin(), tensorDesc->sizes()->end());
+    if (flatbuffers::IsFieldPresent(tensorDesc, dml::ir::DmlBufferTensorDesc::VT_STRIDES))
+    {
+        bufferTensorDesc.strides.emplace(tensorDesc->strides()->begin(), tensorDesc->strides()->end());
+    }
+    bufferTensorDesc.totalTensorSizeInBytes = tensorDesc->totalTensorSizeInBytes();
+    return bufferTensorDesc;
+}
+
+AbstractOperatorDesc CreateAbstractOperatorDesc(
+    uint32_t nodeIndex,
+    const dml::ir::OperatorNodeDesc* flatbufferOperatorNodeDesc,
+    const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>* nodeInputNames,
+    const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>* nodeOutputNames,
+    const std::unordered_set<std::string_view>& constantInputs)
+{
+    DML_OPERATOR_TYPE type = ApiTraits::StringifyHelpers::FromString<DML_OPERATOR_TYPE>(flatbufferOperatorNodeDesc->type()->c_str());
+    if (type == DML_OPERATOR_INVALID)
+    {
+        throw std::invalid_argument("Graph operator node at index:" + std::to_string(nodeIndex) +
+                                    " either has empty or invalid operator type.");
+    }
+    const DML_OPERATOR_SCHEMA& schema = SchemaHelpers::GetSchema(type);
+    std::vector<OperatorField> operatorFields(schema.FieldCount);
+    
+    auto inputNameItr = nodeInputNames->begin();
+    uint32_t inputTensorDescIndex = 0;
+    
+    uint32_t outputTensorDescIndex = 0;
+    auto outputNameItr = nodeOutputNames->begin();
+
+    uint32_t attributeIndex = 0;
+    
+
+    for (uint32_t fieldIndex = 0; fieldIndex < schema.FieldCount; fieldIndex++)
+    {
+        const DML_SCHEMA_FIELD* schemaField = &schema.Fields[fieldIndex];
+        
+        OperatorFieldVariant field;
+        switch (schemaField->Kind)
+        {
+            case DML_SCHEMA_FIELD_KIND_INPUT_TENSOR:
+            {
+                if (inputNameItr == nodeInputNames->end())
+                {
+                    throw std::invalid_argument("Missing input names for node at index:" + std::to_string(nodeIndex));
+                }
+
+                if (schemaField->Type == DML_SCHEMA_FIELD_TYPE_TENSOR_DESC)
+                {
+                    const flatbuffers::String* inputName = *inputNameItr;
+                    inputNameItr++;
+                    if (inputName->size() == 0)
+                    {
+                        field = OperatorFieldTypes::TensorDesc();
+                        break;
+                    }
+                    bool isConstantTensor = !constantInputs.empty() && constantInputs.find(inputName->c_str()) != constantInputs.end();
+
+                    if (flatbufferOperatorNodeDesc->inputs()->size() <= inputTensorDescIndex)
+                    {
+                        throw std::invalid_argument("Expecting at least " + std::to_string(inputTensorDescIndex + 1) + 
+                                                    "input tensor desc for graph operator node at index:" + std::to_string(nodeIndex));
+                    }
+                    const dml::ir::DmlBufferTensorDesc* tensorDesc = flatbufferOperatorNodeDesc->inputs()->Get(inputTensorDescIndex++);
+                    field = CreateBufferTensorDesc(tensorDesc, isConstantTensor);
+                }
+                else if (schemaField->Type == DML_SCHEMA_FIELD_TYPE_TENSOR_DESC_ARRAY)
+                {
+                    std::vector<DmlBufferTensorDesc> tensors;
+                    while (inputTensorDescIndex < static_cast<uint32_t>(flatbufferOperatorNodeDesc->inputs()->size()))
+                    {
+                        const flatbuffers::String* inputName = *inputNameItr;
+                        inputNameItr++;
+                        bool isConstantTensor = !constantInputs.empty() && constantInputs.find(inputName->c_str()) != constantInputs.end();
+                        
+                        if (flatbufferOperatorNodeDesc->inputs()->size() <= inputTensorDescIndex)
+                        {
+                            throw std::invalid_argument("Expecting at least " + std::to_string(inputTensorDescIndex + 1) + 
+                                                        "input tensor desc for graph operator node at index:" + std::to_string(nodeIndex));
+                        }
+                        const dml::ir::DmlBufferTensorDesc* tensorDesc = flatbufferOperatorNodeDesc->inputs()->Get(inputTensorDescIndex++);
+                        tensors.push_back(CreateBufferTensorDesc(tensorDesc, isConstantTensor).value());
+                    }
+                    field = tensors;
+                }
+                break;
+            }
+            case DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR:
+            {
+                if (outputNameItr == nodeOutputNames->end())
+                {
+                    throw std::invalid_argument("Missing output names for node at index:" + std::to_string(nodeIndex));
+                }
+
+                if (schemaField->Type == DML_SCHEMA_FIELD_TYPE_TENSOR_DESC)
+                {
+                    const flatbuffers::String* outputName = *outputNameItr;
+                    outputNameItr++;
+
+                    if (outputName->size() == 0)
+                    {
+                        field = OperatorFieldTypes::TensorDesc();
+                        break;
+                    }
+
+                    if (flatbufferOperatorNodeDesc->outputs()->size() <= outputTensorDescIndex)
+                    {
+                        throw std::invalid_argument("Expecting at least " + std::to_string(outputTensorDescIndex + 1) + 
+                                                    "output tensor desc for graph operator node at index:" + std::to_string(nodeIndex));
+                    }
+                    const dml::ir::DmlBufferTensorDesc* tensorDesc = flatbufferOperatorNodeDesc->outputs()->Get(outputTensorDescIndex++);
+                    field = CreateBufferTensorDesc(tensorDesc);
+                }
+                else if (schemaField->Type == DML_SCHEMA_FIELD_TYPE_TENSOR_DESC_ARRAY)
+                {
+                    std::vector<DmlBufferTensorDesc> tensors;
+                    while (outputTensorDescIndex < static_cast<uint32_t>(flatbufferOperatorNodeDesc->outputs()->size()))
+                    {
+                        if (flatbufferOperatorNodeDesc->outputs()->size() <= outputTensorDescIndex)
+                        {
+                            throw std::invalid_argument("Expecting at least " + std::to_string(outputTensorDescIndex + 1) + 
+                                                        "output tensor desc for graph operator node at index:" + std::to_string(nodeIndex));
+                        }
+                        const dml::ir::DmlBufferTensorDesc* tensorDesc = flatbufferOperatorNodeDesc->outputs()->Get(outputTensorDescIndex++);
+                        tensors.push_back(CreateBufferTensorDesc(tensorDesc).value());
+                    }
+                    field = tensors;
+                }
+                break;
+            }
+            case DML_SCHEMA_FIELD_KIND_ATTRIBUTE:
+            {
+                if (flatbufferOperatorNodeDesc->attributes()->size() <= attributeIndex)
+                {
+                    throw std::invalid_argument("Expecting at least " + std::to_string(attributeIndex + 1) + 
+                                                "attributes for graph operator node at index:" + std::to_string(nodeIndex));
+                }
+                const dml::ir::operatorFieldTypes::AttributeDesc* attributeDesc = 
+                    attributeIndex >= flatbufferOperatorNodeDesc->attributes()->size() ?
+                    nullptr : 
+                    flatbufferOperatorNodeDesc->attributes()->Get(attributeIndex++);
+                field = CreateAttribute(schemaField, attributeDesc);
+                break;
+            }
+        }
+
+        operatorFields[fieldIndex] = OperatorField(schemaField, std::move(field));
+    }
+
+    return AbstractOperatorDesc(&schema, std::move(operatorFields));
+}
+
+std::unordered_map<std::string_view, uint32_t> ConvertToEdgeNameToIndexMap(
+    const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>* list)
+{
+    std::unordered_map<std::string_view, uint32_t> nameToIndexMap;
+    for (uint32_t index = 0; index < list->size(); index++)
+    {
+        const flatbuffers::String* name = list->GetAsString(index);
+        if (name->size() == 0)
+        {
+            continue;
+        }
+        nameToIndexMap[name->string_view()] = index;
+    }
+    return nameToIndexMap; // NRVO will automatically move it. no need to use std::move
+}
+
+template <typename EdgeType> void PopulateEdges(
+    const uint32_t nodeIndex,
+    const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>* edgeNames,
+    const std::unordered_map<std::string_view, uint32_t>& edgeNameToIndexMap,
+    /*out*/ std::vector<EdgeType>& edges,
+    /*out*/ std::vector<DmlIntermediateSerializedGraphEdge>& intermediateEdges,
+    /*out*/ std::unordered_map<std::string_view, NodeIndex>& edgeToOutgoingNodeIndexMap)
+{
+    for (flatbuffers::uoffset_t edgeIndex = 0; edgeIndex < edgeNames->size(); edgeIndex++)
+    {
+        const flatbuffers::String* edgeName = edgeNames->Get(edgeIndex);
+        if (edgeName->size() == 0)
+        {
+            // This must be optional input/output
+            continue;
+        }
+        // edge can be graphInput or graphOutput
+        if (edgeNameToIndexMap.find(edgeName->string_view()) != edgeNameToIndexMap.end())
+        {
+            EdgeType edge = {};
+            edge.Name = edgeName->str();
+            
+            if constexpr (std::is_same_v<EdgeType, DmlInputSerializedGraphEdge>)
+            {
+                edge.GraphInputIndex = edgeNameToIndexMap.at(edgeName->string_view());
+                edge.ToNodeIndex = nodeIndex;
+                edge.ToNodeInputIndex = edgeIndex;
+            }
+            else if constexpr (std::is_same_v<EdgeType, DmlOutputSerializedGraphEdge>)
+            {
+                edge.GraphOutputIndex = edgeNameToIndexMap.at(edgeName->string_view());
+                edge.FromNodeIndex = nodeIndex;
+                edge.FromNodeOutputIndex = edgeIndex;
+                edgeToOutgoingNodeIndexMap[edgeName->string_view()] = {nodeIndex, edgeIndex};
+            }
+
+            edges.push_back(edge);
+        }
+        // edge is intermediate edge
+        else 
+        {
+            if constexpr (std::is_same_v<EdgeType, DmlInputSerializedGraphEdge>)
+            {
+                if (edgeToOutgoingNodeIndexMap.find(edgeName->string_view()) == edgeToOutgoingNodeIndexMap.end())
+                {
+                    throw std::range_error("Neither there is any graph input with name " + edgeName->str() + 
+                                           "nor there is any node which has " + edgeName->str() + " as one of the output.");
+                }
+                auto& intermediateEdgeNodeIndex = edgeToOutgoingNodeIndexMap[edgeName->string_view()];
+                DmlIntermediateSerializedGraphEdge intermediateEdge = {};
+                intermediateEdge.Name = edgeName->str();
+                intermediateEdge.FromNodeIndex = intermediateEdgeNodeIndex.nodeIndex;
+                intermediateEdge.FromNodeOutputIndex = intermediateEdgeNodeIndex.nodeOutputIndex;
+                intermediateEdge.ToNodeIndex = nodeIndex;
+                intermediateEdge.ToNodeInputIndex = edgeIndex;
+                intermediateEdges.push_back(std::move(intermediateEdge));
+            }
+            else if constexpr (std::is_same_v<EdgeType, DmlOutputSerializedGraphEdge>)
+            {
+                edgeToOutgoingNodeIndexMap[edgeName->string_view()] = {nodeIndex, edgeIndex};
+            }
+        }
+    }
+}
+
+/*
+* - Handling of empty optional input/output/attibute for non-constant node:
+*   input/output
+*   - <DmlGraphNode.inputNames> and <DmlGraphNode.outputNames> will have an null entry
+*      but the actual OperatorNodeDesc variant's <OperatorNodeDesc.inputs> 
+*      and <OperatorNodeDesc.outputs> will not have any entry.
+*   attribute
+*   - <OperatorNodeDesc.attributes> will have null entry
+*/
+DmlSerializedGraphDesc DeserializeDmlGraph(
+    const uint8_t* flatbufferGraphDescBlob,
+    /*out*/ std::vector<std::unique_ptr<std::byte[]>>& rawData)
+{
+    if (flatbufferGraphDescBlob == nullptr)
+    {
+        throw std::invalid_argument("Given pointer to flatbuffer blob is null");
+    }
+    const dml::ir::DmlGraphDesc* flatbufferGraphDesc = dml::ir::GetDmlGraphDesc(flatbufferGraphDescBlob);
+    
+    std::unordered_map<std::string_view, uint32_t> graphInputEdgeToIndexMap = ConvertToEdgeNameToIndexMap(flatbufferGraphDesc->graphInputNames());
+    std::unordered_map<std::string_view, uint32_t> graphOutputEdgeToIndexMap = ConvertToEdgeNameToIndexMap(flatbufferGraphDesc->graphOutputNames());
+    
+    std::unordered_map<std::string_view, NodeIndex> edgeToOutgoingNodeIndexMap;
+    std::unordered_set<std::string_view> constantInputs;
+
+    std::vector<DmlSerializedGraphNode> nodes(flatbufferGraphDesc->nodes()->size());
+    std::vector<DmlInputSerializedGraphEdge> inputEdges;
+    std::vector<DmlOutputSerializedGraphEdge> outputEdges;
+    std::vector<DmlIntermediateSerializedGraphEdge> intermediateEdges;
+
+    for (uint32_t nodeIndex = 0; nodeIndex < flatbufferGraphDesc->nodes()->size(); nodeIndex++)
+    {
+        const dml::ir::DmlGraphNode* flatbufferNode = flatbufferGraphDesc->nodes()->Get(nodeIndex);
+
+        PopulateEdges<DmlInputSerializedGraphEdge>(
+            nodeIndex,
+            flatbufferNode->inputNames(),
+            graphInputEdgeToIndexMap,
+            inputEdges,
+            intermediateEdges,
+            edgeToOutgoingNodeIndexMap);
+        PopulateEdges<DmlOutputSerializedGraphEdge>(
+            nodeIndex,
+            flatbufferNode->outputNames(),
+            graphOutputEdgeToIndexMap,
+            outputEdges,
+            intermediateEdges,
+            edgeToOutgoingNodeIndexMap);
+
+        DmlSerializedGraphNode node = {};
+        if (flatbufferNode->name()->size() == 0)
+        {
+            throw std::invalid_argument("Graph node at index:" + std::to_string(nodeIndex) + " doesn't have any name");
+        }
+        node.Name = flatbufferNode->name()->c_str();
+
+        if (flatbufferNode->desc_type() == dml::ir::NodeDesc_ConstantNodeDesc)
+        {
+            const dml::ir::ConstantNodeDesc* flatbufferConstantNode = flatbufferNode->desc_as_ConstantNodeDesc();
+            if (flatbufferConstantNode->data_type() == dml::ir::ConstantNodeDescDetail_ConstantName)
+            {
+                if (flatbufferConstantNode->data_as_ConstantName()->name()->size() == 0)
+                {
+                    throw std::invalid_argument("Constant node at index:" + std::to_string(nodeIndex) + 
+                                                " doesn't have constant data name.");
+                }
+
+                ConstantName constantNode = {flatbufferConstantNode->data_as_ConstantName()->name()->c_str()};
+                node.Desc = constantNode;
+                // output of this node will part of constantInputs list
+                for (uint32_t outputIndex = 0; outputIndex < flatbufferNode->outputNames()->size(); outputIndex++)
+                {
+                    constantInputs.insert(flatbufferNode->outputNames()->Get(outputIndex)->c_str());
+                }
+            }
+            else if (flatbufferConstantNode->data_type() == dml::ir::ConstantNodeDescDetail_ConstantRawData)
+            {
+                
+                uint32_t rawDataSize = flatbufferConstantNode->data_as_ConstantRawData()->data()->size();
+                rawData.push_back(std::make_unique<std::byte[]>(rawDataSize));
+                std::transform(
+                    flatbufferConstantNode->data_as_ConstantRawData()->data()->begin(),
+                    flatbufferConstantNode->data_as_ConstantRawData()->data()->end(),
+                    rawData.back().get(),
+                    [](uint8_t b) {return static_cast<std::byte>(b);});
+
+                ConstantData constantData = {};
+                constantData.dataSize = rawDataSize;
+                constantData.data = rawData.back().get();
+                node.Desc = constantData;
+            }
+
+
+        }
+        else if (flatbufferNode->desc_type() == dml::ir::NodeDesc::NodeDesc_OperatorNodeDesc)
+        {
+            // convert dml::ir::OperatorNodeDesc to AbstractOperatorDesc
+            const dml::ir::OperatorNodeDesc* flatbufferOperatorNodeDesc = flatbufferNode->desc_as_OperatorNodeDesc();
+            node.Desc = CreateAbstractOperatorDesc(
+                nodeIndex,
+                flatbufferOperatorNodeDesc,
+                flatbufferNode->inputNames(),
+                flatbufferNode->outputNames(),
+                constantInputs);
+        }
+
+        nodes[nodeIndex] = node;
+    }
+
+    DmlSerializedGraphDesc graphDesc;
+    graphDesc.InputCount = flatbufferGraphDesc->graphInputNames()->size();
+    graphDesc.OutputCount = flatbufferGraphDesc->graphOutputNames()->size();
+    graphDesc.InputEdges = std::move(inputEdges);
+    graphDesc.IntermediateEdges = std::move(intermediateEdges);
+    graphDesc.OutputEdges = std::move(outputEdges);
+    graphDesc.Nodes = std::move(nodes);
+    return graphDesc;	
+}
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
index 642d9aa03eeef..202b762d99e01 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
@@ -135,8 +135,10 @@ namespace DmlGraphFusionHelper
 
     void ProcessInputData(
         const ExecutionProviderImpl* providerImpl,
+        const bool graphSerializationEnabled,
         const std::vector<uint8_t>& isInputsUploadedByDmlEP,
-        const std::vector<DML_INPUT_GRAPH_EDGE_DESC>& inputEdges,
+        const std::unordered_map<uint32_t, uint32_t>* serializedGraphInputIndexToSubgraphInputIndex,
+        const std::unordered_map<std::string_view, uint32_t>* serializedGraphLargeConstantNameToSubgraphInputIndex,
         const gsl::span<const std::string> subGraphInputArgNames,
         const std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>>& initializerNameToInitializerMap,
         onnxruntime::Graph& graph,
@@ -162,8 +164,17 @@ namespace DmlGraphFusionHelper
 
         // Walk through each graph edge and mark used inputs
         inputsUsed.assign(fusedNodeInputCount, false);
-        for (const DML_INPUT_GRAPH_EDGE_DESC& edge : inputEdges) {
-            inputsUsed[edge.GraphInputIndex] = true;
+        for (auto it = serializedGraphInputIndexToSubgraphInputIndex->begin(); it != serializedGraphInputIndexToSubgraphInputIndex->end(); it++) {
+            inputsUsed[it->second] = true;
+        }
+        for (auto it = serializedGraphLargeConstantNameToSubgraphInputIndex->begin(); it != serializedGraphLargeConstantNameToSubgraphInputIndex->end(); it++) {
+            inputsUsed[it->second] = true;
+        }
+
+        std::wstring modelName;
+        if (graphSerializationEnabled)
+        {
+            modelName = GetModelName(graph.ModelPath());
         }
 
         for (uint32_t i = 0; i < initInputBindings.size(); i++)
@@ -209,6 +220,10 @@ namespace DmlGraphFusionHelper
 
                 // Tensor sizes in DML must be a multiple of 4 bytes large.
                 tensorByteSize = AlignToPow2<size_t>(tensorByteSize, 4);
+                if(graphSerializationEnabled)
+                {
+                    WriteToFile(modelName, ConvertToWString(iter->first) + L".bin", reinterpret_cast<uint8_t*>(tensorPtr), tensorByteSize);
+                }
 
                 if (inputRawData)
                 {
@@ -287,55 +302,158 @@ namespace DmlGraphFusionHelper
         return initializerPartitionMap;
     }
 
+    inline uint32_t GetConstantNodeGraphInputIndex(
+        const std::string& constantName,
+        const std::unordered_map<std::string_view, uint32_t>* serializedGraphConstantNameToMainGraphInputIndex,
+        uint32_t& graphMaxInputIndex,
+        std::unordered_map<std::string_view, uint32_t>& localConstantNameToIndexMap)
+    {
+        if (serializedGraphConstantNameToMainGraphInputIndex == nullptr)
+        {
+            if (localConstantNameToIndexMap.find(constantName) == localConstantNameToIndexMap.end())
+            {
+                localConstantNameToIndexMap[constantName] = ++graphMaxInputIndex;
+            }
+            return localConstantNameToIndexMap[constantName];
+        }
+        else
+        {
+            graphMaxInputIndex = std::max(graphMaxInputIndex, serializedGraphConstantNameToMainGraphInputIndex->at(constantName));
+            return serializedGraphConstantNameToMainGraphInputIndex->at(constantName);
+        }
+    }
+
+    template <size_t AllocatorSize>
     void ConvertGraphDesc(
         const Dml::GraphDescBuilder::GraphDesc& graphDesc,
-        _Out_ DML_GRAPH_DESC& dmlGraphDesc,
         const uint32_t inputCount,
         const uint32_t outputCount,
-        _Inout_ std::vector<DML_OPERATOR_GRAPH_NODE_DESC>& dmlOperatorGraphNodes,
-        _Inout_ std::vector<DML_CONSTANT_DATA_GRAPH_NODE_DESC>& dmlConstantGraphNodes,
+        IDMLDevice* device,
+        StackAllocator<AllocatorSize>& allocator,
+        const std::unordered_map<uint32_t, uint32_t>* serializedGraphInputIndexToSubgraphInputIndex,
+        const std::unordered_map<std::string_view, uint32_t>* serializedGraphLargeConstantNameToSubgraphInputIndex,
+        _Out_ DML_GRAPH_DESC& dmlGraphDesc,
+        _Inout_ std::vector<ComPtr<IDMLOperator>>& dmlOperators,
         _Inout_ std::vector<DML_GRAPH_NODE_DESC>& dmlGraphNodes,
         _Inout_ std::vector<DML_GRAPH_EDGE_DESC>& dmlInputEdges,
         _Inout_ std::vector<DML_GRAPH_EDGE_DESC>& dmlOutputEdges,
         _Inout_ std::vector<DML_GRAPH_EDGE_DESC>& dmlIntermediateEdges)
     {
-        for (size_t i = 0; i < graphDesc.nodes.size(); ++i)
+        std::unordered_map<uint32_t, uint32_t> oldNodeIndexToNewNodeIndexMap;
+        for (uint32_t index = 0; index < static_cast<uint32_t>(graphDesc.Nodes.size()); index++)
         {
-            auto& nodeInfo = graphDesc.nodes[i];
-
-            if (std::holds_alternative<Microsoft::WRL::ComPtr<IDMLOperator>>(nodeInfo.nodeDef))
+            const DmlSerializedGraphNode& node = graphDesc.Nodes[index];
+            if (std::holds_alternative<AbstractOperatorDesc>(node.Desc))
             {
-                dmlOperatorGraphNodes[i] = DML_OPERATOR_GRAPH_NODE_DESC{std::get<Microsoft::WRL::ComPtr<IDMLOperator>>(nodeInfo.nodeDef).Get(), nodeInfo.name.data()};
-                dmlGraphNodes[i] = DML_GRAPH_NODE_DESC{DML_GRAPH_NODE_TYPE_OPERATOR, &dmlOperatorGraphNodes[i]};
+                oldNodeIndexToNewNodeIndexMap[index] = static_cast<uint32_t>(dmlGraphNodes.size());
+                DML_OPERATOR_DESC dmlDesc = SchemaHelpers::ConvertOperatorDesc<AllocatorSize>(std::get<AbstractOperatorDesc>(node.Desc), &allocator);
+                ComPtr<IDMLOperator> op;
+                ORT_THROW_IF_FAILED(device->CreateOperator(&dmlDesc, IID_PPV_ARGS(&op)));
+                dmlOperators.push_back(op);
+                DML_OPERATOR_GRAPH_NODE_DESC* dmlOperatorGraphNode = allocator.template Allocate<DML_OPERATOR_GRAPH_NODE_DESC>();
+                dmlOperatorGraphNode->Name = node.Name.data();
+                dmlOperatorGraphNode->Operator = op.Get();
+                dmlGraphNodes.push_back(DML_GRAPH_NODE_DESC{DML_GRAPH_NODE_TYPE_OPERATOR, dmlOperatorGraphNode});
             }
             else
             {
-                auto& nodeDefinitionData = std::get<std::vector<uint8_t>>(nodeInfo.nodeDef);
-                dmlConstantGraphNodes[i] = DML_CONSTANT_DATA_GRAPH_NODE_DESC{
-                    nodeDefinitionData.data(),
-                    nodeDefinitionData.size(),
-                    nodeInfo.name.data()
-                };
-
-                // TODO: Change as new header is ingested
-                dmlGraphNodes[i] = DML_GRAPH_NODE_DESC{static_cast<DML_GRAPH_NODE_TYPE>(2), &dmlConstantGraphNodes[i]};
+                auto& constantNodeVariant = std::get<DmlSerializedGraphNodeConstantVariant>(node.Desc);
+                if (std::holds_alternative<ConstantData>(constantNodeVariant))
+                {
+                    oldNodeIndexToNewNodeIndexMap[index] = static_cast<uint32_t>(dmlGraphNodes.size());
+
+                    auto& constantData = std::get<ConstantData>(constantNodeVariant);
+                    
+                    DML_CONSTANT_DATA_GRAPH_NODE_DESC* constantNode = allocator.template Allocate<DML_CONSTANT_DATA_GRAPH_NODE_DESC>();
+                    constantNode->Name = node.Name.data();
+                    constantNode->DataSize = constantData.dataSize;
+                    constantNode->Data = constantData.data;
+                    dmlGraphNodes.push_back(DML_GRAPH_NODE_DESC{DML_GRAPH_NODE_TYPE_CONSTANT, constantNode});
+                }
             }
         }
 
-        for (size_t i = 0; i < graphDesc.inputEdges.size(); ++i)
+        uint32_t graphMaxInputIndex = 0;
+
+        for (size_t i = 0; i < graphDesc.InputEdges.size(); ++i)
         {
-            dmlInputEdges[i] = DML_GRAPH_EDGE_DESC{DML_GRAPH_EDGE_TYPE_INPUT, &graphDesc.inputEdges[i]};
+            DML_INPUT_GRAPH_EDGE_DESC* edge = allocator.template Allocate<DML_INPUT_GRAPH_EDGE_DESC>();
+            // 1. If serializedGraphInputIndexToMainGraphInputIndex is not null:
+            //      then use the corresponding main graph input index, because the caller will use corresponding
+            //      main graph input index for extracting the actual input tensor from the main graph and
+            //      the caller does not own the creation of dml bindings directly.
+            //      Use Case: When the caller is ORT (DML EP) or DmlEngine.
+            //
+            // 2. If serializedGraphInputIndexToMainGraphInputIndex is null:
+            //      then assign the sequential graph input index, because it owns the creation of dml bindings
+            //      directly.
+            edge->GraphInputIndex = serializedGraphInputIndexToSubgraphInputIndex == nullptr ?
+                graphDesc.InputEdges[i].GraphInputIndex :
+                serializedGraphInputIndexToSubgraphInputIndex->at(graphDesc.InputEdges[i].GraphInputIndex);
+            edge->ToNodeIndex = oldNodeIndexToNewNodeIndexMap[graphDesc.InputEdges[i].ToNodeIndex];
+            edge->ToNodeInputIndex = graphDesc.InputEdges[i].ToNodeInputIndex;
+            edge->Name = graphDesc.InputEdges[i].Name.data();
+
+            graphMaxInputIndex = std::max(graphMaxInputIndex, edge->GraphInputIndex);
+            dmlInputEdges.push_back(DML_GRAPH_EDGE_DESC{DML_GRAPH_EDGE_TYPE_INPUT, edge});
         }
 
-        for (size_t i = 0; i < graphDesc.outputEdges.size(); ++i)
+        for (size_t i = 0; i < graphDesc.OutputEdges.size(); ++i)
         {
-            dmlOutputEdges[i] = DML_GRAPH_EDGE_DESC{DML_GRAPH_EDGE_TYPE_OUTPUT, &graphDesc.outputEdges[i]};
+            DML_OUTPUT_GRAPH_EDGE_DESC* edge = allocator.template Allocate<DML_OUTPUT_GRAPH_EDGE_DESC>();
+            edge->GraphOutputIndex = graphDesc.OutputEdges[i].GraphOutputIndex;
+            edge->FromNodeIndex = oldNodeIndexToNewNodeIndexMap[graphDesc.OutputEdges[i].FromNodeIndex];
+            edge->FromNodeOutputIndex = graphDesc.OutputEdges[i].FromNodeOutputIndex;
+            edge->Name = graphDesc.OutputEdges[i].Name.data();
+
+            dmlOutputEdges.push_back(DML_GRAPH_EDGE_DESC{DML_GRAPH_EDGE_TYPE_OUTPUT, edge});
         }
 
-        for (size_t i = 0; i < graphDesc.intermediateEdges.size(); ++i)
+        std::unordered_map<std::string_view, uint32_t> localConstantNameToIndexMap;
+        for (uint32_t i = 0; i < static_cast<uint32_t>(graphDesc.IntermediateEdges.size()); ++i)
         {
-            dmlIntermediateEdges[i] =
-                DML_GRAPH_EDGE_DESC{DML_GRAPH_EDGE_TYPE_INTERMEDIATE, &graphDesc.intermediateEdges[i]};
+            DmlSerializedGraphNodeDescVariant descVariant = graphDesc.Nodes[graphDesc.IntermediateEdges[i].FromNodeIndex].Desc;
+            bool isConstantEdge = std::holds_alternative<DmlSerializedGraphNodeConstantVariant>(descVariant);
+            if (isConstantEdge)
+            {
+                auto& constantNodeVariant = std::get<DmlSerializedGraphNodeConstantVariant>(descVariant);
+                if (std::holds_alternative<ConstantData>(constantNodeVariant))
+                {
+                    DML_INTERMEDIATE_GRAPH_EDGE_DESC* edge = allocator.template Allocate<DML_INTERMEDIATE_GRAPH_EDGE_DESC>();
+                    edge->FromNodeIndex = oldNodeIndexToNewNodeIndexMap[graphDesc.IntermediateEdges[i].FromNodeIndex];
+                    edge->FromNodeOutputIndex = graphDesc.IntermediateEdges[i].FromNodeOutputIndex;
+                    edge->ToNodeIndex = oldNodeIndexToNewNodeIndexMap[graphDesc.IntermediateEdges[i].ToNodeIndex];
+                    edge->ToNodeInputIndex = graphDesc.IntermediateEdges[i].ToNodeInputIndex;
+                    edge->Name = graphDesc.IntermediateEdges[i].Name.data();
+                    dmlIntermediateEdges.push_back(DML_GRAPH_EDGE_DESC{DML_GRAPH_EDGE_TYPE_INTERMEDIATE, edge});
+                }
+                else
+                {
+                    const std::string& constantName = graphDesc.Nodes[graphDesc.IntermediateEdges[i].FromNodeIndex].Name;
+
+                    DML_INPUT_GRAPH_EDGE_DESC* edge = allocator.template Allocate<DML_INPUT_GRAPH_EDGE_DESC>();
+                    edge->GraphInputIndex = GetConstantNodeGraphInputIndex(
+                        constantName,
+                        serializedGraphLargeConstantNameToSubgraphInputIndex,
+                        graphMaxInputIndex,
+                        localConstantNameToIndexMap);
+                    edge->ToNodeIndex = oldNodeIndexToNewNodeIndexMap[graphDesc.IntermediateEdges[i].ToNodeIndex];
+                    edge->ToNodeInputIndex = graphDesc.IntermediateEdges[i].ToNodeInputIndex;
+                    edge->Name = graphDesc.IntermediateEdges[i].Name.data();
+
+                    dmlInputEdges.push_back({DML_GRAPH_EDGE_TYPE_INPUT, edge});
+                }
+            }
+            else
+            {
+                DML_INTERMEDIATE_GRAPH_EDGE_DESC* edge = allocator.template Allocate<DML_INTERMEDIATE_GRAPH_EDGE_DESC>();
+                edge->FromNodeIndex = oldNodeIndexToNewNodeIndexMap[graphDesc.IntermediateEdges[i].FromNodeIndex];
+                edge->FromNodeOutputIndex = graphDesc.IntermediateEdges[i].FromNodeOutputIndex;
+                edge->ToNodeIndex = oldNodeIndexToNewNodeIndexMap[graphDesc.IntermediateEdges[i].ToNodeIndex];
+                edge->ToNodeInputIndex = graphDesc.IntermediateEdges[i].ToNodeInputIndex;
+                edge->Name = graphDesc.IntermediateEdges[i].Name.data();
+                dmlIntermediateEdges.push_back(DML_GRAPH_EDGE_DESC{DML_GRAPH_EDGE_TYPE_INTERMEDIATE, edge});
+            }
         }
 
         dmlGraphDesc.InputCount = inputCount;
@@ -400,27 +518,34 @@ namespace DmlGraphFusionHelper
     Microsoft::WRL::ComPtr<IDMLCompiledOperator> TryCreateCompiledOperator(
         const GraphDescBuilder::GraphDesc& graphDesc,
         const onnxruntime::IndexedSubGraph& indexedSubGraph,
-        const ExecutionProviderImpl* providerImpl)
+        const ExecutionProviderImpl* providerImpl,
+        const std::unordered_map<uint32_t, uint32_t>* serializedGraphInputIndexToSubgraphInputIndex,
+        const std::unordered_map<std::string_view, uint32_t>* serializedGraphLargeConstantNameToSubgraphInputIndex)
     {
         const uint32_t fusedNodeInputCount = gsl::narrow_cast<uint32_t>(indexedSubGraph.GetMetaDef()->inputs.size());
         const uint32_t fusedNodeOutputCount = gsl::narrow_cast<uint32_t>(indexedSubGraph.GetMetaDef()->outputs.size());
 
         // convert DML EP GraphDesc into DML_GRAPH_DESC and create IDMLCompiledOperator
-        DML_GRAPH_DESC dmlGraphDesc = {};
-        std::vector<DML_OPERATOR_GRAPH_NODE_DESC> dmlOperatorGraphNodes(graphDesc.nodes.size());
-        std::vector<DML_CONSTANT_DATA_GRAPH_NODE_DESC> dmlConstantGraphNodes(graphDesc.nodes.size());
+        ComPtr<IDMLDevice> device;
+        ORT_THROW_IF_FAILED(providerImpl->GetDmlDevice(device.GetAddressOf()));
 
-        std::vector<DML_GRAPH_NODE_DESC> dmlGraphNodes(graphDesc.nodes.size());
-        std::vector<DML_GRAPH_EDGE_DESC> dmlInputEdges(graphDesc.inputEdges.size());
-        std::vector<DML_GRAPH_EDGE_DESC> dmlOutputEdges(graphDesc.outputEdges.size());
-        std::vector<DML_GRAPH_EDGE_DESC> dmlIntermediateEdges(graphDesc.intermediateEdges.size());
+        StackAllocator<1024> allocator;
+        DML_GRAPH_DESC dmlGraphDesc = {};
+        std::vector<ComPtr<IDMLOperator>> dmlOperators;
+        std::vector<DML_GRAPH_NODE_DESC> dmlGraphNodes;
+        std::vector<DML_GRAPH_EDGE_DESC> dmlInputEdges;
+        std::vector<DML_GRAPH_EDGE_DESC> dmlOutputEdges;
+        std::vector<DML_GRAPH_EDGE_DESC> dmlIntermediateEdges;
         ConvertGraphDesc(
             graphDesc,
-            dmlGraphDesc,
             fusedNodeInputCount,
             fusedNodeOutputCount,
-            dmlOperatorGraphNodes,
-            dmlConstantGraphNodes,
+            device.Get(),
+            allocator,
+            serializedGraphInputIndexToSubgraphInputIndex,
+            serializedGraphLargeConstantNameToSubgraphInputIndex,
+            dmlGraphDesc,
+            dmlOperators,
             dmlGraphNodes,
             dmlInputEdges,
             dmlOutputEdges,
@@ -438,8 +563,6 @@ namespace DmlGraphFusionHelper
             executionFlags |= DML_EXECUTION_FLAG_DISABLE_META_COMMANDS;
         }
 
-        ComPtr<IDMLDevice> device;
-        ORT_THROW_IF_FAILED(providerImpl->GetDmlDevice(device.GetAddressOf()));
 
         ComPtr<IDMLDevice1> device1;
         ORT_THROW_IF_FAILED(device.As(&device1));
@@ -460,6 +583,7 @@ namespace DmlGraphFusionHelper
     }
 
     void FusePartitionAndRegisterKernel(
+        const uint32_t partitionIndex,
         onnxruntime::Graph& graph,
         onnxruntime::KernelRegistry* registryForPartitionKernels,
         const std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>>& initializerNameToInitializerMap,
@@ -467,8 +591,43 @@ namespace DmlGraphFusionHelper
         const onnxruntime::IndexedSubGraph& indexedSubGraph,
         std::vector<uint8_t>&& isInputsUploadedByDmlEP,
         const GraphDescBuilder::GraphDesc& graphDesc,
-        Microsoft::WRL::ComPtr<IDMLCompiledOperator> compiledExecutionPlanOperator)
+        Microsoft::WRL::ComPtr<IDMLCompiledOperator> compiledExecutionPlanOperator,
+        const bool graphSerializationEnabled,
+        const std::unordered_map<uint32_t, uint32_t>* serializedGraphInputIndexToSubgraphInputIndex,
+        const std::unordered_map<std::string_view, uint32_t>* serializedGraphLargeConstantNameToSubgraphInputIndex)
     {
+      if (graphSerializationEnabled)
+      {
+
+        const std::wstring modelName = GetModelName(graph.ModelPath());
+        auto buffer = SerializeDmlGraph(graphDesc);
+
+        const std::wstring partitionName =
+            L"Partition_" +
+            std::to_wstring(partitionIndex) +
+            L".bin";
+        WriteToFile(modelName, partitionName, buffer.data(), buffer.size());
+
+        std::vector<std::unique_ptr<std::byte[]>> rawData;
+        DmlSerializedGraphDesc deserializedGraphDesc = DeserializeDmlGraph(buffer.data(), rawData);
+        GraphDescBuilder::GraphDesc deserializedDmlGraphDesc = {};
+        deserializedDmlGraphDesc.InputCount = deserializedGraphDesc.InputCount;
+        deserializedDmlGraphDesc.InputEdges = std::move(deserializedGraphDesc.InputEdges);
+        deserializedDmlGraphDesc.IntermediateEdges = std::move(deserializedGraphDesc.IntermediateEdges);
+        deserializedDmlGraphDesc.Nodes = std::move(deserializedGraphDesc.Nodes);
+        deserializedDmlGraphDesc.OutputCount = deserializedGraphDesc.OutputCount;
+        deserializedDmlGraphDesc.OutputEdges = std::move(deserializedGraphDesc.OutputEdges);
+        deserializedDmlGraphDesc.reuseCommandList = graphDesc.reuseCommandList;
+        deserializedDmlGraphDesc.outputShapes = graphDesc.outputShapes;
+
+        compiledExecutionPlanOperator = DmlGraphFusionHelper::TryCreateCompiledOperator(
+                        deserializedDmlGraphDesc,
+                        indexedSubGraph,
+                        providerImpl,
+                        serializedGraphInputIndexToSubgraphInputIndex,
+                        serializedGraphLargeConstantNameToSubgraphInputIndex);
+      }
+
         auto& fusedNode = graph.BeginFuseSubGraph(indexedSubGraph, indexedSubGraph.GetMetaDef()->name);
         fusedNode.SetExecutionProviderType(onnxruntime::kDmlExecutionProvider);
 
@@ -482,8 +641,10 @@ namespace DmlGraphFusionHelper
         std::vector<bool> inputsUsed;
         ProcessInputData(
             providerImpl,
+            graphSerializationEnabled,
             isInputsUploadedByDmlEP,
-            graphDesc.inputEdges,
+            serializedGraphInputIndexToSubgraphInputIndex,
+            serializedGraphLargeConstantNameToSubgraphInputIndex,
             indexedSubGraph.GetMetaDef()->inputs,
             initializerNameToInitializerMap,
             graph,
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.h
index f8f6162aaa1e0..f1e9654021196 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.h
@@ -45,12 +45,17 @@ namespace DmlGraphFusionHelper
         gsl::span<std::unique_ptr<GraphPartition>> partitions
     );
 
+    template <size_t AllocatorSize>
     void ConvertGraphDesc(
         const Dml::GraphDescBuilder::GraphDesc& graphDesc,
-        _Out_ DML_GRAPH_DESC& dmlGraphDesc,
         const uint32_t inputCount,
         const uint32_t outputCount,
-        _Inout_ std::vector<DML_OPERATOR_GRAPH_NODE_DESC>& dmlOperatorGraphNodes,
+        IDMLDevice* device,
+        StackAllocator<AllocatorSize>& allocator,
+        const std::unordered_map<uint32_t, uint32_t>* serializedGraphInputIndexToSubgraphInputIndex,
+        const std::unordered_map<std::string_view, uint32_t>* serializedGraphLargeConstantNameToSubgraphInputIndex,
+        _Out_ DML_GRAPH_DESC& dmlGraphDesc,
+        _Inout_ std::vector<ComPtr<IDMLOperator>>& dmlOperators,
         _Inout_ std::vector<DML_GRAPH_NODE_DESC>& dmlGraphNodes,
         _Inout_ std::vector<DML_GRAPH_EDGE_DESC>& dmlInputEdges,
         _Inout_ std::vector<DML_GRAPH_EDGE_DESC>& dmlOutputEdges,
@@ -69,9 +74,12 @@ namespace DmlGraphFusionHelper
     Microsoft::WRL::ComPtr<IDMLCompiledOperator> TryCreateCompiledOperator(
         const GraphDescBuilder::GraphDesc& graphDesc,
         const onnxruntime::IndexedSubGraph& indexedSubGraph,
-        const ExecutionProviderImpl* providerImpl);
+        const ExecutionProviderImpl* providerImpl,
+        const std::unordered_map<uint32_t, uint32_t>* serializedGraphInputIndexToSubgraphInputIndex,
+        const std::unordered_map<std::string_view, uint32_t>* serializedGraphLargeConstantNameToSubgraphInputIndex);
 
     void FusePartitionAndRegisterKernel(
+        const uint32_t partitionIndex,
         onnxruntime::Graph& graph,
         onnxruntime::KernelRegistry* registryForPartitionKernels,
         const std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>>& initializerNameToInitializerMap,
@@ -79,7 +87,10 @@ namespace DmlGraphFusionHelper
         const onnxruntime::IndexedSubGraph& indexedSubGraph,
         std::vector<uint8_t>&& isInputsUploadedByDmlEP,
         const GraphDescBuilder::GraphDesc& graphDesc,
-        Microsoft::WRL::ComPtr<IDMLCompiledOperator> compiledExecutionPlanOperator);
+        Microsoft::WRL::ComPtr<IDMLCompiledOperator> compiledExecutionPlanOperator,
+        const bool graphSerializationEnabled,
+        const std::unordered_map<uint32_t, uint32_t>* serializedGraphInputIndexToSubgraphInputIndex = nullptr,
+        const std::unordered_map<std::string_view, uint32_t>* serializedGraphLargeConstantNameToSubgraphInputIndex = nullptr);
 
     void RegisterDynamicKernel(
         onnxruntime::Graph& graph,
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.cpp
index 679738b639ec9..35a2c451a49a5 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.cpp
@@ -24,15 +24,20 @@ namespace Dml
             std::vector<uint8_t> isInputsUploadedByDmlEP;
             GraphDescBuilder::GraphDesc graphDesc;
             std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>> isInitializerTransferable;
+            std::vector<std::unique_ptr<std::byte[]>> smallConstantData; // Need to keep it alive for maintaining lifetime
+            std::unordered_map<uint32_t, uint32_t> serializedGraphInputIndexToSubgraphInputIndex;
+            std::unordered_map<std::string_view, uint32_t> serializedGraphLargeConstantNameToSubgraphInputIndex;
         };
     }
 
     DmlGraphFusionTransformer::DmlGraphFusionTransformer(
         const std::string& name,
-        const onnxruntime::IExecutionProvider* provider
+        const onnxruntime::IExecutionProvider* provider,
+        const bool graphSerializationEnabled
     )
         :onnxruntime::GraphTransformer(name),
-         m_providerImpl(static_cast<const ExecutionProvider*>(provider)->GetImpl())
+         m_providerImpl(static_cast<const ExecutionProvider*>(provider)->GetImpl()),
+         graphSerializationEnabled(graphSerializationEnabled)
     {
     }
 
@@ -227,23 +232,39 @@ namespace Dml
 
                     ComPtr<IDMLDevice> device;
                     ORT_THROW_IF_FAILED(m_providerImpl->GetDmlDevice(device.GetAddressOf()));
+                    // This map will be used to transfer the initializer to D3D12 system heap memory.
+                    // 'serializedDmlGraphDesc' will have constant input as intermediate edges, that's why
+                    // we need a mapping between intermediateEdgeIndex and indexedSubGraph's (a given partition)
+                    // input arg index.
+                    //   For ex: Let's say intermediate edge index = idx, then
+                    //           indexedSubGraphInputArgIdx = constantEdgeIdxToSubgraphInputArgIdxMap[idx];
+                    //           corresponding constant tensor = initializerNameToInitializerMap[indexedSubGraph.GetMetaDef()->inputs[indexedSubGraphInputArgIdx]]
+                    // We are using intermediate edge index as a key because same constant tensor can be used by
+                    // multiple nodes.
+                    std::unordered_map<uint32_t, uint32_t> serializedGraphInputIndexToSubgraphInputIndex;
+                    std::unordered_map<std::string_view, uint32_t> serializedGraphLargeConstantNameToSubgraphInputIndex;
+                    std::vector<std::unique_ptr<std::byte[]>> smallConstantData;
                     GraphDescBuilder::GraphDesc graphDesc = GraphDescBuilder::BuildGraphDesc(
                         isInputsUploadedByDmlEP.data(),
                         isInputsUploadedByDmlEP.size(),
                         isInitializerTransferable,
                         partitionNodePropsMap,
-                        device.Get(),
                         m_providerImpl,
                         modelPath,
                         subgraphNodes,
                         subgraphInputs,
-                        subgraphOutputs);
+                        subgraphOutputs,
+                        serializedGraphInputIndexToSubgraphInputIndex,
+                        serializedGraphLargeConstantNameToSubgraphInputIndex,
+                        smallConstantData);
 
                     // Compile the operator
                     auto compiledPartition = DmlGraphFusionHelper::TryCreateCompiledOperator(
                         graphDesc,
                         indexedSubGraph,
-                        m_providerImpl);
+                        m_providerImpl,
+                        &serializedGraphInputIndexToSubgraphInputIndex,
+                        &serializedGraphLargeConstantNameToSubgraphInputIndex);
 
                     if (!compiledPartition)
                     {
@@ -264,6 +285,9 @@ namespace Dml
                         compiledPartitionInfo->isInputsUploadedByDmlEP = std::move(isInputsUploadedByDmlEP);
                         compiledPartitionInfo->graphDesc = std::move(graphDesc);
                         compiledPartitionInfo->isInitializerTransferable = std::move(isInitializerTransferable);
+                        compiledPartitionInfo->smallConstantData = std::move(smallConstantData);
+                        compiledPartitionInfo->serializedGraphInputIndexToSubgraphInputIndex = std::move(serializedGraphInputIndexToSubgraphInputIndex);
+                        compiledPartitionInfo->serializedGraphLargeConstantNameToSubgraphInputIndex = std::move(serializedGraphLargeConstantNameToSubgraphInputIndex);
                         compiledPartitionInfos[partitionIndex] = std::move(compiledPartitionInfo);
                     }
                 }
@@ -271,12 +295,14 @@ namespace Dml
         }
         while (!additionalSplittingNodes.empty());
 
+        uint32_t partitionIndex = 0;
         for (auto&& compiledPartitionInfo : compiledPartitionInfos)
         {
             // Null compiled operators were not DML partitions
             if (compiledPartitionInfo)
             {
                 DmlGraphFusionHelper::FusePartitionAndRegisterKernel(
+                    partitionIndex++,
                     graph,
                     m_providerImpl->GetKernelRegistry().get(),
                     compiledPartitionInfo->isInitializerTransferable,
@@ -284,7 +310,10 @@ namespace Dml
                     compiledPartitionInfo->indexedSubGraph,
                     std::move(compiledPartitionInfo->isInputsUploadedByDmlEP),
                     compiledPartitionInfo->graphDesc,
-                    compiledPartitionInfo->compiledOperator);
+                    compiledPartitionInfo->compiledOperator,
+                    graphSerializationEnabled,
+                    &compiledPartitionInfo->serializedGraphInputIndexToSubgraphInputIndex,
+                    &compiledPartitionInfo->serializedGraphLargeConstantNameToSubgraphInputIndex);
             }
         }
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.h
index 19dab0c89943c..b370f3ef9043c 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.h
@@ -16,7 +16,8 @@ class DmlGraphFusionTransformer : public onnxruntime::GraphTransformer
 public:
     DmlGraphFusionTransformer(
         const std::string& name,
-        const onnxruntime::IExecutionProvider* provider
+        const onnxruntime::IExecutionProvider* provider,
+        const bool graphSerializationEnabled
     );
 
 public:
@@ -38,5 +39,6 @@ class DmlGraphFusionTransformer : public onnxruntime::GraphTransformer
 
 private:
     const ExecutionProviderImpl* m_providerImpl = nullptr;
+    const bool graphSerializationEnabled = false;
 };
 }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphSerialization.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphSerialization.cpp
new file mode 100644
index 0000000000000..5355964e8db74
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphSerialization.cpp
@@ -0,0 +1,580 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+
+#pragma once
+#include "precomp.h"
+
+template <typename T>
+T* ReadAs(uint8_t* base, size_t byteOffset)
+{
+    return reinterpret_cast<T*>(base + byteOffset);
+}
+
+void SerializeAttributeDescs(
+    flatbuffers::FlatBufferBuilder& builder,
+    const AbstractOperatorDesc& operatorDesc,
+    /*out*/ std::vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>>& attributeDescs);
+
+flatbuffers::Offset<dml::ir::operatorFieldTypes::Activation> serializeActivation(
+    flatbuffers::FlatBufferBuilder& builder,
+    const AbstractOperatorDesc& activationOperatorDesc)
+{
+    std::vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>> attributeDescs;
+    SerializeAttributeDescs(builder, activationOperatorDesc, attributeDescs);
+    
+    flatbuffers::Offset<dml::ir::operatorFieldTypes::Activation> offset = dml::ir::operatorFieldTypes::CreateActivationDirect(
+        builder,
+        activationOperatorDesc.schema->OperatorName,
+        &attributeDescs);
+    return offset;
+}
+
+void SerializeAttributeDescs(
+    flatbuffers::FlatBufferBuilder& builder,
+    const AbstractOperatorDesc& operatorDesc,
+    /*out*/ std::vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>>& attributeDescs)
+{
+    for (const OperatorField& field : operatorDesc.fields)
+    {
+        if (field.GetSchema()->Kind == DML_SCHEMA_FIELD_KIND_INPUT_TENSOR || 
+            field.GetSchema()->Kind == DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR)
+        {
+            continue;
+        }
+
+        flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc> offset;
+
+        if (std::holds_alternative<OperatorFieldTypes::FusedActivationOperatorDesc>(field.GetData()))
+        {
+            const OperatorFieldTypes::FusedActivationOperatorDesc& fusedActivation = field.AsFusedActivationOperatorDesc();
+            if (!fusedActivation.has_value())
+            {
+                offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect(
+                    builder,
+                    nullptr,
+                    dml::ir::operatorFieldTypes::AttributeFieldVariant_Activation);
+            }
+            else
+            {
+                offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect(
+                    builder,
+                    field.GetSchema()->Name,
+                    dml::ir::operatorFieldTypes::AttributeFieldVariant_Activation,
+                    serializeActivation(builder, fusedActivation.value()).Union());
+            }
+        }
+        else if (std::holds_alternative<OperatorFieldTypes::FusedActivationOperatorDescArray>(field.GetData()))
+        {
+            const OperatorFieldTypes::FusedActivationOperatorDescArray& fusedActivations = 
+                field.AsFusedActivationOperatorDescArray();
+            if (!fusedActivations.has_value())
+            {
+                offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect(
+                    builder,
+                    nullptr,
+                    dml::ir::operatorFieldTypes::AttributeFieldVariant_ActivationArray);
+            }
+            else
+            {
+                std::vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::Activation>> fbActivations;
+
+                for (AbstractOperatorDesc activationOpDesc : fusedActivations.value())
+                {
+                    flatbuffers::Offset<dml::ir::operatorFieldTypes::Activation> fbActivation = 
+                        serializeActivation(builder, activationOpDesc);
+                    fbActivations.push_back(fbActivation);
+                }
+
+                flatbuffers::Offset<dml::ir::operatorFieldTypes::ActivationArray> activationOffset = 
+                    dml::ir::operatorFieldTypes::CreateActivationArrayDirect(builder, &fbActivations);
+                
+                offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect(
+                    builder,
+                    field.GetSchema()->Name,
+                    dml::ir::operatorFieldTypes::AttributeFieldVariant_ActivationArray,
+                    activationOffset.Union());
+            }
+        }
+        else if (std::holds_alternative<OperatorFieldTypes::UInt>(field.GetData()))
+        {
+            offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect(
+                builder,
+                field.GetSchema()->Name,
+                dml::ir::operatorFieldTypes::AttributeFieldVariant_UInt32,
+                builder.CreateStruct(dml::ir::operatorFieldTypes::UInt32(field.AsUInt())).Union());
+        }
+        else if (std::holds_alternative<OperatorFieldTypes::UInt64>(field.GetData()))
+        {
+            offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect(
+                builder,
+                field.GetSchema()->Name,
+                dml::ir::operatorFieldTypes::AttributeFieldVariant_UInt64,
+                builder.CreateStruct(dml::ir::operatorFieldTypes::UInt64(field.AsUInt64())).Union());
+        }
+        else if (std::holds_alternative<OperatorFieldTypes::Int>(field.GetData()))
+        {
+            offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect(
+                builder,
+                field.GetSchema()->Name,
+                dml::ir::operatorFieldTypes::AttributeFieldVariant_Int32,
+                builder.CreateStruct(dml::ir::operatorFieldTypes::Int32(field.AsInt())).Union());
+        }
+        else if (std::holds_alternative<OperatorFieldTypes::Float>(field.GetData()))
+        {
+            offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect(
+                builder,
+                field.GetSchema()->Name,
+                dml::ir::operatorFieldTypes::AttributeFieldVariant_Float32,
+                builder.CreateStruct(dml::ir::operatorFieldTypes::Float32(field.AsFloat())).Union());
+        }
+        else if (std::holds_alternative<OperatorFieldTypes::UIntArray>(field.GetData()))
+        {
+            offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect(
+                builder,
+                field.GetSchema()->Name,
+                dml::ir::operatorFieldTypes::AttributeFieldVariant_UIntArray,
+                dml::ir::operatorFieldTypes::CreateUIntArray(builder, builder.CreateVector(field.AsUIntArray())).Union());
+        }
+        else if (std::holds_alternative<OperatorFieldTypes::IntArray>(field.GetData()))
+        {
+            offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect(
+                builder,
+                field.GetSchema()->Name,
+                dml::ir::operatorFieldTypes::AttributeFieldVariant_IntArray,
+                dml::ir::operatorFieldTypes::CreateIntArray(builder, builder.CreateVector(field.AsIntArray())).Union());
+        }
+        else if (std::holds_alternative<OperatorFieldTypes::FloatArray>(field.GetData()))
+        {
+            offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect(
+                builder,
+                field.GetSchema()->Name,
+                dml::ir::operatorFieldTypes::AttributeFieldVariant_FloatArray,
+                dml::ir::operatorFieldTypes::CreateFloatArray(builder, builder.CreateVector(field.AsFloatArray())).Union());
+        }
+        else if (std::holds_alternative<OperatorFieldTypes::ScaleBias>(field.GetData()))
+        {
+            const OperatorFieldTypes::ScaleBias& scaleBias = field.AsScaleBias();
+            if (!scaleBias.has_value())
+            {
+                offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect(
+                    builder,
+                    nullptr,
+                    dml::ir::operatorFieldTypes::AttributeFieldVariant_ScaleBias);
+            }
+            else
+            {
+                dml::ir::operatorFieldTypes::ScaleBias fbScaleBias(scaleBias.value().Scale, scaleBias.value().Bias);
+                offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect(
+                    builder,
+                    field.GetSchema()->Name,
+                    dml::ir::operatorFieldTypes::AttributeFieldVariant_ScaleBias,
+                    builder.CreateStruct(fbScaleBias).Union());
+            }
+        }
+        else if (std::holds_alternative<OperatorFieldTypes::Size2D>(field.GetData()))
+        {
+            const DML_SIZE_2D size2d = field.AsSize2D();
+            offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect(
+                builder,
+                field.GetSchema()->Name,
+                dml::ir::operatorFieldTypes::AttributeFieldVariant_Size2D,
+                builder.CreateStruct(dml::ir::operatorFieldTypes::Size2D(size2d.Width, size2d.Height)).Union());
+        }
+        else if (std::holds_alternative<OperatorFieldTypes::ScalarUnion>(field.GetData()))
+        {
+            OperatorFieldTypes::ScalarUnion scalarUnion = field.AsScalarUnion();
+            dml::ir::operatorFieldTypes::ByteArray byteArr;
+            for (uint32_t index = 0; index < static_cast<uint32_t>(sizeof(scalarUnion.Bytes)); index++)
+            {
+                byteArr.mutable_data()->Mutate(index, scalarUnion.Bytes[index]);
+            }
+
+            flatbuffers::Offset<dml::ir::operatorFieldTypes::ScalarUnionData> scalarUnionOffset = 
+                dml::ir::operatorFieldTypes::CreateScalarUnionData(
+                    builder,
+                    dml::ir::operatorFieldTypes::ScalarVariant_ByteArray,
+                    builder.CreateStruct(byteArr).Union());
+
+            offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect(
+                builder,
+                field.GetSchema()->Name,
+                dml::ir::operatorFieldTypes::AttributeFieldVariant_ScalarUnionData,
+                scalarUnionOffset.Union());
+        }
+        else if (std::holds_alternative<OperatorFieldTypes::Bool>(field.GetData()))
+        {
+            offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect(
+                builder,
+                field.GetSchema()->Name,
+                dml::ir::operatorFieldTypes::AttributeFieldVariant_Bool,
+                builder.CreateStruct(dml::ir::operatorFieldTypes::Bool(field.AsBool())).Union());
+        }
+        else
+        {
+            continue;
+        }
+        
+        attributeDescs.push_back(offset);
+    }
+}
+
+flatbuffers::Offset<dml::ir::DmlBufferTensorDesc> SerializeDmlTensorDesc(
+    flatbuffers::FlatBufferBuilder& builder,
+    const DmlBufferTensorDesc* tensorDesc)
+{
+    const std::vector<uint32_t> *strides = nullptr;
+    if (tensorDesc->strides.has_value())
+    {
+        strides = &tensorDesc->strides.value();
+    }
+    
+    flatbuffers::Offset<dml::ir::DmlBufferTensorDesc> offset = dml::ir::CreateDmlBufferTensorDescDirect(
+        builder,
+        ApiTraits::StringifyHelpers::ToString(tensorDesc->dataType),
+        &tensorDesc->sizes,
+        strides,
+        tensorDesc->totalTensorSizeInBytes);
+    return offset;
+}
+
+flatbuffers::Offset<void> SerializeOperatorNodeDesc(
+    flatbuffers::FlatBufferBuilder& builder,
+    const AbstractOperatorDesc& operatorDesc)
+{
+    const DML_OPERATOR_SCHEMA* operatorSchema = operatorDesc.schema;
+
+    std::vector<flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>> inputTensorDescs;
+    std::vector<flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>> outputTensorDescs;
+    
+    for (const DmlBufferTensorDesc* tensorDesc : operatorDesc.GetInputTensors())
+    {
+        if (tensorDesc == nullptr)
+        {
+            continue;
+        }
+        flatbuffers::Offset<dml::ir::DmlBufferTensorDesc> serializedDmlTensorDesc = SerializeDmlTensorDesc(builder, tensorDesc);
+        inputTensorDescs.push_back(serializedDmlTensorDesc);
+    }
+    
+    for (const DmlBufferTensorDesc* tensorDesc : operatorDesc.GetOutputTensors())
+    {
+        if (tensorDesc == nullptr)
+        {
+            continue;
+        }
+        flatbuffers::Offset<dml::ir::DmlBufferTensorDesc> serializedDmlTensorDesc = SerializeDmlTensorDesc(builder, tensorDesc);
+        outputTensorDescs.push_back(serializedDmlTensorDesc);
+    }
+    
+    std::vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>> attributeDescs;
+    SerializeAttributeDescs(builder, operatorDesc, attributeDescs);
+    
+    flatbuffers::Offset<dml::ir::OperatorNodeDesc> offset = dml::ir::CreateOperatorNodeDesc(
+        builder,
+        builder.CreateString(operatorSchema->OperatorName),
+        builder.CreateVector(inputTensorDescs),
+        builder.CreateVector(outputTensorDescs),
+        builder.CreateVector(attributeDescs));
+    return offset.Union();
+}
+
+flatbuffers::Offset<void> SerializeConstantNodeDesc(
+    flatbuffers::FlatBufferBuilder& builder,
+    uint32_t nodeIndex,
+    const DmlSerializedGraphNodeConstantVariant& constantNodeDesc)
+{
+    flatbuffers::Offset<dml::ir::ConstantNodeDesc> offset;
+    
+    if (std::holds_alternative<ConstantName>(constantNodeDesc))
+    {
+        auto& constantName = std::get<ConstantName>(constantNodeDesc);
+        if (constantName.name.empty())
+        {
+            throw std::invalid_argument("Graph constant node at index:" + std::to_string(nodeIndex) +
+                                        " doesn't have the constant data name.");
+        }
+
+        flatbuffers::Offset<dml::ir::ConstantName> constantNameOffset = dml::ir::CreateConstantName(
+            builder, 
+            builder.CreateString(constantName.name));
+
+        offset = dml::ir::CreateConstantNodeDesc(
+            builder,
+            dml::ir::ConstantNodeDescDetail_ConstantName,
+            constantNameOffset.Union());
+    }
+    else
+    {
+        auto& constantData = std::get<ConstantData>(constantNodeDesc);
+        std::vector<uint8_t> rawBytes;
+        std::transform(constantData.data, constantData.data + constantData.dataSize, 
+                       std::back_inserter(rawBytes), [](std::byte b) {return static_cast<uint8_t>(b); });
+        flatbuffers::Offset<dml::ir::ConstantRawData> constantDataOffset = dml::ir::CreateConstantRawDataDirect(
+            builder,
+            &rawBytes);
+
+        offset = dml::ir::CreateConstantNodeDesc(
+            builder,
+            dml::ir::ConstantNodeDescDetail_ConstantRawData,
+            constantDataOffset.Union());
+    }
+    
+    return offset.Union();
+}
+
+flatbuffers::Offset<dml::ir::DmlGraphNode> SerializeNode(
+    flatbuffers::FlatBufferBuilder& builder,
+    const uint32_t nodeIndex,
+    const DmlSerializedGraphNode& graphNode,
+    const std::vector<flatbuffers::Offset<flatbuffers::String>>& nodeInputNames,
+    const std::vector<flatbuffers::Offset<flatbuffers::String>>& nodeOutputNames)
+{
+    if (graphNode.Name.empty())
+    {        
+        throw std::invalid_argument("Graph node at index:" + std::to_string(nodeIndex) + 
+                                    " does not have any name.");
+    }
+
+    flatbuffers::Offset<dml::ir::DmlGraphNode> offset;
+    if (std::holds_alternative<AbstractOperatorDesc>(graphNode.Desc))
+    {
+        auto& operatorNode = std::get<AbstractOperatorDesc>(graphNode.Desc);
+        offset = dml::ir::CreateDmlGraphNode(
+            builder,
+            dml::ir::NodeDesc_OperatorNodeDesc,
+            SerializeOperatorNodeDesc(builder, operatorNode),
+            builder.CreateString(graphNode.Name),
+            builder.CreateVector(nodeInputNames),
+            builder.CreateVector(nodeOutputNames));
+    }
+    else
+    {
+        auto& constantNodeVariant = std::get<DmlSerializedGraphNodeConstantVariant>(graphNode.Desc);
+        offset = dml::ir::CreateDmlGraphNode(
+            builder,
+            dml::ir::NodeDesc_ConstantNodeDesc,
+            SerializeConstantNodeDesc(builder, nodeIndex, constantNodeVariant),
+            builder.CreateString(graphNode.Name),
+            builder.CreateVector(nodeInputNames),
+            builder.CreateVector(nodeOutputNames));
+    }
+    return offset;
+}
+
+/*
+* validates input/output edges and throws exception if an edge 
+* does not have a name or if an edge has more than 1 names.
+*/
+template <typename Edge>
+std::unordered_map<uint32_t, flatbuffers::Offset<flatbuffers::String>> ConvertToEdgeIndexToNameMap(
+    const std::vector<Edge>& edges,
+    flatbuffers::FlatBufferBuilder& builder)
+{
+    std::unordered_map<uint32_t, flatbuffers::Offset<flatbuffers::String>> edgeIndexToNameMap;
+    for (auto& edge : edges)
+    {
+        uint32_t index;
+        if constexpr (std::is_same_v<Edge, DmlInputSerializedGraphEdge>)
+        {
+            index = edge.GraphInputIndex;
+        }
+        else if constexpr (std::is_same_v<Edge, DmlOutputSerializedGraphEdge>)
+        {
+            index = edge.GraphOutputIndex;
+        }
+        
+        if (edge.Name.empty())
+        {
+            throw std::invalid_argument("Graph input or output edge at index " + std::to_string(index) + " does not have name.");
+        }
+
+        if (edgeIndexToNameMap.find(index) != edgeIndexToNameMap.end())
+        {
+            flatbuffers::String* edgeName = ReadAs<flatbuffers::String>(
+                builder.GetCurrentBufferPointer(),
+                builder.GetSize() - edgeIndexToNameMap[index].o);
+            if (edge.Name != edgeName->str())
+            {
+                throw std::invalid_argument("Graph input or output edge at index " + std::to_string(index) + " has more than 1 names.");
+            }
+        }
+
+        edgeIndexToNameMap[index] = builder.CreateString(edge.Name);
+    }
+    return edgeIndexToNameMap; // NRVO will automatically move it. no need to use std::move
+}
+
+void PopulateNonConstantNodeInputOutputCount(
+    const std::vector<DmlSerializedGraphNode>& nodes,
+    /*out*/ std::vector<uint32_t>& nodeInputCounts,
+    /*out*/ std::vector<uint32_t>& nodeOutputCounts)
+{
+    for (uint32_t nodeIndex = 0; nodeIndex < static_cast<uint32_t>(nodes.size()); nodeIndex++)
+    {
+        auto& node = nodes[nodeIndex];
+        if (std::holds_alternative<AbstractOperatorDesc>(node.Desc))
+        {
+            auto& operatorNode = std::get<AbstractOperatorDesc>(node.Desc);
+            nodeInputCounts[nodeIndex] = std::max(
+                nodeInputCounts[nodeIndex], 
+                static_cast<uint32_t>(operatorNode.GetInputTensors().size()));
+
+            nodeOutputCounts[nodeIndex] = std::max(
+                nodeOutputCounts[nodeIndex], 
+                static_cast<uint32_t>(operatorNode.GetOutputTensors().size()));
+        }
+    }
+}
+
+void PopulateConstantNodeInputOutputCount(
+    const std::vector<DmlIntermediateSerializedGraphEdge>& edges,
+    /*out*/std::vector<uint32_t>& maxInputIndexForNodes,
+    /*out*/std::vector<uint32_t>& maxOutputIndexForNodes)
+{
+    for (auto& edge : edges)
+    {
+        maxInputIndexForNodes[edge.ToNodeIndex] = std::max(maxInputIndexForNodes[edge.ToNodeIndex], edge.ToNodeInputIndex + 1);
+        maxOutputIndexForNodes[edge.FromNodeIndex] = std::max(maxOutputIndexForNodes[edge.FromNodeIndex], edge.FromNodeOutputIndex + 1);
+    }
+}
+
+/*
+* validates intermediate edge and throws exception if an edge 
+* does not have a name or if an edge has more than 1 names.
+*/
+void PopulateNodeInputOutputNames(
+    flatbuffers::FlatBufferBuilder& builder,
+    const DmlSerializedGraphDesc& graphDesc,
+    const std::unordered_map<uint32_t, flatbuffers::Offset<flatbuffers::String>>& graphInputIndexToNameMap,
+    const std::unordered_map<uint32_t, flatbuffers::Offset<flatbuffers::String>>& graphOutputIndexToNameMap,
+    /*out*/std::vector<std::vector<flatbuffers::Offset<flatbuffers::String>>>& nodeToInputNames, 
+    /*out*/std::vector<std::vector<flatbuffers::Offset<flatbuffers::String>>>& nodeToOutputNames)
+{
+    for (auto& edge : graphDesc.InputEdges)
+    {
+        nodeToInputNames[edge.ToNodeIndex][edge.ToNodeInputIndex] = graphInputIndexToNameMap.at(edge.GraphInputIndex);
+    }
+
+    for (auto& edge : graphDesc.OutputEdges)
+    {
+        nodeToOutputNames[edge.FromNodeIndex][edge.FromNodeOutputIndex] = graphOutputIndexToNameMap.at(edge.GraphOutputIndex);
+    }
+
+    std::unordered_map<uint32_t, std::unordered_map<uint32_t, flatbuffers::Offset<flatbuffers::String>>> intermediateEdgeNames;
+    for (uint32_t edgeIndex = 0; edgeIndex < static_cast<uint32_t>(graphDesc.IntermediateEdges.size()); edgeIndex++)
+    {
+        auto& edge = graphDesc.IntermediateEdges[edgeIndex];
+        if (edge.Name.empty())
+        {
+            throw std::invalid_argument(
+                    "Graph intermediate edge from nodeIndex:" + std::to_string(edge.FromNodeIndex) + 
+                    " & nodeOutputIndex:" + std::to_string(edge.FromNodeOutputIndex) + " doesn't have name.");
+        }
+        
+        if (intermediateEdgeNames.find(edge.FromNodeIndex) != intermediateEdgeNames.end() &&
+            intermediateEdgeNames[edge.FromNodeIndex].find(edge.FromNodeOutputIndex) != intermediateEdgeNames[edge.FromNodeIndex].end())
+        {
+            flatbuffers::Offset edgeNameOffset = intermediateEdgeNames[edge.FromNodeIndex][edge.FromNodeOutputIndex];
+            flatbuffers::String* edgeName = ReadAs<flatbuffers::String>(
+                builder.GetCurrentBufferPointer(),
+                builder.GetSize() - edgeNameOffset.o);
+
+            if (edgeName->str() != edge.Name)
+            {
+                throw std::invalid_argument(
+                    "Graph intermediate edge from nodeIndex:" + std::to_string(edge.FromNodeIndex) + 
+                    " & nodeOutputIndex:" + std::to_string(edge.FromNodeOutputIndex) + " has more than 1 names.");
+            }
+        }
+        else
+        {
+            intermediateEdgeNames[edge.FromNodeIndex][edge.FromNodeOutputIndex] = builder.CreateString(edge.Name.c_str());
+        }
+        nodeToInputNames[edge.ToNodeIndex][edge.ToNodeInputIndex] = intermediateEdgeNames[edge.FromNodeIndex][edge.FromNodeOutputIndex];
+        nodeToOutputNames[edge.FromNodeIndex][edge.FromNodeOutputIndex] = intermediateEdgeNames[edge.FromNodeIndex][edge.FromNodeOutputIndex];
+    }
+}
+
+
+/*
+* - If an edge is connected to multiple nodes, then there will be multiple instances 
+*   of input or intermediate edges, all with the same name.
+* - The input <graphDesc> will be validated incrementally throughout the execution 
+*   of the method.
+* - Handling of empty optional input/output/attibute for non-constant node:
+*   input/output
+*   - <DmlGraphNode.inputNames> and <DmlGraphNode.outputNames> will have an null entry
+*      but the actual OperatorNodeDesc variant's <OperatorNodeDesc.inputs> 
+*      and <OperatorNodeDesc.outputs> will not have any entry.
+*   attribute
+*   - <OperatorNodeDesc.attributes> will have null entry
+*/
+flatbuffers::DetachedBuffer SerializeDmlGraph(const DmlSerializedGraphDesc& graphDesc)
+{
+
+    flatbuffers::FlatBufferBuilder builder(1024);
+    if (graphDesc.Nodes.empty())
+    {
+        return builder.Release();
+    }
+
+    // create input/output edge index to name map
+    std::unordered_map<uint32_t, flatbuffers::Offset<flatbuffers::String>> graphInputIndexToNameMap = 
+        ConvertToEdgeIndexToNameMap<DmlInputSerializedGraphEdge>(graphDesc.InputEdges, builder);
+    std::unordered_map<uint32_t, flatbuffers::Offset<flatbuffers::String>> graphOutputIndexToNameMap = 
+        ConvertToEdgeIndexToNameMap<DmlOutputSerializedGraphEdge>(graphDesc.OutputEdges, builder);
+
+    /*
+    * - Calculate number of input/output for each operator to allocate
+    *   appropriate amount of memory for each node to store input/output names.
+    * - Non-constant node's input/output count can be determined by the
+    *   AbstractOperatorDesc.
+    * - Constant node will only have outgoing edges and those outgoing edges 
+    *   will be intermediate edges.
+    */
+    std::vector<uint32_t> nodeInputCounts(graphDesc.Nodes.size(), 0);
+    std::vector<uint32_t> nodeOutputCounts(graphDesc.Nodes.size(), 0);
+    PopulateNonConstantNodeInputOutputCount(graphDesc.Nodes, nodeInputCounts, nodeOutputCounts);
+    PopulateConstantNodeInputOutputCount(graphDesc.IntermediateEdges, nodeInputCounts, nodeOutputCounts);
+    
+    // populate node input/output names.
+    std::vector<std::vector<flatbuffers::Offset<flatbuffers::String>>> nodeToInputNames(graphDesc.Nodes.size());
+    std::vector<std::vector<flatbuffers::Offset<flatbuffers::String>>> nodeToOutputNames(graphDesc.Nodes.size());
+    for (uint32_t nodeIndex = 0; nodeIndex < static_cast<uint32_t>(graphDesc.Nodes.size()); nodeIndex++)
+    {
+        nodeToInputNames[nodeIndex].assign(nodeInputCounts[nodeIndex], builder.CreateString(nullptr, 0));
+        nodeToOutputNames[nodeIndex].assign(nodeOutputCounts[nodeIndex], builder.CreateString(nullptr, 0));
+    }
+    PopulateNodeInputOutputNames(builder, graphDesc, graphInputIndexToNameMap, graphOutputIndexToNameMap, nodeToInputNames, nodeToOutputNames);
+
+    // Create flatbuffer node objects
+    std::vector<flatbuffers::Offset<dml::ir::DmlGraphNode>> nodes(graphDesc.Nodes.size());
+    for (uint32_t nodeIndex = 0; nodeIndex < static_cast<uint32_t>(graphDesc.Nodes.size()); nodeIndex++)
+    {
+        nodes[nodeIndex] = SerializeNode(
+                            builder,
+                            nodeIndex,
+                            graphDesc.Nodes[nodeIndex],
+                            nodeToInputNames[nodeIndex],
+                            nodeToOutputNames[nodeIndex]);
+    }
+
+    // Convert to std::vector to create the <dml::ir::DmlGraphDesc> object.
+    std::vector<flatbuffers::Offset<flatbuffers::String>> graphInputNames(graphDesc.InputCount, builder.CreateString(nullptr, 0));
+    std::vector<flatbuffers::Offset<flatbuffers::String>> graphOutputNames(graphDesc.OutputCount, builder.CreateString(nullptr, 0));
+    for (const auto& [key, value] : graphInputIndexToNameMap)
+    {
+        graphInputNames[key] = value;
+    }
+    for (const auto& [key, value] : graphOutputIndexToNameMap)
+    {
+        graphOutputNames[key] = value;
+    }
+
+    flatbuffers::Offset<dml::ir::DmlGraphDesc> dmlGraphDescOffset = dml::ir::CreateDmlGraphDescDirect(
+        builder,
+        &nodes,
+        &graphInputNames,
+        &graphOutputNames);
+    builder.Finish(dmlGraphDescOffset);
+    return builder.Release();
+}
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlRuntimeFusedGraphKernel.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlRuntimeFusedGraphKernel.cpp
index 5c7b7bff1e370..0f0d445a95bae 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlRuntimeFusedGraphKernel.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlRuntimeFusedGraphKernel.cpp
@@ -180,32 +180,50 @@ namespace Dml
                 // Convert partitionONNXGraph into DML EP GraphDesc
                 ComPtr<IDMLDevice> device;
                 ORT_THROW_IF_FAILED(providerImpl->GetDmlDevice(device.GetAddressOf()));
+                // This map will be used to transfer the initializer to D3D12 system heap memory.
+                // 'serializedDmlGraphDesc' will have constant input as intermediate edges, that's why
+                // we need a mapping between intermediateEdgeIndex and indexedSubGraph's (a given partition)
+                // input arg index.
+                //   For ex: Let's say intermediate edge index = idx, then
+                //           indexedSubGraphInputArgIdx = constantEdgeIdxToSubgraphInputArgIdxMap[idx];
+                //           corresponding constant tensor = initializerNameToInitializerMap[indexedSubGraph.GetMetaDef()->inputs[indexedSubGraphInputArgIdx]]
+                // We are using intermediate edge index as a key because same constant tensor can be used by
+                // multiple nodes.
+                std::unordered_map<uint32_t, uint32_t> serializedGraphInputIndexToSubgraphInputIndex;
+                std::unordered_map<std::string_view, uint32_t> serializedGraphLargeConstantNameToSubgraphInputIndex;
+                std::vector<std::unique_ptr<std::byte[]>> smallConstantData;
                 GraphDescBuilder::GraphDesc graphDesc = GraphDescBuilder::BuildGraphDesc(
                     isInputsUploadedByDmlEP.data(),
                     isInputsUploadedByDmlEP.size(),
                     m_isInitializerTransferable,
                     m_partitionNodePropsMap,
-                    device.Get(),
                     providerImpl,
                     m_modelPath,
                     m_subgraphNodePointers,
                     m_subgraphInputs,
-                    m_subgraphOutputs);
+                    m_subgraphOutputs,
+                    serializedGraphInputIndexToSubgraphInputIndex,
+                    serializedGraphLargeConstantNameToSubgraphInputIndex,
+                    smallConstantData);
 
                 m_outputShapes = graphDesc.outputShapes;
 
                 // Walk through each graph edge and mark used inputs
                 m_inputsUsed.resize(fusedNodeInputCount, false);
-                for (const DML_INPUT_GRAPH_EDGE_DESC& edge : graphDesc.inputEdges)
-                {
-                    m_inputsUsed[edge.GraphInputIndex] = true;
+                for (auto it = serializedGraphInputIndexToSubgraphInputIndex.begin(); it != serializedGraphInputIndexToSubgraphInputIndex.end(); it++) {
+                    m_inputsUsed[it->second] = true;
+                }
+                for (auto it = serializedGraphLargeConstantNameToSubgraphInputIndex.begin(); it != serializedGraphLargeConstantNameToSubgraphInputIndex.end(); it++) {
+                    m_inputsUsed[it->second] = true;
                 }
 
                 // Compile the operator
                 m_compiledExecutionPlanOperator = DmlGraphFusionHelper::TryCreateCompiledOperator(
                     graphDesc,
                     *m_indexedSubGraph,
-                    providerImpl);
+                    providerImpl,
+                    &serializedGraphInputIndexToSubgraphInputIndex,
+                    &serializedGraphLargeConstantNameToSubgraphInputIndex);
 
                 // Queue references to objects which must be kept alive until resulting GPU work completes
                 m_winmlProvider->QueueReference(m_compiledExecutionPlanOperator.Get());
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h
index a5415ba85f3d3..e1e7eacfbd85d 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h
@@ -24,8 +24,8 @@ struct EnumTraits<DML_TENSOR_TYPE>
 template <>
 struct EnumTraits<DML_OPERATOR_TYPE>
 {
-    static constexpr auto ValueCount = 161;
-    static constexpr size_t ActivationFunctionCount = 24;
+    static constexpr auto ValueCount = 168;
+    static constexpr size_t ActivationFunctionCount = 26;
 };
 
 template <>
@@ -62,7 +62,7 @@ struct EnumTraits<DML_CONVOLUTION_DIRECTION>
 template <>
 struct EnumTraits<DML_PADDING_MODE>
 {
-    static constexpr auto ValueCount = 4;
+    static constexpr auto ValueCount = 5;
 };
 
 template <>
@@ -86,7 +86,7 @@ struct EnumTraits<DML_FEATURE>
 template <>
 struct EnumTraits<DML_FEATURE_LEVEL>
 {
-    static constexpr auto ValueCount = 8;
+    static constexpr auto ValueCount = 13;
 };
 
 template <>
@@ -119,6 +119,12 @@ struct EnumTraits<DML_RANDOM_GENERATOR_TYPE>
     static constexpr auto ValueCount = 1;
 };
 
+template <>
+struct EnumTraits<DML_MULTIHEAD_ATTENTION_MASK_TYPE>
+{
+    static constexpr auto ValueCount = 5;
+};
+
 template <typename T>
 constexpr auto EnumValueCount = EnumTraits<T>::ValueCount;
 
@@ -495,12 +501,6 @@ struct OperatorDescTraits<DML_ROI_POOLING_OPERATOR_DESC>
     static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_ROI_POOLING;
 };
 
-template <>
-struct OperatorDescTraits<DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_DESC>
-{
-    static constexpr DML_OPERATOR_TYPE Type = (DML_OPERATOR_TYPE) DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING;
-};
-
 template <>
 struct OperatorDescTraits<DML_SLICE_OPERATOR_DESC>
 {
@@ -1029,6 +1029,24 @@ struct OperatorDescTraits<DML_DIAGONAL_MATRIX1_OPERATOR_DESC>
     static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_DIAGONAL_MATRIX1;
 };
 
+template <>
+struct OperatorDescTraits<DML_MULTIHEAD_ATTENTION_OPERATOR_DESC>
+{
+    static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_MULTIHEAD_ATTENTION;
+};
+
+template <>
+struct OperatorDescTraits<DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_DESC>
+{
+    static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING;
+};
+
+template <>
+struct OperatorDescTraits<DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC>
+{
+    static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT;
+};
+
 template <>
 struct OperatorDescTraits<DML_ACTIVATION_ELU_OPERATOR_DESC>
 {
@@ -1174,9 +1192,15 @@ struct OperatorDescTraits<DML_ACTIVATION_GELU_OPERATOR_DESC>
 };
 
 template <>
-struct OperatorDescTraits<DML_MULTIHEAD_ATTENTION_OPERATOR_DESC>
+struct OperatorDescTraits<DML_ACTIVATION_SWISH_OPERATOR_DESC>
 {
-    static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_MULTIHEAD_ATTENTION;
+    static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_ACTIVATION_SWISH;
+};
+
+template <>
+struct OperatorDescTraits<DML_ACTIVATION_HARD_SWISH_OPERATOR_DESC>
+{
+    static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_ACTIVATION_HARD_SWISH;
 };
 
 template <DML_OPERATOR_TYPE Type>
@@ -1502,12 +1526,6 @@ struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_ROI_POOLING>
     using DescType = DML_ROI_POOLING_OPERATOR_DESC;
 };
 
-template <>
-struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING>
-{
-    using DescType = DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_DESC;
-};
-
 template <>
 struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_SLICE>
 {
@@ -2036,6 +2054,24 @@ struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_DIAGONAL_MATRIX1>
     using DescType = DML_DIAGONAL_MATRIX1_OPERATOR_DESC;
 };
 
+template <>
+struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_MULTIHEAD_ATTENTION>
+{
+    using DescType = DML_MULTIHEAD_ATTENTION_OPERATOR_DESC;
+};
+
+template <>
+struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING>
+{
+    using DescType = DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_DESC;
+};
+
+template <>
+struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT>
+{
+    using DescType = DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC;
+};
+
 template <>
 struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_ACTIVATION_ELU>
 {
@@ -2181,14 +2217,20 @@ struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_ACTIVATION_GELU>
 };
 
 template <>
-struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_MULTIHEAD_ATTENTION>
+struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_ACTIVATION_SWISH>
 {
-    using DescType = DML_MULTIHEAD_ATTENTION_OPERATOR_DESC;
+    using DescType = DML_ACTIVATION_SWISH_OPERATOR_DESC;
+};
+
+template <>
+struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_ACTIVATION_HARD_SWISH>
+{
+    using DescType = DML_ACTIVATION_HARD_SWISH_OPERATOR_DESC;
 };
 
 // Calls a visitor functor, supplying an empty operator desc corresponding to the given DML_OPERATOR_TYPE as
 // the first argument.
-//
+// 
 // For example:
 //   Visit(DML_OPERATOR_ELEMENT_WISE_IDENTITY, [](auto tag) {
 //       using T = decltype(tag); // T is one of the DML_*_OPERATOR_DESC structs
@@ -2485,6 +2527,10 @@ auto OperatorTypeVisitor(DML_OPERATOR_TYPE type, Visitor&& visitor, Ts&&... args
         return std::invoke(std::forward<Visitor>(visitor), DML_DIAGONAL_MATRIX1_OPERATOR_DESC{}, std::forward<Ts>(args)...);
     case DML_OPERATOR_MULTIHEAD_ATTENTION:
         return std::invoke(std::forward<Visitor>(visitor), DML_MULTIHEAD_ATTENTION_OPERATOR_DESC{}, std::forward<Ts>(args)...);
+    case DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING:
+        return std::invoke(std::forward<Visitor>(visitor), DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_DESC{}, std::forward<Ts>(args)...);
+    case DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT:
+        return std::invoke(std::forward<Visitor>(visitor), DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC{}, std::forward<Ts>(args)...);
     case DML_OPERATOR_ACTIVATION_ELU:
         return std::invoke(std::forward<Visitor>(visitor), DML_ACTIVATION_ELU_OPERATOR_DESC{}, std::forward<Ts>(args)...);
     case DML_OPERATOR_ACTIVATION_CELU:
@@ -2533,13 +2579,10 @@ auto OperatorTypeVisitor(DML_OPERATOR_TYPE type, Visitor&& visitor, Ts&&... args
         return std::invoke(std::forward<Visitor>(visitor), DML_ACTIVATION_SHRINK_OPERATOR_DESC{}, std::forward<Ts>(args)...);
     case DML_OPERATOR_ACTIVATION_GELU:
         return std::invoke(std::forward<Visitor>(visitor), DML_ACTIVATION_GELU_OPERATOR_DESC{}, std::forward<Ts>(args)...);
-
-#pragma warning(push)
-#pragma warning(disable: 4063)
-    case DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING:
-        return std::invoke(std::forward<Visitor>(visitor), DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_DESC{}, std::forward<Ts>(args)...);
-#pragma warning(pop)
-
+    case DML_OPERATOR_ACTIVATION_SWISH:
+        return std::invoke(std::forward<Visitor>(visitor), DML_ACTIVATION_SWISH_OPERATOR_DESC{}, std::forward<Ts>(args)...);
+    case DML_OPERATOR_ACTIVATION_HARD_SWISH:
+        return std::invoke(std::forward<Visitor>(visitor), DML_ACTIVATION_HARD_SWISH_OPERATOR_DESC{}, std::forward<Ts>(args)...);
     default:
         ORT_THROW_HR(E_INVALIDARG);
         return std::invoke(std::forward<Visitor>(visitor), DML_ACTIVATION_RELU_OPERATOR_DESC{}, std::forward<Ts>(args)...);
@@ -2547,7 +2590,55 @@ auto OperatorTypeVisitor(DML_OPERATOR_TYPE type, Visitor&& visitor, Ts&&... args
 }
 #pragma warning(pop)
 
+namespace StringifyHelpers
+{
+template <typename T>
+inline gsl::czstring ToString(T value)
+{
+#ifndef WAI_BUILD_LINUX
+    // Clang will instantiate this template even if it isn't used,
+    // so this static_assert will always fire and break the build.
+    static_assert(false, "Not implemented for this type");
+#endif
+}
+
+template <>
+inline gsl::czstring ToString(DML_TENSOR_DATA_TYPE value)
+{
+    switch (value)
+    {
+    case DML_TENSOR_DATA_TYPE_UNKNOWN: return "DML_TENSOR_DATA_TYPE_UNKNOWN";
+    case DML_TENSOR_DATA_TYPE_FLOAT32: return "DML_TENSOR_DATA_TYPE_FLOAT32";
+    case DML_TENSOR_DATA_TYPE_FLOAT16: return "DML_TENSOR_DATA_TYPE_FLOAT16";
+    case DML_TENSOR_DATA_TYPE_UINT32: return "DML_TENSOR_DATA_TYPE_UINT32";
+    case DML_TENSOR_DATA_TYPE_UINT16: return "DML_TENSOR_DATA_TYPE_UINT16";
+    case DML_TENSOR_DATA_TYPE_UINT8: return "DML_TENSOR_DATA_TYPE_UINT8";
+    case DML_TENSOR_DATA_TYPE_INT32: return "DML_TENSOR_DATA_TYPE_INT32";
+    case DML_TENSOR_DATA_TYPE_INT16: return "DML_TENSOR_DATA_TYPE_INT16";
+    case DML_TENSOR_DATA_TYPE_INT8: return "DML_TENSOR_DATA_TYPE_INT8";
+    case DML_TENSOR_DATA_TYPE_FLOAT64: return "DML_TENSOR_DATA_TYPE_FLOAT64";
+    case DML_TENSOR_DATA_TYPE_UINT64: return "DML_TENSOR_DATA_TYPE_UINT64";
+    case DML_TENSOR_DATA_TYPE_INT64: return "DML_TENSOR_DATA_TYPE_INT64";
+    default:
+        assert(false);
+        return "<unknown>";
+    }
+}
+
+template <>
+inline gsl::czstring ToString(DML_TENSOR_TYPE value)
+{
+    switch (value)
+    {
+    case DML_TENSOR_TYPE_INVALID: return "DML_TENSOR_TYPE_INVALID";
+    case DML_TENSOR_TYPE_BUFFER: return "DML_TENSOR_TYPE_BUFFER";
+    default:
+        assert(false);
+        return "<unknown>";
+    }
+}
 
+template <>
 inline gsl::czstring ToString(DML_OPERATOR_TYPE value)
 {
     switch (value)
@@ -2561,9 +2652,6 @@ inline gsl::czstring ToString(DML_OPERATOR_TYPE value)
     case DML_OPERATOR_ELEMENT_WISE_ATAN: return "DML_OPERATOR_ELEMENT_WISE_ATAN";
     case DML_OPERATOR_ELEMENT_WISE_CEIL: return "DML_OPERATOR_ELEMENT_WISE_CEIL";
     case DML_OPERATOR_ELEMENT_WISE_CLIP: return "DML_OPERATOR_ELEMENT_WISE_CLIP";
-    case DML_OPERATOR_ELEMENT_WISE_CLIP1: return "DML_OPERATOR_ELEMENT_WISE_CLIP1";
-    case DML_OPERATOR_ELEMENT_WISE_CLIP_GRAD: return "DML_OPERATOR_ELEMENT_WISE_CLIP_GRAD";
-    case DML_OPERATOR_ELEMENT_WISE_CLIP_GRAD1: return "DML_OPERATOR_ELEMENT_WISE_CLIP_GRAD1";
     case DML_OPERATOR_ELEMENT_WISE_COS: return "DML_OPERATOR_ELEMENT_WISE_COS";
     case DML_OPERATOR_ELEMENT_WISE_DIVIDE: return "DML_OPERATOR_ELEMENT_WISE_DIVIDE";
     case DML_OPERATOR_ELEMENT_WISE_EXP: return "DML_OPERATOR_ELEMENT_WISE_EXP";
@@ -2587,24 +2675,41 @@ inline gsl::czstring ToString(DML_OPERATOR_TYPE value)
     case DML_OPERATOR_ELEMENT_WISE_RECIP: return "DML_OPERATOR_ELEMENT_WISE_RECIP";
     case DML_OPERATOR_ELEMENT_WISE_SIN: return "DML_OPERATOR_ELEMENT_WISE_SIN";
     case DML_OPERATOR_ELEMENT_WISE_SQRT: return "DML_OPERATOR_ELEMENT_WISE_SQRT";
-    case DML_OPERATOR_ELEMENT_WISE_DIFFERENCE_SQUARE: return "DML_OPERATOR_ELEMENT_WISE_DIFFERENCE_SQUARE";
-    case DML_OPERATOR_ELEMENT_WISE_ATAN_YX: return "DML_OPERATOR_ELEMENT_WISE_ATAN_YX";
     case DML_OPERATOR_ELEMENT_WISE_SUBTRACT: return "DML_OPERATOR_ELEMENT_WISE_SUBTRACT";
     case DML_OPERATOR_ELEMENT_WISE_TAN: return "DML_OPERATOR_ELEMENT_WISE_TAN";
     case DML_OPERATOR_ELEMENT_WISE_THRESHOLD: return "DML_OPERATOR_ELEMENT_WISE_THRESHOLD";
     case DML_OPERATOR_ELEMENT_WISE_QUANTIZE_LINEAR: return "DML_OPERATOR_ELEMENT_WISE_QUANTIZE_LINEAR";
     case DML_OPERATOR_ELEMENT_WISE_DEQUANTIZE_LINEAR: return "DML_OPERATOR_ELEMENT_WISE_DEQUANTIZE_LINEAR";
+    case DML_OPERATOR_ACTIVATION_ELU: return "DML_OPERATOR_ACTIVATION_ELU";
+    case DML_OPERATOR_ACTIVATION_CELU: return "DML_OPERATOR_ACTIVATION_CELU";
+    case DML_OPERATOR_ACTIVATION_HARDMAX: return "DML_OPERATOR_ACTIVATION_HARDMAX";
+    case DML_OPERATOR_ACTIVATION_HARDMAX1: return "DML_OPERATOR_ACTIVATION_HARDMAX1";
+    case DML_OPERATOR_ACTIVATION_HARD_SIGMOID: return "DML_OPERATOR_ACTIVATION_HARD_SIGMOID";
+    case DML_OPERATOR_ACTIVATION_IDENTITY: return "DML_OPERATOR_ACTIVATION_IDENTITY";
+    case DML_OPERATOR_ACTIVATION_LEAKY_RELU: return "DML_OPERATOR_ACTIVATION_LEAKY_RELU";
+    case DML_OPERATOR_ACTIVATION_LINEAR: return "DML_OPERATOR_ACTIVATION_LINEAR";
+    case DML_OPERATOR_ACTIVATION_LOG_SOFTMAX: return "DML_OPERATOR_ACTIVATION_LOG_SOFTMAX";
+    case DML_OPERATOR_ACTIVATION_LOG_SOFTMAX1: return "DML_OPERATOR_ACTIVATION_LOG_SOFTMAX1";
+    case DML_OPERATOR_ACTIVATION_PARAMETERIZED_RELU: return "DML_OPERATOR_ACTIVATION_PARAMETERIZED_RELU";
+    case DML_OPERATOR_ACTIVATION_PARAMETRIC_SOFTPLUS: return "DML_OPERATOR_ACTIVATION_PARAMETRIC_SOFTPLUS";
+    case DML_OPERATOR_ACTIVATION_RELU: return "DML_OPERATOR_ACTIVATION_RELU";
+    case DML_OPERATOR_ACTIVATION_SCALED_ELU: return "DML_OPERATOR_ACTIVATION_SCALED_ELU";
+    case DML_OPERATOR_ACTIVATION_SCALED_TANH: return "DML_OPERATOR_ACTIVATION_SCALED_TANH";
+    case DML_OPERATOR_ACTIVATION_SIGMOID: return "DML_OPERATOR_ACTIVATION_SIGMOID";
+    case DML_OPERATOR_ACTIVATION_SOFTMAX: return "DML_OPERATOR_ACTIVATION_SOFTMAX";
+    case DML_OPERATOR_ACTIVATION_SOFTMAX1: return "DML_OPERATOR_ACTIVATION_SOFTMAX1";
+    case DML_OPERATOR_ACTIVATION_SOFTPLUS: return "DML_OPERATOR_ACTIVATION_SOFTPLUS";
+    case DML_OPERATOR_ACTIVATION_SOFTSIGN: return "DML_OPERATOR_ACTIVATION_SOFTSIGN";
+    case DML_OPERATOR_ACTIVATION_TANH: return "DML_OPERATOR_ACTIVATION_TANH";
+    case DML_OPERATOR_ACTIVATION_THRESHOLDED_RELU: return "DML_OPERATOR_ACTIVATION_THRESHOLDED_RELU";
     case DML_OPERATOR_CONVOLUTION: return "DML_OPERATOR_CONVOLUTION";
     case DML_OPERATOR_GEMM: return "DML_OPERATOR_GEMM";
     case DML_OPERATOR_REDUCE: return "DML_OPERATOR_REDUCE";
-    case DML_OPERATOR_ARGMIN: return "DML_OPERATOR_ARGMIN";
-    case DML_OPERATOR_ARGMAX: return "DML_OPERATOR_ARGMAX";
     case DML_OPERATOR_AVERAGE_POOLING: return "DML_OPERATOR_AVERAGE_POOLING";
     case DML_OPERATOR_AVERAGE_POOLING1: return "DML_OPERATOR_AVERAGE_POOLING1";
     case DML_OPERATOR_LP_POOLING: return "DML_OPERATOR_LP_POOLING";
     case DML_OPERATOR_LP_POOLING1: return "DML_OPERATOR_LP_POOLING1";
     case DML_OPERATOR_MAX_POOLING: return "DML_OPERATOR_MAX_POOLING";
-    case DML_OPERATOR_MAX_POOLING1: return "DML_OPERATOR_MAX_POOLING1";
     case DML_OPERATOR_ROI_POOLING: return "DML_OPERATOR_ROI_POOLING";
     case DML_OPERATOR_SLICE: return "DML_OPERATOR_SLICE";
     case DML_OPERATOR_CAST: return "DML_OPERATOR_CAST";
@@ -2620,18 +2725,15 @@ inline gsl::czstring ToString(DML_OPERATOR_TYPE value)
     case DML_OPERATOR_TILE: return "DML_OPERATOR_TILE";
     case DML_OPERATOR_TOP_K: return "DML_OPERATOR_TOP_K";
     case DML_OPERATOR_BATCH_NORMALIZATION: return "DML_OPERATOR_BATCH_NORMALIZATION";
-    case DML_OPERATOR_BATCH_NORMALIZATION_GRAD: return "DML_OPERATOR_BATCH_NORMALIZATION_GRAD";
-    case DML_OPERATOR_BATCH_NORMALIZATION_TRAINING_GRAD: return "DML_OPERATOR_BATCH_NORMALIZATION_TRAINING_GRAD";
+    case DML_OPERATOR_BATCH_NORMALIZATION_TRAINING: return "DML_OPERATOR_BATCH_NORMALIZATION_TRAINING";
     case DML_OPERATOR_MEAN_VARIANCE_NORMALIZATION: return "DML_OPERATOR_MEAN_VARIANCE_NORMALIZATION";
     case DML_OPERATOR_LOCAL_RESPONSE_NORMALIZATION: return "DML_OPERATOR_LOCAL_RESPONSE_NORMALIZATION";
-    case DML_OPERATOR_LOCAL_RESPONSE_NORMALIZATION_GRAD: return "DML_OPERATOR_LOCAL_RESPONSE_NORMALIZATION_GRAD";
     case DML_OPERATOR_LP_NORMALIZATION: return "DML_OPERATOR_LP_NORMALIZATION";
     case DML_OPERATOR_RNN: return "DML_OPERATOR_RNN";
     case DML_OPERATOR_LSTM: return "DML_OPERATOR_LSTM";
     case DML_OPERATOR_GRU: return "DML_OPERATOR_GRU";
     case DML_OPERATOR_ELEMENT_WISE_SIGN: return "DML_OPERATOR_ELEMENT_WISE_SIGN";
     case DML_OPERATOR_ELEMENT_WISE_IS_NAN: return "DML_OPERATOR_ELEMENT_WISE_IS_NAN";
-    case DML_OPERATOR_ELEMENT_WISE_NEGATE: return "DML_OPERATOR_ELEMENT_WISE_NEGATE";
     case DML_OPERATOR_ELEMENT_WISE_ERF: return "DML_OPERATOR_ELEMENT_WISE_ERF";
     case DML_OPERATOR_ELEMENT_WISE_SINH: return "DML_OPERATOR_ELEMENT_WISE_SINH";
     case DML_OPERATOR_ELEMENT_WISE_COSH: return "DML_OPERATOR_ELEMENT_WISE_COSH";
@@ -2641,6 +2743,8 @@ inline gsl::czstring ToString(DML_OPERATOR_TYPE value)
     case DML_OPERATOR_ELEMENT_WISE_ATANH: return "DML_OPERATOR_ELEMENT_WISE_ATANH";
     case DML_OPERATOR_ELEMENT_WISE_IF: return "DML_OPERATOR_ELEMENT_WISE_IF";
     case DML_OPERATOR_ELEMENT_WISE_ADD1: return "DML_OPERATOR_ELEMENT_WISE_ADD1";
+    case DML_OPERATOR_ACTIVATION_SHRINK: return "DML_OPERATOR_ACTIVATION_SHRINK";
+    case DML_OPERATOR_MAX_POOLING1: return "DML_OPERATOR_MAX_POOLING1";
     case DML_OPERATOR_MAX_UNPOOLING: return "DML_OPERATOR_MAX_UNPOOLING";
     case DML_OPERATOR_DIAGONAL_MATRIX: return "DML_OPERATOR_DIAGONAL_MATRIX";
     case DML_OPERATOR_SCATTER: return "DML_OPERATOR_SCATTER";
@@ -2652,10 +2756,9 @@ inline gsl::czstring ToString(DML_OPERATOR_TYPE value)
     case DML_OPERATOR_ELEMENT_WISE_IS_INFINITY: return "DML_OPERATOR_ELEMENT_WISE_IS_INFINITY";
     case DML_OPERATOR_ELEMENT_WISE_MODULUS_TRUNCATE: return "DML_OPERATOR_ELEMENT_WISE_MODULUS_TRUNCATE";
     case DML_OPERATOR_ELEMENT_WISE_MODULUS_FLOOR: return "DML_OPERATOR_ELEMENT_WISE_MODULUS_FLOOR";
-    case DML_OPERATOR_FILL_VALUE_CONSTANT: return "DML_OPERATOR_FILL_VALUE_CONSTANT";
     case DML_OPERATOR_FILL_VALUE_SEQUENCE: return "DML_OPERATOR_FILL_VALUE_SEQUENCE";
+    case DML_OPERATOR_FILL_VALUE_CONSTANT: return "DML_OPERATOR_FILL_VALUE_CONSTANT";
     case DML_OPERATOR_CUMULATIVE_SUMMATION: return "DML_OPERATOR_CUMULATIVE_SUMMATION";
-    case DML_OPERATOR_CUMULATIVE_PRODUCT: return "DML_OPERATOR_CUMULATIVE_PRODUCT";
     case DML_OPERATOR_REVERSE_SUBSEQUENCES: return "DML_OPERATOR_REVERSE_SUBSEQUENCES";
     case DML_OPERATOR_GATHER_ELEMENTS: return "DML_OPERATOR_GATHER_ELEMENTS";
     case DML_OPERATOR_GATHER_ND: return "DML_OPERATOR_GATHER_ND";
@@ -2684,20 +2787,278 @@ inline gsl::czstring ToString(DML_OPERATOR_TYPE value)
     case DML_OPERATOR_RESAMPLE_GRAD: return "DML_OPERATOR_RESAMPLE_GRAD";
     case DML_OPERATOR_SLICE_GRAD: return "DML_OPERATOR_SLICE_GRAD";
     case DML_OPERATOR_ADAM_OPTIMIZER: return "DML_OPERATOR_ADAM_OPTIMIZER";
+    case DML_OPERATOR_ARGMIN: return "DML_OPERATOR_ARGMIN";
+    case DML_OPERATOR_ARGMAX: return "DML_OPERATOR_ARGMAX";
     case DML_OPERATOR_ROI_ALIGN: return "DML_OPERATOR_ROI_ALIGN";
-    case DML_OPERATOR_ROI_ALIGN1: return "DML_OPERATOR_ROI_ALIGN1";
     case DML_OPERATOR_GATHER_ND1: return "DML_OPERATOR_GATHER_ND1";
-    case DML_OPERATOR_DYNAMIC_QUANTIZE_LINEAR: return "DML_OPERATOR_DYNAMIC_QUANTIZE_LINEAR";
+    case DML_OPERATOR_ELEMENT_WISE_ATAN_YX: return "DML_OPERATOR_ELEMENT_WISE_ATAN_YX";
+    case DML_OPERATOR_ELEMENT_WISE_CLIP_GRAD: return "DML_OPERATOR_ELEMENT_WISE_CLIP_GRAD";
+    case DML_OPERATOR_ELEMENT_WISE_DIFFERENCE_SQUARE: return "DML_OPERATOR_ELEMENT_WISE_DIFFERENCE_SQUARE";
+    case DML_OPERATOR_LOCAL_RESPONSE_NORMALIZATION_GRAD: return "DML_OPERATOR_LOCAL_RESPONSE_NORMALIZATION_GRAD";
+    case DML_OPERATOR_CUMULATIVE_PRODUCT: return "DML_OPERATOR_CUMULATIVE_PRODUCT";
+    case DML_OPERATOR_BATCH_NORMALIZATION_GRAD: return "DML_OPERATOR_BATCH_NORMALIZATION_GRAD";
+    case DML_OPERATOR_BATCH_NORMALIZATION_TRAINING_GRAD: return "DML_OPERATOR_BATCH_NORMALIZATION_TRAINING_GRAD";
     case DML_OPERATOR_ELEMENT_WISE_QUANTIZED_LINEAR_ADD: return "DML_OPERATOR_ELEMENT_WISE_QUANTIZED_LINEAR_ADD";
-    case DML_OPERATOR_ROI_ALIGN_GRAD: return "DML_OPERATOR_ROI_ALIGN_GRAD";
-    case DML_OPERATOR_BATCH_NORMALIZATION_TRAINING: return "DML_OPERATOR_BATCH_NORMALIZATION_TRAINING";
+    case DML_OPERATOR_DYNAMIC_QUANTIZE_LINEAR: return "DML_OPERATOR_DYNAMIC_QUANTIZE_LINEAR";
+    case DML_OPERATOR_ROI_ALIGN1: return "DML_OPERATOR_ROI_ALIGN1";
+    case DML_OPERATOR_ELEMENT_WISE_CLIP1: return "DML_OPERATOR_ELEMENT_WISE_CLIP1";
+    case DML_OPERATOR_ELEMENT_WISE_CLIP_GRAD1: return "DML_OPERATOR_ELEMENT_WISE_CLIP_GRAD1";
+    case DML_OPERATOR_ELEMENT_WISE_NEGATE: return "DML_OPERATOR_ELEMENT_WISE_NEGATE";
+    case DML_OPERATOR_ACTIVATION_GELU: return "DML_OPERATOR_ACTIVATION_GELU";
+    case DML_OPERATOR_ACTIVATION_SWISH: return "DML_OPERATOR_ACTIVATION_SWISH";
+    case DML_OPERATOR_ACTIVATION_HARD_SWISH: return "DML_OPERATOR_ACTIVATION_HARD_SWISH";
     case DML_OPERATOR_RESAMPLE2: return "DML_OPERATOR_RESAMPLE2";
     case DML_OPERATOR_RESAMPLE_GRAD1: return "DML_OPERATOR_RESAMPLE_GRAD1";
     case DML_OPERATOR_DIAGONAL_MATRIX1: return "DML_OPERATOR_DIAGONAL_MATRIX1";
     case DML_OPERATOR_MULTIHEAD_ATTENTION: return "DML_OPERATOR_MULTIHEAD_ATTENTION";
+    case DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING: return "DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING";
+    case DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT: return "DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT";
+    default:
+        assert(false);
+        return "<unknown>";
+    }
+}
+
+template <>
+inline gsl::czstring ToString(DML_BINDING_TYPE value)
+{
+    switch (value)
+    {
+    case DML_BINDING_TYPE_NONE: return "DML_BINDING_TYPE_NONE";
+    case DML_BINDING_TYPE_BUFFER: return "DML_BINDING_TYPE_BUFFER";
+    case DML_BINDING_TYPE_BUFFER_ARRAY: return "DML_BINDING_TYPE_BUFFER_ARRAY";
+    default:
+        assert(false);
+        return "<unknown>";
+    }
+}
+
+template <>
+inline gsl::czstring ToString(DML_REDUCE_FUNCTION value)
+{
+    switch (value)
+    {
+    case DML_REDUCE_FUNCTION_ARGMAX: return "DML_REDUCE_FUNCTION_ARGMAX";
+    case DML_REDUCE_FUNCTION_ARGMIN: return "DML_REDUCE_FUNCTION_ARGMIN";
+    case DML_REDUCE_FUNCTION_AVERAGE: return "DML_REDUCE_FUNCTION_AVERAGE";
+    case DML_REDUCE_FUNCTION_L1: return "DML_REDUCE_FUNCTION_L1";
+    case DML_REDUCE_FUNCTION_L2: return "DML_REDUCE_FUNCTION_L2";
+    case DML_REDUCE_FUNCTION_LOG_SUM: return "DML_REDUCE_FUNCTION_LOG_SUM";
+    case DML_REDUCE_FUNCTION_LOG_SUM_EXP: return "DML_REDUCE_FUNCTION_LOG_SUM_EXP";
+    case DML_REDUCE_FUNCTION_MAX: return "DML_REDUCE_FUNCTION_MAX";
+    case DML_REDUCE_FUNCTION_MIN: return "DML_REDUCE_FUNCTION_MIN";
+    case DML_REDUCE_FUNCTION_MULTIPLY: return "DML_REDUCE_FUNCTION_MULTIPLY";
+    case DML_REDUCE_FUNCTION_SUM: return "DML_REDUCE_FUNCTION_SUM";
+    case DML_REDUCE_FUNCTION_SUM_SQUARE: return "DML_REDUCE_FUNCTION_SUM_SQUARE";
     default:
         assert(false);
         return "<unknown>";
     }
 }
+
+template <>
+inline gsl::czstring ToString(DML_MATRIX_TRANSFORM value)
+{
+    switch (value)
+    {
+    case DML_MATRIX_TRANSFORM_NONE: return "DML_MATRIX_TRANSFORM_NONE";
+    case DML_MATRIX_TRANSFORM_TRANSPOSE: return "DML_MATRIX_TRANSFORM_TRANSPOSE";
+    default:
+        assert(false);
+        return "<unknown>";
+    }
+}
+
+template <>
+inline gsl::czstring ToString(DML_CONVOLUTION_MODE value)
+{
+    switch (value)
+    {
+    case DML_CONVOLUTION_MODE_CONVOLUTION: return "DML_CONVOLUTION_MODE_CONVOLUTION";
+    case DML_CONVOLUTION_MODE_CROSS_CORRELATION: return "DML_CONVOLUTION_MODE_CROSS_CORRELATION";
+    default:
+        assert(false);
+        return "<unknown>";
+    }
+}
+
+template <>
+inline gsl::czstring ToString(DML_CONVOLUTION_DIRECTION value)
+{
+    switch (value)
+    {
+    case DML_CONVOLUTION_DIRECTION_FORWARD: return "DML_CONVOLUTION_DIRECTION_FORWARD";
+    case DML_CONVOLUTION_DIRECTION_BACKWARD: return "DML_CONVOLUTION_DIRECTION_BACKWARD";
+    default:
+        assert(false);
+        return "<unknown>";
+    }
+}
+
+template <>
+inline gsl::czstring ToString(DML_PADDING_MODE value)
+{
+    switch (value)
+    {
+    case DML_PADDING_MODE_CONSTANT: return "DML_PADDING_MODE_CONSTANT";
+    case DML_PADDING_MODE_EDGE: return "DML_PADDING_MODE_EDGE";
+    case DML_PADDING_MODE_REFLECTION: return "DML_PADDING_MODE_REFLECTION";
+    case DML_PADDING_MODE_SYMMETRIC: return "DML_PADDING_MODE_SYMMETRIC";
+    default:
+        assert(false);
+        return "<unknown>";
+    }
+}
+
+template <>
+inline gsl::czstring ToString(DML_INTERPOLATION_MODE value)
+{
+    switch (value)
+    {
+    case DML_INTERPOLATION_MODE_NEAREST_NEIGHBOR: return "DML_INTERPOLATION_MODE_NEAREST_NEIGHBOR";
+    case DML_INTERPOLATION_MODE_LINEAR: return "DML_INTERPOLATION_MODE_LINEAR";
+    default:
+        assert(false);
+        return "<unknown>";
+    }
+}
+
+template <>
+inline gsl::czstring ToString(DML_RECURRENT_NETWORK_DIRECTION value)
+{
+    switch (value)
+    {
+    case DML_RECURRENT_NETWORK_DIRECTION_FORWARD: return "DML_RECURRENT_NETWORK_DIRECTION_FORWARD";
+    case DML_RECURRENT_NETWORK_DIRECTION_BACKWARD: return "DML_RECURRENT_NETWORK_DIRECTION_BACKWARD";
+    case DML_RECURRENT_NETWORK_DIRECTION_BIDIRECTIONAL: return "DML_RECURRENT_NETWORK_DIRECTION_BIDIRECTIONAL";
+    default:
+        assert(false);
+        return "<unknown>";
+    }
+}
+
+template <>
+inline gsl::czstring ToString(DML_FEATURE value)
+{
+    switch (value)
+    {
+    case DML_FEATURE_TENSOR_DATA_TYPE_SUPPORT: return "DML_FEATURE_TENSOR_DATA_TYPE_SUPPORT";
+    case DML_FEATURE_FEATURE_LEVELS: return "DML_FEATURE_FEATURE_LEVELS";
+    default:
+        assert(false);
+        return "<unknown>";
+    }
+}
+
+template <>
+inline gsl::czstring ToString(DML_FEATURE_LEVEL value)
+{
+    switch (value)
+    {
+    case DML_FEATURE_LEVEL_1_0: return "DML_FEATURE_LEVEL_1_0";
+    case DML_FEATURE_LEVEL_2_0: return "DML_FEATURE_LEVEL_2_0";
+    case DML_FEATURE_LEVEL_2_1: return "DML_FEATURE_LEVEL_2_1";
+    case DML_FEATURE_LEVEL_3_0: return "DML_FEATURE_LEVEL_3_0";
+    case DML_FEATURE_LEVEL_3_1: return "DML_FEATURE_LEVEL_3_1";
+    case DML_FEATURE_LEVEL_4_0: return "DML_FEATURE_LEVEL_4_0";
+    case DML_FEATURE_LEVEL_4_1: return "DML_FEATURE_LEVEL_4_1";
+    case DML_FEATURE_LEVEL_5_0: return "DML_FEATURE_LEVEL_5_0";
+    case DML_FEATURE_LEVEL_5_1: return "DML_FEATURE_LEVEL_5_1";
+    case DML_FEATURE_LEVEL_5_2: return "DML_FEATURE_LEVEL_5_2";
+    case DML_FEATURE_LEVEL_6_0: return "DML_FEATURE_LEVEL_6_0";
+    case DML_FEATURE_LEVEL_6_1: return "DML_FEATURE_LEVEL_6_1";
+    case DML_FEATURE_LEVEL_6_2: return "DML_FEATURE_LEVEL_6_2";
+    default:
+        assert(false);
+        return "<unknown>";
+    }
+}
+
+template <>
+inline gsl::czstring ToString(DML_IS_INFINITY_MODE value)
+{
+    switch (value)
+    {
+    case DML_IS_INFINITY_MODE_EITHER: return "DML_IS_INFINITY_MODE_EITHER";
+    case DML_IS_INFINITY_MODE_POSITIVE: return "DML_IS_INFINITY_MODE_POSITIVE";
+    case DML_IS_INFINITY_MODE_NEGATIVE: return "DML_IS_INFINITY_MODE_NEGATIVE";
+    default:
+        assert(false);
+        return "<unknown>";
+    }
+}
+
+template <>
+inline gsl::czstring ToString(DML_DEPTH_SPACE_ORDER value)
+{
+    switch (value)
+    {
+    case DML_DEPTH_SPACE_ORDER_DEPTH_COLUMN_ROW: return "DML_DEPTH_SPACE_ORDER_DEPTH_COLUMN_ROW";
+    case DML_DEPTH_SPACE_ORDER_COLUMN_ROW_DEPTH: return "DML_DEPTH_SPACE_ORDER_COLUMN_ROW_DEPTH";
+    default:
+        assert(false);
+        return "<unknown>";
+    }
+}
+
+template <>
+inline gsl::czstring ToString(DML_AXIS_DIRECTION value)
+{
+    switch (value)
+    {
+    case DML_AXIS_DIRECTION_INCREASING: return "DML_AXIS_DIRECTION_INCREASING";
+    case DML_AXIS_DIRECTION_DECREASING: return "DML_AXIS_DIRECTION_DECREASING";
+    default:
+        assert(false);
+        return "<unknown>";
+    }
+}
+
+template <>
+inline gsl::czstring ToString(DML_ROUNDING_MODE value)
+{
+    switch (value)
+    {
+    case DML_ROUNDING_MODE_HALVES_TO_NEAREST_EVEN: return "DML_ROUNDING_MODE_HALVES_TO_NEAREST_EVEN";
+    case DML_ROUNDING_MODE_TOWARD_ZERO: return "DML_ROUNDING_MODE_TOWARD_ZERO";
+    case DML_ROUNDING_MODE_TOWARD_INFINITY: return "DML_ROUNDING_MODE_TOWARD_INFINITY";
+    default:
+        assert(false);
+        return "<unknown>";
+    }
+}
+
+template <>
+inline gsl::czstring ToString(DML_RANDOM_GENERATOR_TYPE value)
+{
+    switch (value)
+    {
+    case DML_RANDOM_GENERATOR_TYPE_PHILOX_4X32_10: return "DML_RANDOM_GENERATOR_TYPE_PHILOX_4X32_10";
+    default:
+        assert(false);
+        return "<unknown>";
+    }
+}
+
+template <>
+inline gsl::czstring ToString(DML_MULTIHEAD_ATTENTION_MASK_TYPE value)
+{
+    switch (value)
+    {
+    case DML_MULTIHEAD_ATTENTION_MASK_TYPE_NONE: return "DML_MULTIHEAD_ATTENTION_MASK_TYPE_NONE";
+    case DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_SEQUENCE_LENGTH: return "DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_SEQUENCE_LENGTH";
+    case DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_SEQUENCE_END_START: return "DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_SEQUENCE_END_START";
+    case DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_QUERY_SEQUENCE_LENGTH_START_END: return "DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_QUERY_SEQUENCE_LENGTH_START_END";
+    case DML_MULTIHEAD_ATTENTION_MASK_TYPE_BOOLEAN: return "DML_MULTIHEAD_ATTENTION_MASK_TYPE_BOOLEAN";
+    default:
+        assert(false);
+        return "<unknown>";
+    }
+}
+
+
+template <typename T>
+T FromString(std::string_view value);
+
+}
 }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h
index 2a82c12872a72..5fe6603c2a0bf 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h
@@ -618,7 +618,7 @@ constexpr DML_OPERATOR_SCHEMA DML_ELEMENT_WISE_THRESHOLD_OPERATOR_SCHEMA {
 constexpr DML_SCHEMA_FIELD DML_ELEMENT_WISE_QUANTIZE_LINEAR_OPERATOR_SCHEMA_FIELDS[4] {
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "ScaleTensor", false },
-    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "ZeroPointTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "ZeroPointTensor", true },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false },
 };
 
@@ -633,7 +633,7 @@ constexpr DML_OPERATOR_SCHEMA DML_ELEMENT_WISE_QUANTIZE_LINEAR_OPERATOR_SCHEMA {
 constexpr DML_SCHEMA_FIELD DML_ELEMENT_WISE_DEQUANTIZE_LINEAR_OPERATOR_SCHEMA_FIELDS[4] {
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "ScaleTensor", false },
-    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "ZeroPointTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "ZeroPointTensor", true },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false },
 };
 
@@ -869,31 +869,6 @@ constexpr DML_OPERATOR_SCHEMA DML_ROI_POOLING_OPERATOR_SCHEMA {
     DML_ROI_POOLING_OPERATOR_SCHEMA_FIELDS,
 };
 
-
-constexpr DML_SCHEMA_FIELD DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA_FIELDS[13] {
-    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false },
-    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputScaleTensor", false },
-    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputZeroPointTensor", true },
-    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputScaleTensor", false },
-    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputZeroPointTensor", true },
-    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false },
-    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT, "DimensionCount", false },
-    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "Strides", false },
-    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "WindowSize", false },
-    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "StartPadding", false },
-    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "EndPadding", false },
-    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "Dilations", false },
-    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT, "IncludePadding", false },
-};
-
-constexpr DML_OPERATOR_SCHEMA DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA {
-    "DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING",
-    static_cast<DML_OPERATOR_TYPE>(DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING),
-    DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE,
-    13,
-    DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA_FIELDS,
-};
-
 constexpr DML_SCHEMA_FIELD DML_SLICE_OPERATOR_SCHEMA_FIELDS[6] {
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false },
@@ -1146,7 +1121,7 @@ constexpr DML_SCHEMA_FIELD DML_BATCH_NORMALIZATION_TRAINING_GRAD_OPERATOR_SCHEMA
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputGradientTensor", false },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputScaleGradientTensor", false },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputBiasGradientTensor", false },
-    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_FLOAT, "Epsilon", true },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_FLOAT, "Epsilon", false },
 };
 
 constexpr DML_OPERATOR_SCHEMA DML_BATCH_NORMALIZATION_TRAINING_GRAD_OPERATOR_SCHEMA {
@@ -2312,7 +2287,7 @@ constexpr DML_OPERATOR_SCHEMA DML_BATCH_NORMALIZATION_TRAINING_OPERATOR_SCHEMA {
     DML_BATCH_NORMALIZATION_TRAINING_OPERATOR_SCHEMA_FIELDS,
 };
 
-constexpr DML_SCHEMA_FIELD DML_RESAMPLE2_OPERATOR_SCHEMA_FIELDS[8]{
+constexpr DML_SCHEMA_FIELD DML_RESAMPLE2_OPERATOR_SCHEMA_FIELDS[8] {
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT, "InterpolationMode", false },
@@ -2323,7 +2298,7 @@ constexpr DML_SCHEMA_FIELD DML_RESAMPLE2_OPERATOR_SCHEMA_FIELDS[8]{
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_FLOAT_ARRAY, "OutputPixelOffsets", false },
 };
 
-constexpr DML_OPERATOR_SCHEMA DML_RESAMPLE2_OPERATOR_SCHEMA{
+constexpr DML_OPERATOR_SCHEMA DML_RESAMPLE2_OPERATOR_SCHEMA {
     "DML_OPERATOR_RESAMPLE2",
     DML_OPERATOR_RESAMPLE2,
     DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE,
@@ -2342,7 +2317,7 @@ constexpr DML_SCHEMA_FIELD DML_RESAMPLE_GRAD1_OPERATOR_SCHEMA_FIELDS[8]{
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_FLOAT_ARRAY, "OutputPixelOffsets", false },
 };
 
-constexpr DML_OPERATOR_SCHEMA DML_RESAMPLE_GRAD1_OPERATOR_SCHEMA{
+constexpr DML_OPERATOR_SCHEMA DML_RESAMPLE_GRAD1_OPERATOR_SCHEMA {
     "DML_OPERATOR_RESAMPLE_GRAD1",
     DML_OPERATOR_RESAMPLE_GRAD1,
     DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE,
@@ -2350,7 +2325,7 @@ constexpr DML_OPERATOR_SCHEMA DML_RESAMPLE_GRAD1_OPERATOR_SCHEMA{
     DML_RESAMPLE_GRAD1_OPERATOR_SCHEMA_FIELDS,
 };
 
-constexpr DML_SCHEMA_FIELD DML_DIAGONAL_MATRIX1_OPERATOR_SCHEMA_FIELDS[6]{
+constexpr DML_SCHEMA_FIELD DML_DIAGONAL_MATRIX1_OPERATOR_SCHEMA_FIELDS[6] {
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", true },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT, "ValueDataType", false },
@@ -2359,7 +2334,7 @@ constexpr DML_SCHEMA_FIELD DML_DIAGONAL_MATRIX1_OPERATOR_SCHEMA_FIELDS[6]{
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_INT, "DiagonalFillEnd", false },
 };
 
-constexpr DML_OPERATOR_SCHEMA DML_DIAGONAL_MATRIX1_OPERATOR_SCHEMA{
+constexpr DML_OPERATOR_SCHEMA DML_DIAGONAL_MATRIX1_OPERATOR_SCHEMA {
     "DML_OPERATOR_DIAGONAL_MATRIX1",
     DML_OPERATOR_DIAGONAL_MATRIX1,
     DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE,
@@ -2396,6 +2371,48 @@ constexpr DML_OPERATOR_SCHEMA DML_MULTIHEAD_ATTENTION_OPERATOR_SCHEMA {
     DML_MULTIHEAD_ATTENTION_OPERATOR_SCHEMA_FIELDS,
 };
 
+constexpr DML_SCHEMA_FIELD DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA_FIELDS[13] {
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputScaleTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputZeroPointTensor", true },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputScaleTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputZeroPointTensor", true },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT, "DimensionCount", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "Strides", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "WindowSize", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "StartPadding", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "EndPadding", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "Dilations", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT, "IncludePadding", false },
+};
+
+constexpr DML_OPERATOR_SCHEMA DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA {
+    "DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING",
+    DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING,
+    DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE,
+    13,
+    DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA_FIELDS,
+};
+
+constexpr DML_SCHEMA_FIELD DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA_FIELDS[8] {
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "ATensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "AScaleTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "AZeroPointTensor", true },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BScaleTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BZeroPointTensor", true },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BiasTensor", true },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false },
+};
+
+constexpr DML_OPERATOR_SCHEMA DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA {
+    "DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT",
+    DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT,
+    DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE,
+    8,
+    DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA_FIELDS,
+};
 constexpr DML_SCHEMA_FIELD DML_ACTIVATION_ELU_OPERATOR_SCHEMA_FIELDS[3] {
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false },
@@ -2732,6 +2749,35 @@ constexpr DML_OPERATOR_SCHEMA DML_ACTIVATION_GELU_OPERATOR_SCHEMA {
     DML_ACTIVATION_GELU_OPERATOR_SCHEMA_FIELDS,
 };
 
+constexpr DML_SCHEMA_FIELD DML_ACTIVATION_SWISH_OPERATOR_SCHEMA_FIELDS[3] {
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_FLOAT, "SigmoidInputScale", false },
+};
+
+constexpr DML_OPERATOR_SCHEMA DML_ACTIVATION_SWISH_OPERATOR_SCHEMA {
+    "DML_OPERATOR_ACTIVATION_SWISH",
+    DML_OPERATOR_ACTIVATION_SWISH,
+    DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE,
+    3,
+    DML_ACTIVATION_SWISH_OPERATOR_SCHEMA_FIELDS,
+};
+
+constexpr DML_SCHEMA_FIELD DML_ACTIVATION_HARD_SWISH_OPERATOR_SCHEMA_FIELDS[4] {
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_FLOAT, "Alpha", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_FLOAT, "Beta", false },
+};
+
+constexpr DML_OPERATOR_SCHEMA DML_ACTIVATION_HARD_SWISH_OPERATOR_SCHEMA {
+    "DML_OPERATOR_ACTIVATION_HARD_SWISH",
+    DML_OPERATOR_ACTIVATION_HARD_SWISH,
+    DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE,
+    4,
+    DML_ACTIVATION_HARD_SWISH_OPERATOR_SCHEMA_FIELDS,
+};
+
 constexpr DML_SCHEMA_FIELD DML_RNN_ZERO_OPERATOR_SCHEMA_FIELDS[3] {
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "SequenceLengthsTensor", false },
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphDesc_generated.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphDesc_generated.h
new file mode 100644
index 0000000000000..72059b9a3f911
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphDesc_generated.h
@@ -0,0 +1,788 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_DMLGRAPHDESC_DML_IR_H_
+#define FLATBUFFERS_GENERATED_DMLGRAPHDESC_DML_IR_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+#include "OperatorFieldTypes_generated.h"
+
+namespace dml {
+namespace ir {
+
+struct ConstantRawData;
+struct ConstantRawDataBuilder;
+
+struct ConstantName;
+struct ConstantNameBuilder;
+
+struct ConstantNodeDesc;
+struct ConstantNodeDescBuilder;
+
+struct DmlBufferTensorDesc;
+struct DmlBufferTensorDescBuilder;
+
+struct OperatorNodeDesc;
+struct OperatorNodeDescBuilder;
+
+struct DmlGraphNode;
+struct DmlGraphNodeBuilder;
+
+struct DmlGraphDesc;
+struct DmlGraphDescBuilder;
+
+enum ConstantNodeDescDetail {
+  ConstantNodeDescDetail_NONE = 0,
+  ConstantNodeDescDetail_ConstantName = 1,
+  ConstantNodeDescDetail_ConstantRawData = 2,
+  ConstantNodeDescDetail_MIN = ConstantNodeDescDetail_NONE,
+  ConstantNodeDescDetail_MAX = ConstantNodeDescDetail_ConstantRawData
+};
+
+inline const ConstantNodeDescDetail (&EnumValuesConstantNodeDescDetail())[3] {
+  static const ConstantNodeDescDetail values[] = {
+    ConstantNodeDescDetail_NONE,
+    ConstantNodeDescDetail_ConstantName,
+    ConstantNodeDescDetail_ConstantRawData
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesConstantNodeDescDetail() {
+  static const char * const names[4] = {
+    "NONE",
+    "ConstantName",
+    "ConstantRawData",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameConstantNodeDescDetail(ConstantNodeDescDetail e) {
+  if (flatbuffers::IsOutRange(e, ConstantNodeDescDetail_NONE, ConstantNodeDescDetail_ConstantRawData)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesConstantNodeDescDetail()[index];
+}
+
+template<typename T> struct ConstantNodeDescDetailTraits {
+  static const ConstantNodeDescDetail enum_value = ConstantNodeDescDetail_NONE;
+};
+
+template<> struct ConstantNodeDescDetailTraits<dml::ir::ConstantName> {
+  static const ConstantNodeDescDetail enum_value = ConstantNodeDescDetail_ConstantName;
+};
+
+template<> struct ConstantNodeDescDetailTraits<dml::ir::ConstantRawData> {
+  static const ConstantNodeDescDetail enum_value = ConstantNodeDescDetail_ConstantRawData;
+};
+
+bool VerifyConstantNodeDescDetail(flatbuffers::Verifier &verifier, const void *obj, ConstantNodeDescDetail type);
+bool VerifyConstantNodeDescDetailVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types);
+
+enum NodeDesc {
+  NodeDesc_NONE = 0,
+  NodeDesc_OperatorNodeDesc = 1,
+  NodeDesc_ConstantNodeDesc = 2,
+  NodeDesc_MIN = NodeDesc_NONE,
+  NodeDesc_MAX = NodeDesc_ConstantNodeDesc
+};
+
+inline const NodeDesc (&EnumValuesNodeDesc())[3] {
+  static const NodeDesc values[] = {
+    NodeDesc_NONE,
+    NodeDesc_OperatorNodeDesc,
+    NodeDesc_ConstantNodeDesc
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesNodeDesc() {
+  static const char * const names[4] = {
+    "NONE",
+    "OperatorNodeDesc",
+    "ConstantNodeDesc",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameNodeDesc(NodeDesc e) {
+  if (flatbuffers::IsOutRange(e, NodeDesc_NONE, NodeDesc_ConstantNodeDesc)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesNodeDesc()[index];
+}
+
+template<typename T> struct NodeDescTraits {
+  static const NodeDesc enum_value = NodeDesc_NONE;
+};
+
+template<> struct NodeDescTraits<dml::ir::OperatorNodeDesc> {
+  static const NodeDesc enum_value = NodeDesc_OperatorNodeDesc;
+};
+
+template<> struct NodeDescTraits<dml::ir::ConstantNodeDesc> {
+  static const NodeDesc enum_value = NodeDesc_ConstantNodeDesc;
+};
+
+bool VerifyNodeDesc(flatbuffers::Verifier &verifier, const void *obj, NodeDesc type);
+bool VerifyNodeDescVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types);
+
+struct ConstantRawData FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ConstantRawDataBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DATA = 4
+  };
+  const flatbuffers::Vector<uint8_t> *data() const {
+    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_DATA);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_DATA) &&
+           verifier.VerifyVector(data()) &&
+           verifier.EndTable();
+  }
+};
+
+struct ConstantRawDataBuilder {
+  typedef ConstantRawData Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_data(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> data) {
+    fbb_.AddOffset(ConstantRawData::VT_DATA, data);
+  }
+  explicit ConstantRawDataBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ConstantRawDataBuilder &operator=(const ConstantRawDataBuilder &);
+  flatbuffers::Offset<ConstantRawData> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ConstantRawData>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ConstantRawData> CreateConstantRawData(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> data = 0) {
+  ConstantRawDataBuilder builder_(_fbb);
+  builder_.add_data(data);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<ConstantRawData> CreateConstantRawDataDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<uint8_t> *data = nullptr) {
+  auto data__ = data ? _fbb.CreateVector<uint8_t>(*data) : 0;
+  return dml::ir::CreateConstantRawData(
+      _fbb,
+      data__);
+}
+
+struct ConstantName FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ConstantNameBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NAME = 4
+  };
+  const flatbuffers::String *name() const {
+    return GetPointer<const flatbuffers::String *>(VT_NAME);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           verifier.EndTable();
+  }
+};
+
+struct ConstantNameBuilder {
+  typedef ConstantName Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_name(flatbuffers::Offset<flatbuffers::String> name) {
+    fbb_.AddOffset(ConstantName::VT_NAME, name);
+  }
+  explicit ConstantNameBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ConstantNameBuilder &operator=(const ConstantNameBuilder &);
+  flatbuffers::Offset<ConstantName> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ConstantName>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ConstantName> CreateConstantName(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::String> name = 0) {
+  ConstantNameBuilder builder_(_fbb);
+  builder_.add_name(name);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<ConstantName> CreateConstantNameDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const char *name = nullptr) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  return dml::ir::CreateConstantName(
+      _fbb,
+      name__);
+}
+
+struct ConstantNodeDesc FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ConstantNodeDescBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DATA_TYPE = 4,
+    VT_DATA = 6
+  };
+  dml::ir::ConstantNodeDescDetail data_type() const {
+    return static_cast<dml::ir::ConstantNodeDescDetail>(GetField<uint8_t>(VT_DATA_TYPE, 0));
+  }
+  const void *data() const {
+    return GetPointer<const void *>(VT_DATA);
+  }
+  template<typename T> const T *data_as() const;
+  const dml::ir::ConstantName *data_as_ConstantName() const {
+    return data_type() == dml::ir::ConstantNodeDescDetail_ConstantName ? static_cast<const dml::ir::ConstantName *>(data()) : nullptr;
+  }
+  const dml::ir::ConstantRawData *data_as_ConstantRawData() const {
+    return data_type() == dml::ir::ConstantNodeDescDetail_ConstantRawData ? static_cast<const dml::ir::ConstantRawData *>(data()) : nullptr;
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_DATA_TYPE) &&
+           VerifyOffset(verifier, VT_DATA) &&
+           VerifyConstantNodeDescDetail(verifier, data(), data_type()) &&
+           verifier.EndTable();
+  }
+};
+
+template<> inline const dml::ir::ConstantName *ConstantNodeDesc::data_as<dml::ir::ConstantName>() const {
+  return data_as_ConstantName();
+}
+
+template<> inline const dml::ir::ConstantRawData *ConstantNodeDesc::data_as<dml::ir::ConstantRawData>() const {
+  return data_as_ConstantRawData();
+}
+
+struct ConstantNodeDescBuilder {
+  typedef ConstantNodeDesc Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_data_type(dml::ir::ConstantNodeDescDetail data_type) {
+    fbb_.AddElement<uint8_t>(ConstantNodeDesc::VT_DATA_TYPE, static_cast<uint8_t>(data_type), 0);
+  }
+  void add_data(flatbuffers::Offset<void> data) {
+    fbb_.AddOffset(ConstantNodeDesc::VT_DATA, data);
+  }
+  explicit ConstantNodeDescBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ConstantNodeDescBuilder &operator=(const ConstantNodeDescBuilder &);
+  flatbuffers::Offset<ConstantNodeDesc> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ConstantNodeDesc>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ConstantNodeDesc> CreateConstantNodeDesc(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    dml::ir::ConstantNodeDescDetail data_type = dml::ir::ConstantNodeDescDetail_NONE,
+    flatbuffers::Offset<void> data = 0) {
+  ConstantNodeDescBuilder builder_(_fbb);
+  builder_.add_data(data);
+  builder_.add_data_type(data_type);
+  return builder_.Finish();
+}
+
+struct DmlBufferTensorDesc FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef DmlBufferTensorDescBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DATATYPE = 4,
+    VT_SIZES = 6,
+    VT_STRIDES = 8,
+    VT_TOTALTENSORSIZEINBYTES = 10
+  };
+  const flatbuffers::String *dataType() const {
+    return GetPointer<const flatbuffers::String *>(VT_DATATYPE);
+  }
+  const flatbuffers::Vector<uint32_t> *sizes() const {
+    return GetPointer<const flatbuffers::Vector<uint32_t> *>(VT_SIZES);
+  }
+  const flatbuffers::Vector<uint32_t> *strides() const {
+    return GetPointer<const flatbuffers::Vector<uint32_t> *>(VT_STRIDES);
+  }
+  uint64_t totalTensorSizeInBytes() const {
+    return GetField<uint64_t>(VT_TOTALTENSORSIZEINBYTES, 0);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_DATATYPE) &&
+           verifier.VerifyString(dataType()) &&
+           VerifyOffset(verifier, VT_SIZES) &&
+           verifier.VerifyVector(sizes()) &&
+           VerifyOffset(verifier, VT_STRIDES) &&
+           verifier.VerifyVector(strides()) &&
+           VerifyField<uint64_t>(verifier, VT_TOTALTENSORSIZEINBYTES) &&
+           verifier.EndTable();
+  }
+};
+
+struct DmlBufferTensorDescBuilder {
+  typedef DmlBufferTensorDesc Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_dataType(flatbuffers::Offset<flatbuffers::String> dataType) {
+    fbb_.AddOffset(DmlBufferTensorDesc::VT_DATATYPE, dataType);
+  }
+  void add_sizes(flatbuffers::Offset<flatbuffers::Vector<uint32_t>> sizes) {
+    fbb_.AddOffset(DmlBufferTensorDesc::VT_SIZES, sizes);
+  }
+  void add_strides(flatbuffers::Offset<flatbuffers::Vector<uint32_t>> strides) {
+    fbb_.AddOffset(DmlBufferTensorDesc::VT_STRIDES, strides);
+  }
+  void add_totalTensorSizeInBytes(uint64_t totalTensorSizeInBytes) {
+    fbb_.AddElement<uint64_t>(DmlBufferTensorDesc::VT_TOTALTENSORSIZEINBYTES, totalTensorSizeInBytes, 0);
+  }
+  explicit DmlBufferTensorDescBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  DmlBufferTensorDescBuilder &operator=(const DmlBufferTensorDescBuilder &);
+  flatbuffers::Offset<DmlBufferTensorDesc> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<DmlBufferTensorDesc>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<DmlBufferTensorDesc> CreateDmlBufferTensorDesc(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::String> dataType = 0,
+    flatbuffers::Offset<flatbuffers::Vector<uint32_t>> sizes = 0,
+    flatbuffers::Offset<flatbuffers::Vector<uint32_t>> strides = 0,
+    uint64_t totalTensorSizeInBytes = 0) {
+  DmlBufferTensorDescBuilder builder_(_fbb);
+  builder_.add_totalTensorSizeInBytes(totalTensorSizeInBytes);
+  builder_.add_strides(strides);
+  builder_.add_sizes(sizes);
+  builder_.add_dataType(dataType);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<DmlBufferTensorDesc> CreateDmlBufferTensorDescDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const char *dataType = nullptr,
+    const std::vector<uint32_t> *sizes = nullptr,
+    const std::vector<uint32_t> *strides = nullptr,
+    uint64_t totalTensorSizeInBytes = 0) {
+  auto dataType__ = dataType ? _fbb.CreateString(dataType) : 0;
+  auto sizes__ = sizes ? _fbb.CreateVector<uint32_t>(*sizes) : 0;
+  auto strides__ = strides ? _fbb.CreateVector<uint32_t>(*strides) : 0;
+  return dml::ir::CreateDmlBufferTensorDesc(
+      _fbb,
+      dataType__,
+      sizes__,
+      strides__,
+      totalTensorSizeInBytes);
+}
+
+struct OperatorNodeDesc FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef OperatorNodeDescBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_TYPE = 4,
+    VT_INPUTS = 6,
+    VT_OUTPUTS = 8,
+    VT_ATTRIBUTES = 10
+  };
+  const flatbuffers::String *type() const {
+    return GetPointer<const flatbuffers::String *>(VT_TYPE);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>> *inputs() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>> *>(VT_INPUTS);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>> *outputs() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>> *>(VT_OUTPUTS);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>> *attributes() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>> *>(VT_ATTRIBUTES);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_TYPE) &&
+           verifier.VerifyString(type()) &&
+           VerifyOffset(verifier, VT_INPUTS) &&
+           verifier.VerifyVector(inputs()) &&
+           verifier.VerifyVectorOfTables(inputs()) &&
+           VerifyOffset(verifier, VT_OUTPUTS) &&
+           verifier.VerifyVector(outputs()) &&
+           verifier.VerifyVectorOfTables(outputs()) &&
+           VerifyOffset(verifier, VT_ATTRIBUTES) &&
+           verifier.VerifyVector(attributes()) &&
+           verifier.VerifyVectorOfTables(attributes()) &&
+           verifier.EndTable();
+  }
+};
+
+struct OperatorNodeDescBuilder {
+  typedef OperatorNodeDesc Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_type(flatbuffers::Offset<flatbuffers::String> type) {
+    fbb_.AddOffset(OperatorNodeDesc::VT_TYPE, type);
+  }
+  void add_inputs(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>>> inputs) {
+    fbb_.AddOffset(OperatorNodeDesc::VT_INPUTS, inputs);
+  }
+  void add_outputs(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>>> outputs) {
+    fbb_.AddOffset(OperatorNodeDesc::VT_OUTPUTS, outputs);
+  }
+  void add_attributes(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>>> attributes) {
+    fbb_.AddOffset(OperatorNodeDesc::VT_ATTRIBUTES, attributes);
+  }
+  explicit OperatorNodeDescBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  OperatorNodeDescBuilder &operator=(const OperatorNodeDescBuilder &);
+  flatbuffers::Offset<OperatorNodeDesc> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<OperatorNodeDesc>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<OperatorNodeDesc> CreateOperatorNodeDesc(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::String> type = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>>> inputs = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>>> outputs = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>>> attributes = 0) {
+  OperatorNodeDescBuilder builder_(_fbb);
+  builder_.add_attributes(attributes);
+  builder_.add_outputs(outputs);
+  builder_.add_inputs(inputs);
+  builder_.add_type(type);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<OperatorNodeDesc> CreateOperatorNodeDescDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const char *type = nullptr,
+    const std::vector<flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>> *inputs = nullptr,
+    const std::vector<flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>> *outputs = nullptr,
+    const std::vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>> *attributes = nullptr) {
+  auto type__ = type ? _fbb.CreateString(type) : 0;
+  auto inputs__ = inputs ? _fbb.CreateVector<flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>>(*inputs) : 0;
+  auto outputs__ = outputs ? _fbb.CreateVector<flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>>(*outputs) : 0;
+  auto attributes__ = attributes ? _fbb.CreateVector<flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>>(*attributes) : 0;
+  return dml::ir::CreateOperatorNodeDesc(
+      _fbb,
+      type__,
+      inputs__,
+      outputs__,
+      attributes__);
+}
+
+struct DmlGraphNode FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef DmlGraphNodeBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DESC_TYPE = 4,
+    VT_DESC = 6,
+    VT_NAME = 8,
+    VT_INPUTNAMES = 10,
+    VT_OUTPUTNAMES = 12
+  };
+  dml::ir::NodeDesc desc_type() const {
+    return static_cast<dml::ir::NodeDesc>(GetField<uint8_t>(VT_DESC_TYPE, 0));
+  }
+  const void *desc() const {
+    return GetPointer<const void *>(VT_DESC);
+  }
+  template<typename T> const T *desc_as() const;
+  const dml::ir::OperatorNodeDesc *desc_as_OperatorNodeDesc() const {
+    return desc_type() == dml::ir::NodeDesc_OperatorNodeDesc ? static_cast<const dml::ir::OperatorNodeDesc *>(desc()) : nullptr;
+  }
+  const dml::ir::ConstantNodeDesc *desc_as_ConstantNodeDesc() const {
+    return desc_type() == dml::ir::NodeDesc_ConstantNodeDesc ? static_cast<const dml::ir::ConstantNodeDesc *>(desc()) : nullptr;
+  }
+  const flatbuffers::String *name() const {
+    return GetPointer<const flatbuffers::String *>(VT_NAME);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *inputNames() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *>(VT_INPUTNAMES);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *outputNames() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *>(VT_OUTPUTNAMES);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_DESC_TYPE) &&
+           VerifyOffset(verifier, VT_DESC) &&
+           VerifyNodeDesc(verifier, desc(), desc_type()) &&
+           VerifyOffset(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyOffset(verifier, VT_INPUTNAMES) &&
+           verifier.VerifyVector(inputNames()) &&
+           verifier.VerifyVectorOfStrings(inputNames()) &&
+           VerifyOffset(verifier, VT_OUTPUTNAMES) &&
+           verifier.VerifyVector(outputNames()) &&
+           verifier.VerifyVectorOfStrings(outputNames()) &&
+           verifier.EndTable();
+  }
+};
+
+template<> inline const dml::ir::OperatorNodeDesc *DmlGraphNode::desc_as<dml::ir::OperatorNodeDesc>() const {
+  return desc_as_OperatorNodeDesc();
+}
+
+template<> inline const dml::ir::ConstantNodeDesc *DmlGraphNode::desc_as<dml::ir::ConstantNodeDesc>() const {
+  return desc_as_ConstantNodeDesc();
+}
+
+struct DmlGraphNodeBuilder {
+  typedef DmlGraphNode Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_desc_type(dml::ir::NodeDesc desc_type) {
+    fbb_.AddElement<uint8_t>(DmlGraphNode::VT_DESC_TYPE, static_cast<uint8_t>(desc_type), 0);
+  }
+  void add_desc(flatbuffers::Offset<void> desc) {
+    fbb_.AddOffset(DmlGraphNode::VT_DESC, desc);
+  }
+  void add_name(flatbuffers::Offset<flatbuffers::String> name) {
+    fbb_.AddOffset(DmlGraphNode::VT_NAME, name);
+  }
+  void add_inputNames(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> inputNames) {
+    fbb_.AddOffset(DmlGraphNode::VT_INPUTNAMES, inputNames);
+  }
+  void add_outputNames(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> outputNames) {
+    fbb_.AddOffset(DmlGraphNode::VT_OUTPUTNAMES, outputNames);
+  }
+  explicit DmlGraphNodeBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  DmlGraphNodeBuilder &operator=(const DmlGraphNodeBuilder &);
+  flatbuffers::Offset<DmlGraphNode> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<DmlGraphNode>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<DmlGraphNode> CreateDmlGraphNode(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    dml::ir::NodeDesc desc_type = dml::ir::NodeDesc_NONE,
+    flatbuffers::Offset<void> desc = 0,
+    flatbuffers::Offset<flatbuffers::String> name = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> inputNames = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> outputNames = 0) {
+  DmlGraphNodeBuilder builder_(_fbb);
+  builder_.add_outputNames(outputNames);
+  builder_.add_inputNames(inputNames);
+  builder_.add_name(name);
+  builder_.add_desc(desc);
+  builder_.add_desc_type(desc_type);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<DmlGraphNode> CreateDmlGraphNodeDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    dml::ir::NodeDesc desc_type = dml::ir::NodeDesc_NONE,
+    flatbuffers::Offset<void> desc = 0,
+    const char *name = nullptr,
+    const std::vector<flatbuffers::Offset<flatbuffers::String>> *inputNames = nullptr,
+    const std::vector<flatbuffers::Offset<flatbuffers::String>> *outputNames = nullptr) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  auto inputNames__ = inputNames ? _fbb.CreateVector<flatbuffers::Offset<flatbuffers::String>>(*inputNames) : 0;
+  auto outputNames__ = outputNames ? _fbb.CreateVector<flatbuffers::Offset<flatbuffers::String>>(*outputNames) : 0;
+  return dml::ir::CreateDmlGraphNode(
+      _fbb,
+      desc_type,
+      desc,
+      name__,
+      inputNames__,
+      outputNames__);
+}
+
+struct DmlGraphDesc FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef DmlGraphDescBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NODES = 4,
+    VT_GRAPHINPUTNAMES = 6,
+    VT_GRAPHOUTPUTNAMES = 8
+  };
+  const flatbuffers::Vector<flatbuffers::Offset<dml::ir::DmlGraphNode>> *nodes() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<dml::ir::DmlGraphNode>> *>(VT_NODES);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *graphInputNames() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *>(VT_GRAPHINPUTNAMES);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *graphOutputNames() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *>(VT_GRAPHOUTPUTNAMES);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_NODES) &&
+           verifier.VerifyVector(nodes()) &&
+           verifier.VerifyVectorOfTables(nodes()) &&
+           VerifyOffset(verifier, VT_GRAPHINPUTNAMES) &&
+           verifier.VerifyVector(graphInputNames()) &&
+           verifier.VerifyVectorOfStrings(graphInputNames()) &&
+           VerifyOffset(verifier, VT_GRAPHOUTPUTNAMES) &&
+           verifier.VerifyVector(graphOutputNames()) &&
+           verifier.VerifyVectorOfStrings(graphOutputNames()) &&
+           verifier.EndTable();
+  }
+};
+
+struct DmlGraphDescBuilder {
+  typedef DmlGraphDesc Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_nodes(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<dml::ir::DmlGraphNode>>> nodes) {
+    fbb_.AddOffset(DmlGraphDesc::VT_NODES, nodes);
+  }
+  void add_graphInputNames(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> graphInputNames) {
+    fbb_.AddOffset(DmlGraphDesc::VT_GRAPHINPUTNAMES, graphInputNames);
+  }
+  void add_graphOutputNames(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> graphOutputNames) {
+    fbb_.AddOffset(DmlGraphDesc::VT_GRAPHOUTPUTNAMES, graphOutputNames);
+  }
+  explicit DmlGraphDescBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  DmlGraphDescBuilder &operator=(const DmlGraphDescBuilder &);
+  flatbuffers::Offset<DmlGraphDesc> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<DmlGraphDesc>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<DmlGraphDesc> CreateDmlGraphDesc(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<dml::ir::DmlGraphNode>>> nodes = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> graphInputNames = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> graphOutputNames = 0) {
+  DmlGraphDescBuilder builder_(_fbb);
+  builder_.add_graphOutputNames(graphOutputNames);
+  builder_.add_graphInputNames(graphInputNames);
+  builder_.add_nodes(nodes);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<DmlGraphDesc> CreateDmlGraphDescDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<flatbuffers::Offset<dml::ir::DmlGraphNode>> *nodes = nullptr,
+    const std::vector<flatbuffers::Offset<flatbuffers::String>> *graphInputNames = nullptr,
+    const std::vector<flatbuffers::Offset<flatbuffers::String>> *graphOutputNames = nullptr) {
+  auto nodes__ = nodes ? _fbb.CreateVector<flatbuffers::Offset<dml::ir::DmlGraphNode>>(*nodes) : 0;
+  auto graphInputNames__ = graphInputNames ? _fbb.CreateVector<flatbuffers::Offset<flatbuffers::String>>(*graphInputNames) : 0;
+  auto graphOutputNames__ = graphOutputNames ? _fbb.CreateVector<flatbuffers::Offset<flatbuffers::String>>(*graphOutputNames) : 0;
+  return dml::ir::CreateDmlGraphDesc(
+      _fbb,
+      nodes__,
+      graphInputNames__,
+      graphOutputNames__);
+}
+
+inline bool VerifyConstantNodeDescDetail(flatbuffers::Verifier &verifier, const void *obj, ConstantNodeDescDetail type) {
+  switch (type) {
+    case ConstantNodeDescDetail_NONE: {
+      return true;
+    }
+    case ConstantNodeDescDetail_ConstantName: {
+      auto ptr = reinterpret_cast<const dml::ir::ConstantName *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case ConstantNodeDescDetail_ConstantRawData: {
+      auto ptr = reinterpret_cast<const dml::ir::ConstantRawData *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    default: return true;
+  }
+}
+
+inline bool VerifyConstantNodeDescDetailVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types) {
+  if (!values || !types) return !values && !types;
+  if (values->size() != types->size()) return false;
+  for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+    if (!VerifyConstantNodeDescDetail(
+        verifier,  values->Get(i), types->GetEnum<ConstantNodeDescDetail>(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline bool VerifyNodeDesc(flatbuffers::Verifier &verifier, const void *obj, NodeDesc type) {
+  switch (type) {
+    case NodeDesc_NONE: {
+      return true;
+    }
+    case NodeDesc_OperatorNodeDesc: {
+      auto ptr = reinterpret_cast<const dml::ir::OperatorNodeDesc *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case NodeDesc_ConstantNodeDesc: {
+      auto ptr = reinterpret_cast<const dml::ir::ConstantNodeDesc *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    default: return true;
+  }
+}
+
+inline bool VerifyNodeDescVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types) {
+  if (!values || !types) return !values && !types;
+  if (values->size() != types->size()) return false;
+  for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+    if (!VerifyNodeDesc(
+        verifier,  values->Get(i), types->GetEnum<NodeDesc>(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline const dml::ir::DmlGraphDesc *GetDmlGraphDesc(const void *buf) {
+  return flatbuffers::GetRoot<dml::ir::DmlGraphDesc>(buf);
+}
+
+inline const dml::ir::DmlGraphDesc *GetSizePrefixedDmlGraphDesc(const void *buf) {
+  return flatbuffers::GetSizePrefixedRoot<dml::ir::DmlGraphDesc>(buf);
+}
+
+inline bool VerifyDmlGraphDescBuffer(
+    flatbuffers::Verifier &verifier) {
+  return verifier.VerifyBuffer<dml::ir::DmlGraphDesc>(nullptr);
+}
+
+inline bool VerifySizePrefixedDmlGraphDescBuffer(
+    flatbuffers::Verifier &verifier) {
+  return verifier.VerifySizePrefixedBuffer<dml::ir::DmlGraphDesc>(nullptr);
+}
+
+inline void FinishDmlGraphDescBuffer(
+    flatbuffers::FlatBufferBuilder &fbb,
+    flatbuffers::Offset<dml::ir::DmlGraphDesc> root) {
+  fbb.Finish(root);
+}
+
+inline void FinishSizePrefixedDmlGraphDescBuffer(
+    flatbuffers::FlatBufferBuilder &fbb,
+    flatbuffers::Offset<dml::ir::DmlGraphDesc> root) {
+  fbb.FinishSizePrefixed(root);
+}
+
+}  // namespace ir
+}  // namespace dml
+
+#endif  // FLATBUFFERS_GENERATED_DMLGRAPHDESC_DML_IR_H_
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphDeserialization.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphDeserialization.h
new file mode 100644
index 0000000000000..9decf0dce1bb2
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphDeserialization.h
@@ -0,0 +1,14 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+
+#pragma once
+#include "DmlSerializedGraphDesc.h"
+
+struct NodeIndex
+{
+    uint32_t nodeIndex;
+    uint32_t nodeOutputIndex;
+};
+
+DmlSerializedGraphDesc DeserializeDmlGraph(
+    const uint8_t* flatbufferGraphDescBlob,
+    /*out*/ std::vector<std::unique_ptr<std::byte[]>>& rawData);
\ No newline at end of file
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphSerialization.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphSerialization.h
new file mode 100644
index 0000000000000..d8d069da906b7
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphSerialization.h
@@ -0,0 +1,8 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+
+#pragma once
+#include "DmlGraphDesc_generated.h"
+
+struct DmlSerializedGraphDesc;
+
+flatbuffers::DetachedBuffer SerializeDmlGraph(const DmlSerializedGraphDesc& graphDesc);
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlSerializedGraphDesc.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlSerializedGraphDesc.h
new file mode 100644
index 0000000000000..51c3d6c81244b
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlSerializedGraphDesc.h
@@ -0,0 +1,73 @@
+//-----------------------------------------------------------------------------
+//
+//  Copyright (c) Microsoft Corporation. All rights reserved.
+//
+//-----------------------------------------------------------------------------
+
+#pragma once
+
+struct ConstantName
+{
+    std::string name;
+};
+
+struct ConstantData
+{
+    std::byte* data;
+    uint64_t dataSize;
+};
+
+using DmlSerializedGraphNodeConstantVariant = std::variant<
+    ConstantName,
+    ConstantData
+>;
+
+using DmlSerializedGraphNodeDescVariant = std::variant<
+    AbstractOperatorDesc,
+    DmlSerializedGraphNodeConstantVariant
+>;
+
+struct DmlSerializedGraphNode   
+{
+    DmlSerializedGraphNodeDescVariant Desc;
+    std::string Name; 
+};
+
+struct DmlInputSerializedGraphEdge
+{
+    uint32_t GraphInputIndex; 
+    uint32_t ToNodeIndex; 
+    uint32_t ToNodeInputIndex; 
+    std::string Name; 
+};
+
+struct DmlOutputSerializedGraphEdge
+{
+    uint32_t FromNodeIndex; 
+    uint32_t FromNodeOutputIndex; 
+    uint32_t GraphOutputIndex; 
+    std::string Name; 
+};
+
+struct DmlIntermediateSerializedGraphEdge
+{
+    uint32_t FromNodeIndex; 
+    uint32_t FromNodeOutputIndex; 
+    uint32_t ToNodeIndex; 
+    uint32_t ToNodeInputIndex; 
+    std::string Name; 
+};
+
+struct DmlSerializedGraphDesc
+{
+    uint32_t InputCount;
+    uint32_t OutputCount;
+    // nodes must be present in topological order for deserialization to work
+    // because while creating a intermediate edge during deserialization, node (from
+    // which given intermediate edge is outputting) must be visited before than the node
+    // (to which given intermediate edge is inputting)
+    std::vector<DmlSerializedGraphNode> Nodes;
+    std::vector<DmlInputSerializedGraphEdge> InputEdges;
+    std::vector<DmlOutputSerializedGraphEdge> OutputEdges;
+    std::vector<DmlIntermediateSerializedGraphEdge> IntermediateEdges;
+};
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaHelpers.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaHelpers.h
index 99218c135f058..4be41ad3924a2 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaHelpers.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaHelpers.h
@@ -425,7 +425,6 @@ inline std::vector<OperatorField> GetFields(const DML_AVERAGE_POOLING_OPERATOR_D
         OperatorField(&DML_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[7], ToOperatorFieldType(static_cast<UINT>(desc.IncludePadding))),
     };
 }
-
 inline std::vector<OperatorField> GetFields(const DML_AVERAGE_POOLING1_OPERATOR_DESC& desc)
 {
     return {
@@ -502,24 +501,6 @@ inline std::vector<OperatorField> GetFields(const DML_ROI_POOLING_OPERATOR_DESC&
         OperatorField(&DML_ROI_POOLING_OPERATOR_SCHEMA.Fields[4], ToOperatorFieldType(static_cast<DML_SIZE_2D>(desc.PooledSize))),
     };
 }
-inline std::vector<OperatorField> GetFields(const DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_DESC& desc)
-{
-    return {
-        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[0], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.InputTensor))),
-        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[1], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.InputScaleTensor))),
-        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[2], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.InputZeroPointTensor))),
-        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[3], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.OutputScaleTensor))),
-        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[4], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.OutputZeroPointTensor))),
-        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[5], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.OutputTensor))),
-        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[6], ToOperatorFieldType(static_cast<UINT>(desc.DimensionCount))),
-        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[7], ToOperatorFieldType(static_cast<const UINT*>(desc.Strides), desc.DimensionCount)),
-        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[8], ToOperatorFieldType(static_cast<const UINT*>(desc.WindowSize), desc.DimensionCount)),
-        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[9], ToOperatorFieldType(static_cast<const UINT*>(desc.StartPadding), desc.DimensionCount)),
-        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[10], ToOperatorFieldType(static_cast<const UINT*>(desc.EndPadding), desc.DimensionCount)),
-        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[11], ToOperatorFieldType(static_cast<const UINT*>(desc.Dilations), desc.DimensionCount)),
-        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[12], ToOperatorFieldType(static_cast<UINT>(desc.IncludePadding))),
-    };
-}
 inline std::vector<OperatorField> GetFields(const DML_SLICE_OPERATOR_DESC& desc)
 {
     return {
@@ -1488,6 +1469,37 @@ inline std::vector<OperatorField> GetFields(const DML_MULTIHEAD_ATTENTION_OPERAT
         OperatorField(&DML_MULTIHEAD_ATTENTION_OPERATOR_SCHEMA.Fields[17], ToOperatorFieldType(static_cast<UINT>(desc.MaskType))),
     };
 }
+inline std::vector<OperatorField> GetFields(const DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_DESC& desc)
+{
+    return {
+        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[0], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.InputTensor))),
+        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[1], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.InputScaleTensor))),
+        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[2], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.InputZeroPointTensor))),
+        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[3], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.OutputScaleTensor))),
+        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[4], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.OutputZeroPointTensor))),
+        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[5], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.OutputTensor))),
+        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[6], ToOperatorFieldType(static_cast<UINT>(desc.DimensionCount))),
+        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[7], ToOperatorFieldType(static_cast<const UINT*>(desc.Strides), desc.DimensionCount)),
+        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[8], ToOperatorFieldType(static_cast<const UINT*>(desc.WindowSize), desc.DimensionCount)),
+        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[9], ToOperatorFieldType(static_cast<const UINT*>(desc.StartPadding), desc.DimensionCount)),
+        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[10], ToOperatorFieldType(static_cast<const UINT*>(desc.EndPadding), desc.DimensionCount)),
+        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[11], ToOperatorFieldType(static_cast<const UINT*>(desc.Dilations), desc.DimensionCount)),
+        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[12], ToOperatorFieldType(static_cast<UINT>(desc.IncludePadding))),
+    };
+}
+inline std::vector<OperatorField> GetFields(const DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC& desc)
+{
+    return {
+        OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[0], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.ATensor))),
+        OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[1], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.AScaleTensor))),
+        OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[2], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.AZeroPointTensor))),
+        OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[3], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.BTensor))),
+        OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[4], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.BScaleTensor))),
+        OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[5], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.BZeroPointTensor))),
+        OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[6], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.BiasTensor))),
+        OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[7], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.OutputTensor))),
+    };
+}
 inline std::vector<OperatorField> GetFields(const DML_ACTIVATION_ELU_OPERATOR_DESC& desc)
 {
     return {
@@ -1680,6 +1692,23 @@ inline std::vector<OperatorField> GetFields(const DML_ACTIVATION_GELU_OPERATOR_D
         OperatorField(&DML_ACTIVATION_GELU_OPERATOR_SCHEMA.Fields[1], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.OutputTensor))),
     };
 }
+inline std::vector<OperatorField> GetFields(const DML_ACTIVATION_SWISH_OPERATOR_DESC& desc)
+{
+    return {
+        OperatorField(&DML_ACTIVATION_SWISH_OPERATOR_SCHEMA.Fields[0], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.InputTensor))),
+        OperatorField(&DML_ACTIVATION_SWISH_OPERATOR_SCHEMA.Fields[1], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.OutputTensor))),
+        OperatorField(&DML_ACTIVATION_SWISH_OPERATOR_SCHEMA.Fields[2], ToOperatorFieldType(static_cast<FLOAT>(desc.SigmoidInputScale))),
+    };
+}
+inline std::vector<OperatorField> GetFields(const DML_ACTIVATION_HARD_SWISH_OPERATOR_DESC& desc)
+{
+    return {
+        OperatorField(&DML_ACTIVATION_HARD_SWISH_OPERATOR_SCHEMA.Fields[0], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.InputTensor))),
+        OperatorField(&DML_ACTIVATION_HARD_SWISH_OPERATOR_SCHEMA.Fields[1], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.OutputTensor))),
+        OperatorField(&DML_ACTIVATION_HARD_SWISH_OPERATOR_SCHEMA.Fields[2], ToOperatorFieldType(static_cast<FLOAT>(desc.Alpha))),
+        OperatorField(&DML_ACTIVATION_HARD_SWISH_OPERATOR_SCHEMA.Fields[3], ToOperatorFieldType(static_cast<FLOAT>(desc.Beta))),
+    };
+}
 inline const DML_OPERATOR_SCHEMA& GetSchema(DML_OPERATOR_TYPE operatorType)
 {
     switch (operatorType)
@@ -1826,6 +1855,8 @@ inline const DML_OPERATOR_SCHEMA& GetSchema(DML_OPERATOR_TYPE operatorType)
     case DML_OPERATOR_RESAMPLE_GRAD1: return DML_RESAMPLE_GRAD1_OPERATOR_SCHEMA;
     case DML_OPERATOR_DIAGONAL_MATRIX1: return DML_DIAGONAL_MATRIX1_OPERATOR_SCHEMA;
     case DML_OPERATOR_MULTIHEAD_ATTENTION: return DML_MULTIHEAD_ATTENTION_OPERATOR_SCHEMA;
+    case DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING: return DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA;
+    case DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT: return DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA;
     case DML_OPERATOR_ACTIVATION_ELU: return DML_ACTIVATION_ELU_OPERATOR_SCHEMA;
     case DML_OPERATOR_ACTIVATION_CELU: return DML_ACTIVATION_CELU_OPERATOR_SCHEMA;
     case DML_OPERATOR_ACTIVATION_HARDMAX: return DML_ACTIVATION_HARDMAX_OPERATOR_SCHEMA;
@@ -1850,6 +1881,8 @@ inline const DML_OPERATOR_SCHEMA& GetSchema(DML_OPERATOR_TYPE operatorType)
     case DML_OPERATOR_ACTIVATION_THRESHOLDED_RELU: return DML_ACTIVATION_THRESHOLDED_RELU_OPERATOR_SCHEMA;
     case DML_OPERATOR_ACTIVATION_SHRINK: return DML_ACTIVATION_SHRINK_OPERATOR_SCHEMA;
     case DML_OPERATOR_ACTIVATION_GELU: return DML_ACTIVATION_GELU_OPERATOR_SCHEMA;
+    case DML_OPERATOR_ACTIVATION_SWISH: return DML_ACTIVATION_SWISH_OPERATOR_SCHEMA;
+    case DML_OPERATOR_ACTIVATION_HARD_SWISH: return DML_ACTIVATION_HARD_SWISH_OPERATOR_SCHEMA;
 
     default:
         ORT_THROW_HR(E_INVALIDARG);
@@ -2431,6 +2464,14 @@ inline AbstractOperatorDesc ConvertOperatorDesc(const DML_OPERATOR_DESC& opDesc)
         return AbstractOperatorDesc(
             &DML_MULTIHEAD_ATTENTION_OPERATOR_SCHEMA,
             GetFields(*static_cast<const DML_MULTIHEAD_ATTENTION_OPERATOR_DESC*>(opDesc.Desc)));
+    case DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING:
+        return AbstractOperatorDesc(
+            &DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA,
+            GetFields(*static_cast<const DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_DESC*>(opDesc.Desc)));
+    case DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT:
+        return AbstractOperatorDesc(
+            &DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA,
+            GetFields(*static_cast<const DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC*>(opDesc.Desc)));
     case DML_OPERATOR_ACTIVATION_ELU:
         return AbstractOperatorDesc(
             &DML_ACTIVATION_ELU_OPERATOR_SCHEMA,
@@ -2527,13 +2568,14 @@ inline AbstractOperatorDesc ConvertOperatorDesc(const DML_OPERATOR_DESC& opDesc)
         return AbstractOperatorDesc(
             &DML_ACTIVATION_GELU_OPERATOR_SCHEMA,
             GetFields(*static_cast<const DML_ACTIVATION_GELU_OPERATOR_DESC*>(opDesc.Desc)));
-#pragma warning(push)
-#pragma warning(disable: 4063)
-    case DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING:
+    case DML_OPERATOR_ACTIVATION_SWISH:
         return AbstractOperatorDesc(
-            &DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA,
-            GetFields(*static_cast<const DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_DESC*>(opDesc.Desc)));
-#pragma warning(pop)
+            &DML_ACTIVATION_SWISH_OPERATOR_SCHEMA,
+            GetFields(*static_cast<const DML_ACTIVATION_SWISH_OPERATOR_DESC*>(opDesc.Desc)));
+    case DML_OPERATOR_ACTIVATION_HARD_SWISH:
+        return AbstractOperatorDesc(
+            &DML_ACTIVATION_HARD_SWISH_OPERATOR_SCHEMA,
+            GetFields(*static_cast<const DML_ACTIVATION_HARD_SWISH_OPERATOR_DESC*>(opDesc.Desc)));
 
     default:
         ORT_THROW_HR(E_INVALIDARG);
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaTypes.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaTypes.h
index 25f0dd26c6067..a94bb67b68d36 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaTypes.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaTypes.h
@@ -15,32 +15,34 @@ using ApiAttributeVariant = std::variant<
     const FLOAT*, 
     const DML_SCALE_BIAS*, 
     DML_SIZE_2D, 
-    DML_SCALAR_UNION
+    DML_SCALAR_UNION, 
+    BOOL
     >;
 
 namespace OperatorFieldTypes
 {
     using TensorDesc = std::optional<DmlBufferTensorDesc>; // DML_SCHEMA_FIELD_TYPE_TENSOR_DESC
     using TensorDescArray = std::optional<std::vector<DmlBufferTensorDesc>>; // DML_SCHEMA_FIELD_TYPE_TENSOR_DESC_ARRAY
-    using OperatorDesc = std::optional<AbstractOperatorDesc>; // DML_SCHEMA_FIELD_TYPE_OPERATOR_DESC
-    using OperatorDescArray = std::optional<std::vector<AbstractOperatorDesc>>; // DML_SCHEMA_FIELD_TYPE_OPERATOR_DESC_ARRAY
+    using FusedActivationOperatorDesc = std::optional<AbstractOperatorDesc>; // DML_SCHEMA_FIELD_TYPE_OPERATOR_DESC
+    using FusedActivationOperatorDescArray = std::optional<std::vector<AbstractOperatorDesc>>; // DML_SCHEMA_FIELD_TYPE_OPERATOR_DESC_ARRAY
     using UInt = uint32_t; // DML_SCHEMA_FIELD_TYPE_UINT
     using UInt64 = uint64_t; // DML_SCHEMA_FIELD_TYPE_UINT64
     using Int = int32_t; // DML_SCHEMA_FIELD_TYPE_INT
     using Float = float; // DML_SCHEMA_FIELD_TYPE_FLOAT
-    using UIntArray = std::optional<std::vector<uint32_t>>; // DML_SCHEMA_FIELD_TYPE_UINT_ARRAY
-    using IntArray = std::optional<std::vector<int32_t>>; // DML_SCHEMA_FIELD_TYPE_INT_ARRAY
-    using FloatArray = std::optional<std::vector<float>>; // DML_SCHEMA_FIELD_TYPE_FLOAT_ARRAY
+    using UIntArray = std::vector<uint32_t>; // DML_SCHEMA_FIELD_TYPE_UINT_ARRAY
+    using IntArray = std::vector<int32_t>; // DML_SCHEMA_FIELD_TYPE_INT_ARRAY
+    using FloatArray = std::vector<float>; // DML_SCHEMA_FIELD_TYPE_FLOAT_ARRAY
     using ScaleBias = std::optional<DML_SCALE_BIAS>; // DML_SCHEMA_FIELD_TYPE_SCALE_BIAS
     using Size2D = DML_SIZE_2D; // DML_SCHEMA_FIELD_TYPE_SIZE_2D
     using ScalarUnion = DML_SCALAR_UNION; // DML_SCHEMA_FIELD_TYPE_SCALAR_UNION
+    using Bool = bool; // DML_SCHEMA_FIELD_TYPE_BOOL
 }
 
 using OperatorFieldVariant = std::variant<
     OperatorFieldTypes::TensorDesc, 
     OperatorFieldTypes::TensorDescArray, 
-    OperatorFieldTypes::OperatorDesc, 
-    OperatorFieldTypes::OperatorDescArray, 
+    OperatorFieldTypes::FusedActivationOperatorDesc, 
+    OperatorFieldTypes::FusedActivationOperatorDescArray, 
     OperatorFieldTypes::UInt, 
     OperatorFieldTypes::UInt64, 
     OperatorFieldTypes::Int, 
@@ -50,7 +52,8 @@ using OperatorFieldVariant = std::variant<
     OperatorFieldTypes::FloatArray, 
     OperatorFieldTypes::ScaleBias, 
     OperatorFieldTypes::Size2D, 
-    OperatorFieldTypes::ScalarUnion
+    OperatorFieldTypes::ScalarUnion, 
+    OperatorFieldTypes::Bool
     >;
 
 class OperatorField
@@ -80,11 +83,11 @@ class OperatorField
     const OperatorFieldTypes::TensorDescArray& AsTensorDescArray() const { return std::get<OperatorFieldTypes::TensorDescArray>(m_data); }
     OperatorFieldTypes::TensorDescArray& AsTensorDescArray() { return std::get<OperatorFieldTypes::TensorDescArray>(m_data); }
 
-    const OperatorFieldTypes::OperatorDesc& AsOperatorDesc() const { return std::get<OperatorFieldTypes::OperatorDesc>(m_data); }
-    OperatorFieldTypes::OperatorDesc& AsOperatorDesc() { return std::get<OperatorFieldTypes::OperatorDesc>(m_data); }
+    const OperatorFieldTypes::FusedActivationOperatorDesc& AsFusedActivationOperatorDesc() const { return std::get<OperatorFieldTypes::FusedActivationOperatorDesc>(m_data); }
+    OperatorFieldTypes::FusedActivationOperatorDesc& AsFusedActivationOperatorDesc() { return std::get<OperatorFieldTypes::FusedActivationOperatorDesc>(m_data); }
 
-    const OperatorFieldTypes::OperatorDescArray& AsOperatorDescArray() const { return std::get<OperatorFieldTypes::OperatorDescArray>(m_data); }
-    OperatorFieldTypes::OperatorDescArray& AsOperatorDescArray() { return std::get<OperatorFieldTypes::OperatorDescArray>(m_data); }
+    const OperatorFieldTypes::FusedActivationOperatorDescArray& AsFusedActivationOperatorDescArray() const { return std::get<OperatorFieldTypes::FusedActivationOperatorDescArray>(m_data); }
+    OperatorFieldTypes::FusedActivationOperatorDescArray& AsFusedActivationOperatorDescArray() { return std::get<OperatorFieldTypes::FusedActivationOperatorDescArray>(m_data); }
 
     const OperatorFieldTypes::UInt& AsUInt() const { return std::get<OperatorFieldTypes::UInt>(m_data); }
     OperatorFieldTypes::UInt& AsUInt() { return std::get<OperatorFieldTypes::UInt>(m_data); }
@@ -116,6 +119,9 @@ class OperatorField
     const OperatorFieldTypes::ScalarUnion& AsScalarUnion() const { return std::get<OperatorFieldTypes::ScalarUnion>(m_data); }
     OperatorFieldTypes::ScalarUnion& AsScalarUnion() { return std::get<OperatorFieldTypes::ScalarUnion>(m_data); }
 
+    const OperatorFieldTypes::Bool& AsBool() const { return std::get<OperatorFieldTypes::Bool>(m_data); }
+    OperatorFieldTypes::Bool& AsBool() { return std::get<OperatorFieldTypes::Bool>(m_data); }
+
 private:
     const DML_SCHEMA_FIELD* m_schema;
     OperatorFieldVariant m_data;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/OperatorFieldTypes_generated.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/OperatorFieldTypes_generated.h
new file mode 100644
index 0000000000000..167a913bb0132
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/OperatorFieldTypes_generated.h
@@ -0,0 +1,1318 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_OPERATORFIELDTYPES_DML_IR_OPERATORFIELDTYPES_H_
+#define FLATBUFFERS_GENERATED_OPERATORFIELDTYPES_DML_IR_OPERATORFIELDTYPES_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+namespace dml {
+namespace ir {
+namespace operatorFieldTypes {
+
+struct AttributeDesc;
+struct AttributeDescBuilder;
+
+struct Activation;
+struct ActivationBuilder;
+
+struct ActivationArray;
+struct ActivationArrayBuilder;
+
+struct UInt8;
+
+struct UInt16;
+
+struct UInt32;
+
+struct UInt64;
+
+struct Int8;
+
+struct Int16;
+
+struct Int32;
+
+struct Int64;
+
+struct Float32;
+
+struct Float64;
+
+struct UIntArray;
+struct UIntArrayBuilder;
+
+struct IntArray;
+struct IntArrayBuilder;
+
+struct FloatArray;
+struct FloatArrayBuilder;
+
+struct ScaleBias;
+
+struct Size2D;
+
+struct ByteArray;
+
+struct ScalarUnionData;
+struct ScalarUnionDataBuilder;
+
+struct Bool;
+
+enum AttributeFieldVariant {
+  AttributeFieldVariant_NONE = 0,
+  AttributeFieldVariant_Activation = 1,
+  AttributeFieldVariant_ActivationArray = 2,
+  AttributeFieldVariant_UInt32 = 3,
+  AttributeFieldVariant_UInt64 = 4,
+  AttributeFieldVariant_Int32 = 5,
+  AttributeFieldVariant_Float32 = 6,
+  AttributeFieldVariant_UIntArray = 7,
+  AttributeFieldVariant_IntArray = 8,
+  AttributeFieldVariant_FloatArray = 9,
+  AttributeFieldVariant_ScaleBias = 10,
+  AttributeFieldVariant_Size2D = 11,
+  AttributeFieldVariant_ScalarUnionData = 12,
+  AttributeFieldVariant_Bool = 13,
+  AttributeFieldVariant_MIN = AttributeFieldVariant_NONE,
+  AttributeFieldVariant_MAX = AttributeFieldVariant_Bool
+};
+
+inline const AttributeFieldVariant (&EnumValuesAttributeFieldVariant())[14] {
+  static const AttributeFieldVariant values[] = {
+    AttributeFieldVariant_NONE,
+    AttributeFieldVariant_Activation,
+    AttributeFieldVariant_ActivationArray,
+    AttributeFieldVariant_UInt32,
+    AttributeFieldVariant_UInt64,
+    AttributeFieldVariant_Int32,
+    AttributeFieldVariant_Float32,
+    AttributeFieldVariant_UIntArray,
+    AttributeFieldVariant_IntArray,
+    AttributeFieldVariant_FloatArray,
+    AttributeFieldVariant_ScaleBias,
+    AttributeFieldVariant_Size2D,
+    AttributeFieldVariant_ScalarUnionData,
+    AttributeFieldVariant_Bool
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesAttributeFieldVariant() {
+  static const char * const names[15] = {
+    "NONE",
+    "Activation",
+    "ActivationArray",
+    "UInt32",
+    "UInt64",
+    "Int32",
+    "Float32",
+    "UIntArray",
+    "IntArray",
+    "FloatArray",
+    "ScaleBias",
+    "Size2D",
+    "ScalarUnionData",
+    "Bool",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameAttributeFieldVariant(AttributeFieldVariant e) {
+  if (flatbuffers::IsOutRange(e, AttributeFieldVariant_NONE, AttributeFieldVariant_Bool)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesAttributeFieldVariant()[index];
+}
+
+template<typename T> struct AttributeFieldVariantTraits {
+  static const AttributeFieldVariant enum_value = AttributeFieldVariant_NONE;
+};
+
+template<> struct AttributeFieldVariantTraits<dml::ir::operatorFieldTypes::Activation> {
+  static const AttributeFieldVariant enum_value = AttributeFieldVariant_Activation;
+};
+
+template<> struct AttributeFieldVariantTraits<dml::ir::operatorFieldTypes::ActivationArray> {
+  static const AttributeFieldVariant enum_value = AttributeFieldVariant_ActivationArray;
+};
+
+template<> struct AttributeFieldVariantTraits<dml::ir::operatorFieldTypes::UInt32> {
+  static const AttributeFieldVariant enum_value = AttributeFieldVariant_UInt32;
+};
+
+template<> struct AttributeFieldVariantTraits<dml::ir::operatorFieldTypes::UInt64> {
+  static const AttributeFieldVariant enum_value = AttributeFieldVariant_UInt64;
+};
+
+template<> struct AttributeFieldVariantTraits<dml::ir::operatorFieldTypes::Int32> {
+  static const AttributeFieldVariant enum_value = AttributeFieldVariant_Int32;
+};
+
+template<> struct AttributeFieldVariantTraits<dml::ir::operatorFieldTypes::Float32> {
+  static const AttributeFieldVariant enum_value = AttributeFieldVariant_Float32;
+};
+
+template<> struct AttributeFieldVariantTraits<dml::ir::operatorFieldTypes::UIntArray> {
+  static const AttributeFieldVariant enum_value = AttributeFieldVariant_UIntArray;
+};
+
+template<> struct AttributeFieldVariantTraits<dml::ir::operatorFieldTypes::IntArray> {
+  static const AttributeFieldVariant enum_value = AttributeFieldVariant_IntArray;
+};
+
+template<> struct AttributeFieldVariantTraits<dml::ir::operatorFieldTypes::FloatArray> {
+  static const AttributeFieldVariant enum_value = AttributeFieldVariant_FloatArray;
+};
+
+template<> struct AttributeFieldVariantTraits<dml::ir::operatorFieldTypes::ScaleBias> {
+  static const AttributeFieldVariant enum_value = AttributeFieldVariant_ScaleBias;
+};
+
+template<> struct AttributeFieldVariantTraits<dml::ir::operatorFieldTypes::Size2D> {
+  static const AttributeFieldVariant enum_value = AttributeFieldVariant_Size2D;
+};
+
+template<> struct AttributeFieldVariantTraits<dml::ir::operatorFieldTypes::ScalarUnionData> {
+  static const AttributeFieldVariant enum_value = AttributeFieldVariant_ScalarUnionData;
+};
+
+template<> struct AttributeFieldVariantTraits<dml::ir::operatorFieldTypes::Bool> {
+  static const AttributeFieldVariant enum_value = AttributeFieldVariant_Bool;
+};
+
+bool VerifyAttributeFieldVariant(flatbuffers::Verifier &verifier, const void *obj, AttributeFieldVariant type);
+bool VerifyAttributeFieldVariantVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types);
+
+enum ScalarVariant {
+  ScalarVariant_NONE = 0,
+  ScalarVariant_ByteArray = 1,
+  ScalarVariant_Int8 = 2,
+  ScalarVariant_UInt8 = 3,
+  ScalarVariant_Int16 = 4,
+  ScalarVariant_UInt16 = 5,
+  ScalarVariant_Int32 = 6,
+  ScalarVariant_UInt32 = 7,
+  ScalarVariant_Int64 = 8,
+  ScalarVariant_UInt64 = 9,
+  ScalarVariant_Float32 = 10,
+  ScalarVariant_Float64 = 11,
+  ScalarVariant_MIN = ScalarVariant_NONE,
+  ScalarVariant_MAX = ScalarVariant_Float64
+};
+
+inline const ScalarVariant (&EnumValuesScalarVariant())[12] {
+  static const ScalarVariant values[] = {
+    ScalarVariant_NONE,
+    ScalarVariant_ByteArray,
+    ScalarVariant_Int8,
+    ScalarVariant_UInt8,
+    ScalarVariant_Int16,
+    ScalarVariant_UInt16,
+    ScalarVariant_Int32,
+    ScalarVariant_UInt32,
+    ScalarVariant_Int64,
+    ScalarVariant_UInt64,
+    ScalarVariant_Float32,
+    ScalarVariant_Float64
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesScalarVariant() {
+  static const char * const names[13] = {
+    "NONE",
+    "ByteArray",
+    "Int8",
+    "UInt8",
+    "Int16",
+    "UInt16",
+    "Int32",
+    "UInt32",
+    "Int64",
+    "UInt64",
+    "Float32",
+    "Float64",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameScalarVariant(ScalarVariant e) {
+  if (flatbuffers::IsOutRange(e, ScalarVariant_NONE, ScalarVariant_Float64)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesScalarVariant()[index];
+}
+
+template<typename T> struct ScalarVariantTraits {
+  static const ScalarVariant enum_value = ScalarVariant_NONE;
+};
+
+template<> struct ScalarVariantTraits<dml::ir::operatorFieldTypes::ByteArray> {
+  static const ScalarVariant enum_value = ScalarVariant_ByteArray;
+};
+
+template<> struct ScalarVariantTraits<dml::ir::operatorFieldTypes::Int8> {
+  static const ScalarVariant enum_value = ScalarVariant_Int8;
+};
+
+template<> struct ScalarVariantTraits<dml::ir::operatorFieldTypes::UInt8> {
+  static const ScalarVariant enum_value = ScalarVariant_UInt8;
+};
+
+template<> struct ScalarVariantTraits<dml::ir::operatorFieldTypes::Int16> {
+  static const ScalarVariant enum_value = ScalarVariant_Int16;
+};
+
+template<> struct ScalarVariantTraits<dml::ir::operatorFieldTypes::UInt16> {
+  static const ScalarVariant enum_value = ScalarVariant_UInt16;
+};
+
+template<> struct ScalarVariantTraits<dml::ir::operatorFieldTypes::Int32> {
+  static const ScalarVariant enum_value = ScalarVariant_Int32;
+};
+
+template<> struct ScalarVariantTraits<dml::ir::operatorFieldTypes::UInt32> {
+  static const ScalarVariant enum_value = ScalarVariant_UInt32;
+};
+
+template<> struct ScalarVariantTraits<dml::ir::operatorFieldTypes::Int64> {
+  static const ScalarVariant enum_value = ScalarVariant_Int64;
+};
+
+template<> struct ScalarVariantTraits<dml::ir::operatorFieldTypes::UInt64> {
+  static const ScalarVariant enum_value = ScalarVariant_UInt64;
+};
+
+template<> struct ScalarVariantTraits<dml::ir::operatorFieldTypes::Float32> {
+  static const ScalarVariant enum_value = ScalarVariant_Float32;
+};
+
+template<> struct ScalarVariantTraits<dml::ir::operatorFieldTypes::Float64> {
+  static const ScalarVariant enum_value = ScalarVariant_Float64;
+};
+
+bool VerifyScalarVariant(flatbuffers::Verifier &verifier, const void *obj, ScalarVariant type);
+bool VerifyScalarVariantVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(1) UInt8 FLATBUFFERS_FINAL_CLASS {
+ private:
+  uint8_t data_;
+
+ public:
+  UInt8() {
+    memset(static_cast<void *>(this), 0, sizeof(UInt8));
+  }
+  UInt8(uint8_t _data)
+      : data_(flatbuffers::EndianScalar(_data)) {
+  }
+  uint8_t data() const {
+    return flatbuffers::EndianScalar(data_);
+  }
+  void mutate_data(uint8_t _data) {
+    flatbuffers::WriteScalar(&data_, _data);
+  }
+};
+FLATBUFFERS_STRUCT_END(UInt8, 1);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(2) UInt16 FLATBUFFERS_FINAL_CLASS {
+ private:
+  uint16_t data_;
+
+ public:
+  UInt16() {
+    memset(static_cast<void *>(this), 0, sizeof(UInt16));
+  }
+  UInt16(uint16_t _data)
+      : data_(flatbuffers::EndianScalar(_data)) {
+  }
+  uint16_t data() const {
+    return flatbuffers::EndianScalar(data_);
+  }
+  void mutate_data(uint16_t _data) {
+    flatbuffers::WriteScalar(&data_, _data);
+  }
+};
+FLATBUFFERS_STRUCT_END(UInt16, 2);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(4) UInt32 FLATBUFFERS_FINAL_CLASS {
+ private:
+  uint32_t data_;
+
+ public:
+  UInt32() {
+    memset(static_cast<void *>(this), 0, sizeof(UInt32));
+  }
+  UInt32(uint32_t _data)
+      : data_(flatbuffers::EndianScalar(_data)) {
+  }
+  uint32_t data() const {
+    return flatbuffers::EndianScalar(data_);
+  }
+  void mutate_data(uint32_t _data) {
+    flatbuffers::WriteScalar(&data_, _data);
+  }
+};
+FLATBUFFERS_STRUCT_END(UInt32, 4);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) UInt64 FLATBUFFERS_FINAL_CLASS {
+ private:
+  uint64_t data_;
+
+ public:
+  UInt64() {
+    memset(static_cast<void *>(this), 0, sizeof(UInt64));
+  }
+  UInt64(uint64_t _data)
+      : data_(flatbuffers::EndianScalar(_data)) {
+  }
+  uint64_t data() const {
+    return flatbuffers::EndianScalar(data_);
+  }
+  void mutate_data(uint64_t _data) {
+    flatbuffers::WriteScalar(&data_, _data);
+  }
+};
+FLATBUFFERS_STRUCT_END(UInt64, 8);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(1) Int8 FLATBUFFERS_FINAL_CLASS {
+ private:
+  int8_t data_;
+
+ public:
+  Int8() {
+    memset(static_cast<void *>(this), 0, sizeof(Int8));
+  }
+  Int8(int8_t _data)
+      : data_(flatbuffers::EndianScalar(_data)) {
+  }
+  int8_t data() const {
+    return flatbuffers::EndianScalar(data_);
+  }
+  void mutate_data(int8_t _data) {
+    flatbuffers::WriteScalar(&data_, _data);
+  }
+};
+FLATBUFFERS_STRUCT_END(Int8, 1);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(2) Int16 FLATBUFFERS_FINAL_CLASS {
+ private:
+  int16_t data_;
+
+ public:
+  Int16() {
+    memset(static_cast<void *>(this), 0, sizeof(Int16));
+  }
+  Int16(int16_t _data)
+      : data_(flatbuffers::EndianScalar(_data)) {
+  }
+  int16_t data() const {
+    return flatbuffers::EndianScalar(data_);
+  }
+  void mutate_data(int16_t _data) {
+    flatbuffers::WriteScalar(&data_, _data);
+  }
+};
+FLATBUFFERS_STRUCT_END(Int16, 2);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(4) Int32 FLATBUFFERS_FINAL_CLASS {
+ private:
+  int32_t data_;
+
+ public:
+  Int32() {
+    memset(static_cast<void *>(this), 0, sizeof(Int32));
+  }
+  Int32(int32_t _data)
+      : data_(flatbuffers::EndianScalar(_data)) {
+  }
+  int32_t data() const {
+    return flatbuffers::EndianScalar(data_);
+  }
+  void mutate_data(int32_t _data) {
+    flatbuffers::WriteScalar(&data_, _data);
+  }
+};
+FLATBUFFERS_STRUCT_END(Int32, 4);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) Int64 FLATBUFFERS_FINAL_CLASS {
+ private:
+  int64_t data_;
+
+ public:
+  Int64() {
+    memset(static_cast<void *>(this), 0, sizeof(Int64));
+  }
+  Int64(int64_t _data)
+      : data_(flatbuffers::EndianScalar(_data)) {
+  }
+  int64_t data() const {
+    return flatbuffers::EndianScalar(data_);
+  }
+  void mutate_data(int64_t _data) {
+    flatbuffers::WriteScalar(&data_, _data);
+  }
+};
+FLATBUFFERS_STRUCT_END(Int64, 8);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(4) Float32 FLATBUFFERS_FINAL_CLASS {
+ private:
+  float data_;
+
+ public:
+  Float32() {
+    memset(static_cast<void *>(this), 0, sizeof(Float32));
+  }
+  Float32(float _data)
+      : data_(flatbuffers::EndianScalar(_data)) {
+  }
+  float data() const {
+    return flatbuffers::EndianScalar(data_);
+  }
+  void mutate_data(float _data) {
+    flatbuffers::WriteScalar(&data_, _data);
+  }
+};
+FLATBUFFERS_STRUCT_END(Float32, 4);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) Float64 FLATBUFFERS_FINAL_CLASS {
+ private:
+  double data_;
+
+ public:
+  Float64() {
+    memset(static_cast<void *>(this), 0, sizeof(Float64));
+  }
+  Float64(double _data)
+      : data_(flatbuffers::EndianScalar(_data)) {
+  }
+  double data() const {
+    return flatbuffers::EndianScalar(data_);
+  }
+  void mutate_data(double _data) {
+    flatbuffers::WriteScalar(&data_, _data);
+  }
+};
+FLATBUFFERS_STRUCT_END(Float64, 8);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(4) ScaleBias FLATBUFFERS_FINAL_CLASS {
+ private:
+  float scale_;
+  float bias_;
+
+ public:
+  ScaleBias() {
+    memset(static_cast<void *>(this), 0, sizeof(ScaleBias));
+  }
+  ScaleBias(float _scale, float _bias)
+      : scale_(flatbuffers::EndianScalar(_scale)),
+        bias_(flatbuffers::EndianScalar(_bias)) {
+  }
+  float scale() const {
+    return flatbuffers::EndianScalar(scale_);
+  }
+  void mutate_scale(float _scale) {
+    flatbuffers::WriteScalar(&scale_, _scale);
+  }
+  float bias() const {
+    return flatbuffers::EndianScalar(bias_);
+  }
+  void mutate_bias(float _bias) {
+    flatbuffers::WriteScalar(&bias_, _bias);
+  }
+};
+FLATBUFFERS_STRUCT_END(ScaleBias, 8);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(4) Size2D FLATBUFFERS_FINAL_CLASS {
+ private:
+  uint32_t width_;
+  uint32_t height_;
+
+ public:
+  Size2D() {
+    memset(static_cast<void *>(this), 0, sizeof(Size2D));
+  }
+  Size2D(uint32_t _width, uint32_t _height)
+      : width_(flatbuffers::EndianScalar(_width)),
+        height_(flatbuffers::EndianScalar(_height)) {
+  }
+  uint32_t width() const {
+    return flatbuffers::EndianScalar(width_);
+  }
+  void mutate_width(uint32_t _width) {
+    flatbuffers::WriteScalar(&width_, _width);
+  }
+  uint32_t height() const {
+    return flatbuffers::EndianScalar(height_);
+  }
+  void mutate_height(uint32_t _height) {
+    flatbuffers::WriteScalar(&height_, _height);
+  }
+};
+FLATBUFFERS_STRUCT_END(Size2D, 8);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(1) ByteArray FLATBUFFERS_FINAL_CLASS {
+ private:
+  uint8_t data_[8];
+
+ public:
+  ByteArray() {
+    memset(static_cast<void *>(this), 0, sizeof(ByteArray));
+  }
+  const flatbuffers::Array<uint8_t, 8> *data() const {
+    return reinterpret_cast<const flatbuffers::Array<uint8_t, 8> *>(data_);
+  }
+  flatbuffers::Array<uint8_t, 8> *mutable_data() {
+    return reinterpret_cast<flatbuffers::Array<uint8_t, 8> *>(data_);
+  }
+};
+FLATBUFFERS_STRUCT_END(ByteArray, 8);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(1) Bool FLATBUFFERS_FINAL_CLASS {
+ private:
+  uint8_t data_;
+
+ public:
+  Bool() {
+    memset(static_cast<void *>(this), 0, sizeof(Bool));
+  }
+  Bool(bool _data)
+      : data_(flatbuffers::EndianScalar(static_cast<uint8_t>(_data))) {
+  }
+  bool data() const {
+    return flatbuffers::EndianScalar(data_) != 0;
+  }
+  void mutate_data(bool _data) {
+    flatbuffers::WriteScalar(&data_, static_cast<uint8_t>(_data));
+  }
+};
+FLATBUFFERS_STRUCT_END(Bool, 1);
+
+struct AttributeDesc FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef AttributeDescBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NAME = 4,
+    VT_VAL_TYPE = 6,
+    VT_VAL = 8
+  };
+  const flatbuffers::String *name() const {
+    return GetPointer<const flatbuffers::String *>(VT_NAME);
+  }
+  flatbuffers::String *mutable_name() {
+    return GetPointer<flatbuffers::String *>(VT_NAME);
+  }
+  dml::ir::operatorFieldTypes::AttributeFieldVariant val_type() const {
+    return static_cast<dml::ir::operatorFieldTypes::AttributeFieldVariant>(GetField<uint8_t>(VT_VAL_TYPE, 0));
+  }
+  const void *val() const {
+    return GetPointer<const void *>(VT_VAL);
+  }
+  template<typename T> const T *val_as() const;
+  const dml::ir::operatorFieldTypes::Activation *val_as_Activation() const {
+    return val_type() == dml::ir::operatorFieldTypes::AttributeFieldVariant_Activation ? static_cast<const dml::ir::operatorFieldTypes::Activation *>(val()) : nullptr;
+  }
+  const dml::ir::operatorFieldTypes::ActivationArray *val_as_ActivationArray() const {
+    return val_type() == dml::ir::operatorFieldTypes::AttributeFieldVariant_ActivationArray ? static_cast<const dml::ir::operatorFieldTypes::ActivationArray *>(val()) : nullptr;
+  }
+  const dml::ir::operatorFieldTypes::UInt32 *val_as_UInt32() const {
+    return val_type() == dml::ir::operatorFieldTypes::AttributeFieldVariant_UInt32 ? static_cast<const dml::ir::operatorFieldTypes::UInt32 *>(val()) : nullptr;
+  }
+  const dml::ir::operatorFieldTypes::UInt64 *val_as_UInt64() const {
+    return val_type() == dml::ir::operatorFieldTypes::AttributeFieldVariant_UInt64 ? static_cast<const dml::ir::operatorFieldTypes::UInt64 *>(val()) : nullptr;
+  }
+  const dml::ir::operatorFieldTypes::Int32 *val_as_Int32() const {
+    return val_type() == dml::ir::operatorFieldTypes::AttributeFieldVariant_Int32 ? static_cast<const dml::ir::operatorFieldTypes::Int32 *>(val()) : nullptr;
+  }
+  const dml::ir::operatorFieldTypes::Float32 *val_as_Float32() const {
+    return val_type() == dml::ir::operatorFieldTypes::AttributeFieldVariant_Float32 ? static_cast<const dml::ir::operatorFieldTypes::Float32 *>(val()) : nullptr;
+  }
+  const dml::ir::operatorFieldTypes::UIntArray *val_as_UIntArray() const {
+    return val_type() == dml::ir::operatorFieldTypes::AttributeFieldVariant_UIntArray ? static_cast<const dml::ir::operatorFieldTypes::UIntArray *>(val()) : nullptr;
+  }
+  const dml::ir::operatorFieldTypes::IntArray *val_as_IntArray() const {
+    return val_type() == dml::ir::operatorFieldTypes::AttributeFieldVariant_IntArray ? static_cast<const dml::ir::operatorFieldTypes::IntArray *>(val()) : nullptr;
+  }
+  const dml::ir::operatorFieldTypes::FloatArray *val_as_FloatArray() const {
+    return val_type() == dml::ir::operatorFieldTypes::AttributeFieldVariant_FloatArray ? static_cast<const dml::ir::operatorFieldTypes::FloatArray *>(val()) : nullptr;
+  }
+  const dml::ir::operatorFieldTypes::ScaleBias *val_as_ScaleBias() const {
+    return val_type() == dml::ir::operatorFieldTypes::AttributeFieldVariant_ScaleBias ? static_cast<const dml::ir::operatorFieldTypes::ScaleBias *>(val()) : nullptr;
+  }
+  const dml::ir::operatorFieldTypes::Size2D *val_as_Size2D() const {
+    return val_type() == dml::ir::operatorFieldTypes::AttributeFieldVariant_Size2D ? static_cast<const dml::ir::operatorFieldTypes::Size2D *>(val()) : nullptr;
+  }
+  const dml::ir::operatorFieldTypes::ScalarUnionData *val_as_ScalarUnionData() const {
+    return val_type() == dml::ir::operatorFieldTypes::AttributeFieldVariant_ScalarUnionData ? static_cast<const dml::ir::operatorFieldTypes::ScalarUnionData *>(val()) : nullptr;
+  }
+  const dml::ir::operatorFieldTypes::Bool *val_as_Bool() const {
+    return val_type() == dml::ir::operatorFieldTypes::AttributeFieldVariant_Bool ? static_cast<const dml::ir::operatorFieldTypes::Bool *>(val()) : nullptr;
+  }
+  void *mutable_val() {
+    return GetPointer<void *>(VT_VAL);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyField<uint8_t>(verifier, VT_VAL_TYPE) &&
+           VerifyOffset(verifier, VT_VAL) &&
+           VerifyAttributeFieldVariant(verifier, val(), val_type()) &&
+           verifier.EndTable();
+  }
+};
+
+template<> inline const dml::ir::operatorFieldTypes::Activation *AttributeDesc::val_as<dml::ir::operatorFieldTypes::Activation>() const {
+  return val_as_Activation();
+}
+
+template<> inline const dml::ir::operatorFieldTypes::ActivationArray *AttributeDesc::val_as<dml::ir::operatorFieldTypes::ActivationArray>() const {
+  return val_as_ActivationArray();
+}
+
+template<> inline const dml::ir::operatorFieldTypes::UInt32 *AttributeDesc::val_as<dml::ir::operatorFieldTypes::UInt32>() const {
+  return val_as_UInt32();
+}
+
+template<> inline const dml::ir::operatorFieldTypes::UInt64 *AttributeDesc::val_as<dml::ir::operatorFieldTypes::UInt64>() const {
+  return val_as_UInt64();
+}
+
+template<> inline const dml::ir::operatorFieldTypes::Int32 *AttributeDesc::val_as<dml::ir::operatorFieldTypes::Int32>() const {
+  return val_as_Int32();
+}
+
+template<> inline const dml::ir::operatorFieldTypes::Float32 *AttributeDesc::val_as<dml::ir::operatorFieldTypes::Float32>() const {
+  return val_as_Float32();
+}
+
+template<> inline const dml::ir::operatorFieldTypes::UIntArray *AttributeDesc::val_as<dml::ir::operatorFieldTypes::UIntArray>() const {
+  return val_as_UIntArray();
+}
+
+template<> inline const dml::ir::operatorFieldTypes::IntArray *AttributeDesc::val_as<dml::ir::operatorFieldTypes::IntArray>() const {
+  return val_as_IntArray();
+}
+
+template<> inline const dml::ir::operatorFieldTypes::FloatArray *AttributeDesc::val_as<dml::ir::operatorFieldTypes::FloatArray>() const {
+  return val_as_FloatArray();
+}
+
+template<> inline const dml::ir::operatorFieldTypes::ScaleBias *AttributeDesc::val_as<dml::ir::operatorFieldTypes::ScaleBias>() const {
+  return val_as_ScaleBias();
+}
+
+template<> inline const dml::ir::operatorFieldTypes::Size2D *AttributeDesc::val_as<dml::ir::operatorFieldTypes::Size2D>() const {
+  return val_as_Size2D();
+}
+
+template<> inline const dml::ir::operatorFieldTypes::ScalarUnionData *AttributeDesc::val_as<dml::ir::operatorFieldTypes::ScalarUnionData>() const {
+  return val_as_ScalarUnionData();
+}
+
+template<> inline const dml::ir::operatorFieldTypes::Bool *AttributeDesc::val_as<dml::ir::operatorFieldTypes::Bool>() const {
+  return val_as_Bool();
+}
+
+struct AttributeDescBuilder {
+  typedef AttributeDesc Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_name(flatbuffers::Offset<flatbuffers::String> name) {
+    fbb_.AddOffset(AttributeDesc::VT_NAME, name);
+  }
+  void add_val_type(dml::ir::operatorFieldTypes::AttributeFieldVariant val_type) {
+    fbb_.AddElement<uint8_t>(AttributeDesc::VT_VAL_TYPE, static_cast<uint8_t>(val_type), 0);
+  }
+  void add_val(flatbuffers::Offset<void> val) {
+    fbb_.AddOffset(AttributeDesc::VT_VAL, val);
+  }
+  explicit AttributeDescBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  AttributeDescBuilder &operator=(const AttributeDescBuilder &);
+  flatbuffers::Offset<AttributeDesc> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<AttributeDesc>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<AttributeDesc> CreateAttributeDesc(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::String> name = 0,
+    dml::ir::operatorFieldTypes::AttributeFieldVariant val_type = dml::ir::operatorFieldTypes::AttributeFieldVariant_NONE,
+    flatbuffers::Offset<void> val = 0) {
+  AttributeDescBuilder builder_(_fbb);
+  builder_.add_val(val);
+  builder_.add_name(name);
+  builder_.add_val_type(val_type);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<AttributeDesc> CreateAttributeDescDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const char *name = nullptr,
+    dml::ir::operatorFieldTypes::AttributeFieldVariant val_type = dml::ir::operatorFieldTypes::AttributeFieldVariant_NONE,
+    flatbuffers::Offset<void> val = 0) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  return dml::ir::operatorFieldTypes::CreateAttributeDesc(
+      _fbb,
+      name__,
+      val_type,
+      val);
+}
+
+struct Activation FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ActivationBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_TYPE = 4,
+    VT_ATTRIBUTES = 6
+  };
+  const flatbuffers::String *type() const {
+    return GetPointer<const flatbuffers::String *>(VT_TYPE);
+  }
+  flatbuffers::String *mutable_type() {
+    return GetPointer<flatbuffers::String *>(VT_TYPE);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>> *attributes() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>> *>(VT_ATTRIBUTES);
+  }
+  flatbuffers::Vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>> *mutable_attributes() {
+    return GetPointer<flatbuffers::Vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>> *>(VT_ATTRIBUTES);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_TYPE) &&
+           verifier.VerifyString(type()) &&
+           VerifyOffset(verifier, VT_ATTRIBUTES) &&
+           verifier.VerifyVector(attributes()) &&
+           verifier.VerifyVectorOfTables(attributes()) &&
+           verifier.EndTable();
+  }
+};
+
+struct ActivationBuilder {
+  typedef Activation Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_type(flatbuffers::Offset<flatbuffers::String> type) {
+    fbb_.AddOffset(Activation::VT_TYPE, type);
+  }
+  void add_attributes(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>>> attributes) {
+    fbb_.AddOffset(Activation::VT_ATTRIBUTES, attributes);
+  }
+  explicit ActivationBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ActivationBuilder &operator=(const ActivationBuilder &);
+  flatbuffers::Offset<Activation> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Activation>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Activation> CreateActivation(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::String> type = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>>> attributes = 0) {
+  ActivationBuilder builder_(_fbb);
+  builder_.add_attributes(attributes);
+  builder_.add_type(type);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<Activation> CreateActivationDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const char *type = nullptr,
+    const std::vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>> *attributes = nullptr) {
+  auto type__ = type ? _fbb.CreateString(type) : 0;
+  auto attributes__ = attributes ? _fbb.CreateVector<flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>>(*attributes) : 0;
+  return dml::ir::operatorFieldTypes::CreateActivation(
+      _fbb,
+      type__,
+      attributes__);
+}
+
+struct ActivationArray FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ActivationArrayBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DATA = 4
+  };
+  const flatbuffers::Vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::Activation>> *data() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::Activation>> *>(VT_DATA);
+  }
+  flatbuffers::Vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::Activation>> *mutable_data() {
+    return GetPointer<flatbuffers::Vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::Activation>> *>(VT_DATA);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_DATA) &&
+           verifier.VerifyVector(data()) &&
+           verifier.VerifyVectorOfTables(data()) &&
+           verifier.EndTable();
+  }
+};
+
+struct ActivationArrayBuilder {
+  typedef ActivationArray Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_data(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::Activation>>> data) {
+    fbb_.AddOffset(ActivationArray::VT_DATA, data);
+  }
+  explicit ActivationArrayBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ActivationArrayBuilder &operator=(const ActivationArrayBuilder &);
+  flatbuffers::Offset<ActivationArray> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ActivationArray>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ActivationArray> CreateActivationArray(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::Activation>>> data = 0) {
+  ActivationArrayBuilder builder_(_fbb);
+  builder_.add_data(data);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<ActivationArray> CreateActivationArrayDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::Activation>> *data = nullptr) {
+  auto data__ = data ? _fbb.CreateVector<flatbuffers::Offset<dml::ir::operatorFieldTypes::Activation>>(*data) : 0;
+  return dml::ir::operatorFieldTypes::CreateActivationArray(
+      _fbb,
+      data__);
+}
+
+struct UIntArray FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef UIntArrayBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DATA = 4
+  };
+  const flatbuffers::Vector<uint32_t> *data() const {
+    return GetPointer<const flatbuffers::Vector<uint32_t> *>(VT_DATA);
+  }
+  flatbuffers::Vector<uint32_t> *mutable_data() {
+    return GetPointer<flatbuffers::Vector<uint32_t> *>(VT_DATA);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_DATA) &&
+           verifier.VerifyVector(data()) &&
+           verifier.EndTable();
+  }
+};
+
+struct UIntArrayBuilder {
+  typedef UIntArray Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_data(flatbuffers::Offset<flatbuffers::Vector<uint32_t>> data) {
+    fbb_.AddOffset(UIntArray::VT_DATA, data);
+  }
+  explicit UIntArrayBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  UIntArrayBuilder &operator=(const UIntArrayBuilder &);
+  flatbuffers::Offset<UIntArray> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<UIntArray>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<UIntArray> CreateUIntArray(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::Vector<uint32_t>> data = 0) {
+  UIntArrayBuilder builder_(_fbb);
+  builder_.add_data(data);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<UIntArray> CreateUIntArrayDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<uint32_t> *data = nullptr) {
+  auto data__ = data ? _fbb.CreateVector<uint32_t>(*data) : 0;
+  return dml::ir::operatorFieldTypes::CreateUIntArray(
+      _fbb,
+      data__);
+}
+
+struct IntArray FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef IntArrayBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DATA = 4
+  };
+  const flatbuffers::Vector<int32_t> *data() const {
+    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_DATA);
+  }
+  flatbuffers::Vector<int32_t> *mutable_data() {
+    return GetPointer<flatbuffers::Vector<int32_t> *>(VT_DATA);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_DATA) &&
+           verifier.VerifyVector(data()) &&
+           verifier.EndTable();
+  }
+};
+
+struct IntArrayBuilder {
+  typedef IntArray Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_data(flatbuffers::Offset<flatbuffers::Vector<int32_t>> data) {
+    fbb_.AddOffset(IntArray::VT_DATA, data);
+  }
+  explicit IntArrayBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  IntArrayBuilder &operator=(const IntArrayBuilder &);
+  flatbuffers::Offset<IntArray> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<IntArray>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<IntArray> CreateIntArray(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::Vector<int32_t>> data = 0) {
+  IntArrayBuilder builder_(_fbb);
+  builder_.add_data(data);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<IntArray> CreateIntArrayDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int32_t> *data = nullptr) {
+  auto data__ = data ? _fbb.CreateVector<int32_t>(*data) : 0;
+  return dml::ir::operatorFieldTypes::CreateIntArray(
+      _fbb,
+      data__);
+}
+
+struct FloatArray FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef FloatArrayBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DATA = 4
+  };
+  const flatbuffers::Vector<float> *data() const {
+    return GetPointer<const flatbuffers::Vector<float> *>(VT_DATA);
+  }
+  flatbuffers::Vector<float> *mutable_data() {
+    return GetPointer<flatbuffers::Vector<float> *>(VT_DATA);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_DATA) &&
+           verifier.VerifyVector(data()) &&
+           verifier.EndTable();
+  }
+};
+
+struct FloatArrayBuilder {
+  typedef FloatArray Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_data(flatbuffers::Offset<flatbuffers::Vector<float>> data) {
+    fbb_.AddOffset(FloatArray::VT_DATA, data);
+  }
+  explicit FloatArrayBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  FloatArrayBuilder &operator=(const FloatArrayBuilder &);
+  flatbuffers::Offset<FloatArray> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<FloatArray>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<FloatArray> CreateFloatArray(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::Vector<float>> data = 0) {
+  FloatArrayBuilder builder_(_fbb);
+  builder_.add_data(data);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<FloatArray> CreateFloatArrayDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<float> *data = nullptr) {
+  auto data__ = data ? _fbb.CreateVector<float>(*data) : 0;
+  return dml::ir::operatorFieldTypes::CreateFloatArray(
+      _fbb,
+      data__);
+}
+
+struct ScalarUnionData FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ScalarUnionDataBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DATA_TYPE = 4,
+    VT_DATA = 6
+  };
+  dml::ir::operatorFieldTypes::ScalarVariant data_type() const {
+    return static_cast<dml::ir::operatorFieldTypes::ScalarVariant>(GetField<uint8_t>(VT_DATA_TYPE, 0));
+  }
+  const void *data() const {
+    return GetPointer<const void *>(VT_DATA);
+  }
+  template<typename T> const T *data_as() const;
+  const dml::ir::operatorFieldTypes::ByteArray *data_as_ByteArray() const {
+    return data_type() == dml::ir::operatorFieldTypes::ScalarVariant_ByteArray ? static_cast<const dml::ir::operatorFieldTypes::ByteArray *>(data()) : nullptr;
+  }
+  const dml::ir::operatorFieldTypes::Int8 *data_as_Int8() const {
+    return data_type() == dml::ir::operatorFieldTypes::ScalarVariant_Int8 ? static_cast<const dml::ir::operatorFieldTypes::Int8 *>(data()) : nullptr;
+  }
+  const dml::ir::operatorFieldTypes::UInt8 *data_as_UInt8() const {
+    return data_type() == dml::ir::operatorFieldTypes::ScalarVariant_UInt8 ? static_cast<const dml::ir::operatorFieldTypes::UInt8 *>(data()) : nullptr;
+  }
+  const dml::ir::operatorFieldTypes::Int16 *data_as_Int16() const {
+    return data_type() == dml::ir::operatorFieldTypes::ScalarVariant_Int16 ? static_cast<const dml::ir::operatorFieldTypes::Int16 *>(data()) : nullptr;
+  }
+  const dml::ir::operatorFieldTypes::UInt16 *data_as_UInt16() const {
+    return data_type() == dml::ir::operatorFieldTypes::ScalarVariant_UInt16 ? static_cast<const dml::ir::operatorFieldTypes::UInt16 *>(data()) : nullptr;
+  }
+  const dml::ir::operatorFieldTypes::Int32 *data_as_Int32() const {
+    return data_type() == dml::ir::operatorFieldTypes::ScalarVariant_Int32 ? static_cast<const dml::ir::operatorFieldTypes::Int32 *>(data()) : nullptr;
+  }
+  const dml::ir::operatorFieldTypes::UInt32 *data_as_UInt32() const {
+    return data_type() == dml::ir::operatorFieldTypes::ScalarVariant_UInt32 ? static_cast<const dml::ir::operatorFieldTypes::UInt32 *>(data()) : nullptr;
+  }
+  const dml::ir::operatorFieldTypes::Int64 *data_as_Int64() const {
+    return data_type() == dml::ir::operatorFieldTypes::ScalarVariant_Int64 ? static_cast<const dml::ir::operatorFieldTypes::Int64 *>(data()) : nullptr;
+  }
+  const dml::ir::operatorFieldTypes::UInt64 *data_as_UInt64() const {
+    return data_type() == dml::ir::operatorFieldTypes::ScalarVariant_UInt64 ? static_cast<const dml::ir::operatorFieldTypes::UInt64 *>(data()) : nullptr;
+  }
+  const dml::ir::operatorFieldTypes::Float32 *data_as_Float32() const {
+    return data_type() == dml::ir::operatorFieldTypes::ScalarVariant_Float32 ? static_cast<const dml::ir::operatorFieldTypes::Float32 *>(data()) : nullptr;
+  }
+  const dml::ir::operatorFieldTypes::Float64 *data_as_Float64() const {
+    return data_type() == dml::ir::operatorFieldTypes::ScalarVariant_Float64 ? static_cast<const dml::ir::operatorFieldTypes::Float64 *>(data()) : nullptr;
+  }
+  void *mutable_data() {
+    return GetPointer<void *>(VT_DATA);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_DATA_TYPE) &&
+           VerifyOffset(verifier, VT_DATA) &&
+           VerifyScalarVariant(verifier, data(), data_type()) &&
+           verifier.EndTable();
+  }
+};
+
+template<> inline const dml::ir::operatorFieldTypes::ByteArray *ScalarUnionData::data_as<dml::ir::operatorFieldTypes::ByteArray>() const {
+  return data_as_ByteArray();
+}
+
+template<> inline const dml::ir::operatorFieldTypes::Int8 *ScalarUnionData::data_as<dml::ir::operatorFieldTypes::Int8>() const {
+  return data_as_Int8();
+}
+
+template<> inline const dml::ir::operatorFieldTypes::UInt8 *ScalarUnionData::data_as<dml::ir::operatorFieldTypes::UInt8>() const {
+  return data_as_UInt8();
+}
+
+template<> inline const dml::ir::operatorFieldTypes::Int16 *ScalarUnionData::data_as<dml::ir::operatorFieldTypes::Int16>() const {
+  return data_as_Int16();
+}
+
+template<> inline const dml::ir::operatorFieldTypes::UInt16 *ScalarUnionData::data_as<dml::ir::operatorFieldTypes::UInt16>() const {
+  return data_as_UInt16();
+}
+
+template<> inline const dml::ir::operatorFieldTypes::Int32 *ScalarUnionData::data_as<dml::ir::operatorFieldTypes::Int32>() const {
+  return data_as_Int32();
+}
+
+template<> inline const dml::ir::operatorFieldTypes::UInt32 *ScalarUnionData::data_as<dml::ir::operatorFieldTypes::UInt32>() const {
+  return data_as_UInt32();
+}
+
+template<> inline const dml::ir::operatorFieldTypes::Int64 *ScalarUnionData::data_as<dml::ir::operatorFieldTypes::Int64>() const {
+  return data_as_Int64();
+}
+
+template<> inline const dml::ir::operatorFieldTypes::UInt64 *ScalarUnionData::data_as<dml::ir::operatorFieldTypes::UInt64>() const {
+  return data_as_UInt64();
+}
+
+template<> inline const dml::ir::operatorFieldTypes::Float32 *ScalarUnionData::data_as<dml::ir::operatorFieldTypes::Float32>() const {
+  return data_as_Float32();
+}
+
+template<> inline const dml::ir::operatorFieldTypes::Float64 *ScalarUnionData::data_as<dml::ir::operatorFieldTypes::Float64>() const {
+  return data_as_Float64();
+}
+
+struct ScalarUnionDataBuilder {
+  typedef ScalarUnionData Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_data_type(dml::ir::operatorFieldTypes::ScalarVariant data_type) {
+    fbb_.AddElement<uint8_t>(ScalarUnionData::VT_DATA_TYPE, static_cast<uint8_t>(data_type), 0);
+  }
+  void add_data(flatbuffers::Offset<void> data) {
+    fbb_.AddOffset(ScalarUnionData::VT_DATA, data);
+  }
+  explicit ScalarUnionDataBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ScalarUnionDataBuilder &operator=(const ScalarUnionDataBuilder &);
+  flatbuffers::Offset<ScalarUnionData> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ScalarUnionData>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ScalarUnionData> CreateScalarUnionData(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    dml::ir::operatorFieldTypes::ScalarVariant data_type = dml::ir::operatorFieldTypes::ScalarVariant_NONE,
+    flatbuffers::Offset<void> data = 0) {
+  ScalarUnionDataBuilder builder_(_fbb);
+  builder_.add_data(data);
+  builder_.add_data_type(data_type);
+  return builder_.Finish();
+}
+
+inline bool VerifyAttributeFieldVariant(flatbuffers::Verifier &verifier, const void *obj, AttributeFieldVariant type) {
+  switch (type) {
+    case AttributeFieldVariant_NONE: {
+      return true;
+    }
+    case AttributeFieldVariant_Activation: {
+      auto ptr = reinterpret_cast<const dml::ir::operatorFieldTypes::Activation *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case AttributeFieldVariant_ActivationArray: {
+      auto ptr = reinterpret_cast<const dml::ir::operatorFieldTypes::ActivationArray *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case AttributeFieldVariant_UInt32: {
+      return verifier.Verify<dml::ir::operatorFieldTypes::UInt32>(static_cast<const uint8_t *>(obj), 0);
+    }
+    case AttributeFieldVariant_UInt64: {
+      return verifier.Verify<dml::ir::operatorFieldTypes::UInt64>(static_cast<const uint8_t *>(obj), 0);
+    }
+    case AttributeFieldVariant_Int32: {
+      return verifier.Verify<dml::ir::operatorFieldTypes::Int32>(static_cast<const uint8_t *>(obj), 0);
+    }
+    case AttributeFieldVariant_Float32: {
+      return verifier.Verify<dml::ir::operatorFieldTypes::Float32>(static_cast<const uint8_t *>(obj), 0);
+    }
+    case AttributeFieldVariant_UIntArray: {
+      auto ptr = reinterpret_cast<const dml::ir::operatorFieldTypes::UIntArray *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case AttributeFieldVariant_IntArray: {
+      auto ptr = reinterpret_cast<const dml::ir::operatorFieldTypes::IntArray *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case AttributeFieldVariant_FloatArray: {
+      auto ptr = reinterpret_cast<const dml::ir::operatorFieldTypes::FloatArray *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case AttributeFieldVariant_ScaleBias: {
+      return verifier.Verify<dml::ir::operatorFieldTypes::ScaleBias>(static_cast<const uint8_t *>(obj), 0);
+    }
+    case AttributeFieldVariant_Size2D: {
+      return verifier.Verify<dml::ir::operatorFieldTypes::Size2D>(static_cast<const uint8_t *>(obj), 0);
+    }
+    case AttributeFieldVariant_ScalarUnionData: {
+      auto ptr = reinterpret_cast<const dml::ir::operatorFieldTypes::ScalarUnionData *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case AttributeFieldVariant_Bool: {
+      return verifier.Verify<dml::ir::operatorFieldTypes::Bool>(static_cast<const uint8_t *>(obj), 0);
+    }
+    default: return true;
+  }
+}
+
+inline bool VerifyAttributeFieldVariantVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types) {
+  if (!values || !types) return !values && !types;
+  if (values->size() != types->size()) return false;
+  for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+    if (!VerifyAttributeFieldVariant(
+        verifier,  values->Get(i), types->GetEnum<AttributeFieldVariant>(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline bool VerifyScalarVariant(flatbuffers::Verifier &verifier, const void *obj, ScalarVariant type) {
+  switch (type) {
+    case ScalarVariant_NONE: {
+      return true;
+    }
+    case ScalarVariant_ByteArray: {
+      return verifier.Verify<dml::ir::operatorFieldTypes::ByteArray>(static_cast<const uint8_t *>(obj), 0);
+    }
+    case ScalarVariant_Int8: {
+      return verifier.Verify<dml::ir::operatorFieldTypes::Int8>(static_cast<const uint8_t *>(obj), 0);
+    }
+    case ScalarVariant_UInt8: {
+      return verifier.Verify<dml::ir::operatorFieldTypes::UInt8>(static_cast<const uint8_t *>(obj), 0);
+    }
+    case ScalarVariant_Int16: {
+      return verifier.Verify<dml::ir::operatorFieldTypes::Int16>(static_cast<const uint8_t *>(obj), 0);
+    }
+    case ScalarVariant_UInt16: {
+      return verifier.Verify<dml::ir::operatorFieldTypes::UInt16>(static_cast<const uint8_t *>(obj), 0);
+    }
+    case ScalarVariant_Int32: {
+      return verifier.Verify<dml::ir::operatorFieldTypes::Int32>(static_cast<const uint8_t *>(obj), 0);
+    }
+    case ScalarVariant_UInt32: {
+      return verifier.Verify<dml::ir::operatorFieldTypes::UInt32>(static_cast<const uint8_t *>(obj), 0);
+    }
+    case ScalarVariant_Int64: {
+      return verifier.Verify<dml::ir::operatorFieldTypes::Int64>(static_cast<const uint8_t *>(obj), 0);
+    }
+    case ScalarVariant_UInt64: {
+      return verifier.Verify<dml::ir::operatorFieldTypes::UInt64>(static_cast<const uint8_t *>(obj), 0);
+    }
+    case ScalarVariant_Float32: {
+      return verifier.Verify<dml::ir::operatorFieldTypes::Float32>(static_cast<const uint8_t *>(obj), 0);
+    }
+    case ScalarVariant_Float64: {
+      return verifier.Verify<dml::ir::operatorFieldTypes::Float64>(static_cast<const uint8_t *>(obj), 0);
+    }
+    default: return true;
+  }
+}
+
+inline bool VerifyScalarVariantVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types) {
+  if (!values || !types) return !values && !types;
+  if (values->size() != types->size()) return false;
+  for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+    if (!VerifyScalarVariant(
+        verifier,  values->Get(i), types->GetEnum<ScalarVariant>(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+}  // namespace operatorFieldTypes
+}  // namespace ir
+}  // namespace dml
+
+#endif  // FLATBUFFERS_GENERATED_OPERATORFIELDTYPES_DML_IR_OPERATORFIELDTYPES_H_
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/SchemaHelpers.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/SchemaHelpers.h
index 5285481485184..1bc694dfe90c2 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/SchemaHelpers.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/SchemaHelpers.h
@@ -26,14 +26,14 @@ namespace SchemaHelpers
         return field;
     }
 
-    inline OperatorFieldTypes::OperatorDesc ToOperatorFieldType(const DML_OPERATOR_DESC* value)
+    inline OperatorFieldTypes::FusedActivationOperatorDesc ToOperatorFieldType(const DML_OPERATOR_DESC* value)
     {
-        return value ? OperatorFieldTypes::OperatorDesc(ConvertOperatorDesc(*value)) : std::nullopt;
+        return value ? OperatorFieldTypes::FusedActivationOperatorDesc(ConvertOperatorDesc(*value)) : std::nullopt;
     }
 
-    inline OperatorFieldTypes::OperatorDescArray ToOperatorFieldType(const DML_OPERATOR_DESC* values, uint32_t count)
+    inline OperatorFieldTypes::FusedActivationOperatorDescArray ToOperatorFieldType(const DML_OPERATOR_DESC* values, uint32_t count)
     {
-        OperatorFieldTypes::OperatorDescArray field;
+        OperatorFieldTypes::FusedActivationOperatorDescArray field;
         if (values && count != 0)
         {
             field.emplace(count);
@@ -65,13 +65,17 @@ namespace SchemaHelpers
         return value;
     }
 
+    inline OperatorFieldTypes::Bool ToOperatorFieldType(bool value)
+    {
+        return value;
+    }
+
     inline OperatorFieldTypes::UIntArray ToOperatorFieldType(const uint32_t* values, uint32_t count)
     {
         OperatorFieldTypes::UIntArray field;
         if (values && count != 0)
         {
-            field.emplace(count);
-            std::copy_n(values, count, field->begin());
+            field.assign(values, values + count);
         }
         return field;
     }
@@ -81,8 +85,7 @@ namespace SchemaHelpers
         OperatorFieldTypes::IntArray field;
         if (values && count != 0)
         {
-            field.emplace(count);
-            std::copy_n(values, count, field->begin());
+            field.assign(values, values + count);
         }
         return field;
     }
@@ -92,8 +95,7 @@ namespace SchemaHelpers
         OperatorFieldTypes::FloatArray field;
         if (values && count != 0)
         {
-            field.emplace(count);
-            std::copy_n(values, count, field->begin());
+            field.assign(values, values + count);
         }
         return field;
     }
@@ -237,7 +239,7 @@ namespace SchemaHelpers
         {
             DML_OPERATOR_DESC* desc = nullptr;
 
-            const auto& value = field.AsOperatorDesc();
+            const auto& value = field.AsFusedActivationOperatorDesc();
             if (value)
             {
                 desc = allocator->template Allocate<DML_OPERATOR_DESC>();
@@ -251,7 +253,7 @@ namespace SchemaHelpers
         {
             DML_OPERATOR_DESC* descs = nullptr;
 
-            const auto& values = field.AsOperatorDescArray();
+            const auto& values = field.AsFusedActivationOperatorDescArray();
             if (values)
             {
                 descs = allocator->template Allocate<DML_OPERATOR_DESC>(values->size());
@@ -288,16 +290,20 @@ namespace SchemaHelpers
             dst->Write(value);
         } break;
 
+        case DML_SCHEMA_FIELD_TYPE_BOOL:
+        {
+            // OperatorFieldTypes::Bool is a 'bool' (1 byte) but written as 'BOOL' in op descs (4 bytes).
+            BOOL value = static_cast<BOOL>(field.AsBool());
+            dst->Write(value);
+        } break;
+
         case DML_SCHEMA_FIELD_TYPE_UINT_ARRAY:
         {
             uint32_t* arrayPtr = nullptr;
 
             const auto& values = field.AsUIntArray();
-            if (values)
-            {
-                arrayPtr = allocator->template Allocate<uint32_t>(values->size());
-                std::copy(values->begin(), values->end(), arrayPtr);
-            }
+            arrayPtr = allocator->template Allocate<uint32_t>(values.size());
+            std::copy(values.begin(), values.end(), arrayPtr);
 
             dst->Write(arrayPtr);
         } break;
@@ -307,11 +313,8 @@ namespace SchemaHelpers
             int32_t* arrayPtr = nullptr;
 
             const auto& values = field.AsIntArray();
-            if (values)
-            {
-                arrayPtr = allocator->template Allocate<int32_t>(values->size());
-                std::copy(values->begin(), values->end(), arrayPtr);
-            }
+            arrayPtr = allocator->template Allocate<int32_t>(values.size());
+            std::copy(values.begin(), values.end(), arrayPtr);
 
             dst->Write(arrayPtr);
         } break;
@@ -321,11 +324,8 @@ namespace SchemaHelpers
             float* arrayPtr = nullptr;
 
             const auto& values = field.AsFloatArray();
-            if (values)
-            {
-                arrayPtr = allocator->template Allocate<float>(values->size());
-                std::copy(values->begin(), values->end(), arrayPtr);
-            }
+            arrayPtr = allocator->template Allocate<float>(values.size());
+            std::copy(values.begin(), values.end(), arrayPtr);
 
             dst->Write(arrayPtr);
         } break;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp
index 2456b396de3f6..e6f008af5c23f 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp
@@ -33,10 +33,10 @@ namespace Dml::GraphDescBuilder
     #pragma warning(pop)
 
     static void RemoveUnconnectedNodes(
-        std::vector<NodeInfo>& graphNodes,
-        std::vector<DML_INPUT_GRAPH_EDGE_DESC>& graphInputEdges,
-        std::vector<DML_INTERMEDIATE_GRAPH_EDGE_DESC>& graphIntermediateEdges,
-        std::vector<DML_OUTPUT_GRAPH_EDGE_DESC>& graphOutputEdges)
+        std::vector<DmlSerializedGraphNode>& graphNodes,
+        std::vector<DmlInputSerializedGraphEdge>& graphInputEdges,
+        std::vector<DmlIntermediateSerializedGraphEdge>& graphIntermediateEdges,
+        std::vector<DmlOutputSerializedGraphEdge>& graphOutputEdges)
     {
         enum class NodeState
         {
@@ -52,7 +52,7 @@ namespace Dml::GraphDescBuilder
         };
 
         std::vector<NodeData> nodesData(graphNodes.size());
-        for (const DML_INTERMEDIATE_GRAPH_EDGE_DESC& intermediateEdge : graphIntermediateEdges)
+        for (const DmlIntermediateSerializedGraphEdge& intermediateEdge : graphIntermediateEdges)
         {
             nodesData[intermediateEdge.ToNodeIndex].predecessorIndices.push_back(intermediateEdge.FromNodeIndex);
         }
@@ -60,7 +60,7 @@ namespace Dml::GraphDescBuilder
         std::stack<uint32_t> nodeIndicesToVisit;
 
         // Start from the outputs of the graph and traverse upwards
-        for (const DML_OUTPUT_GRAPH_EDGE_DESC& outputEdge : graphOutputEdges)
+        for (const DmlOutputSerializedGraphEdge& outputEdge : graphOutputEdges)
         {
             nodeIndicesToVisit.push(outputEdge.FromNodeIndex);
         }
@@ -143,17 +143,44 @@ namespace Dml::GraphDescBuilder
         }
     }
 
+
+    uint32_t SetAndGetDmlGraphNodeIndex(
+        const uint32_t operatorDmlGraphNodeIndex,
+        const std::string& nodeNamePrefix,
+        AbstractOperatorDesc& operatorDesc,
+        /*in_out*/std::unordered_map<uint32_t, uint32_t>& operatorDmlGraphToDmlGraphNodeIndexMap,
+        /*in_out*/std::vector<DmlSerializedGraphNode>& dmlGraphNodes)
+    {
+        auto iter = operatorDmlGraphToDmlGraphNodeIndexMap.find(operatorDmlGraphNodeIndex);
+        if (iter != operatorDmlGraphToDmlGraphNodeIndexMap.end())
+        {
+            return iter->second;
+        }
+        operatorDmlGraphToDmlGraphNodeIndexMap[operatorDmlGraphNodeIndex] = static_cast<uint32_t>(dmlGraphNodes.size());
+        dmlGraphNodes.push_back({operatorDesc, nodeNamePrefix + std::to_string(operatorDmlGraphNodeIndex)});
+        return operatorDmlGraphToDmlGraphNodeIndexMap[operatorDmlGraphNodeIndex];
+    }
+
+    // Terminology:
+    //   Subgraph: partitioned ONNX graph from the original (main) ONNX graph
+    //   DmlGraph: a graph in DML currency converted from subgraph.
+    //   operatorDmlGraph: a graph in DML currency for a given node or operator
+    // Main Points to note:
+    //   - GraphDesc will always has sequential indices for input and intermediate edges.
+    //   - 1 onnx node can be converted to one or more dml nodes.
     GraphDesc BuildGraphDesc(
         const uint8_t* isConstGpuGraphInput,
         const size_t isConstGpuGraphInputCount,
         const std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>>& isInitializerTransferable,
         const std::unordered_map<std::string, GraphNodeProperties>& graphNodePropertyMap,
-        IDMLDevice* device,
         const ExecutionProviderImpl* executionHandle,
         const onnxruntime::Path& modelPath,
         gsl::span<const onnxruntime::Node* const> subgraphNodes,
         gsl::span<const onnxruntime::NodeArg* const> subgraphInputs,
-        gsl::span<const onnxruntime::NodeArg* const> subgraphOutputs)
+        gsl::span<const onnxruntime::NodeArg* const> subgraphOutputs,
+        /*out*/ std::unordered_map<uint32_t, uint32_t>& serializedGraphInputIndexToSubgraphInputIndex,
+        /*out*/ std::unordered_map<std::string_view, uint32_t>& serializedGraphLargeConstantNameToSubgraphInputIndex,
+        /*out*/ std::vector<std::unique_ptr<std::byte[]>>& smallConstantData)
     {
         struct NodeAndIndex
         {
@@ -161,19 +188,34 @@ namespace Dml::GraphDescBuilder
             uint32_t targetIndex; // The index of the input/output on the node (e.g. 1 for the second input on a node)
         };
 
-        // Map from Lotus node argument names to the new node and index where it will be produced
-        std::unordered_map<std::string, NodeAndIndex> nameToNodeAndIndexMap;
-
         std::unordered_map<std::string, EdgeShapes> nodeOutputShapes;
 
-        // Map from Lotus node argument names to input indices of the fused kernel node.
-        std::unordered_map<std::string, uint32_t> nameToDmlFusedNodeInputIndex;
+        // Map from ORT subgraph input names to indices
+        std::unordered_map<std::string_view, uint32_t> subgraphInputNameToIndexMap;
+        
+        // - Map from ORT node's output names to DmlGraph <NodeAndIndex>.
+        // - Once a given ORT node (or operator) will be transformed into a operatorDmlGraph,
+        //   then ORT node's output names will become output edges for the operatorDmlGraph.
+        // - This map will be populated for those output edges.
+        std::unordered_map<std::string, NodeAndIndex> dmlGraphNodeOutputNameToNodeAndIndexMap;
+        
+        // This map will be used to re-index an subGraphInputIndex to sequential input index
+        // for DmlGraph
+        std::unordered_map<uint32_t, uint32_t> subGraphInputIndexToDmlGraphInputIndex;
+        
+        // Iterate through each node and create a corresponding node in the new graph
+        // We can iterate the nodes in any order because the edge connectivity will take care of the topological order
+        std::unordered_map<std::string, std::vector<uint32_t>> inferredOutputShapes;
+        
+        std::vector<DmlSerializedGraphNode> dmlGraphNodes;
+        std::vector<DmlInputSerializedGraphEdge> dmlGraphInputEdges;
+        std::vector<DmlIntermediateSerializedGraphEdge> dmlGraphIntermediateEdges;
+        std::vector<DmlOutputSerializedGraphEdge> dmlGraphOutputEdges;
 
         for (size_t inputIndex = 0; inputIndex < subgraphInputs.size(); ++inputIndex)
         {
-            const onnxruntime::NodeArg* graphInput = subgraphInputs[inputIndex];
-
-            if (!graphInput)
+            const onnxruntime::NodeArg* subgraphInput = subgraphInputs[inputIndex];
+            if (!subgraphInput)
             {
                 // This is a workaround for when node inputs get manipulated by transformers outside of our control,
                 // which then causes them to have a different name. If that happens we can't figure out how to
@@ -181,45 +223,21 @@ namespace Dml::GraphDescBuilder
                 // just bail early.
                 ORT_THROW_HR(E_UNEXPECTED);
             }
-
-            nameToDmlFusedNodeInputIndex.emplace(graphInput->Name(), gsl::narrow_cast<uint32_t>(inputIndex));
-        }
-
-        StackAllocator<1024> allocator; // Used for converting abstract operator descs into DML_OPERATOR_DESC
-
-        std::vector<NodeInfo> graphNodes;
-        std::vector<DML_INPUT_GRAPH_EDGE_DESC> graphInputEdges;
-        std::vector<DML_INTERMEDIATE_GRAPH_EDGE_DESC> graphIntermediateEdges;
-        std::vector<DML_OUTPUT_GRAPH_EDGE_DESC> graphOutputEdges;
-
-        // Avoid using separate command lists for small graphs. This value can be reduced by tuning the
-        // flushing behavior of DmlCommandRecorder.  Its current behavior is to assume that graphs contain
-        // enough GPU work to be worth flushing immediately.
-        const uint32_t minNodeCountToReuseCommandList = 5;
-        bool reuseCommandList = false;
-
-        if (subgraphNodes.size() >= minNodeCountToReuseCommandList || executionHandle->IsMcdmDevice())
-        {
-            reuseCommandList = true;
+            subgraphInputNameToIndexMap.emplace(subgraphInput->Name(), gsl::narrow_cast<uint32_t>(inputIndex));
         }
 
         auto constantCpuGraphInputGetter = [&isInitializerTransferable, &modelPath](const std::string& argName)
         {
             ComPtr<OnnxTensorWrapper> tensorWrapper;
-
             auto iter = isInitializerTransferable.find(argName);
             if (iter != isInitializerTransferable.end())
             {
                 // Using const_cast here is simpler than making surrounding code const correct.
                 tensorWrapper = wil::MakeOrThrow<OnnxTensorWrapper>(const_cast<ONNX_NAMESPACE::TensorProto*>(iter->second.first), modelPath);
             }
-
             return tensorWrapper;
         };
 
-        // Iterate through each node and create a corresponding node in the new graph
-        // We can iterate the nodes in any order because the edge connectivity will take care of the topological order
-        std::unordered_map<std::string, std::vector<uint32_t>> inferredOutputShapes;
 
         for (const onnxruntime::Node* subgraphNode : subgraphNodes)
         {
@@ -277,195 +295,206 @@ namespace Dml::GraphDescBuilder
             }
 
             EdgeShapes outputShapes;
-            DmlGraphNodeCreateInfo graphNodeCreateInfo;
+            DmlGraphNodeCreateInfo operatorDmlGraphCreateInfo;
             graphNodeProps.internalRegInfo->graphNodeFactoryRegistration->factory(
                 node,
                 constantCpuNodeInputGetter,
                 executionHandle,
                 &inputShapesOverrides,
                 /*out*/ &outputShapes,
-                /*out*/ &graphNodeCreateInfo
+                /*out*/ &operatorDmlGraphCreateInfo
             );
 
             ORT_THROW_HR_IF(E_UNEXPECTED, outputShapes.EdgeCount() != node.OutputDefs().size());
             for (int i = 0; i < node.OutputDefs().size(); ++i)
             {
                 inferredOutputShapes[node.OutputDefs()[i]->Name()] = outputShapes.GetShape(i);
-            }
-
-            // Create a map between operatorGraphNodeIndex to mainGraphNodeIndex.
-            std::unordered_map<uint32_t, uint32_t> operatorGraphNodeIndexToMainGraphNodeIndexMap;
-            uint32_t graphNodeCount = gsl::narrow_cast<uint32_t>(graphNodes.size());
-            const bool isNodeAsOpDesc = graphNodeCreateInfo.nodesAsOperatorDesc.size() > 0;
-            size_t firstOpDescGraphNodeIndex = graphNodes.size();
-
-            if (isNodeAsOpDesc)
+            }            
+            
+            // Algorithm:
+            //  1. Create constant nodes by iterating through operatorDmlGraph's input edges and keep a map of it,
+            //     because there would be an intermediate edge from the constantNode and source of the intermediate edge
+            //     should come before the destination.
+            //  2. Again iterate through operatorDmlGraph's input edges to create mainGraph's input and intermediate edges.
+            //  3. Iterate through operatorDmlGraph's intermediate edges to create mainGraph's intermediate edges.
+            //  4. Iterate through operatorDmlGraph's output edges to populate outputEdgeNameToDmlGraphNodeAndIndex
+            //  5. While performing step 2, 3, and 4, insert operatorDmlGraphNode to the mainDmlGraphNode list.
+            
+            for (auto& operatorDmlGraphInputEdge : operatorDmlGraphCreateInfo.inputEdges)
             {
-                // Can't populate graphNodes vector at this point, because operatorDesc may get modified later.
-                for (uint32_t nodeIndex = 0; nodeIndex < graphNodeCreateInfo.nodeCount; nodeIndex++)
+                const onnxruntime::NodeArg* arg = node.InputDefs()[operatorDmlGraphInputEdge.GraphInputIndex];
+                if (arg->Exists())
                 {
-                    ORT_THROW_HR_IF(E_UNEXPECTED, !graphNodeCreateInfo.nodesAsOperatorDesc[nodeIndex]);
-                    operatorGraphNodeIndexToMainGraphNodeIndexMap.emplace(nodeIndex, graphNodeCount++);
-                }
+                    auto iter = subgraphInputNameToIndexMap.find(arg->Name());
+                    if (iter != subgraphInputNameToIndexMap.end() &&
+                        iter->second < isConstGpuGraphInputCount &&
+                        isConstGpuGraphInput[iter->second])
+                    {
+                        DmlSerializedGraphNode constantNode = {};
+                        constantNode.Name = arg->Name();
+
+                        // This is a highly inefficient approach to generating constant nodes.  It duplicates constant data 
+                        // across the graph input as well as every consumer's unique constant node.  However it is currently 
+                        // only used for small inputs.
+                        auto& operatorDmlGraphInputNode = operatorDmlGraphCreateInfo.nodes[operatorDmlGraphInputEdge.ToNodeIndex];
+                        std::vector<DmlBufferTensorDesc*> toNodeInputTensorDescs = operatorDmlGraphInputNode->GetInputTensors();
+                        DmlBufferTensorDesc* tensorDesc = toNodeInputTensorDescs[operatorDmlGraphInputEdge.ToNodeInputIndex];
+                        ComPtr<OnnxTensorWrapper> constantInput;
+
+                        if (tensorDesc->totalTensorSizeInBytes < c_maxConstNodeDataSize)
+                        {
+                            constantInput = constantCpuGraphInputGetter(arg->Name());
+                        }
 
-                graphNodes.resize(graphNodes.size() + graphNodeCreateInfo.nodeCount);
-            }
-            else
-            {
-                for (uint32_t nodeIndex = 0; nodeIndex < graphNodeCreateInfo.nodeCount; nodeIndex++)
-                {
-                    ORT_THROW_HR_IF(E_UNEXPECTED, !graphNodeCreateInfo.nodesAsIDMLOperator[nodeIndex].Get());
-                    operatorGraphNodeIndexToMainGraphNodeIndexMap.emplace(nodeIndex, graphNodeCount++);
-                    NodeInfo nodeInfo = {};
-                    nodeInfo.nodeDef = std::move(graphNodeCreateInfo.nodesAsIDMLOperator[nodeIndex]);
-                    graphNodes.push_back(std::move(nodeInfo));
+                        if (constantInput)
+                        {
+                            // The tensor description's size should be no larger than the constant input unless it was rounded to
+                            // the required alignment.
+                            assert(((constantInput->GetTensorByteSize() + 3) & ~3) >= tensorDesc->totalTensorSizeInBytes);
+                            size_t minimumConstantSize = std::min(constantInput->GetTensorByteSize(), gsl::narrow_cast<size_t>(tensorDesc->totalTensorSizeInBytes));
+                            auto data = static_cast<const uint8_t*>(constantInput->GetData());
+                            std::vector<uint8_t> tensorData(data, data + minimumConstantSize);
+
+                            smallConstantData.push_back(std::make_unique<std::byte[]>(tensorData.size()));
+                            std::transform(tensorData.begin(), tensorData.end(), smallConstantData.back().get(), [](uint8_t b) {return static_cast<std::byte>(b);});
+
+                            ConstantData constantData = {smallConstantData.back().get(), tensorData.size()};
+                            constantNode.Desc = constantData;
+                        }
+                        else
+                        {
+                            ConstantName constantFileName = {GetSanitizedFileName(arg->Name())};
+                            constantNode.Desc = constantFileName;
+                        }
+                        dmlGraphNodeOutputNameToNodeAndIndexMap[arg->Name()] = {static_cast<uint32_t>(dmlGraphNodes.size()), 0};
+                        dmlGraphNodes.push_back(constantNode);
+                    }
                 }
             }
 
-            // map operatorGraphInputEdge as either mainGraphInputEdge or mainGraphIntermediateEdge
-            for (auto& operatorGraphInputEdge : graphNodeCreateInfo.inputEdges)
-            {
-                // operatorGraphInputEdge.GraphInputIndex will be the ONNX input index.
-                const onnxruntime::NodeArg* arg = node.InputDefs()[operatorGraphInputEdge.GraphInputIndex];
+            // Create a map between operatorGraphNodeIndex to dmlGraphNodeIndex.
+            std::unordered_map<uint32_t, uint32_t> operatorDmlGraphToDmlGraphNodeIndexMap;
 
+            // map operatorDmlGraphInputEdge as either mainDmlGraphInputEdge or mainDmlGraphIntermediateEdge
+            for (auto& operatorDmlGraphInputEdge : operatorDmlGraphCreateInfo.inputEdges)
+            {
+                // operatorDmlGraphInputEdge.GraphInputIndex will be the ONNX input index.
+                const onnxruntime::NodeArg* arg = node.InputDefs()[operatorDmlGraphInputEdge.GraphInputIndex];
                 if (arg->Exists())
                 {
-                    auto iter = nameToDmlFusedNodeInputIndex.find(arg->Name());
-                    uint32_t mainGraphNodeIndex = operatorGraphNodeIndexToMainGraphNodeIndexMap[operatorGraphInputEdge.ToNodeIndex];
-
-                    if (iter != nameToDmlFusedNodeInputIndex.end())
+                    uint32_t dmlGraphNodeIndex = SetAndGetDmlGraphNodeIndex(
+                        operatorDmlGraphInputEdge.ToNodeIndex,
+                        node.Name(),
+                        *operatorDmlGraphCreateInfo.nodes[operatorDmlGraphInputEdge.ToNodeIndex],
+                        operatorDmlGraphToDmlGraphNodeIndexMap,
+                        dmlGraphNodes);
+
+                    auto iter = subgraphInputNameToIndexMap.find(arg->Name());
+                    if (iter != subgraphInputNameToIndexMap.end())
                     {
-                        // This is a graph input
-
-                        const uint32_t dmlFusedNodeInputIndex = iter->second;
-
-                        // If this is a constant input, set the appropriate flags on the desc
-                        if (isNodeAsOpDesc &&
-                            dmlFusedNodeInputIndex < isConstGpuGraphInputCount &&
-                            isConstGpuGraphInput[dmlFusedNodeInputIndex])
+                        const uint32_t subgraphInputIndex = iter->second;
+                        
+                        // Either this edge will be
+                        //  a constant input, then it will be an intermediate edge and 
+                        //  set the OWNED_BY_DML flag if it is large constant 
+                        //  or,
+                        //  a non-constant input, then it will be a mainDmlGraphInputEdge.
+                        if (subgraphInputIndex < isConstGpuGraphInputCount &&
+                            isConstGpuGraphInput[subgraphInputIndex])
                         {
-                            // This is a highly inefficient approach to generating constant nodes.  It duplicates constant data
-                            // across the graph input as well as every consumer's unique constant node.  However it is currently
-                            // only used for small inputs.
-                            uint32_t c_maxConstNodeDataSize = 8;
-
-
-                            auto& operatorGraphInputNode = graphNodeCreateInfo.nodesAsOperatorDesc[operatorGraphInputEdge.ToNodeIndex];
-                            std::vector<DmlBufferTensorDesc*> toNodeInputTensorDescs = operatorGraphInputNode->GetInputTensors();
-                            DmlBufferTensorDesc* tensorDesc = toNodeInputTensorDescs[operatorGraphInputEdge.ToNodeInputIndex];
-                            ComPtr<OnnxTensorWrapper> constantInput;
-
-                            if (tensorDesc->totalTensorSizeInBytes < c_maxConstNodeDataSize)
-                            {
-                                constantInput = constantCpuGraphInputGetter(arg->Name());
-                            }
-
-                            if (constantInput)
-                            {
-                                // The tensor description's size should be no larger than the constant input unless it was rounded to
-                                // the required alignment.
-                                assert(((constantInput->GetTensorByteSize() + 3) & ~3) >= tensorDesc->totalTensorSizeInBytes);
-                                size_t minimumConstantSize = std::min(constantInput->GetTensorByteSize(), gsl::narrow_cast<size_t>(tensorDesc->totalTensorSizeInBytes));
-                                auto data = static_cast<const uint8_t*>(constantInput->GetData());
-                                std::vector<uint8_t> tensorData(data, data + minimumConstantSize);
-
-                                NodeInfo nodeInfo = {};
-                                nodeInfo.nodeDef = std::move(tensorData);
-                                graphNodes.push_back(std::move(nodeInfo));
-
-                                DML_INTERMEDIATE_GRAPH_EDGE_DESC edge = {};
-                                edge.FromNodeIndex = static_cast<UINT>(graphNodes.size() - 1);
-                                edge.FromNodeOutputIndex = 0;
-                                edge.ToNodeIndex = mainGraphNodeIndex;
-                                edge.ToNodeInputIndex = operatorGraphInputEdge.ToNodeInputIndex;
-                                graphIntermediateEdges.push_back(edge);
-                            }
-                            else
+                            const auto& constantNodeAndIndex = dmlGraphNodeOutputNameToNodeAndIndexMap.at(arg->Name());
+                            auto& constantNodeVariant = std::get<DmlSerializedGraphNodeConstantVariant>(dmlGraphNodes[constantNodeAndIndex.nodeIndex].Desc);
+                            if (std::holds_alternative<ConstantName>(constantNodeVariant))
                             {
-                                DML_INPUT_GRAPH_EDGE_DESC edge = {};
-                                edge.GraphInputIndex = dmlFusedNodeInputIndex;
-                                edge.ToNodeIndex = mainGraphNodeIndex;
-                                edge.ToNodeInputIndex = operatorGraphInputEdge.ToNodeInputIndex;
-                                graphInputEdges.push_back(edge);
-
+                                auto& mainDmlGraphNode = dmlGraphNodes[dmlGraphNodeIndex];
+                                AbstractOperatorDesc& abstractOperatorDesc = std::get<AbstractOperatorDesc>(mainDmlGraphNode.Desc);
+                                std::vector<DmlBufferTensorDesc*> toNodeInputTensorDescs = abstractOperatorDesc.GetInputTensors();
+                                DmlBufferTensorDesc* tensorDesc = toNodeInputTensorDescs[operatorDmlGraphInputEdge.ToNodeInputIndex];
                                 tensorDesc->flags |= DML_TENSOR_FLAG_OWNED_BY_DML;
+                                serializedGraphLargeConstantNameToSubgraphInputIndex[arg->Name()] = subgraphInputIndex;
                             }
+
+                            DmlIntermediateSerializedGraphEdge edge = {};
+                            edge.FromNodeIndex = constantNodeAndIndex.nodeIndex;
+                            edge.FromNodeOutputIndex = constantNodeAndIndex.targetIndex;
+                            edge.ToNodeIndex = dmlGraphNodeIndex;
+                            edge.ToNodeInputIndex = operatorDmlGraphInputEdge.ToNodeInputIndex;
+                            edge.Name = arg->Name() + "-nodeIdx:" + std::to_string(edge.FromNodeIndex) + "-outputIdx:" + std::to_string(edge.FromNodeOutputIndex);
+                            dmlGraphIntermediateEdges.push_back(edge);
                         }
                         else
                         {
-                            DML_INPUT_GRAPH_EDGE_DESC edge = {};
-                            edge.GraphInputIndex = dmlFusedNodeInputIndex;
-                            edge.ToNodeIndex = mainGraphNodeIndex;
-                            edge.ToNodeInputIndex = operatorGraphInputEdge.ToNodeInputIndex;
-                            graphInputEdges.push_back(edge);
+                            DmlInputSerializedGraphEdge edge = {};
+                            if (subGraphInputIndexToDmlGraphInputIndex.find(subgraphInputIndex) == subGraphInputIndexToDmlGraphInputIndex.end())
+                            {
+                                subGraphInputIndexToDmlGraphInputIndex[subgraphInputIndex] = static_cast<uint32_t>(subGraphInputIndexToDmlGraphInputIndex.size());
+                            }
+
+                            edge.GraphInputIndex = subGraphInputIndexToDmlGraphInputIndex[subgraphInputIndex];
+                            edge.ToNodeIndex = dmlGraphNodeIndex;
+                            edge.ToNodeInputIndex = operatorDmlGraphInputEdge.ToNodeInputIndex;  // ?? might need to point inputIndex
+                            edge.Name = arg->Name();
+
+                            serializedGraphInputIndexToSubgraphInputIndex[edge.GraphInputIndex] = subgraphInputIndex;
+                            dmlGraphInputEdges.push_back(edge);
                         }
                     }
                     else
                     {
-                        const auto& inputNodeAndIndex = nameToNodeAndIndexMap.at(arg->Name());
+                        const auto& inputNodeAndIndex = dmlGraphNodeOutputNameToNodeAndIndexMap.at(arg->Name());
 
-                        DML_INTERMEDIATE_GRAPH_EDGE_DESC edge = {};
+                        DmlIntermediateSerializedGraphEdge edge = {};
                         edge.FromNodeIndex = inputNodeAndIndex.nodeIndex;
                         edge.FromNodeOutputIndex = inputNodeAndIndex.targetIndex;
-                        edge.ToNodeIndex = mainGraphNodeIndex;
-                        edge.ToNodeInputIndex = operatorGraphInputEdge.ToNodeInputIndex;
-                        graphIntermediateEdges.push_back(edge);
+                        edge.ToNodeIndex = dmlGraphNodeIndex;
+                        edge.ToNodeInputIndex = operatorDmlGraphInputEdge.ToNodeInputIndex;
+                        edge.Name = arg->Name();
+                        dmlGraphIntermediateEdges.push_back(edge);
                     }
                 }
             }
 
             // map operatorGraphIntermediateEdges as mainGraphIntermediateEdge
-            for (auto& operatorGraphIntermediateEdge : graphNodeCreateInfo.intermediateEdges)
+            for (auto& operatorGraphIntermediateEdge : operatorDmlGraphCreateInfo.intermediateEdges)
             {
-                DML_INTERMEDIATE_GRAPH_EDGE_DESC edge = {};
-                edge.FromNodeIndex = operatorGraphNodeIndexToMainGraphNodeIndexMap[operatorGraphIntermediateEdge.FromNodeIndex];
+                DmlIntermediateSerializedGraphEdge edge = {};
+                uint32_t shiftedFromNodeIndex = SetAndGetDmlGraphNodeIndex(
+                        operatorGraphIntermediateEdge.FromNodeIndex,
+                        node.Name(),
+                        *operatorDmlGraphCreateInfo.nodes[operatorGraphIntermediateEdge.FromNodeIndex],
+                        operatorDmlGraphToDmlGraphNodeIndexMap,
+                        dmlGraphNodes);
+                uint32_t shiftedToNodeIndex = SetAndGetDmlGraphNodeIndex(
+                        operatorGraphIntermediateEdge.ToNodeIndex,
+                        node.Name(),
+                        *operatorDmlGraphCreateInfo.nodes[operatorGraphIntermediateEdge.ToNodeIndex],
+                        operatorDmlGraphToDmlGraphNodeIndexMap,
+                        dmlGraphNodes);
+
+                edge.FromNodeIndex = shiftedFromNodeIndex;
                 edge.FromNodeOutputIndex = operatorGraphIntermediateEdge.FromNodeOutputIndex;
-                edge.ToNodeIndex = operatorGraphNodeIndexToMainGraphNodeIndexMap[operatorGraphIntermediateEdge.ToNodeIndex];
+                edge.ToNodeIndex = shiftedToNodeIndex;
                 edge.ToNodeInputIndex = operatorGraphIntermediateEdge.ToNodeInputIndex;
-                graphIntermediateEdges.push_back(edge);
+                edge.Name = "nodeIdx:" + std::to_string(shiftedFromNodeIndex) + "-outputIdx:" + std::to_string(operatorGraphIntermediateEdge.FromNodeOutputIndex);
+                dmlGraphIntermediateEdges.push_back(edge);
             }
-
+            
             // populate nameToNodeAndIndexMap (which will be used by above loop) for operatorGraphOutputEdges
-            for (auto& operatorGraphOutputEdge : graphNodeCreateInfo.outputEdges)
+            for (auto& operatorGraphOutputEdge : operatorDmlGraphCreateInfo.outputEdges)
             {
                 const onnxruntime::NodeArg* arg = node.OutputDefs()[operatorGraphOutputEdge.GraphOutputIndex];
                 if (arg->Exists())
                 {
-                    nameToNodeAndIndexMap[arg->Name()] = NodeAndIndex {
-                        operatorGraphNodeIndexToMainGraphNodeIndexMap[operatorGraphOutputEdge.FromNodeIndex],
-                        operatorGraphOutputEdge.FromNodeOutputIndex
-                    };
-
+                    uint32_t shiftedNodeIndex = SetAndGetDmlGraphNodeIndex(
+                            operatorGraphOutputEdge.FromNodeIndex,
+                            node.Name(),
+                            *operatorDmlGraphCreateInfo.nodes[operatorGraphOutputEdge.FromNodeIndex],
+                            operatorDmlGraphToDmlGraphNodeIndexMap,
+                            dmlGraphNodes);
+                    dmlGraphNodeOutputNameToNodeAndIndexMap[arg->Name()] = {shiftedNodeIndex, operatorGraphOutputEdge.FromNodeOutputIndex};
                     nodeOutputShapes[arg->Name()] = outputShapes;
                 }
             }
-
-            if (isNodeAsOpDesc)
-            {
-                for (size_t i = 0; i < graphNodeCreateInfo.nodesAsOperatorDesc.size(); ++i)
-                {
-                    auto& opDesc = graphNodeCreateInfo.nodesAsOperatorDesc[i];
-
-                    DML_OPERATOR_DESC dmlDesc = SchemaHelpers::ConvertOperatorDesc(*opDesc, &allocator);
-
-                    // TODO: Change as new header is ingested
-                    if (dmlDesc.Type == (DML_OPERATOR_TYPE) DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING)
-                        dmlDesc.Type = (DML_OPERATOR_TYPE) 169;
-
-                    // TODO: Change as new header is ingested
-                    if (dmlDesc.Type == (DML_OPERATOR_TYPE) DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT)
-                        dmlDesc.Type = (DML_OPERATOR_TYPE) 170;
-
-                    ComPtr<IDMLOperator> op;
-                    ORT_THROW_IF_FAILED(device->CreateOperator(&dmlDesc, IID_PPV_ARGS(&op)));
-                    allocator.Reset();
-
-                    NodeInfo nodeInfo = {};
-                    nodeInfo.nodeDef = std::move(op);
-                    nodeInfo.name = node.Name();
-                    graphNodes[firstOpDescGraphNodeIndex + i] = std::move(nodeInfo);
-                }
-            }
         }
 
         EdgeShapes graphOutputShapes(subgraphOutputs.size());
@@ -476,24 +505,27 @@ namespace Dml::GraphDescBuilder
             const onnxruntime::NodeArg* graphOutput = subgraphOutputs[outputIndex];
 
             ORT_THROW_HR_IF_NULL_MSG(E_POINTER, graphOutput, "FusedNode's nodeArgList does not contain one of the nodeArg");
-            const auto& outputNodeAndIndex = nameToNodeAndIndexMap.at(graphOutput->Name());
+            const auto& outputNodeAndIndex = dmlGraphNodeOutputNameToNodeAndIndexMap.at(graphOutput->Name());
 
-            DML_OUTPUT_GRAPH_EDGE_DESC edge = {};
+            DmlOutputSerializedGraphEdge edge = {};
             edge.FromNodeIndex = outputNodeAndIndex.nodeIndex;
             edge.FromNodeOutputIndex = outputNodeAndIndex.targetIndex;
             edge.GraphOutputIndex = gsl::narrow_cast<uint32_t>(outputIndex);
-            graphOutputEdges.push_back(edge);
+            edge.Name = graphOutput->Name();
+            dmlGraphOutputEdges.push_back(edge);
             graphOutputShapes.GetMutableShape(outputIndex) = nodeOutputShapes[graphOutput->Name()].GetShape(outputNodeAndIndex.targetIndex);
         }
 
-        RemoveUnconnectedNodes(graphNodes, graphInputEdges, graphIntermediateEdges, graphOutputEdges);
+        RemoveUnconnectedNodes(dmlGraphNodes, dmlGraphInputEdges, dmlGraphIntermediateEdges, dmlGraphOutputEdges);
 
         GraphDesc graphDesc{};
-        graphDesc.nodes = std::move(graphNodes);
-        graphDesc.inputEdges = std::move(graphInputEdges);
-        graphDesc.outputEdges = std::move(graphOutputEdges);
-        graphDesc.intermediateEdges = std::move(graphIntermediateEdges);
-        graphDesc.reuseCommandList = reuseCommandList;
+        graphDesc.InputCount = static_cast<uint32_t>(dmlGraphInputEdges.size());
+        graphDesc.OutputCount = static_cast<uint32_t>(subgraphOutputs.size());
+        graphDesc.Nodes = std::move(dmlGraphNodes);
+        graphDesc.InputEdges = std::move(dmlGraphInputEdges);
+        graphDesc.OutputEdges = std::move(dmlGraphOutputEdges);
+        graphDesc.IntermediateEdges = std::move(dmlGraphIntermediateEdges);
+        graphDesc.reuseCommandList = (subgraphNodes.size() >= minNodeCountToReuseCommandList || executionHandle->IsMcdmDevice());
         graphDesc.outputShapes = std::move(graphOutputShapes);
         return graphDesc;
     }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.h
index c95e89b45541b..4055984b40405 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.h
@@ -22,22 +22,15 @@ namespace Dml
 
     namespace GraphDescBuilder
     {
+        constexpr uint32_t minNodeCountToReuseCommandList = 5;
+        constexpr uint32_t c_maxConstNodeDataSize = 8;
+
         // Gets a unique name for the node which survives recreation and graph manipulations between the point
         // that graph partitioning occurs and kernel creation happens
         const std::string& GetUniqueNodeName(const onnxruntime::Node& node);
 
-        struct NodeInfo
-        {
-            std::variant<Microsoft::WRL::ComPtr<IDMLOperator>, std::vector<uint8_t>> nodeDef;
-            std::string name;
-        };
-
-        struct GraphDesc
+        struct GraphDesc : DmlSerializedGraphDesc
         {
-            std::vector<NodeInfo> nodes;
-            std::vector<DML_INPUT_GRAPH_EDGE_DESC> inputEdges;
-            std::vector<DML_OUTPUT_GRAPH_EDGE_DESC> outputEdges;
-            std::vector<DML_INTERMEDIATE_GRAPH_EDGE_DESC> intermediateEdges;
             bool reuseCommandList;
             Windows::AI::MachineLearning::Adapter::EdgeShapes outputShapes;
         };
@@ -47,11 +40,13 @@ namespace Dml
             const size_t isConstGpuGraphInputCount,
             const std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>>& isInitializerTransferable,
             const std::unordered_map<std::string, GraphNodeProperties>& graphNodePropertyMap,
-            IDMLDevice* device,
             const ExecutionProviderImpl* executionHandle,
             const onnxruntime::Path& modelPath,
             gsl::span<const onnxruntime::Node* const> subgraphNodes,
             gsl::span<const onnxruntime::NodeArg* const> subgraphInputs,
-            gsl::span<const onnxruntime::NodeArg* const> subgraphOutputs);
+            gsl::span<const onnxruntime::NodeArg* const> subgraphOutputs,
+            /*out*/ std::unordered_map<uint32_t, uint32_t>& serializedGraphInputIndexToSubgraphInputIndex,
+            /*out*/ std::unordered_map<std::string_view, uint32_t>& serializedGraphLargeConstantNameToSubgraphInputIndex,
+            /*out*/ std::vector<std::unique_ptr<std::byte[]>>& smallConstantData);
     }
 }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
index d524780de71b8..f29fbc7a1a65b 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
@@ -1508,31 +1508,17 @@ namespace Windows::AI::MachineLearning::Adapter
         ORT_TRY
         {
             assert(operatorGraphDesc != nullptr);
-            // Either nodesAsOpDesc or nodesIDMLOperator can be present.
-            assert(operatorGraphDesc->nodeCount == 0 || (!operatorGraphDesc->nodesAsOpDesc ^ !operatorGraphDesc->nodesAsIDMLOperator));
+            assert(operatorGraphDesc->nodeCount == 0 || operatorGraphDesc->nodes);
 
-            if (operatorGraphDesc->nodesAsOpDesc)
+            m_graphNodeCreateInfo->nodes = std::vector<std::unique_ptr<AbstractOperatorDesc>>();
+            for (uint32_t nodeIndex = 0; nodeIndex < operatorGraphDesc->nodeCount; nodeIndex++)
             {
-                m_graphNodeCreateInfo->nodesAsOperatorDesc = std::vector<std::unique_ptr<AbstractOperatorDesc>>();
-                for (uint32_t nodeIndex = 0; nodeIndex < operatorGraphDesc->nodeCount; nodeIndex++)
-                {
-                    auto* node = operatorGraphDesc->nodesAsOpDesc[nodeIndex];
-                    assert(node != nullptr);
-                    AbstractOperatorDesc abstractDesc = SchemaHelpers::ConvertOperatorDesc(*node);
-                    m_graphNodeCreateInfo->nodesAsOperatorDesc.push_back(std::make_unique<AbstractOperatorDesc>(std::move(abstractDesc)));
-                }
-            }
-            else
-            {
-                m_graphNodeCreateInfo->nodesAsIDMLOperator = std::vector<Microsoft::WRL::ComPtr<IDMLOperator>>();
-                for (uint32_t nodeIndex = 0; nodeIndex < operatorGraphDesc->nodeCount; nodeIndex++)
-                {
-                    auto* node = operatorGraphDesc->nodesAsIDMLOperator[nodeIndex];
-                    assert(node != nullptr);
-                    m_graphNodeCreateInfo->nodesAsIDMLOperator.push_back(node);
-                }
+                auto* node = operatorGraphDesc->nodes[nodeIndex];
+                assert(node != nullptr);
+                AbstractOperatorDesc abstractDesc = SchemaHelpers::ConvertOperatorDesc(*node);
+                m_graphNodeCreateInfo->nodes.push_back(std::make_unique<AbstractOperatorDesc>(std::move(abstractDesc)));
             }
-
+            
             // There can be operators (or kernels) which don't require any input.
             assert(operatorGraphDesc->inputEdgeCount == 0 || operatorGraphDesc->inputEdges != nullptr);
             m_graphNodeCreateInfo->inputEdges.insert(
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp
index c3bb1a52210f5..287f1e5b6dfe7 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp
@@ -53,7 +53,7 @@ namespace Dml
             MLOperatorGraphDesc operatorGraphDesc = {};
             operatorGraphDesc.nodeCount = 1;
             const DML_OPERATOR_DESC* opDescs{&operatorDesc};
-            operatorGraphDesc.nodesAsOpDesc = &opDescs;
+            operatorGraphDesc.nodes = &opDescs;
 
             std::vector<DML_INPUT_GRAPH_EDGE_DESC> inputEdges;
             for (uint32_t inputIndex = 0; inputIndex < m_kernelInputIndices.size(); inputIndex++)
@@ -796,7 +796,7 @@ namespace Dml
         for (size_t i = 0; i < graphDesc.NodeCount; ++i)
         {
             // Create the operator.
-            ORT_THROW_IF_FAILED(m_dmlDevice->CreateOperator(operatorGraphDesc.nodesAsOpDesc[i], IID_PPV_ARGS(&dmlOperators[i])));
+            ORT_THROW_IF_FAILED(m_dmlDevice->CreateOperator(operatorGraphDesc.nodes[i], IID_PPV_ARGS(&dmlOperators[i])));
             dmlOperatorGraphNodes[i] = DML_OPERATOR_GRAPH_NODE_DESC{dmlOperators[i].Get()};
             dmlGraphNodes[i] = DML_GRAPH_NODE_DESC{DML_GRAPH_NODE_TYPE_OPERATOR, &dmlOperatorGraphNodes[i]};
         }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorAttention.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorAttention.cpp
index c8ca6806e75f7..73c2d57e984af 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorAttention.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorAttention.cpp
@@ -531,7 +531,7 @@ class DmlOperatorAttention : public DmlOperator
         operatorGraphDesc.outputEdgeCount = gsl::narrow_cast<uint32_t>(outputEdges.size());
         operatorGraphDesc.outputEdges = outputEdges.data();
         operatorGraphDesc.nodeCount = gsl::narrow_cast<uint32_t>(opDescs.size());
-        operatorGraphDesc.nodesAsOpDesc = opDescs.data();
+        operatorGraphDesc.nodes = opDescs.data();
 
         SetDmlOperatorGraphDesc(std::move(operatorGraphDesc), kernelCreationContext);
     }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorBiasAdd.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorBiasAdd.cpp
index 1c851c94c4ddc..5aceebbdabfe3 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorBiasAdd.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorBiasAdd.cpp
@@ -103,7 +103,7 @@ class DmlOperatorBiasAdd : public DmlOperator
         operatorGraphDesc.outputEdgeCount = gsl::narrow_cast<uint32_t>(outputEdges.size());
         operatorGraphDesc.outputEdges = outputEdges.data();
         operatorGraphDesc.nodeCount = gsl::narrow_cast<uint32_t>(opDescs.size());
-        operatorGraphDesc.nodesAsOpDesc = opDescs.data();
+        operatorGraphDesc.nodes = opDescs.data();
         SetDmlOperatorGraphDesc(std::move(operatorGraphDesc), kernelCreationContext);
     }
 };
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorBiasSplitGelu.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorBiasSplitGelu.cpp
index 501ce14f1fc08..1e10214ffd463 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorBiasSplitGelu.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorBiasSplitGelu.cpp
@@ -137,7 +137,7 @@ class DmlOperatorBiasSplitGelu : public DmlOperator
         operatorGraphDesc.outputEdgeCount = gsl::narrow_cast<uint32_t>(outputEdges.size());
         operatorGraphDesc.outputEdges = outputEdges.data();
         operatorGraphDesc.nodeCount = gsl::narrow_cast<uint32_t>(opDescs.size());
-        operatorGraphDesc.nodesAsOpDesc = opDescs.data();
+        operatorGraphDesc.nodes = opDescs.data();
         SetDmlOperatorGraphDesc(std::move(operatorGraphDesc), kernelCreationContext);
     }
 };
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorEmbedLayerNormalization.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorEmbedLayerNormalization.cpp
index 6a8333cd72561..3c9458658c4d0 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorEmbedLayerNormalization.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorEmbedLayerNormalization.cpp
@@ -484,7 +484,7 @@ class DmlOperatorEmbedLayerNormalization : public DmlOperator
         operatorGraphDesc.outputEdgeCount = gsl::narrow_cast<uint32_t>(outputEdges.size());
         operatorGraphDesc.outputEdges = outputEdges.data();
         operatorGraphDesc.nodeCount = gsl::narrow_cast<uint32_t>(opDescs.size());
-        operatorGraphDesc.nodesAsOpDesc = opDescs.data();
+        operatorGraphDesc.nodes = opDescs.data();
 
         SetDmlOperatorGraphDesc(std::move(operatorGraphDesc), kernelCreationContext);
     }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorGroupNorm.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorGroupNorm.cpp
index fed0e4645ffd8..8b275fc550f3e 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorGroupNorm.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorGroupNorm.cpp
@@ -287,7 +287,7 @@ class DmlOperatorGroupNorm : public DmlOperator
         operatorGraphDesc.outputEdgeCount = gsl::narrow_cast<uint32_t>(outputEdges.size());
         operatorGraphDesc.outputEdges = outputEdges.data();
         operatorGraphDesc.nodeCount = gsl::narrow_cast<uint32_t>(opDescs.size());
-        operatorGraphDesc.nodesAsOpDesc = opDescs.data();
+        operatorGraphDesc.nodes = opDescs.data();
         SetDmlOperatorGraphDesc(std::move(operatorGraphDesc), kernelCreationContext);
     }
 };
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorLayerNormalization.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorLayerNormalization.cpp
index 5c64059f7caa9..80e6fefc2fb80 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorLayerNormalization.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorLayerNormalization.cpp
@@ -247,7 +247,7 @@ class DmlOperatorLayerNormalization : public DmlOperator
         operatorGraphDesc.outputEdgeCount = gsl::narrow_cast<uint32_t>(outputEdges.size());
         operatorGraphDesc.outputEdges = outputEdges.data();
         operatorGraphDesc.nodeCount = gsl::narrow_cast<uint32_t>(opDescs.size());
-        operatorGraphDesc.nodesAsOpDesc = opDescs.data();
+        operatorGraphDesc.nodes = opDescs.data();
 
         SetDmlOperatorGraphDesc(std::move(operatorGraphDesc), kernelCreationContext);
     }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearConcat.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearConcat.cpp
index c97b03dc36b62..8727610ff3112 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearConcat.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearConcat.cpp
@@ -166,7 +166,7 @@ class DmlOperatorQLinearConcat : public DmlOperator, public QLinearConcatHelper
 
         MLOperatorGraphDesc operatorGraphDesc = {};
         operatorGraphDesc.nodeCount = static_cast<uint32_t>(opDescs.size());
-        operatorGraphDesc.nodesAsOpDesc = opDescs.data();
+        operatorGraphDesc.nodes = opDescs.data();
 
         uint32_t joinNodeIndex = operatorGraphDesc.nodeCount - 2;
         uint32_t quantizeNodeIndex = operatorGraphDesc.nodeCount - 1;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearSigmoid.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearSigmoid.cpp
index 35f926d62c92a..f658e7c7da323 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearSigmoid.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearSigmoid.cpp
@@ -113,7 +113,7 @@ class DmlOperatorQLinearSigmoid : public DmlOperator
         MLOperatorGraphDesc operatorGraphDesc = {};
         operatorGraphDesc.nodeCount = 3;
         std::vector<const DML_OPERATOR_DESC*> opDescs{&opDesc1, &opDesc2, &opDesc3};
-        operatorGraphDesc.nodesAsOpDesc = opDescs.data();
+        operatorGraphDesc.nodes = opDescs.data();
 
         // set input edges
         std::pair<uint32_t, uint32_t> nodeToNodeInputIndex[5] {{0, 0}, {0, 1}, {0, 2}, {2, 1}, {2, 2}};
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQuickGelu.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQuickGelu.cpp
index 3683ab7b0b0b3..e62b7d707ba78 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQuickGelu.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQuickGelu.cpp
@@ -123,7 +123,7 @@ class DmlOperatorQuickGelu : public DmlOperator
         operatorGraphDesc.outputEdgeCount = gsl::narrow_cast<uint32_t>(outputEdges.size());
         operatorGraphDesc.outputEdges = outputEdges.data();
         operatorGraphDesc.nodeCount = gsl::narrow_cast<uint32_t>(opDescs.size());
-        operatorGraphDesc.nodesAsOpDesc = opDescs.data();
+        operatorGraphDesc.nodes = opDescs.data();
         SetDmlOperatorGraphDesc(std::move(operatorGraphDesc), kernelCreationContext);
     }
 };
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorRotaryEmbedding.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorRotaryEmbedding.cpp
index 44004b5d77f70..0f15ebf342b3a 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorRotaryEmbedding.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorRotaryEmbedding.cpp
@@ -441,7 +441,7 @@ class DmlOperatorRotaryEmbedding : public DmlOperator
         operatorGraphDesc.outputEdgeCount = gsl::narrow_cast<uint32_t>(outputEdges.size());
         operatorGraphDesc.outputEdges = outputEdges.data();
         operatorGraphDesc.nodeCount = gsl::narrow_cast<uint32_t>(opDescs.size());
-        operatorGraphDesc.nodesAsOpDesc = opDescs.data();
+        operatorGraphDesc.nodes = opDescs.data();
 
         SetDmlOperatorGraphDesc(std::move(operatorGraphDesc), kernelInfo);
     }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorSkipLayerNormalization.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorSkipLayerNormalization.cpp
index 4dafd78f21ea8..094c45a0e38e5 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorSkipLayerNormalization.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorSkipLayerNormalization.cpp
@@ -198,7 +198,7 @@ class DmlOperatorSkipLayerNormalization : public DmlOperator
         operatorGraphDesc.outputEdgeCount = gsl::narrow_cast<uint32_t>(outputEdges.size());
         operatorGraphDesc.outputEdges = outputEdges.data();
         operatorGraphDesc.nodeCount = gsl::narrow_cast<uint32_t>(opDescs.size());
-        operatorGraphDesc.nodesAsOpDesc = opDescs.data();
+        operatorGraphDesc.nodes = opDescs.data();
 
         SetDmlOperatorGraphDesc(std::move(operatorGraphDesc), kernelCreationContext);
     }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Utility.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Utility.h
new file mode 100644
index 0000000000000..02166f992449e
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Utility.h
@@ -0,0 +1,141 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include <string>
+#include <string_view>
+#include <locale>
+#include <codecvt>
+        
+
+namespace Dml
+{
+    static inline std::wstring ConvertToWString(std::string_view str)
+    {
+        std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>,wchar_t> g_converterToUtf16;
+        return g_converterToUtf16.from_bytes(str.data());
+    }
+
+    static inline std::wstring GetModelName(const onnxruntime::Path& modelPath)
+    {
+        if (modelPath.GetComponents().empty())
+        {
+            return L"";
+        }
+        
+        const onnxruntime::PathString& pathString = modelPath.GetComponents().back();
+        size_t dotPosition = pathString.find_last_of('.');
+        if (dotPosition == std::string::npos)
+        {
+            return L"";
+        }
+
+        return pathString.substr(0, dotPosition);
+    }
+
+    static inline std::wstring GetSanitizedFileName(std::wstring_view name)
+    {
+        std::wstring newName(name);
+        for (wchar_t& c : newName)
+        {
+            switch (c)
+            {
+            case '\\':
+            case '/':
+            case '\"':
+            case '|':
+            case '<':
+            case '>':
+            case ':':
+            case '?':
+            case '*':
+                c = '_';
+                break;
+            }
+        }
+        return newName;
+    }
+
+    static inline std::string GetSanitizedFileName(std::string_view name)
+    {
+        std::string newName(name);
+        for (char& c : newName)
+        {
+            switch (c)
+            {
+            case '\\':
+            case '/':
+            case '\"':
+            case '|':
+            case '<':
+            case '>':
+            case ':':
+            case '?':
+            case '*':
+                c = '_';
+                break;
+            }
+        }
+        return newName;
+    }
+
+    static inline void WriteToFile(std::wstring_view directoryName, std::wstring_view fileName, std::uint8_t* data, size_t dataSize)
+    {
+        std::wstring sanitizedFileName = GetSanitizedFileName(fileName);
+        std::filesystem::create_directory(directoryName);
+        std::wstring fullSanitizedFileName = std::wstring(directoryName) +
+                                (directoryName.empty() ? L"" : L"/") +
+                                sanitizedFileName;
+        std::ofstream file(fullSanitizedFileName, std::ios::binary);
+        if (!file.is_open()) 
+        {
+            std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>,wchar_t> g_converterToUtf16;
+            std::stringstream errorMessage;
+            errorMessage << "File named: " << g_converterToUtf16.to_bytes(fileName.data()) << " could not be opened\n";
+            throw std::ios::failure(errorMessage.str());
+        }
+        file.write(reinterpret_cast<const char*>(data), dataSize);
+    }
+
+}
+
+namespace StringUtil
+{
+    struct NameAndIndex
+    {
+        const char* name; // Null terminated.
+        uint32_t index;
+    };
+
+    struct WideNameAndIndex
+    {
+        const wchar_t* name; // Null terminated.
+        uint32_t index;
+    };
+
+    inline std::optional<uint32_t> MapToIndex(std::string_view mode, gsl::span<const NameAndIndex> nameAndIndexList)
+    {
+        for (auto& nameAndIndex : nameAndIndexList)
+        {
+            if (strncmp(nameAndIndex.name, mode.data(), mode.size()) == 0)
+            {
+                return nameAndIndex.index;
+            }
+        }
+
+        return {};
+    }
+
+    inline std::optional<uint32_t> MapToIndex(std::wstring_view mode, gsl::span<const WideNameAndIndex> nameAndIndexList)
+    {
+        for (auto& nameAndIndex : nameAndIndexList)
+        {
+            if (wcsncmp(nameAndIndex.name, mode.data(), mode.size()) == 0)
+            {
+                return nameAndIndex.index;
+            }
+        }
+
+        return {};
+    }
+}
\ No newline at end of file
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/precomp.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/precomp.h
index 83737d2ba4848..332bf86685e8a 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/precomp.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/precomp.h
@@ -17,6 +17,8 @@
 #include <chrono>
 #include <variant>
 #include <cassert>
+#include <fstream>
+#include <filesystem>
 
 #include <wrl/client.h>
 #include <wrl/implements.h>
@@ -37,6 +39,7 @@
 #include <d3d12sdklayers.h>
 #include "External/D3DX12/d3dx12.h"
 #endif
+#include "flatbuffers/flatbuffers.h"
 
 #include "GraphicsUnknownHelper.h"
 
@@ -53,6 +56,9 @@
 #include "External/DirectMLHelpers/SchemaHelpers.h"
 #include "External/DirectMLHelpers/GeneratedSchemaHelpers.h"
 #include "External/DirectMLHelpers/DirectMLX.h"
+#include "External/DirectMLHelpers/DmlSerializedGraphDesc.h"
+#include "External/DirectMLHelpers/DmlGraphSerialization.h"
+#include "External/DirectMLHelpers/DmlGraphDeserialization.h"
 
 using Microsoft::WRL::ComPtr;
 
@@ -67,3 +73,4 @@ using Microsoft::WRL::ComPtr;
 #include "TensorDesc.h"
 #include "DescriptorPool.h"
 #include "IExecutionProvider.h"
+#include "Utility.h"
\ No newline at end of file
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h
index 3bec8d3864cba..ac3a3eb1268b8 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h
@@ -10,18 +10,11 @@ struct DML_INPUT_GRAPH_EDGE_DESC;
 struct DML_OUTPUT_GRAPH_EDGE_DESC;
 struct DML_INTERMEDIATE_GRAPH_EDGE_DESC;
 
-// Either nodesAsOpDesc or nodesAsIDMLOperator is present.
-//  1) Operator kernels which implement operators using only a single DML operator will pass a DML_OPERATOR_DESC.
-//     These kernels pass DML_OPERATOR_DESC, because while building Dml graph (inside FusedGraphKernel.cpp) we can change the
-//     the flag of constant inputs to DML_TENSOR_FLAG_OWNED_BY_DML.
-//  2) Operator kernels which implement operators using DMLX graph, they will pass IDMLOperator and won't be able
-//     to use DML_TENSOR_FLAG_OWNED_BY_DML.
 struct MLOperatorGraphDesc
 {
     uint32_t nodeCount;
-    _Field_size_opt_(nodeCount) const DML_OPERATOR_DESC** nodesAsOpDesc;
-    _Field_size_opt_(nodeCount) IDMLOperator** nodesAsIDMLOperator;
-
+    _Field_size_opt_(nodeCount) const DML_OPERATOR_DESC** nodes;
+    
     uint32_t inputEdgeCount;
     _Field_size_(inputEdgeCount) const DML_INPUT_GRAPH_EDGE_DESC* inputEdges;
 
diff --git a/onnxruntime/core/providers/dml/dml_session_options_config_keys.h b/onnxruntime/core/providers/dml/dml_session_options_config_keys.h
index d11fa7516e713..5b5f371f51616 100644
--- a/onnxruntime/core/providers/dml/dml_session_options_config_keys.h
+++ b/onnxruntime/core/providers/dml/dml_session_options_config_keys.h
@@ -21,3 +21,4 @@
 // "1": disabled (disallowed). Graph fusion will never be used.
 // The default value is "0"
 static const char* const kOrtSessionOptionsConfigDisableDmlGraphFusion = "ep.dml.disable_graph_fusion";
+static const char* const kOrtSessionOptionsConfigEnableGraphSerialization = "ep.dml.enable_graph_serialization";
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index efd7db4ea7629..5fd66c459d382 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -1725,10 +1725,17 @@ common::Status InferenceSession::Initialize() {
         // graph optimization level and is generally always applied.
         bool dml_graph_fusion_enabled = session_options_.optimized_model_filepath.empty() &&
                                         session_options_.config_options.GetConfigOrDefault(kOrtSessionOptionsConfigDisableDmlGraphFusion, "0") == "0";
+        std::string dml_graph_serialization_enabled_config_val = session_options_.config_options.GetConfigOrDefault(kOrtSessionOptionsConfigEnableGraphSerialization, "0");
+        std::transform(dml_graph_serialization_enabled_config_val.begin(),
+                       dml_graph_serialization_enabled_config_val.end(),
+                       dml_graph_serialization_enabled_config_val.begin(),
+                       [](char ch) { return std::tolower(ch); });
+        bool dml_graph_serialization_enabled = dml_graph_serialization_enabled_config_val == "true";
 
         if (dml_graph_fusion_enabled) {
           std::unique_ptr<onnxruntime::GraphTransformer> dmlGraphFusionTransformer = std::make_unique<Dml::DmlGraphFusionTransformer>("DmlGraphFusionTransformer",
-                                                                                                                                      dmlExecutionProvider);
+                                                                                                                                      dmlExecutionProvider,
+                                                                                                                                      dml_graph_serialization_enabled);
           if (dmlGraphFusionTransformer == nullptr) {
             return Status(common::ONNXRUNTIME, common::FAIL, "DmlGraphFusionTransformer is nullptr");
           }
diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc
index 3874901f86387..7d4111e3b9c39 100644
--- a/onnxruntime/test/perftest/command_args_parser.cc
+++ b/onnxruntime/test/perftest/command_args_parser.cc
@@ -68,6 +68,7 @@ namespace perftest {
       "\t    [DML only] [device_filter]: DML device filter, options: 'any', 'gpu', 'npu', \n"
       "\t    [DML only] [disable_metacommands]: Options: 'true', 'false', \n"
       "\t    [DML only] [enable_dynamic_graph_fusion]: Options: 'true', 'false', \n"
+      "\t    [DML only] [enable_graph_serialization]: Options: 'true', 'false', \n"
       "\t    [OpenVINO only] [device_type]: Overrides the accelerator hardware type and precision with these values at runtime.\n"
       "\t    [OpenVINO only] [device_id]: Selects a particular hardware device for inference.\n"
       "\t    [OpenVINO only] [enable_npu_fast_compile]: Optionally enabled to speeds up the model's compilation on NPU device targets.\n"
diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc
index 87506c7240578..1934314b8ce43 100644
--- a/onnxruntime/test/perftest/ort_test_session.cc
+++ b/onnxruntime/test/perftest/ort_test_session.cc
@@ -18,6 +18,7 @@
 
 #ifdef USE_DML
 #include "core/providers/dml/dml_provider_factory.h"
+#include "core/providers/dml/dml_session_options_config_keys.h"
 #endif
 
 #ifdef _WIN32
@@ -542,6 +543,15 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
               "[ERROR] [DML] You have selcted wrong value for the key 'enable_dynamic_graph_fusion'. "
               "Select from 'true' or 'false' \n");
         }
+      } else if (key == "enable_graph_serialization") {
+        std::set<std::string> ov_supported_values = {"true", "True", "false", "False"};
+        if (ov_supported_values.find(value) != ov_supported_values.end()) {
+          session_options.AddConfigEntry(kOrtSessionOptionsConfigEnableGraphSerialization, value.data());
+        } else {
+          ORT_THROW(
+              "[ERROR] [DML] You have selcted wrong value for the key 'enable_graph_serialization'. "
+              "Select from 'true' or 'false' \n");
+        }
       }
     }
     session_options.AppendExecutionProvider("DML", dml_options);

From 8bd943be39301639e3f50f524f8fd71c7f2b2a34 Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Tue, 27 Feb 2024 09:31:32 +1000
Subject: [PATCH 062/279] Retry flaky XCode iOS UI tests if we get a known
 error (#19639)

### Description
<!-- Describe your changes. -->
Xcode UI tests seem to be flaky:
https://github.com/orgs/community/discussions/68807
Add a couple of retries if we get a "Timed out while loading
Accessibility." error which is transient.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../github/apple/test_apple_packages.py       | 61 ++++++++++++++-----
 1 file changed, 45 insertions(+), 16 deletions(-)

diff --git a/tools/ci_build/github/apple/test_apple_packages.py b/tools/ci_build/github/apple/test_apple_packages.py
index cd360a63a3a0f..3c0df994ffd3d 100644
--- a/tools/ci_build/github/apple/test_apple_packages.py
+++ b/tools/ci_build/github/apple/test_apple_packages.py
@@ -130,22 +130,51 @@ def _test_apple_packages(args):
 
             simulator_device_info = json.loads(simulator_device_info)
 
-            subprocess.run(
-                [
-                    "xcrun",
-                    "xcodebuild",
-                    "test",
-                    "-workspace",
-                    "./apple_package_test.xcworkspace",
-                    "-scheme",
-                    "ios_package_test",
-                    "-destination",
-                    f"platform=iOS Simulator,id={simulator_device_info['device_udid']}",
-                ],
-                shell=False,
-                check=True,
-                cwd=target_proj_path,
-            )
+            # Xcode UI tests seem to be flaky: https://github.com/orgs/community/discussions/68807
+            # Add a couple of retries if we get this error:
+            #   ios_package_testUITests-Runner Failed to initialize for UI testing:
+            #   Error Domain=com.apple.dt.XCTest.XCTFuture Code=1000 "Timed out while loading Accessibility."
+            attempts = 0
+            cmd = [
+                "xcrun",
+                "xcodebuild",
+                "test",
+                "-workspace",
+                "./apple_package_test.xcworkspace",
+                "-scheme",
+                "ios_package_test",
+                "-destination",
+                f"platform=iOS Simulator,id={simulator_device_info['device_udid']}",
+            ]
+
+            while True:
+                attempts += 1
+                completed_process = subprocess.run(
+                    cmd,
+                    shell=False,
+                    capture_output=True,
+                    check=False,
+                    text=True,
+                    cwd=target_proj_path,
+                )
+
+                # print so it's in CI output
+                print(completed_process.stdout)
+
+                if completed_process.returncode != 0:
+                    print(f"Running ios_package_test failed. Return code was {completed_process.returncode}")
+                    print("xcrun xcodebuild test stderr:")
+                    print(completed_process.stderr)
+                    print("---")
+
+                    if "Timed out while loading Accessibility" in completed_process.stderr and attempts < 3:
+                        continue
+
+                    raise subprocess.CalledProcessError(
+                        completed_process.returncode, " ".join(cmd), completed_process.stdout, completed_process.stderr
+                    )
+
+                break
 
             if PackageVariant[args.variant] != PackageVariant.Mobile and not args.skip_macos_test:
                 subprocess.run(

From 18c8fab1ae03e68a906fe42698ac322e9e49e218 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Mon, 26 Feb 2024 15:58:09 -0800
Subject: [PATCH 063/279] Fix a bug in build.py (#19652)

### Description
Fix a bug in build.py that accidentally disabled C# tests for most
builds when "--build_nuget" is specified.

### Motivation and Context
The bug was introduced in PR #8892 .
---
 tools/ci_build/build.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 5b715bb29e5a1..74c473d34f548 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -2592,7 +2592,7 @@ def main():
         raise BuildError("Using --get-api-doc requires a single build config")
 
     # Disabling unit tests for GPU on nuget creation
-    if args.use_openvino != "CPU_FP32" and args.build_nuget:
+    if args.use_openvino and args.use_openvino != "CPU_FP32" and args.build_nuget:
         args.test = False
 
     # GDK builds don't support testing

From 8a71b657654d63437267014b324bf124a80de347 Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Tue, 27 Feb 2024 11:35:27 +1000
Subject: [PATCH 064/279] Remove skipping of Reshape from NNAPI EP (#19618)

### Description
<!-- Describe your changes. -->
A number of Qualcomm Snapdragon chipsets do not produce correct output
if we skip the Reshape, which ironically was a performance optimization
for Snapdragon chips.

Perf testing showed that Squeeze also seems to execute on CPU so there's
no benefit to using that as an alternative where possible e.g.
Global*Pool -> Reshape to 2D -> Gemm could be potentially be replaced
with Global*Pool -> Squeeze dims 2 and 3 -> Gemm if that offered better
performance.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
#19518
---
 .../builders/op_builder_helpers.cc            | 30 ++++++++++++++-----
 .../builders/op_builder_helpers.h             |  3 --
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_helpers.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_helpers.cc
index a066c64dac67d..466865f23f49a 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_helpers.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_helpers.cc
@@ -965,6 +965,18 @@ Status AddMinMaxOperator(ModelBuilder& model_builder, const NodeUnit& node_unit,
   return Status::OK();
 }
 
+// NOTE: Skipping Reshape results in invalid output on some SnapDragon chipsets. Whilst the NNAPI spec says the input
+// to FullyConnnected can be > 2D, those chipsets don't handle this correctly.
+//
+// CanSkipReshape could potentially be re-enabled in the future if we no longer want to support those old chipsets.
+// However, the Reshape of newer chipsets may not run on CPU so there may not be a performance issue to try and avoid,
+// so CanSkipReshape could be redundant anyway.
+//
+// Known bad chipsets: Qualcomm Snapdragon 850, 855, 865, 870.
+//
+// See https://github.com/microsoft/onnxruntime/issues/19518
+
+/*
 // We can skip the Reshape if all the output edges satisfies both the following conditions
 // 1. The output of the reshape/flatten is not an output of the graph
 // 2. The output of the reshape/flatten is the input 0 of one or more GEMM/Matmul operators,
@@ -977,7 +989,7 @@ Status AddMinMaxOperator(ModelBuilder& model_builder, const NodeUnit& node_unit,
 // between NNAPI CPU impl and Hardware Accelerator impl and will speed up the execution
 // If we are going to skip the reshape, we will still add correct shape and operand type for the output in
 // onnxruntime::nnapi::Model.
-bool CanSkipReshape(const ModelBuilder& model_builder, const NodeUnit& node_unit,
+static bool CanSkipReshape(const ModelBuilder& model_builder, const NodeUnit& node_unit,
                     size_t input_rank, size_t output_rank) {
   // Since we know this is a Reshape NodeUnit, so we can safely assume there is only 1 output
   // and the node_unit has only one output node.
@@ -1039,33 +1051,37 @@ bool CanSkipReshape(const ModelBuilder& model_builder, const NodeUnit& node_unit
                         << node_unit.Name() << "] with output, " << output_name;
   return true;
 }
+*/
 
 Status AddReshapeOperator(ModelBuilder& model_builder,
                           const NodeUnit& node_unit,
                           const std::string& input,
                           const std::vector<int32_t>& shape) {
   auto& shaper(model_builder.GetShaper());
-  const auto& operand_indices(model_builder.GetOperandIndices());
   const auto& operand_types(model_builder.GetOperandTypes());
   const auto& output = node_unit.Outputs()[0].node_arg.Name();
 
   const auto input_shape = shaper[input];
   const auto output_shape = shaper[output];
-  const auto input_rank = input_shape.size();
-  const auto output_rank = output_shape.size();
 
   // For reshape, the output type should be the same as the input type except the shape is different
   auto output_operand_type = operand_types.at(input);
   output_operand_type.SetDimensions(output_shape);
 
+  /* See CanSkipReshape definition above for explanation of why this is disabled.
   // Since Reshape is not running using hardware in NNAPI for some CPU (e.g. Qualcomm SD for now)
   // We will try to see if we the skip the Reshape to prevent context switching between
   // NNAPI CPU impl and NNAPI hardware accelerator impl
   if (CanSkipReshape(model_builder, node_unit, input_rank, output_rank)) {
-    // Since reshape can be skipped, only register the dimension and type, with same index and new name
+    const auto& operand_indices(model_builder.GetOperandIndices());
+    const auto input_rank = input_shape.size();
+    const auto output_rank = output_shape.size();
+    // Since reshape can be skipped, only register the dimension and type, with same index and new name.
+    // This essentially redirects the downstream operator builders to the input of the skipped Reshape node,
+    // but with the output shape of the Reshape node.
     model_builder.RegisterOperand(output, operand_indices.at(input), output_operand_type);
-  } else {
-    // We still need to perform a reshape here
+  } else */
+  {
     std::string shape_name = model_builder.GetUniqueName(node_unit.Name() + input + "newshape");
     ORT_RETURN_IF_ERROR(op_builder_helpers::AddNnapiReshape(model_builder, input, shape_name, shape, output));
   }
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_helpers.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_helpers.h
index 7ccf4c1ef7555..61a16ceff752f 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_helpers.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_helpers.h
@@ -181,9 +181,6 @@ Status AddMinMaxOperator(ModelBuilder& model_builder, const NodeUnit& node_unit,
 Status AddReshapeOperator(ModelBuilder& model_builder, const NodeUnit& node_unit,
                           const std::string& input, const std::vector<int32_t>& shape);
 
-bool CanSkipReshape(const ModelBuilder& model_builder, const NodeUnit& node_unit,
-                    size_t input_rank, size_t output_rank);
-
 Status GetAxesForSqueezeAndUnSqueeze(ModelBuilder& model_builder, const NodeUnit& node_unit,
                                      std::vector<int32_t>& axes);
 

From 6f566562cedff9996e55dbf623b1f0141733d52c Mon Sep 17 00:00:00 2001
From: kailums <109063327+kailums@users.noreply.github.com>
Date: Tue, 27 Feb 2024 11:31:03 +0800
Subject: [PATCH 065/279] support user_compute_stream for rocm ep (#19619)

### Description
<!-- Describe your changes. -->
According to the pr #19229 supporting cuda EP use external compute
stream, we add support for rocm EP.

And when we testing this feature with torch, we found torch use stream 0
for the default stream, and `torch.cuda.current_stream()` returns `0`
for current stream, but ort treat `0` or `nullptr` as invalid, and reset
has_user_compute_stream to false.

Will remove has_user_compute_stream option in the future.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
The motivation for this pr is that we want to use torch.cuda.graph to
capture ort running kernel, which requires torch and ort are running in
the same stream, so we use this API to set ort's working stream.
---
 .../rocm/rocm_execution_provider_info.cc      | 20 +++++++++++++++++++
 .../test/python/onnxruntime_test_python.py    | 10 ++++++++++
 2 files changed, 30 insertions(+)

diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider_info.cc b/onnxruntime/core/providers/rocm/rocm_execution_provider_info.cc
index b557f92287f2b..3cb826437a54f 100644
--- a/onnxruntime/core/providers/rocm/rocm_execution_provider_info.cc
+++ b/onnxruntime/core/providers/rocm/rocm_execution_provider_info.cc
@@ -13,6 +13,8 @@ namespace onnxruntime {
 namespace rocm {
 namespace provider_option_names {
 constexpr const char* kDeviceId = "device_id";
+constexpr const char* kHasUserComputeStream = "has_user_compute_stream";
+constexpr const char* kUserComputeStream = "user_compute_stream";
 constexpr const char* kMemLimit = "gpu_mem_limit";
 constexpr const char* kArenaExtendStrategy = "arena_extend_strategy";
 constexpr const char* kMiopenConvExhaustiveSearch = "miopen_conv_exhaustive_search";
@@ -38,6 +40,7 @@ ROCMExecutionProviderInfo ROCMExecutionProviderInfo::FromProviderOptions(const P
   void* alloc = nullptr;
   void* free = nullptr;
   void* empty_cache = nullptr;
+  void* user_compute_stream = nullptr;
   ORT_THROW_IF_ERROR(
       ProviderOptionsParser{}
           .AddValueParser(
@@ -52,6 +55,15 @@ ROCMExecutionProviderInfo ROCMExecutionProviderInfo::FromProviderOptions(const P
                     ", must be between 0 (inclusive) and ", num_devices, " (exclusive).");
                 return Status::OK();
               })
+          .AddAssignmentToReference(rocm::provider_option_names::kHasUserComputeStream, info.has_user_compute_stream)
+          .AddValueParser(
+              rocm::provider_option_names::kUserComputeStream,
+              [&user_compute_stream](const std::string& value_str) -> Status {
+                size_t address;
+                ORT_RETURN_IF_ERROR(ParseStringWithClassicLocale(value_str, address));
+                user_compute_stream = reinterpret_cast<void*>(address);
+                return Status::OK();
+              })
           .AddValueParser(
               rocm::provider_option_names::kGpuExternalAlloc,
               [&alloc](const std::string& value_str) -> Status {
@@ -108,12 +120,18 @@ ROCMExecutionProviderInfo ROCMExecutionProviderInfo::FromProviderOptions(const P
 
   ROCMExecutionProviderExternalAllocatorInfo alloc_info{alloc, free, empty_cache};
   info.external_allocator_info = alloc_info;
+
+  info.user_compute_stream = user_compute_stream;
+  info.has_user_compute_stream = (user_compute_stream != nullptr);
+
   return info;
 }
 
 ProviderOptions ROCMExecutionProviderInfo::ToProviderOptions(const ROCMExecutionProviderInfo& info) {
   const ProviderOptions options{
       {rocm::provider_option_names::kDeviceId, MakeStringWithClassicLocale(info.device_id)},
+      {rocm::provider_option_names::kHasUserComputeStream, MakeStringWithClassicLocale(info.has_user_compute_stream)},
+      {rocm::provider_option_names::kUserComputeStream, MakeStringWithClassicLocale(reinterpret_cast<size_t>(info.user_compute_stream))},
       {rocm::provider_option_names::kMemLimit, MakeStringWithClassicLocale(info.gpu_mem_limit)},
       {rocm::provider_option_names::kGpuExternalAlloc, MakeStringWithClassicLocale(reinterpret_cast<size_t>(info.external_allocator_info.alloc))},
       {rocm::provider_option_names::kGpuExternalFree, MakeStringWithClassicLocale(reinterpret_cast<size_t>(info.external_allocator_info.free))},
@@ -135,6 +153,8 @@ ProviderOptions ROCMExecutionProviderInfo::ToProviderOptions(const ROCMExecution
 ProviderOptions ROCMExecutionProviderInfo::ToProviderOptions(const OrtROCMProviderOptions& info) {
   const ProviderOptions options{
       {rocm::provider_option_names::kDeviceId, MakeStringWithClassicLocale(info.device_id)},
+      {rocm::provider_option_names::kHasUserComputeStream, MakeStringWithClassicLocale(info.has_user_compute_stream)},
+      {rocm::provider_option_names::kUserComputeStream, MakeStringWithClassicLocale(reinterpret_cast<size_t>(info.user_compute_stream))},
       {rocm::provider_option_names::kMemLimit, MakeStringWithClassicLocale(info.gpu_mem_limit)},
       {rocm::provider_option_names::kArenaExtendStrategy, EnumToName(arena_extend_strategy_mapping, static_cast<onnxruntime::ArenaExtendStrategy>(info.arena_extend_strategy))},
       {rocm::provider_option_names::kMiopenConvExhaustiveSearch, MakeStringWithClassicLocale(info.miopen_conv_exhaustive_search)},
diff --git a/onnxruntime/test/python/onnxruntime_test_python.py b/onnxruntime/test/python/onnxruntime_test_python.py
index 91b6c71e735a8..ab56f3fa0f37f 100644
--- a/onnxruntime/test/python/onnxruntime_test_python.py
+++ b/onnxruntime/test/python/onnxruntime_test_python.py
@@ -559,6 +559,16 @@ def test_get_and_set_option_with_values(option_name, option_values):
 
                 test_get_and_set_option_with_values("enable_hip_graph", ["1", "0"])
 
+                # test for user_compute_stream
+                option = options["ROCMExecutionProvider"]
+                option["user_compute_stream"] = "1"
+                sess.set_providers(["ROCMExecutionProvider"], [option])
+                new_options = sess.get_provider_options()
+                new_option = new_options["ROCMExecutionProvider"]
+                self.assertEqual(new_option["user_compute_stream"], "1")
+                # set user_compute_stream will set has_user_compute_stream to 1 too
+                self.assertEqual(new_option["has_user_compute_stream"], "1")
+
             run_rocm_options_test()
 
     def test_invalid_set_providers(self):

From 5bb58a10e739f8720e9867d19c4313081b12d948 Mon Sep 17 00:00:00 2001
From: Rachel Guo <35738743+YUNQIUGUO@users.noreply.github.com>
Date: Mon, 26 Feb 2024 20:00:14 -0800
Subject: [PATCH 066/279] Enable the most verbose logging level in detox E2E
 React Native CI (#19659)

### Description
<!-- Describe your changes. -->

The RN CI has intermittent failure error with "app seems to idle".
enable the most verbose logging level (and can add steps to dump
device.log from the detox folder/artifacts if necessary) to at least get
more information.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

---------

Co-authored-by: rachguo <rachguo@rachguos-Mini.attlocal.net>
---
 .../github/azure-pipelines/templates/react-native-ci.yml      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml b/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml
index 47cd72f412c67..1b7962059e301 100644
--- a/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml
@@ -279,7 +279,7 @@ stages:
 
     - script: |
         JEST_JUNIT_OUTPUT_FILE=$(Build.SourcesDirectory)/js/react_native/e2e/android-test-results.xml \
-        detox test --record-logs all --configuration android.emu.release
+        detox test --record-logs all --configuration android.emu.release --loglevel trace
       workingDirectory: '$(Build.SourcesDirectory)/js/react_native/e2e'
       displayName: Run React Native Detox Android e2e Tests
 
@@ -329,7 +329,7 @@ stages:
 
     - script: |
         JEST_JUNIT_OUTPUT_FILE=$(Build.SourcesDirectory)/js/react_native/e2e/ios-test-results.xml \
-        detox test --record-logs all --configuration ios.sim.release
+        detox test --record-logs all --configuration ios.sim.release --loglevel trace
       workingDirectory: '$(Build.SourcesDirectory)/js/react_native/e2e'
       displayName: Run React Native Detox iOS e2e Tests
 

From 9e19684944adfda4a414fc91a67259894fce2898 Mon Sep 17 00:00:00 2001
From: duanshengliu <44742794+duanshengliu@users.noreply.github.com>
Date: Tue, 27 Feb 2024 12:56:32 +0800
Subject: [PATCH 067/279] Fix the TypeError issue in quantize.py (#19459)

### Description
<!-- Describe your changes. -->
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
Fix related bug as described in
https://github.com/microsoft/onnxruntime/issues/19430
---
 onnxruntime/python/tools/quantization/quantize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py
index 1bd2ef42151d0..05d3ac248c92c 100644
--- a/onnxruntime/python/tools/quantization/quantize.py
+++ b/onnxruntime/python/tools/quantization/quantize.py
@@ -479,7 +479,7 @@ def inc_dataloader():
         del dataloader
         model = sq.transform(extra_options.get("SmoothQuantAlpha", 0.5), extra_options.get("SmoothQuantFolding", True))
         sq_path = tempfile.TemporaryDirectory(prefix="ort.quant.")
-        model_input = Path(sq_path).joinpath("sq_model.onnx").as_posix()
+        model_input = Path(sq_path.name).joinpath("sq_model.onnx").as_posix()
         model.save(model_input)
         nodes_to_exclude.extend([i.name for i in model.model.graph.node if i.name not in orig_nodes])
         model = load_model_with_shape_infer(Path(model_input))  # use smooth quant model for calibration

From 1e69b612382205b0588f08d2b808b12e32a50a51 Mon Sep 17 00:00:00 2001
From: cloudhan <guangyunhan@microsoft.com>
Date: Tue, 27 Feb 2024 16:06:06 +0800
Subject: [PATCH 068/279] Make version string detection more robust (#19615)

`/opt/rocm/.info/version-dev` is only available if the `rocm-dev`
metapackage is installed. This will bring a lot of unused packages which
are not needed by the users, they may opt for fine grained control.
Fallback to `rocm_version.h` in case `rocm-dev` is not installed.
---
 cmake/CMakeLists.txt | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index ed9043f2adc4a..1376c90fbcefe 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -324,15 +324,27 @@ if (onnxruntime_USE_ROCM)
   endif()
 
   # replicate strategy used by pytorch to get ROCM_VERSION
-  # https://github.com/pytorch/pytorch/blob/8eb21488fdcdb8b0e6fa2e46179b5fa6c42e75af/cmake/public/LoadHIP.cmake#L153-L173
-  file(READ "${onnxruntime_ROCM_HOME}/.info/version-dev" ROCM_VERSION_DEV_RAW)
-  string(REGEX MATCH "^([0-9]+)\.([0-9]+)\.([0-9]+)-.*$" ROCM_VERSION_DEV_MATCH ${ROCM_VERSION_DEV_RAW})
-  if (ROCM_VERSION_DEV_MATCH)
+  # https://github.com/pytorch/pytorch/blob/5c5b71b6eebae76d744261715231093e62f0d090/cmake/public/LoadHIP.cmake
+  # with modification
+  if (EXISTS "${onnxruntime_ROCM_HOME}/.info/version-dev")
+    file(READ "${onnxruntime_ROCM_HOME}/.info/version-dev" ROCM_VERSION_DEV_RAW)
+    string(REGEX MATCH "^([0-9]+)\.([0-9]+)\.([0-9]+)-.*$" ROCM_VERSION_MATCH ${ROCM_VERSION_DEV_RAW})
+  elseif (EXISTS "${onnxruntime_ROCM_HOME}/include/rocm_version.h")
+    file(READ "${onnxruntime_ROCM_HOME}/include/rocm_version.h" ROCM_VERSION_H_RAW)
+    string(REGEX MATCH "\"([0-9]+)\.([0-9]+)\.([0-9]+).*\"" ROCM_VERSION_MATCH ${ROCM_VERSION_H_RAW})
+  elseif (EXISTS "${onnxruntime_ROCM_HOME}/include/rocm-core/rocm_version.h")
+    file(READ "${onnxruntime_ROCM_HOME}/include/rocm-core/rocm_version.h" ROCM_VERSION_H_RAW)
+    string(REGEX MATCH "\"([0-9]+)\.([0-9]+)\.([0-9]+).*\"" ROCM_VERSION_MATCH ${ROCM_VERSION_H_RAW})
+  endif()
+
+  if (ROCM_VERSION_MATCH)
     set(ROCM_VERSION_DEV_MAJOR ${CMAKE_MATCH_1})
     set(ROCM_VERSION_DEV_MINOR ${CMAKE_MATCH_2})
     set(ROCM_VERSION_DEV_PATCH ${CMAKE_MATCH_3})
     set(ROCM_VERSION_DEV "${ROCM_VERSION_DEV_MAJOR}.${ROCM_VERSION_DEV_MINOR}.${ROCM_VERSION_DEV_PATCH}")
     math(EXPR ROCM_VERSION_DEV_INT "(${ROCM_VERSION_DEV_MAJOR}*10000) + (${ROCM_VERSION_DEV_MINOR}*100) + ${ROCM_VERSION_DEV_PATCH}")
+  else()
+    message(FATAL_ERROR "Cannot determine ROCm version string")
   endif()
   message("\n***** ROCm version from ${onnxruntime_ROCM_HOME}/.info/version-dev ****\n")
   message("ROCM_VERSION_DEV: ${ROCM_VERSION_DEV}")

From 4838cb6b3e98273fcdd6a3e54da74cd584167780 Mon Sep 17 00:00:00 2001
From: Adrian Lizarraga <adlizarraga@microsoft.com>
Date: Tue, 27 Feb 2024 02:27:35 -0800
Subject: [PATCH 069/279] [QNN Quantization] Ensure fused nodes have names
 (#19650)

### Description
- Updates the `qnn_preprocess_model()` method to set a name for any new
nodes added to the graph (due to fusion).
- Updates the `qnn_preprocess_model()` method to set a name for any
unnamed nodes that previously existed in the original graph.
- Adds unit tests for fusions (previously missing)
  - Checks that fused node names exist and are unique
  - Checks that fused graph is equivalent to original graph


### Motivation and Context
Nodes are not strictly required to have names. However, a
planned/upcoming feature to support mixed-precision (integer) quantized
models needs nodes to have names.
---
 .../execution_providers/qnn/fusion_lpnorm.py  |   7 +-
 .../execution_providers/qnn/preprocess.py     |  11 +
 .../tools/quantization/fusions/fusion.py      |  15 +
 .../tools/quantization/fusions/fusion_gelu.py |  25 +-
 .../quantization/fusions/fusion_layernorm.py  |   1 +
 .../python/tools/quantization/onnx_model.py   |  17 +
 .../test/python/quantization/test_fusions.py  | 401 ++++++++++++++++++
 7 files changed, 465 insertions(+), 12 deletions(-)
 create mode 100644 onnxruntime/test/python/quantization/test_fusions.py

diff --git a/onnxruntime/python/tools/quantization/execution_providers/qnn/fusion_lpnorm.py b/onnxruntime/python/tools/quantization/execution_providers/qnn/fusion_lpnorm.py
index 9ebf400498e0e..fbf954febdda4 100644
--- a/onnxruntime/python/tools/quantization/execution_providers/qnn/fusion_lpnorm.py
+++ b/onnxruntime/python/tools/quantization/execution_providers/qnn/fusion_lpnorm.py
@@ -122,6 +122,11 @@ def fuse(
 
         self.nodes_to_remove.extend(subgraph_nodes)
         fused_node = onnx.helper.make_node(
-            self.fused_op_type, inputs=[subgraph_input], outputs=[subgraph_output], p=2, axis=-1
+            self.fused_op_type,
+            name=self.create_unique_node_name(),
+            inputs=[subgraph_input],
+            outputs=[subgraph_output],
+            p=2,
+            axis=-1,
         )
         self.nodes_to_add.append(fused_node)
diff --git a/onnxruntime/python/tools/quantization/execution_providers/qnn/preprocess.py b/onnxruntime/python/tools/quantization/execution_providers/qnn/preprocess.py
index becbaceab184e..b1c114fe1f9fd 100644
--- a/onnxruntime/python/tools/quantization/execution_providers/qnn/preprocess.py
+++ b/onnxruntime/python/tools/quantization/execution_providers/qnn/preprocess.py
@@ -44,6 +44,17 @@ def qnn_preprocess_model(model_input: Path, model_output: Path, fuse_layernorm:
             if fusion_layernorm.apply():
                 modified = True
 
+    # Make sure all nodes have a name.
+    unnamed_node_prefix = "qnn_preproc_node_"
+    available_suffix = onnx_model.get_largest_node_name_suffix(unnamed_node_prefix) + 1
+    for node in onnx_model.model.graph.node:
+        if node.op_type != "Constant" and not node.name:
+            new_node_name = f"{unnamed_node_prefix}{available_suffix!s}"
+            available_suffix += 1
+            node.name = new_node_name
+            modified = True
+            logging.warning(f"Node of type {node.op_type} does not have a name. Renamed to {new_node_name}.")
+
     if modified:
         onnx_model.topological_sort()
         onnx.save_model(model, model_output)
diff --git a/onnxruntime/python/tools/quantization/fusions/fusion.py b/onnxruntime/python/tools/quantization/fusions/fusion.py
index b54b421226f1a..4bdc5c26cc946 100644
--- a/onnxruntime/python/tools/quantization/fusions/fusion.py
+++ b/onnxruntime/python/tools/quantization/fusions/fusion.py
@@ -24,6 +24,9 @@ def __init__(self, model: ONNXModel, fused_op_type: str, search_op_type: str):
         self.nodes_to_remove: list = []
         self.nodes_to_add: list = []
 
+        self._new_node_name_prefix = self.fused_op_type + "_fused_" + self.search_op_type + "_"
+        self._new_node_name_suffix = None  # int|None used to create unique node names for the fused ops.
+
     def fuse(
         self,
         node: onnx.NodeProto,
@@ -57,6 +60,18 @@ def apply(self) -> bool:
 
         return graph_updated
 
+    def create_unique_node_name(self):
+        prefix = self._new_node_name_prefix
+
+        if self._new_node_name_suffix is None:
+            largest_suffix: int = self.model.get_largest_node_name_suffix(prefix)
+            self._new_node_name_suffix = largest_suffix + 1
+
+        new_name = f"{prefix}{self._new_node_name_suffix!s}"
+        self._new_node_name_suffix += 1
+
+        return new_name
+
     @staticmethod
     def is_safe_to_fuse_nodes(
         nodes_to_remove: list[onnx.NodeProto],
diff --git a/onnxruntime/python/tools/quantization/fusions/fusion_gelu.py b/onnxruntime/python/tools/quantization/fusions/fusion_gelu.py
index a20d6dbffd7a7..42c4a11833641 100644
--- a/onnxruntime/python/tools/quantization/fusions/fusion_gelu.py
+++ b/onnxruntime/python/tools/quantization/fusions/fusion_gelu.py
@@ -112,7 +112,9 @@ def fuse_1(
             return False
 
         self.nodes_to_remove.extend(subgraph_nodes)
-        fused_node = onnx.helper.make_node("Gelu", inputs=[subgraph_input], outputs=[subgraph_output])
+        fused_node = onnx.helper.make_node(
+            "Gelu", name=self.create_unique_node_name(), inputs=[subgraph_input], outputs=[subgraph_output]
+        )
         fused_node.domain = "com.microsoft"
         self.nodes_to_add.append(fused_node)
         return True
@@ -173,11 +175,9 @@ def fuse_2(
             if not self.has_constant_input(sqrt_node, 2.0):
                 return False
 
-        root_node = self.model.get_parent(div, 0, output_name_to_node)
-        if root_node is None:
-            return False
+        subgraph_input = div.input[0]
 
-        if root_node.output[0] not in mul.input:
+        if subgraph_input not in mul.input:
             return False
 
         subgraph_nodes = [div, erf_node, add_after_erf, mul_after_erf, mul]
@@ -188,7 +188,9 @@ def fuse_2(
             return False
 
         self.nodes_to_remove.extend(subgraph_nodes)
-        fused_node = onnx.helper.make_node("Gelu", inputs=[root_node.output[0]], outputs=[mul.output[0]])
+        fused_node = onnx.helper.make_node(
+            "Gelu", name=self.create_unique_node_name(), inputs=[subgraph_input], outputs=[mul.output[0]]
+        )
         fused_node.domain = "com.microsoft"
         self.nodes_to_add.append(fused_node)
         return True
@@ -239,9 +241,8 @@ def fuse_3(
         if i < 0:
             return False
 
-        root_node = self.model.get_parent(first_mul, 0 if i == 1 else 1, output_name_to_node)
-        if root_node is None:
-            return False
+        root_input_index = 1 - i
+        subgraph_input = first_mul.input[root_input_index]
 
         if mul_half.output[0] not in input_name_to_nodes:
             return False
@@ -250,7 +251,7 @@ def fuse_3(
             return False
         last_mul = children[0]
 
-        if not (last_mul.input[0] == root_node.output[0] or last_mul.input[1] == root_node.output[0]):
+        if not (last_mul.input[0] == subgraph_input or last_mul.input[1] == subgraph_input):
             return False
 
         subgraph_nodes = [first_mul, erf_node, add_after_erf, mul_half, last_mul]
@@ -263,7 +264,9 @@ def fuse_3(
             return False
 
         self.nodes_to_remove.extend(subgraph_nodes)
-        fused_node = onnx.helper.make_node("Gelu", inputs=[root_node.output[0]], outputs=[last_mul.output[0]])
+        fused_node = onnx.helper.make_node(
+            "Gelu", name=self.create_unique_node_name(), inputs=[subgraph_input], outputs=[last_mul.output[0]]
+        )
         fused_node.domain = "com.microsoft"
         self.nodes_to_add.append(fused_node)
         return True
diff --git a/onnxruntime/python/tools/quantization/fusions/fusion_layernorm.py b/onnxruntime/python/tools/quantization/fusions/fusion_layernorm.py
index d7fb89236d3d2..7d58c1c180822 100644
--- a/onnxruntime/python/tools/quantization/fusions/fusion_layernorm.py
+++ b/onnxruntime/python/tools/quantization/fusions/fusion_layernorm.py
@@ -127,6 +127,7 @@ def fuse(
 
         normalize_node = onnx.helper.make_node(
             "LayerNormalization",
+            name=self.create_unique_node_name(),
             inputs=[reduce_mean_node.input[0], weight_input, bias_input],
             outputs=[last_add_node.output[0]],
         )
diff --git a/onnxruntime/python/tools/quantization/onnx_model.py b/onnxruntime/python/tools/quantization/onnx_model.py
index 4591c9c950e6e..46d245d353a07 100644
--- a/onnxruntime/python/tools/quantization/onnx_model.py
+++ b/onnxruntime/python/tools/quantization/onnx_model.py
@@ -283,6 +283,23 @@ def find_node_by_name(self, node_name, new_nodes_list, graph):
         node = find_by_name(node_name, graph_nodes_list)
         return node
 
+    def get_largest_node_name_suffix(self, node_name_prefix):
+        """
+        Gets the largest node name (int) suffix for all node names that begin with `node_name_prefix`.
+        Example: for nodes my_prefix_0 and my_prefix_3, this method returns 3.
+        """
+        suffix = -1
+
+        for node in self.model.graph.node:
+            if node.name and node.name.startswith(node_name_prefix):
+                try:
+                    index = int(node.name[len(node_name_prefix) :])
+                    suffix = max(index, suffix)
+                except ValueError:
+                    continue
+
+        return suffix
+
     def find_nodes_by_initializer(self, graph, initializer):
         """
         Find all nodes with given initializer as an input.
diff --git a/onnxruntime/test/python/quantization/test_fusions.py b/onnxruntime/test/python/quantization/test_fusions.py
new file mode 100644
index 0000000000000..bea110e566fb9
--- /dev/null
+++ b/onnxruntime/test/python/quantization/test_fusions.py
@@ -0,0 +1,401 @@
+#!/usr/bin/env python
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+import math
+import unittest
+
+import numpy as np
+import onnx
+
+import onnxruntime
+from onnxruntime.quantization.execution_providers.qnn.fusion_lpnorm import FusionLpNormalization
+from onnxruntime.quantization.fusions import FusionGelu, FusionLayerNormalization
+from onnxruntime.quantization.onnx_model import ONNXModel
+
+
+class TestFusions(unittest.TestCase):
+    def check_fused_model_correctness(self, orig_model, fused_model, inputs, rtol=1e-7, atol=0):
+        """
+        Checks that the output of the fused model matches the output of the original model.
+        """
+        orig_session = onnxruntime.InferenceSession(orig_model.SerializeToString(), providers=["CPUExecutionProvider"])
+        orig_results = orig_session.run(None, inputs)
+
+        fused_session = onnxruntime.InferenceSession(
+            fused_model.SerializeToString(), providers=["CPUExecutionProvider"]
+        )
+        fused_results = fused_session.run([], inputs)
+
+        self.assertEqual(len(orig_results), len(fused_results), "Number of outputs for fused model differs")
+        for idx, expected_output in enumerate(orig_results):
+            actual_output = fused_results[idx]
+            np.testing.assert_allclose(
+                expected_output,
+                actual_output,
+                rtol=rtol,
+                atol=atol,
+                err_msg=f"Fused model output {idx} differs",
+            )
+
+    def build_erf_sequence_1_model(self, shape):
+        """
+        Erf sequence that fuses into Gelu:
+           +-------Mul(0.5)---------------------+
+           |                                    |
+           |                                    v
+        [root] --> Div -----> Erf  --> Add --> Mul -->
+                  (B=1.4142...)       (1)
+
+        This method builds 2 of these Erf sequences:
+
+        [root] -> ERF_SEQUENCE1 -> ERF_SEQUENCE2 -> output
+        """
+        root_inp = onnx.helper.make_tensor_value_info("root", onnx.TensorProto.FLOAT, shape)
+        output = onnx.helper.make_tensor_value_info("output", onnx.TensorProto.FLOAT, shape)
+        one_const = onnx.numpy_helper.from_array(np.array(1.0, dtype=np.float32), "one_const")
+        half_const = onnx.numpy_helper.from_array(np.array(0.5, dtype=np.float32), "half_const")
+        root2_const = onnx.numpy_helper.from_array(np.array(math.sqrt(2.0), dtype=np.float32), "root2_const")
+
+        # First Erf sequence
+        mul0_node = onnx.helper.make_node("Mul", ["root", "half_const"], ["mul0_out"])
+        div_node = onnx.helper.make_node("Div", ["root", "root2_const"], ["div_out"])
+        erf_node = onnx.helper.make_node("Erf", ["div_out"], ["erf_out"])
+        add_node = onnx.helper.make_node("Add", ["erf_out", "one_const"], ["add_out"])
+        mul1_node = onnx.helper.make_node("Mul", ["add_out", "mul0_out"], ["seq1_output"])
+
+        # Second Erf sequence
+        mul0_node_dup = onnx.helper.make_node("Mul", ["seq1_output", "half_const"], ["mul0_out_dup"])
+        div_node_dup = onnx.helper.make_node("Div", ["seq1_output", "root2_const"], ["div_out_dup"])
+        erf_node_dup = onnx.helper.make_node("Erf", ["div_out_dup"], ["erf_out_dup"])
+        add_node_dup = onnx.helper.make_node("Add", ["erf_out_dup", "one_const"], ["add_out_dup"])
+        mul1_node_dup = onnx.helper.make_node("Mul", ["add_out_dup", "mul0_out_dup"], ["output"])
+
+        graph = onnx.helper.make_graph(
+            [
+                mul0_node,
+                div_node,
+                erf_node,
+                add_node,
+                mul1_node,
+                mul0_node_dup,
+                div_node_dup,
+                erf_node_dup,
+                add_node_dup,
+                mul1_node_dup,
+            ],
+            "two_erf_sequences",
+            [root_inp],
+            [output],
+            initializer=[one_const, half_const, root2_const],
+        )
+        opset_imports = [
+            onnx.helper.make_opsetid("", 18),
+            onnx.helper.make_opsetid("com.microsoft", 1),
+        ]
+        model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        return ONNXModel(model)
+
+    def build_erf_sequence_2_model(self, shape):
+        """
+           +------------------------------------+
+           |                                    |
+           |                                    v
+        [root] --> Div -----> Erf  --> Add --> Mul -->Mul -->
+                  (B=1.4142...)       (1)            (0.5)
+
+        """
+        root_inp = onnx.helper.make_tensor_value_info("root", onnx.TensorProto.FLOAT, shape)
+        output = onnx.helper.make_tensor_value_info("output", onnx.TensorProto.FLOAT, shape)
+        one_const = onnx.numpy_helper.from_array(np.array(1.0, dtype=np.float32), "one_const")
+        half_const = onnx.numpy_helper.from_array(np.array(0.5, dtype=np.float32), "half_const")
+        root2_const = onnx.numpy_helper.from_array(np.array(math.sqrt(2.0), dtype=np.float32), "root2_const")
+
+        div_node = onnx.helper.make_node("Div", ["root", "root2_const"], ["div_out"])
+        erf_node = onnx.helper.make_node("Erf", ["div_out"], ["erf_out"])
+        add_node = onnx.helper.make_node("Add", ["erf_out", "one_const"], ["add_out"])
+        mul0_node = onnx.helper.make_node("Mul", ["add_out", "root"], ["mul0_out"])
+        mul1_node = onnx.helper.make_node("Mul", ["mul0_out", "half_const"], ["output"])
+
+        graph = onnx.helper.make_graph(
+            [div_node, erf_node, add_node, mul0_node, mul1_node],
+            "erf_sequence_2",
+            [root_inp],
+            [output],
+            initializer=[one_const, half_const, root2_const],
+        )
+        opset_imports = [
+            onnx.helper.make_opsetid("", 18),
+            onnx.helper.make_opsetid("com.microsoft", 1),
+        ]
+        model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        return ONNXModel(model)
+
+    def build_erf_sequence_3_model(self, shape):
+        """
+           +------------------------------------------+
+           |                                          |
+           |                                          v
+        [root] --> Div -----> Erf  --> Add --> Mul -->Mul
+                  (B=1.4142...)       (A=1)   (A=0.5)
+
+        """
+        root_inp = onnx.helper.make_tensor_value_info("root", onnx.TensorProto.FLOAT, shape)
+        output = onnx.helper.make_tensor_value_info("output", onnx.TensorProto.FLOAT, shape)
+        one_const = onnx.numpy_helper.from_array(np.array(1.0, dtype=np.float32), "one_const")
+        half_const = onnx.numpy_helper.from_array(np.array(0.5, dtype=np.float32), "half_const")
+        root2_const = onnx.numpy_helper.from_array(np.array(math.sqrt(2.0), dtype=np.float32), "root2_const")
+
+        div_node = onnx.helper.make_node("Div", ["root", "root2_const"], ["div_out"])
+        erf_node = onnx.helper.make_node("Erf", ["div_out"], ["erf_out"])
+        add_node = onnx.helper.make_node("Add", ["erf_out", "one_const"], ["add_out"])
+        mul0_node = onnx.helper.make_node("Mul", ["add_out", "half_const"], ["mul0_out"])
+        mul1_node = onnx.helper.make_node("Mul", ["mul0_out", "root"], ["output"])
+
+        graph = onnx.helper.make_graph(
+            [div_node, erf_node, add_node, mul0_node, mul1_node],
+            "erf_sequence_3",
+            [root_inp],
+            [output],
+            initializer=[one_const, half_const, root2_const],
+        )
+        opset_imports = [
+            onnx.helper.make_opsetid("", 18),
+            onnx.helper.make_opsetid("com.microsoft", 1),
+        ]
+        model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        return ONNXModel(model)
+
+    def build_erf_sequence_4_model(self, shape):
+        """
+           +----------------------------------------------+
+           |                                              |
+           |                                              v
+        [root] --> Mul -----> Erf    -->   Add --> Mul -->Mul
+                   (A=0.7071067690849304)  (B=1)  (B=0.5)
+
+        """
+        root_inp = onnx.helper.make_tensor_value_info("root", onnx.TensorProto.FLOAT, shape)
+        output = onnx.helper.make_tensor_value_info("output", onnx.TensorProto.FLOAT, shape)
+        one_const = onnx.numpy_helper.from_array(np.array(1.0, dtype=np.float32), "one_const")
+        half_const = onnx.numpy_helper.from_array(np.array(0.5, dtype=np.float32), "half_const")
+        frac_const = onnx.numpy_helper.from_array(np.array(0.7071067690849304, dtype=np.float32), "frac_const")
+
+        mul0_node = onnx.helper.make_node("Mul", ["root", "frac_const"], ["mul0_out"])
+        erf_node = onnx.helper.make_node("Erf", ["mul0_out"], ["erf_out"])
+        add_node = onnx.helper.make_node("Add", ["erf_out", "one_const"], ["add_out"])
+        mul1_node = onnx.helper.make_node("Mul", ["add_out", "half_const"], ["mul1_out"])
+        mul2_node = onnx.helper.make_node("Mul", ["mul1_out", "root"], ["output"])
+
+        graph = onnx.helper.make_graph(
+            [mul0_node, erf_node, add_node, mul1_node, mul2_node],
+            "erf_sequence_4",
+            [root_inp],
+            [output],
+            initializer=[one_const, half_const, frac_const],
+        )
+        opset_imports = [
+            onnx.helper.make_opsetid("", 18),
+            onnx.helper.make_opsetid("com.microsoft", 1),
+        ]
+        model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        return ONNXModel(model)
+
+    def build_reduce_mean_sequence_model(self, shape, scale_val, bias_val, axis=-1):
+        """
+            +----------------------+
+            |                      |
+            |                      v
+        [Root] --> ReduceMean -->  Sub  --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Add
+                   (axis=2 or -1)  |      (Y=2)   (axis=2 or -1)  (E-6 or E-12 or 0) ^       ^       ^
+                                   |                                                 |       |       |
+                                   +-------------------------------------------------+    [Scale]  [Bias]
+        """
+        root_inp = onnx.helper.make_tensor_value_info("root", onnx.TensorProto.FLOAT, shape)
+        output = onnx.helper.make_tensor_value_info("output", onnx.TensorProto.FLOAT, shape)
+        scale_const = onnx.numpy_helper.from_array(np.array(scale_val, dtype=np.float32), "scale_const")
+        bias_const = onnx.numpy_helper.from_array(np.array(bias_val, dtype=np.float32), "bias_const")
+        axes_const = onnx.numpy_helper.from_array(np.array([axis], dtype=np.int64), "axes_const")
+        two_const = onnx.numpy_helper.from_array(np.array(2.0, dtype=np.float32), "two_const")
+        eps_const = onnx.numpy_helper.from_array(np.array(1.0e-8, dtype=np.float32), "eps_const")
+
+        rm0_node = onnx.helper.make_node("ReduceMean", ["root", "axes_const"], ["rm0_out"])
+        sub_node = onnx.helper.make_node("Sub", ["root", "rm0_out"], ["sub_out"])
+        pow_node = onnx.helper.make_node("Pow", ["sub_out", "two_const"], ["pow_out"])
+        rm1_node = onnx.helper.make_node("ReduceMean", ["pow_out", "axes_const"], ["rm1_out"])
+        add0_node = onnx.helper.make_node("Add", ["rm1_out", "eps_const"], ["add0_out"])
+        sqrt_node = onnx.helper.make_node("Sqrt", ["add0_out"], ["sqrt_out"])
+        div_node = onnx.helper.make_node("Div", ["sub_out", "sqrt_out"], ["div_out"])
+        mul_node = onnx.helper.make_node("Mul", ["div_out", "scale_const"], ["mul_out"])
+        add1_node = onnx.helper.make_node("Add", ["mul_out", "bias_const"], ["output"])
+
+        graph = onnx.helper.make_graph(
+            [rm0_node, sub_node, pow_node, rm1_node, add0_node, sqrt_node, div_node, mul_node, add1_node],
+            "reduce_mean_sequence",
+            [root_inp],
+            [output],
+            initializer=[scale_const, bias_const, axes_const, two_const, eps_const],
+        )
+        opset_imports = [
+            onnx.helper.make_opsetid("", 18),
+        ]
+        model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        return ONNXModel(model)
+
+    def build_reduce_l2_sequence_model(self, shape, epsilon_val, axis=-1):
+        """
+        [root] --> ReduceL2 -----> Clip  --> Expand ----> Div -->
+           |      (axis=-1)    (min=epsilon) (shape=root)  ^
+           |   (keepdims=True)                             |
+           |                                               |
+           +-----------------------------------------------+
+        """
+        root_inp = onnx.helper.make_tensor_value_info("root", onnx.TensorProto.FLOAT, shape)
+        output = onnx.helper.make_tensor_value_info("output", onnx.TensorProto.FLOAT, shape)
+        axes_const = onnx.numpy_helper.from_array(np.array([axis], dtype=np.int64), "axes_const")
+        eps_const = onnx.numpy_helper.from_array(np.array(epsilon_val, dtype=np.float32), "eps_const")
+        shape_const = onnx.numpy_helper.from_array(np.array(list(shape), dtype=np.int64), "shape_const")
+
+        rl2_node = onnx.helper.make_node("ReduceL2", ["root", "axes_const"], ["rl2_out"], keepdims=1)
+        clip_node = onnx.helper.make_node("Clip", ["rl2_out", "eps_const"], ["clip_out"])
+        expand_node = onnx.helper.make_node("Expand", ["clip_out", "shape_const"], ["expand_out"])
+        div_node = onnx.helper.make_node("Div", ["root", "expand_out"], ["output"])
+
+        graph = onnx.helper.make_graph(
+            [rl2_node, clip_node, expand_node, div_node],
+            "reducel2_sequence",
+            [root_inp],
+            [output],
+            initializer=[axes_const, eps_const, shape_const],
+        )
+        opset_imports = [
+            onnx.helper.make_opsetid("", 18),
+        ]
+        model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        return ONNXModel(model)
+
+    def test_fuse_erf_to_gelu_1(self):
+        shape = (1, 2, 3)
+        model = self.build_erf_sequence_1_model(shape)
+        orig_model = onnx.ModelProto()
+        orig_model.CopyFrom(model.model)
+
+        # Check that fusion simplified model to 2 Gelu nodes.
+        modified = FusionGelu(model).apply()
+        self.assertTrue(modified)
+        self.assertEqual(len(model.model.graph.node), 2)
+
+        gelu_node_0 = model.model.graph.node[0]
+        gelu_node_1 = model.model.graph.node[1]
+        self.assertEqual(gelu_node_0.op_type, "Gelu")
+        self.assertEqual(gelu_node_1.op_type, "Gelu")
+
+        self.assertTrue(gelu_node_0.name)
+        self.assertTrue(gelu_node_1.name)
+        self.assertNotEqual(gelu_node_0.name, gelu_node_1.name)  # Generated names should not be equal
+
+        # Check that fusion is equivalent to original Erf model.
+        inputs = {"root": np.ones(shape, dtype=np.float32)}
+        self.check_fused_model_correctness(orig_model, model.model, inputs)
+
+    def test_fuse_erf_to_gelu_2(self):
+        shape = (1, 2, 3)
+        model = self.build_erf_sequence_2_model(shape)
+        orig_model = onnx.ModelProto()
+        orig_model.CopyFrom(model.model)
+
+        # Check that fusion simplified model to 1 Gelu node.
+        modified = FusionGelu(model).apply()
+        self.assertTrue(modified)
+        self.assertEqual(len(model.model.graph.node), 1)
+
+        gelu_node = model.model.graph.node[0]
+        self.assertEqual(gelu_node.op_type, "Gelu")
+        self.assertTrue(gelu_node.name)
+
+        # Check that fusion is equivalent to original Erf model.
+        inputs = {"root": np.ones(shape, dtype=np.float32)}
+        self.check_fused_model_correctness(orig_model, model.model, inputs)
+
+    def test_fuse_erf_to_gelu_3(self):
+        shape = (1, 2, 3)
+        model = self.build_erf_sequence_3_model(shape)
+        orig_model = onnx.ModelProto()
+        orig_model.CopyFrom(model.model)
+
+        # Check that fusion simplified model to 1 Gelu node.
+        modified = FusionGelu(model).apply()
+        self.assertTrue(modified)
+        self.assertEqual(len(model.model.graph.node), 1)
+
+        gelu_node = model.model.graph.node[0]
+        self.assertEqual(gelu_node.op_type, "Gelu")
+        self.assertTrue(gelu_node.name)
+
+        # Check that fusion is equivalent to original Erf model.
+        inputs = {"root": np.ones(shape, dtype=np.float32)}
+        self.check_fused_model_correctness(orig_model, model.model, inputs)
+
+    def test_fuse_erf_to_gelu_4(self):
+        shape = (1, 2, 3)
+        model = self.build_erf_sequence_4_model(shape)
+        orig_model = onnx.ModelProto()
+        orig_model.CopyFrom(model.model)
+
+        # Check that fusion simplified model to 1 Gelu node.
+        modified = FusionGelu(model).apply()
+        self.assertTrue(modified)
+        self.assertEqual(len(model.model.graph.node), 1)
+
+        gelu_node = model.model.graph.node[0]
+        self.assertEqual(gelu_node.op_type, "Gelu")
+        self.assertTrue(gelu_node.name)
+
+        # Check that fusion is equivalent to original Erf model.
+        inputs = {"root": np.ones(shape, dtype=np.float32)}
+        self.check_fused_model_correctness(orig_model, model.model, inputs)
+
+    def test_fuse_reduce_l2_to_lpnorm(self):
+        shape = (1, 2, 3)
+        model = self.build_reduce_l2_sequence_model(shape, 1e-12, axis=-1)
+        orig_model = onnx.ModelProto()
+        orig_model.CopyFrom(model.model)
+
+        # Check that fusion simplified model to 1 LpNormalization node.
+        modified = FusionLpNormalization(model).apply()
+        self.assertTrue(modified)
+        self.assertEqual(len(model.model.graph.node), 1)
+
+        lpnorm_node = model.model.graph.node[0]
+        self.assertEqual(lpnorm_node.op_type, "LpNormalization")
+        self.assertTrue(lpnorm_node.name)
+
+        # LpNorm's p attribute should be set to 2
+        p_attr = next(attr for attr in lpnorm_node.attribute if attr.name == "p")
+        self.assertEqual(p_attr.i, 2)
+
+    def test_fuse_reduce_mean_to_layer_norm(self):
+        shape = (1, 2, 3)
+        model = self.build_reduce_mean_sequence_model(shape, [2.0, 2.0, 2.0], [1.0, 1.0, 1.0], axis=-1)
+        orig_model = onnx.ModelProto()
+        orig_model.CopyFrom(model.model)
+
+        # Check that fusion simplified model to 1 LayerNormalization node.
+        modified = FusionLayerNormalization(model).apply()
+        self.assertTrue(modified)
+        self.assertEqual(len(model.model.graph.node), 1)
+
+        layer_norm_node = model.model.graph.node[0]
+        self.assertEqual(layer_norm_node.op_type, "LayerNormalization")
+        self.assertTrue(layer_norm_node.name)
+
+        # Check that fused model is equivalent to original model.
+        inputs = {"root": np.ones(shape, dtype=np.float32)}
+        self.check_fused_model_correctness(orig_model, model.model, inputs)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 3b46ab643944a3bcc9e4d9eb2c155ead0bad5cdb Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Wed, 28 Feb 2024 00:46:29 +0800
Subject: [PATCH 070/279] Re-add testing removed by mistake. (#19647)

---
 .../azure-pipelines/linux-ci-pipeline.yml     | 42 ++++++++++++++++++-
 .../docker/scripts/manylinux/requirements.txt |  1 +
 2 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml
index a4bd24b4dd18b..02147c321fab3 100644
--- a/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml
@@ -115,6 +115,7 @@ stages:
           searchFolder: '$(Build.BinariesDirectory)'
           testRunTitle: 'Unit Test Run'
         condition: succeededOrFailed()
+
     - job: Linux_Release
       timeoutInMinutes: 180
       workspace:
@@ -243,7 +244,46 @@ stages:
           ln -s /data/models $(Build.BinariesDirectory)/models
         displayName: link model dir
 
-      
+      - bash: |
+          mkdir -p $HOME/.onnx
+          docker run --rm \
+            --volume /data/onnx:/data/onnx:ro \
+            --volume $(Build.SourcesDirectory):/onnxruntime_src \
+            --volume $(Build.BinariesDirectory):/build \
+            --volume /data/models:/build/models:ro \
+            --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
+            -e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \
+            -e NIGHTLY_BUILD \
+            -e BUILD_BUILDNUMBER \
+            onnxruntimecpubuild \
+            /bin/bash -c "
+              set -ex; \
+              pushd /onnxruntime_src/csharp; \
+              dotnet restore /onnxruntime_src/csharp/OnnxRuntime.DesktopOnly.CSharp.sln; \
+              dotnet build /onnxruntime_src/csharp/OnnxRuntime.DesktopOnly.CSharp.sln -c Release; \
+              dotnet test /onnxruntime_src/csharp/OnnxRuntime.DesktopOnly.CSharp.sln -c Release -f net6.0 --no-build -l \"console;verbosity=normal\"; \
+              popd
+              "
+        displayName: 'Dotnet build C# sln and Test'
+
+      - bash: |
+          mkdir -p $HOME/.onnx
+          docker run --rm \
+            --volume /data/onnx:/data/onnx:ro \
+            --volume $(Build.SourcesDirectory):/onnxruntime_src \
+            --volume $(Build.BinariesDirectory):/build \
+            --volume /data/models:/build/models:ro \
+            --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
+            -e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \
+            -e NIGHTLY_BUILD \
+            -e BUILD_BUILDNUMBER \
+            onnxruntimecpubuild \
+              /bin/bash -c "
+                set -ex; \
+                /bin/bash /onnxruntime_src/tools/scripts/python_test.sh /onnxruntime_src /build Release && \
+                /bin/bash /onnxruntime_src/tools/scripts/symbolic_shape_infer_test.sh /build
+              "
+        displayName: 'Run Release tests and symbolic shape infer test'
 
       - task: PublishTestResults@2
         displayName: 'Publish unit test results'
diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
index 94f52f476579b..886f19388d01e 100644
--- a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
@@ -10,3 +10,4 @@ protobuf==4.21.12
 sympy==1.12
 flatbuffers
 neural-compressor>=2.2.1
+triton

From 580ee20dfce2849029229eb213dc8c7c87a89483 Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Wed, 28 Feb 2024 02:56:16 +1000
Subject: [PATCH 071/279] Tweak Windows build parallelization settings (#19664)

### Description
<!-- Describe your changes. -->
Use UseMultiToolTask and limit the number of cl.exe instances running.

MultiToolTask info:
https://devblogs.microsoft.com/cppblog/improved-parallelism-in-msbuild/

Info on why limiting CL_MPCount can help:
https://github.com/Microsoft/checkedc-clang/wiki/Parallel-builds-of-clang-on-Windows

The current CIs have 4 cores (both physical and logical). Hardcoded the
GPU build in win-ci.yml to use CL_MPCount of 2 as that seems to work
fine. Can adjust if needed to base it on the actual number of cores or
to use build.py to build.

Caveat: I've run about 16 builds and haven't seen a slow build yet, but
as the root cause of the slow builds isn't really known this isn't
guaranteed to be a fix.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
Try and prevent super slow GPU builds by reducing number of tasks
potentially running in parallel.
---
 tools/ci_build/build.py                           | 15 ++++++++++++++-
 .../github/azure-pipelines/templates/win-ci.yml   |  3 ++-
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 74c473d34f548..1056c4ed84510 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -1451,6 +1451,13 @@ def generate_build_tree(
             # tools need to use the symbols.
             add_default_definition(cmake_extra_defines, "CMAKE_MSVC_DEBUG_INFORMATION_FORMAT", "ProgramDatabase")
 
+        if number_of_parallel_jobs(args) > 0:
+            # https://devblogs.microsoft.com/cppblog/improved-parallelism-in-msbuild/
+            # NOTE: this disables /MP if set (according to comments on blog post).
+            # By default, MultiProcMaxCount and CL_MPCount value are equal to the number of CPU logical processors.
+            # See logic around setting CL_MPCount below
+            cmake_args += ["-DCMAKE_VS_GLOBALS=UseMultiToolTask=true;EnforceProcessCountAcrossBuilds=true"]
+
     cmake_args += [f"-D{define}" for define in cmake_extra_defines]
 
     cmake_args += cmake_extra_args
@@ -1662,11 +1669,17 @@ def build_targets(args, cmake_path, build_dir, configs, num_parallel_jobs, targe
         build_tool_args = []
         if num_parallel_jobs != 1:
             if is_windows() and args.cmake_generator != "Ninja" and not args.build_wasm:
+                # https://github.com/Microsoft/checkedc-clang/wiki/Parallel-builds-of-clang-on-Windows suggests
+                # not maxing out CL_MPCount
+                # Start by having one less than num_parallel_jobs (default is num logical cores),
+                # limited to a range of 1..3
+                # that gives maxcpucount projects building using up to 3 cl.exe instances each
                 build_tool_args += [
                     f"/maxcpucount:{num_parallel_jobs}",
+                    # one less than num_parallel_jobs, at least 1, up to 3
+                    f"/p:CL_MPCount={min(max(num_parallel_jobs - 1, 1), 3)}",
                     # if nodeReuse is true, msbuild processes will stay around for a bit after the build completes
                     "/nodeReuse:False",
-                    f"/p:CL_MPCount={num_parallel_jobs}",
                 ]
             elif args.cmake_generator == "Xcode":
                 build_tool_args += [
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
index 8ed22153fd947..e32956d6eb913 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
@@ -162,10 +162,11 @@ stages:
           platform: ${{ parameters.msbuildPlatform }}
           configuration: RelWithDebInfo
           msbuildArchitecture: ${{ parameters.buildArch }}
-          maximumCpuCount: true
+          maximumCpuCount: true  # default is num logical cores worth of projects building concurrently
           logProjectEvents: true
           workingFolder: '$(Build.BinariesDirectory)\RelWithDebInfo'
           createLogFile: true
+          msbuildArgs: "/p:CL_MPCount=2"  # 2x cl.exe per project building.
 
       - task: PythonScript@0
         displayName: 'test'

From 1c468a03b90aa8122d49b3148152a67b0519d36e Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Wed, 28 Feb 2024 03:27:43 +1000
Subject: [PATCH 072/279] Improve Nuget-CUDA-Packaging-Pipeline (#19668)

### Description
<!-- Describe your changes. -->
* Publish the artifacts as late as possible
* once published the artifacts are immutable, and any retry will fail if
they exist
  * if any step fails after publishing the stage cannot be retried
* use powershell to cleanup
  * DeleteFiles is taking >30 mins and causing the stage to timeout
  * powershell took < 1s

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
Make pipeline more robust
---
 .../stages/nuget-combine-cuda-stage.yml             | 13 ++++++-------
 ...mponent-governance-component-detection-steps.yml |  7 ++-----
 2 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml
index 8ca3d9148b514..064e2ea91d194 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml
@@ -213,13 +213,6 @@ stages:
             PlatformsSupported: 'linux-x64'
             VerifyNugetSigning: false
 
-        - task: PublishPipelineArtifact@0
-          displayName: 'Publish Pipeline NuGet Artifact'
-          inputs:
-            artifactName: 'drop-signed-nuget-GPU'
-            targetPath: '$(Build.ArtifactStagingDirectory)'
-
-
         - task: MSBuild@1
           displayName: 'Clean C#'
           inputs:
@@ -241,6 +234,12 @@ stages:
           parameters:
             condition: 'succeeded'
 
+        - task: PublishPipelineArtifact@0
+          displayName: 'Publish Pipeline NuGet Artifact'
+          inputs:
+            artifactName: 'drop-signed-nuget-GPU'
+            targetPath: '$(Build.ArtifactStagingDirectory)'
+
         - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
           displayName: 'Clean Agent Directories'
           condition: always()
diff --git a/tools/ci_build/github/azure-pipelines/templates/component-governance-component-detection-steps.yml b/tools/ci_build/github/azure-pipelines/templates/component-governance-component-detection-steps.yml
index f1418e75bffa2..3d128fdb78eee 100644
--- a/tools/ci_build/github/azure-pipelines/templates/component-governance-component-detection-steps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/component-governance-component-detection-steps.yml
@@ -6,11 +6,8 @@ parameters:
 
 steps:
 - ${{ if eq(variables['System.TeamProject'], 'Lotus') }}:
-  - task: DeleteFiles@1
-    inputs:
-      SourceFolder: '$(Build.BinariesDirectory)'
-      contents: |
-        **/*
+  - powershell: |
+      Remove-Item $(Build.BinariesDirectory)/* -Recurse -Force
     displayName: 'Clean up build directory'
 
   - task: ms.vss-governance-buildtask.governance-build-task-component-detection.ComponentGovernanceComponentDetection@0

From 2e4d1b8f1ba928fe5879077eced9cd5191760cfb Mon Sep 17 00:00:00 2001
From: zesongw <zesong.wang@intel.com>
Date: Wed, 28 Feb 2024 02:01:12 +0800
Subject: [PATCH 073/279] [WebNN EP] Add support for Op MatMul of WebNN CPU
 backend (#19413)

Enable MatMul support for WebNN CPU backend to support more models.
---
 onnxruntime/core/providers/webnn/builders/helper.h |  2 +-
 .../webnn/builders/impl/gemm_op_builder.cc         | 14 ++++++++++++--
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/core/providers/webnn/builders/helper.h b/onnxruntime/core/providers/webnn/builders/helper.h
index d94729e60d029..d7892fe02c1ba 100644
--- a/onnxruntime/core/providers/webnn/builders/helper.h
+++ b/onnxruntime/core/providers/webnn/builders/helper.h
@@ -195,7 +195,7 @@ static const InlinedHashMap<std::string, WebnnOpInfo> op_map = {
     {"LessOrEqual", {"lesserOrEqual", false}},
     {"Log", {"log", false}},
     {"LpPool", {"l2Pool2d", false}},
-    {"MatMul", {"matmul", false}},
+    {"MatMul", {"matmul", true}},
     {"MatMulInteger", {"matmulInteger", false}},
     {"Max", {"max", true}},
     {"MaxPool", {"maxPool2d", true}},
diff --git a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
index 4bf991a1b0105..d5f84f853f7de 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
@@ -29,7 +29,7 @@ class GemmOpBuilder : public BaseOpBuilder {
 
 // Add operator related.
 Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
-                                            const logging::Logger& /* logger */) const {
+                                            const logging::Logger& logger) const {
   const auto& op_type = node.OpType();
   const auto& input_defs = node.InputDefs();
   const size_t a_idx = 0, b_idx = 1, c_idx = 2;  // A*B+C
@@ -38,7 +38,17 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
   emscripten::val b = model_builder.GetOperand(node.InputDefs()[b_idx]->Name());
   emscripten::val output = emscripten::val::object();
   if (op_type == "MatMul") {
-    output = model_builder.GetBuilder().call<emscripten::val>("matmul", a, b);
+    std::vector<int64_t> a_shape;
+    if (!GetShape(*input_defs[a_idx], a_shape, logger)) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Can not get shape of A.");
+    }
+    // The inputs of MatMul must be at least 3D for WebNN CPU backend. Use GEMM for 2D case.
+    // TODO: Remove this workaround when it is fixed in Chromium.
+    if (model_builder.GetWebnnDeviceType() == WebnnDeviceType::CPU && a_shape.size() == 2) {
+      output = model_builder.GetBuilder().call<emscripten::val>("gemm", a, b);
+    } else {
+      output = model_builder.GetBuilder().call<emscripten::val>("matmul", a, b);
+    }
   } else if (op_type == "MatMulInteger") {
     emscripten::val a_zero_point = emscripten::val::null();
     emscripten::val b_zero_point = emscripten::val::null();

From 3cb81cdde25d059af5674506f6a5b899c9c0f5ee Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Tue, 27 Feb 2024 11:07:15 -0800
Subject: [PATCH 074/279] [js/common] move 'env.wasm.trace' to 'env.trace'
 (#19617)

### Description

Try to move 'env.wasm.trace' to 'env.trace' to make it less confusing,
because it also works in webgpu. Marked 'env.wasm.trace' as deprecated.
---
 js/common/lib/env.ts                   | 9 +++++++++
 js/common/lib/trace.ts                 | 6 +++---
 js/web/lib/wasm/jsep/backend-webgpu.ts | 3 ++-
 3 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/js/common/lib/env.ts b/js/common/lib/env.ts
index 6299c26159400..73a47d1a4f937 100644
--- a/js/common/lib/env.ts
+++ b/js/common/lib/env.ts
@@ -36,6 +36,7 @@ export declare namespace Env {
     /**
      * set or get a boolean value indicating whether to enable trace.
      *
+     * @deprecated Use `env.trace` instead. If `env.trace` is set, this property will be ignored.
      * @defaultValue `false`
      */
     trace?: boolean;
@@ -167,6 +168,7 @@ export interface Env {
    * @defaultValue `'warning'`
    */
   logLevel?: 'verbose'|'info'|'warning'|'error'|'fatal';
+
   /**
    * Indicate whether run in debug mode.
    *
@@ -174,6 +176,13 @@ export interface Env {
    */
   debug?: boolean;
 
+  /**
+   * set or get a boolean value indicating whether to enable trace.
+   *
+   * @defaultValue `false`
+   */
+  trace?: boolean;
+
   /**
    * Get version of the current package.
    */
diff --git a/js/common/lib/trace.ts b/js/common/lib/trace.ts
index 404f7ef8089af..7e0487b350198 100644
--- a/js/common/lib/trace.ts
+++ b/js/common/lib/trace.ts
@@ -4,7 +4,7 @@
 import {env} from './env-impl.js';
 
 export const TRACE = (deviceType: string, label: string) => {
-  if (!env.wasm.trace) {
+  if (typeof env.trace === 'undefined' ? !env.wasm.trace : !env.trace) {
     return;
   }
   // eslint-disable-next-line no-console
@@ -30,14 +30,14 @@ const TRACE_FUNC = (msg: string, extraMsg?: string) => {
 };
 
 export const TRACE_FUNC_BEGIN = (extraMsg?: string) => {
-  if (!env.wasm.trace) {
+  if (typeof env.trace === 'undefined' ? !env.wasm.trace : !env.trace) {
     return;
   }
   TRACE_FUNC('BEGIN', extraMsg);
 };
 
 export const TRACE_FUNC_END = (extraMsg?: string) => {
-  if (!env.wasm.trace) {
+  if (typeof env.trace === 'undefined' ? !env.wasm.trace : !env.trace) {
     return;
   }
   TRACE_FUNC('END', extraMsg);
diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts
index 3e3a191ec3ead..27c5566ab9fed 100644
--- a/js/web/lib/wasm/jsep/backend-webgpu.ts
+++ b/js/web/lib/wasm/jsep/backend-webgpu.ts
@@ -710,7 +710,8 @@ export class WebGpuBackend {
   }
   setQueryType(): void {
     this.queryType = 'none';
-    if (this.env.webgpu.profiling?.mode === 'default' || this.env.wasm.trace) {
+    if (this.env.webgpu.profiling?.mode === 'default' ||
+        (typeof this.env.trace === 'undefined' ? this.env.wasm.trace : this.env.trace)) {
       if (this.device.features.has('chromium-experimental-timestamp-query-inside-passes')) {
         this.queryType = 'inside-passes';
       } else if (this.device.features.has('timestamp-query')) {

From c20ced4132d111e3e63844e292f0d8e318cffab2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20M=C3=BCller?=
 <44298237+gedoensmax@users.noreply.github.com>
Date: Tue, 27 Feb 2024 20:26:48 +0100
Subject: [PATCH 075/279] Use CMake's find package for CUDA libs (#19673)

### Description
Answers issue #19640
More details are in the issue, basically I am changing all the include
directory and link directory usage to CMake's `CUDA::*` targets
---
 cmake/CMakeLists.txt                          |  4 ++++
 cmake/adjust_global_compile_flags.cmake       |  2 +-
 .../external/onnxruntime_external_deps.cmake  |  3 +--
 cmake/onnxruntime_providers_cuda.cmake        | 20 +++++++++----------
 cmake/onnxruntime_providers_tensorrt.cmake    | 11 +++++-----
 cmake/onnxruntime_python.cmake                |  5 +----
 cmake/onnxruntime_unittests.cmake             |  4 ++--
 .../core/providers/cuda/nvtx_profile.cc       |  5 -----
 8 files changed, 25 insertions(+), 29 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 1376c90fbcefe..8453da19ce3a6 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -1412,6 +1412,10 @@ endif()
 if (onnxruntime_USE_CUDA)
   set(CMAKE_CUDA_RUNTIME_LIBRARY Shared)
   set(CMAKE_CUDA_STANDARD 17)
+  if(onnxruntime_CUDA_HOME)
+    file(TO_CMAKE_PATH CUDAToolkit_ROOT ${onnxruntime_CUDA_HOME})
+  endif()
+  find_package(CUDAToolkit REQUIRED)
   if(onnxruntime_CUDNN_HOME)
     file(TO_CMAKE_PATH ${onnxruntime_CUDNN_HOME} onnxruntime_CUDNN_HOME)
   endif()
diff --git a/cmake/adjust_global_compile_flags.cmake b/cmake/adjust_global_compile_flags.cmake
index 8161ea574b8cc..d3f9256105127 100644
--- a/cmake/adjust_global_compile_flags.cmake
+++ b/cmake/adjust_global_compile_flags.cmake
@@ -205,7 +205,7 @@ endif()
 
 
 macro(check_nvcc_compiler_flag _FLAG _RESULT)
-    execute_process(COMMAND ${onnxruntime_CUDA_HOME}/bin/nvcc "${_FLAG}" RESULT_VARIABLE NVCC_OUT ERROR_VARIABLE NVCC_ERROR)
+    execute_process(COMMAND ${CUDAToolkit_BIN_DIR}/nvcc "${_FLAG}" RESULT_VARIABLE NVCC_OUT ERROR_VARIABLE NVCC_ERROR)
     message("NVCC_ERROR = ${NVCC_ERROR}")
     message("NVCC_OUT = ${NVCC_OUT}")
     if ("${NVCC_OUT}" MATCHES "0")
diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index 22d12b128dc1f..09d57164b4ee1 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -556,16 +556,15 @@ message("Finished fetching external dependencies")
 set(onnxruntime_LINK_DIRS )
 if (onnxruntime_USE_CUDA)
       #TODO: combine onnxruntime_CUDNN_HOME and onnxruntime_CUDA_HOME, assume they are the same
+      find_package(CUDAToolkit REQUIRED)
       if (WIN32)
         if(onnxruntime_CUDNN_HOME)
           list(APPEND onnxruntime_LINK_DIRS ${onnxruntime_CUDNN_HOME}/lib ${onnxruntime_CUDNN_HOME}/lib/x64)
         endif()
-        list(APPEND onnxruntime_LINK_DIRS ${onnxruntime_CUDA_HOME}/x64/lib64)
       else()
         if(onnxruntime_CUDNN_HOME)
           list(APPEND onnxruntime_LINK_DIRS  ${onnxruntime_CUDNN_HOME}/lib ${onnxruntime_CUDNN_HOME}/lib64)
         endif()
-        list(APPEND onnxruntime_LINK_DIRS ${onnxruntime_CUDA_HOME}/lib64)
       endif()
 endif()
 
diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
index 9887d615c92d7..0f6d48bdb6ec8 100644
--- a/cmake/onnxruntime_providers_cuda.cmake
+++ b/cmake/onnxruntime_providers_cuda.cmake
@@ -178,15 +178,16 @@
     add_dependencies(${target} onnxruntime_providers_shared ${onnxruntime_EXTERNAL_DEPENDENCIES})
     if(onnxruntime_CUDA_MINIMAL)
       target_compile_definitions(${target} PRIVATE USE_CUDA_MINIMAL)
-      target_link_libraries(${target} PRIVATE ${ABSEIL_LIBS} ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 safeint_interface)
+      target_link_libraries(${target} PRIVATE ${ABSEIL_LIBS} ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 safeint_interface CUDA::cudart)
     else()
-      target_link_libraries(${target} PRIVATE cublasLt cublas cudnn curand cufft ${ABSEIL_LIBS} ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 safeint_interface)
+      target_link_libraries(${target} PRIVATE CUDA::cublasLt CUDA::cublas cudnn CUDA::curand CUDA::cufft CUDA::cudart
+              ${ABSEIL_LIBS} ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 safeint_interface)
       if(onnxruntime_CUDNN_HOME)
           target_include_directories(${target} PRIVATE ${onnxruntime_CUDNN_HOME}/include)
           target_link_directories(${target} PRIVATE ${onnxruntime_CUDNN_HOME}/lib)
       endif()
     endif()
-
+    
     if (onnxruntime_USE_TRITON_KERNEL)
       # compile triton kernel, generate .a and .h files
       include(onnxruntime_compile_triton_kernel.cmake)
@@ -196,25 +197,24 @@
       target_include_directories(${target} PRIVATE ${triton_kernel_header_dir})
       target_link_libraries(${target} PUBLIC -Wl,--whole-archive ${triton_kernel_obj_file} -Wl,--no-whole-archive)
       # lib cuda needed by cuLaunchKernel
-      target_link_libraries(${target} PRIVATE cuda)
+      target_link_libraries(${target} PRIVATE CUDA::cuda_driver)
     endif()
 
     include(cutlass)
     target_include_directories(${target} PRIVATE ${cutlass_SOURCE_DIR}/include ${cutlass_SOURCE_DIR}/examples)
 
-    target_include_directories(${target} PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR}  ${eigen_INCLUDE_DIRS} ${TVM_INCLUDES} PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+    target_include_directories(${target} PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR}  ${eigen_INCLUDE_DIRS} ${TVM_INCLUDES}
+     PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
     # ${CMAKE_CURRENT_BINARY_DIR} is so that #include "onnxruntime_config.h" inside tensor_shape.h is found
     set_target_properties(${target} PROPERTIES LINKER_LANGUAGE CUDA)
     set_target_properties(${target} PROPERTIES FOLDER "ONNXRuntime")
 
     if (onnxruntime_ENABLE_CUDA_PROFILING) # configure cupti for cuda profiling
-      target_include_directories(${target} PRIVATE ${onnxruntime_CUDA_HOME}/extras/CUPTI/include)
-      target_link_directories(${target} PRIVATE ${onnxruntime_CUDA_HOME}/extras/CUPTI/lib64)
-      target_link_libraries(${target} PRIVATE cupti)
+      target_link_libraries(${target} PRIVATE CUDA::cupti)
     endif()
 
-    if (onnxruntime_ENABLE_NVTX_PROFILE AND NOT WIN32)
-      target_link_libraries(${target} PRIVATE nvToolsExt)
+    if (onnxruntime_ENABLE_NVTX_PROFILE)
+      target_link_libraries(${target} PRIVATE CUDA::nvtx3)
     endif()
 
     if (onnxruntime_ENABLE_TRAINING_OPS)
diff --git a/cmake/onnxruntime_providers_tensorrt.cmake b/cmake/onnxruntime_providers_tensorrt.cmake
index 686a993de3a4a..15ffc29e79ff4 100644
--- a/cmake/onnxruntime_providers_tensorrt.cmake
+++ b/cmake/onnxruntime_providers_tensorrt.cmake
@@ -8,7 +8,7 @@
   set(BUILD_LIBRARY_ONLY 1)
   add_definitions("-DONNX_ML=1")
   add_definitions("-DONNX_NAMESPACE=onnx")
-  set(CUDA_INCLUDE_DIRS ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+  set(CUDA_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIRS})
   set(TENSORRT_ROOT ${onnxruntime_TENSORRT_HOME})
   set(OLD_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
   set(PROTOBUF_LIBRARY ${PROTOBUF_LIB})
@@ -58,7 +58,7 @@
       URL_HASH SHA1=${DEP_SHA1_onnx_tensorrt}
     )
     if (NOT CUDA_INCLUDE_DIR)
-      set(CUDA_INCLUDE_DIR ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) # onnx-tensorrt repo needs this variable to build
+      set(CUDA_INCLUDE_DIR ${CUDAToolkit_INCLUDE_DIRS}) # onnx-tensorrt repo needs this variable to build
     endif()
     # The onnx_tensorrt repo contains a test program, getSupportedAPITest, which doesn't support Windows. It uses
     # unistd.h. So we must exclude it from our build. onnxruntime_fetchcontent_makeavailable is for the purpose.
@@ -102,11 +102,12 @@
   onnxruntime_add_include_to_target(onnxruntime_providers_tensorrt onnxruntime_common onnx flatbuffers::flatbuffers Boost::mp11 safeint_interface)
   add_dependencies(onnxruntime_providers_tensorrt onnxruntime_providers_shared ${onnxruntime_EXTERNAL_DEPENDENCIES})
   if (onnxruntime_USE_TENSORRT_BUILTIN_PARSER)
-    target_link_libraries(onnxruntime_providers_tensorrt PRIVATE ${trt_link_libs} cudart ${ONNXRUNTIME_PROVIDERS_SHARED} ${PROTOBUF_LIB} flatbuffers::flatbuffers Boost::mp11 safeint_interface ${ABSEIL_LIBS})
+    target_link_libraries(onnxruntime_providers_tensorrt PRIVATE ${trt_link_libs} ${ONNXRUNTIME_PROVIDERS_SHARED} ${PROTOBUF_LIB} flatbuffers::flatbuffers Boost::mp11 safeint_interface ${ABSEIL_LIBS} PUBLIC CUDA::cudart)
   else()
-    target_link_libraries(onnxruntime_providers_tensorrt PRIVATE ${onnxparser_link_libs} ${trt_link_libs} cudart ${ONNXRUNTIME_PROVIDERS_SHARED} ${PROTOBUF_LIB} flatbuffers::flatbuffers ${ABSEIL_LIBS})
+    target_link_libraries(onnxruntime_providers_tensorrt PRIVATE ${onnxparser_link_libs} ${trt_link_libs} ${ONNXRUNTIME_PROVIDERS_SHARED} ${PROTOBUF_LIB} flatbuffers::flatbuffers ${ABSEIL_LIBS} PUBLIC CUDA::cudart)
   endif()
-  target_include_directories(onnxruntime_providers_tensorrt PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${eigen_INCLUDE_DIRS} PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+  target_include_directories(onnxruntime_providers_tensorrt PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${eigen_INCLUDE_DIRS}
+    PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
   if(onnxruntime_CUDNN_HOME)
     target_include_directories(onnxruntime_providers_tensorrt PRIVATE ${onnxruntime_CUDNN_HOME}/include)
   endif()
diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake
index 3f20787e87425..23c6e5e430875 100644
--- a/cmake/onnxruntime_python.cmake
+++ b/cmake/onnxruntime_python.cmake
@@ -282,10 +282,7 @@ if (WIN32)
     get_filename_component(CUDNN_DLL_NAME ${CUDNN_DLL_PATH} NAME_WE)
     string(REPLACE "cudnn64_" "" CUDNN_VERSION "${CUDNN_DLL_NAME}")
     if(NOT onnxruntime_CUDA_VERSION)
-      message("Reading json file ${onnxruntime_CUDA_HOME}/version.json")
-      set(CUDA_SDK_JSON_FILE_PATH "${onnxruntime_CUDA_HOME}/version.json")
-      file(READ ${CUDA_SDK_JSON_FILE_PATH} CUDA_SDK_JSON_CONTENT)
-      string(JSON onnxruntime_CUDA_VERSION GET ${CUDA_SDK_JSON_CONTENT} "cuda" "version")
+      set(onnxruntime_CUDA_VERSION ${CUDAToolkit_VERSION})
       message("onnxruntime_CUDA_VERSION=${onnxruntime_CUDA_VERSION}")
     endif()
     file(APPEND "${VERSION_INFO_FILE}"
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index 3ed695327c183..88f662075e177 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -67,7 +67,7 @@ function(AddTest)
     if(onnxruntime_USE_CUDA)
       #XXX: we should not need to do this. onnxruntime_test_all.exe should not have direct dependency on CUDA DLLs,
       # otherwise it will impact when CUDA DLLs can be unloaded.
-      target_link_libraries(${_UT_TARGET} PRIVATE cudart)
+      target_link_libraries(${_UT_TARGET} PRIVATE CUDA::cudart)
     endif()
     target_link_libraries(${_UT_TARGET} PRIVATE ${_UT_LIBS} GTest::gtest GTest::gmock ${onnxruntime_EXTERNAL_LIBRARIES})
   endif()
@@ -1268,7 +1268,7 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
       list(APPEND onnxruntime_shared_lib_test_LIBS cpuinfo)
     endif()
     if (onnxruntime_USE_CUDA)
-      list(APPEND onnxruntime_shared_lib_test_LIBS cudart)
+      list(APPEND onnxruntime_shared_lib_test_LIBS CUDA::cudart)
     endif()
     if (onnxruntime_USE_ROCM)
       list(APPEND onnxruntime_shared_lib_test_LIBS hip::host)
diff --git a/onnxruntime/core/providers/cuda/nvtx_profile.cc b/onnxruntime/core/providers/cuda/nvtx_profile.cc
index 6c7c594066b86..867e7c1f24584 100644
--- a/onnxruntime/core/providers/cuda/nvtx_profile.cc
+++ b/onnxruntime/core/providers/cuda/nvtx_profile.cc
@@ -4,13 +4,8 @@
 #ifdef ENABLE_NVTX_PROFILE
 #include "nvtx_profile.h"
 #include "core/common/common.h"
-#if defined(_WIN32) || defined(WIN32) || defined(__CYGWIN__) || defined(__MINGW32__) || defined(__BORLANDC__)
 #include <nvtx3/nvToolsExt.h>
 #include <nvtx3/nvToolsExtCuda.h>
-#else
-#include <nvToolsExt.h>
-#include <nvToolsExtCuda.h>
-#endif
 
 namespace onnxruntime {
 namespace profile {

From f95c0773a129a4605b2161f5f9fddb8116c948d0 Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Wed, 28 Feb 2024 10:40:40 +0800
Subject: [PATCH 076/279] Add share memory Flag in docker (#19672)

### Description


### Motivation and Context
Ref:
https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html#setincshmem

Co-authored-by: Your Name <your@email.com>
---
 tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
index 822bc559d992d..165bd804a8ad5 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
@@ -241,7 +241,7 @@ stages:
         script: |
           set -e -x
           mkdir -p $HOME/.onnx
-          docker run --gpus all --rm \
+          docker run --gpus all --shm-size=1g --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --rm \
             --volume  $(Build.SourcesDirectory):/onnxruntime_src \
             --volume $(Build.BinariesDirectory)/Release:/build/Release \
             --volume /data/models:/build/models:ro \

From 026e3178ae71cfcc5cfa2decde9a7d64b935d255 Mon Sep 17 00:00:00 2001
From: pengwa <pengwa@microsoft.com>
Date: Wed, 28 Feb 2024 15:57:05 +0800
Subject: [PATCH 077/279] Improve memory matrix for ORTModule (#19620)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Memory matrix for ORTModule

Collect  parameter/gradient/buffers sizes also.
Exposed as a function, can be used externally for debugging purpose.


```
2024-02-27 07:18:55,283 orttraining.rank-0 [INFO] - rank-0 step 1 memory (MiB) | phase: pre_forward | allocated: 5331 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 219 | max inactive: 816 | param: 5314 | grad: 0 | buffer: 8
2024-02-27 07:18:55,322 orttraining.rank-0 [INFO] - rank-0 step 1 memory (MiB) | phase: post_forward | allocated: 8162 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 400 | max inactive: 816 | param: 5314 | grad: 0 | buffer: 8
2024-02-27 07:18:55,358 orttraining.rank-0 [INFO] - rank-0 step 1 memory (MiB) | phase: pre_backward | allocated: 8926 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 400 | max inactive: 816 | param: 5314 | grad: 0 | buffer: 8
2024-02-27 07:18:55,438 orttraining.rank-0 [INFO] - rank-0 step 1 memory (MiB) | phase: post_backward | allocated: 6098 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 218 | max inactive: 831 | param: 5314 | grad: 12 | buffer: 8
  0%|▏                                                                                                                                                                                                                                              | 2/3200 [01:27<32:05:11, 36.12s/it]2024-02-27 07:18:55,498 orttraining.rank-0 [INFO] - rank-0 step 2 memory (MiB) | phase: pre_forward | allocated: 5331 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 219 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8
2024-02-27 07:18:55,537 orttraining.rank-0 [INFO] - rank-0 step 2 memory (MiB) | phase: post_forward | allocated: 8162 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 400 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8
2024-02-27 07:18:55,576 orttraining.rank-0 [INFO] - rank-0 step 2 memory (MiB) | phase: pre_backward | allocated: 8926 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 400 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8
2024-02-27 07:18:55,657 orttraining.rank-0 [INFO] - rank-0 step 2 memory (MiB) | phase: post_backward | allocated: 6098 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 218 | max inactive: 831 | param: 5314 | grad: 12 | buffer: 8
  0%|▏                                                                                                                                                                                                                                              | 3/3200 [01:27<17:30:57, 19.72s/it]2024-02-27 07:18:55,711 orttraining.rank-0 [INFO] - rank-0 step 3 memory (MiB) | phase: pre_forward | allocated: 5331 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 219 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8
2024-02-27 07:18:55,750 orttraining.rank-0 [INFO] - rank-0 step 3 memory (MiB) | phase: post_forward | allocated: 8162 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 400 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8
2024-02-27 07:18:55,786 orttraining.rank-0 [INFO] - rank-0 step 3 memory (MiB) | phase: pre_backward | allocated: 8926 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 400 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8
2024-02-27 07:18:55,867 orttraining.rank-0 [INFO] - rank-0 step 3 memory (MiB) | phase: post_backward | allocated: 6098 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 218 | max inactive: 831 | param: 5314 | grad: 12 | buffer: 8
[2024-02-27 07:18:55,886] [INFO] [loss_scaler.py:190:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1
  0%|▎                                                                                                                                                                                                                                              | 4/3200 [01:28<10:39:52, 12.01s/it]2024-02-27 07:18:55,902 orttraining.rank-0 [INFO] - rank-0 step 4 memory (MiB) | phase: pre_forward | allocated: 5331 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 219 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8
2024-02-27 07:18:55,944 orttraining.rank-0 [INFO] - rank-0 step 4 memory (MiB) | phase: post_forward | allocated: 8162 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 400 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8
2024-02-27 07:18:55,979 orttraining.rank-0 [INFO] - rank-0 step 4 memory (MiB) | phase: pre_backward | allocated: 8926 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 400 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8
2024-02-27 07:18:56,060 orttraining.rank-0 [INFO] - rank-0 step 4 memory (MiB) | phase: post_backward | allocated: 6098 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 218 | max inactive: 831 | param: 5314 | grad: 12 | buffer: 8
  0%|▍                                                                                                                                                                                                                                               | 5/3200 [01:28<6:53:04,  7.76s/it]2024-02-27 07:18:56,115 orttraining.rank-0 [INFO] - rank-0 step 5 memory (MiB) | phase: pre_forward | allocated: 5331 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 219 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8
2024-02-27 07:18:56,154 orttraining.rank-0 [INFO] - rank-0 step 5 memory (MiB) | phase: post_forward | allocated: 8162 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 400 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8
2024-02-27 07:18:56,190 orttraining.rank-0 [INFO] - rank-0 step 5 memory (MiB) | phase: pre_backward | allocated: 8926 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 400 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8
2024-02-27 07:18:56,270 orttraining.rank-0 [INFO] - rank-0 step 5 memory (MiB) | phase: post_backward | allocated: 6098 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 218 | max inactive: 831 | param: 5314 | grad: 12 | buffer: 8
  0%|▍                                                                                                                                                                                                                                               | 6/3200 [01:28<4:36:19,  5.19s/it]2024-02-27 07:18:56,323 orttraining.rank-0 [INFO] - rank-0 step 6 memory (MiB) | phase: pre_forward | allocated: 5331 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 219 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8
2024-02-27 07:18:56,365 orttraining.rank-0 [INFO] - rank-0 step 6 memory (MiB) | phase: post_forward | allocated: 8162 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 400 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8
2024-02-27 07:18:56,398 orttraining.rank-0 [INFO] - rank-0 step 6 memory (MiB) | phase: pre_backward | allocated: 8926 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 400 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8
2024-02-27 07:18:56,478 orttraining.rank-0 [INFO] - rank-0 step 6 memory (MiB) | phase: post_backward | allocated: 6098 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 218 | max inactive: 831 | param: 5314 | grad: 12 | buffer: 8
  0%|▌                                                                                                                                                                                                                                               | 7/3200 [01:28<3:09:33,  3.56s/it]2024-02-27 07:18:56,533 orttraining.rank-0 [INFO] - rank-0 step 7 memory (MiB) | phase: pre_forward | allocated: 5331 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 219 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8
2024-02-27 07:18:56,572 orttraining.rank-0 [INFO] - rank-0 step 7 memory (MiB) | phase: post_forward | allocated: 8162 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 400 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8
2024-02-27 07:18:56,608 orttraining.rank-0 [INFO] - rank-0 step 7 memory (MiB) | phase: pre_backward | allocated: 8926 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 400 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8
2024-02-27 07:18:56,727 orttraining.rank-0 [INFO] - rank-0 step 7 memory (MiB) | phase: post_backward | allocated: 6098 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 218 | max inactive: 831 | param: 5314 | grad: 12 | buffer: 8
  0%|▌                                                                                                                                                                                                                                               | 8/3200 [01:28<2:13:48,  2.52s/it]2024-02-27 07:18:56,806 orttraining.rank-0 [INFO] - rank-0 step 8 memory (MiB) | phase: pre_forward | allocated: 5331 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 219 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8
2024-02-27 07:18:56,846 orttraining.rank-0 [INFO] - rank-0 step 8 memory (MiB) | phase: post_forward | allocated: 8162 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 400 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8
2024-02-27 07:18:56,882 orttraining.rank-0 [INFO] - rank-0 step 8 memory (MiB) | phase: pre_backward | allocated: 8926 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 400 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8
2024-02-27 07:18:56,962 orttraining.rank-0 [INFO] - rank-0 step 8 memory (MiB) | phase: post_backward | allocated: 6098 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 218 | max inactive: 831 | param: 5314 | grad: 12 | buffer: 8
  0%|▋                                                                                                                                                                                                                                               | 9/3200 [01:29<1:36:03,  1.81s/it]2024-02-27 07:18:57,053 orttraining.rank-0 [INFO] - rank-0 step 9 memory (MiB) | phase: pre_forward | allocated: 5331 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 219 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8
2024-02-27 07:18:57,094 orttraining.rank-0 [INFO] - rank-0 step 9 memory (MiB) | phase: post_forward | allocated: 8162 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 400 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8

```
---
 .../training/ortmodule/_runtime_inspector.py  | 37 +++------
 .../python/training/utils/__init__.py         |  2 +
 .../training/utils/torch_profile_utils.py     | 76 +++++++++++++++++++
 3 files changed, 88 insertions(+), 27 deletions(-)

diff --git a/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py b/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py
index 078ce4d27cd6f..772b9bd9e31ae 100644
--- a/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py
+++ b/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py
@@ -14,7 +14,7 @@
 from sympy import Symbol, simplify
 from sympy.parsing.sympy_parser import parse_expr
 
-from onnxruntime.training.utils import PTable
+from onnxruntime.training.utils import PTable, log_memory_usage
 
 from ._execution_agent import TrainingAgent
 from .options import _MemoryOptimizationLevel, _RuntimeOptions
@@ -509,6 +509,8 @@ def __init__(self, m: torch.nn.Module, logger: Logger):
 
         self._is_first_inspect = True
 
+        self._m = m
+
     def is_enabled(self) -> bool:
         """Check if memory inspector is enabled."""
         return self._is_enabled
@@ -621,29 +623,13 @@ def inspect_memory(self, cur_phase: Phase):
         need_print = self._current_step < 10 or (self._current_step & (self._current_step - 1) == 0)
 
         if need_print:
-            cur_mem_allocated = self._normalize(torch.cuda.memory_allocated())
-            max_mem_allocated = self._normalize(torch.cuda.max_memory_allocated())
-            cur_mem_cached = self._normalize(torch.cuda.memory_reserved())
-            max_mem_cached = self._normalize(torch.cuda.max_memory_reserved())
-            torch_mem_stat = torch.cuda.memory_stats()
-            cur_mem_inactive = self._normalize(torch_mem_stat.get("inactive_split_bytes.all.current", 0))
-            max_mem_inactive = self._normalize(torch_mem_stat.get("inactive_split_bytes.all.peak", 0))
-
-            mem_stats = [
-                ["phase", _convert_phase_to_string(cur_phase)],
-                ["allocated", cur_mem_allocated],  # current memory allocated for tensors
-                ["max allocated", max_mem_allocated],  # peak memory allocated for tensors
-                ["cached", cur_mem_cached],  # current memory cached for the caching allocator
-                ["max cached", max_mem_cached],  # peak memory cached for caching allocator.
-                ["inactive", cur_mem_inactive],  # amount of inactive, non-releasable memory
-                ["max inactive", max_mem_inactive],  # peak of inactive, non-releasable memory
-            ]
-
-            summ = f"{self._rank_info} step {self._current_step} memory ({MemoryObserver.NORMALIZER_UNIT})"
-            for stat in mem_stats:
-                summ += f" | {stat[0]}: {stat[1]}"
-
-            self._logger.info(summ)
+            log_memory_usage(
+                _convert_phase_to_string(cur_phase),
+                rank_0_only=True,
+                step_info=f"step {self._current_step}",
+                logger=self._logger,
+                module=self._m,
+            )
 
         if cur_phase == self._last_phase:
             self._increase_step()
@@ -655,9 +641,6 @@ def inspect_memory(self, cur_phase: Phase):
     def _increase_step(self):
         self._current_step += 1
 
-    def _normalize(self, mem_size_in_bytes: Union[float, int]) -> str:
-        return f"{float(mem_size_in_bytes) / MemoryObserver.NORMALIZER_FACTOR:.0f}"
-
     def display_memory_optimization_plans(self, memory_optimizer_config, details=False) -> Tuple[List[str], PTable]:
         mem_plan_count = len(self.cluster_id_combination_to_saving_symbolics_map)
 
diff --git a/orttraining/orttraining/python/training/utils/__init__.py b/orttraining/orttraining/python/training/utils/__init__.py
index b4a518d573998..ecfb7d7907f3c 100644
--- a/orttraining/orttraining/python/training/utils/__init__.py
+++ b/orttraining/orttraining/python/training/utils/__init__.py
@@ -12,6 +12,7 @@
     unflatten_data_using_schema,
 )
 from onnxruntime.training.utils.torch_profile_utils import (
+    log_memory_usage,
     nvtx_function_decorator,
     torch_nvtx_range_pop,
     torch_nvtx_range_push,
@@ -31,6 +32,7 @@
     "torch_nvtx_range_push",
     "torch_nvtx_range_pop",
     "nvtx_function_decorator",
+    "log_memory_usage",
     "pytorch_type_to_onnx_dtype",
     "onnx_dtype_to_pytorch_dtype",
     "pytorch_scalar_type_to_pytorch_dtype",
diff --git a/orttraining/orttraining/python/training/utils/torch_profile_utils.py b/orttraining/orttraining/python/training/utils/torch_profile_utils.py
index 382d7dac142fe..9e8a41e0dc7c8 100644
--- a/orttraining/orttraining/python/training/utils/torch_profile_utils.py
+++ b/orttraining/orttraining/python/training/utils/torch_profile_utils.py
@@ -3,6 +3,8 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
+from __future__ import annotations
+
 import torch
 
 
@@ -26,3 +28,77 @@ def wrapped_fn(*args, **kwargs):
         return ret_val
 
     return wrapped_fn
+
+
+def log_memory_usage(cur_phase: str, rank_0_only=True, step_info="", logger=None, module=None):
+    """Log memory usage for the current phase.
+    Args:
+        cur_phase (str): The current phase.
+        rank_0_only (bool, optional): Only log the memory usage for rank 0. Defaults to True.
+        step_info (str, optional): The step information. Defaults to "".
+        logger (logging.Logger, optional): The logger to log the memory usage. Defaults to None, which means print to stdout.
+        module (torch.nn.Module, optional): The module to get parameter, buffer and grad sizes. Defaults to None.
+    """
+    rank = 0
+    if rank_0_only is True:
+        if torch.distributed.is_initialized():
+            rank = torch.distributed.get_rank()
+        if rank != 0:
+            return
+
+    _normalizer_factor = float(1024 * 1024)
+    _normalizer_unit = "MiB"
+
+    def _normalize(mem_size_in_bytes: float | int) -> str:
+        return f"{float(mem_size_in_bytes) / _normalizer_factor:.0f}"
+
+    cur_mem_allocated = _normalize(torch.cuda.memory_allocated())
+    max_mem_allocated = _normalize(torch.cuda.max_memory_allocated())
+    cur_mem_cached = _normalize(torch.cuda.memory_reserved())
+    max_mem_cached = _normalize(torch.cuda.max_memory_reserved())
+    torch_mem_stat = torch.cuda.memory_stats()
+    cur_mem_inactive = _normalize(torch_mem_stat.get("inactive_split_bytes.all.current", 0))
+    max_mem_inactive = _normalize(torch_mem_stat.get("inactive_split_bytes.all.peak", 0))
+
+    mem_stats = [
+        ["phase", cur_phase],
+        ["allocated", cur_mem_allocated],  # current memory allocated for tensors
+        ["max allocated", max_mem_allocated],  # peak memory allocated for tensors
+        ["cached", cur_mem_cached],  # current memory cached for the caching allocator
+        ["max cached", max_mem_cached],  # peak memory cached for caching allocator.
+        ["inactive", cur_mem_inactive],  # amount of inactive, non-releasable memory
+        ["max inactive", max_mem_inactive],  # peak of inactive, non-releasable memory
+    ]
+
+    # Calculate the total size of parameters and gradients in the model
+    if module:
+        param_total_size = 0
+        grad_total_size = 0
+        for p in module.parameters():
+            if p.is_cuda:
+                param_total_size += p.numel() * p.element_size()
+            if p.grad is not None and p.grad.is_cuda:
+                grad_total_size += p.grad.numel() * p.grad.element_size()
+
+        # Calculate the total size of buffers in the model
+        buffer_total_size = 0
+        for b in module.buffers():
+            if b.is_cuda:
+                buffer_total_size += b.numel() * b.element_size()
+
+        mem_stats.extend(
+            [
+                ["param", _normalize(param_total_size)],
+                ["grad", _normalize(grad_total_size)],
+                ["buffer", _normalize(buffer_total_size)],
+            ]
+        )
+
+    summ = f"rank-{rank} {step_info} memory ({_normalizer_unit})"
+    for stat in mem_stats:
+        summ += f" | {stat[0]}: {stat[1]}"
+
+    if logger is None:
+        print(summ)
+    else:
+        logger.info(summ)

From 7a147fc6f76a30b8d5875352afced515431ec7e5 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Wed, 28 Feb 2024 02:20:53 -0800
Subject: [PATCH 078/279] Remove a bash task from webgpu CI pipeline (#19682)

### Description
It is a "Bash" task that requires running bash on Windows. Most Windows
operating systems do not have Bash installed. Given this task is only
debugging purposes, we can remove it for now.


### Motivation and Context
I am making this change because I am regenerating the VM image in a
different manner, and the new image does not contain bash. Once this PR
is in, I can switch the images.
---
 .../github/azure-pipelines/templates/win-web-ci.yml        | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
index 8ba3517530edd..043da233cc674 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
@@ -155,12 +155,7 @@ jobs:
       path: $(Build.SourcesDirectory)/js/test/
       cacheHitVar: CACHE_RESTORED
     displayName: 'Cache ONNX node test data'
-  - task: Bash@3
-    inputs:
-      targetType: 'inline'
-      script: find "$(Build.SourcesDirectory)/js/test/" -type f
-    condition: and(not(canceled()), eq(variables.CACHE_RESTORED, 'true'))
-    displayName: 'List ONNX node test data'
+
   - task: PowerShell@2
     inputs:
       filePath: '$(Build.SourcesDirectory)\tools\ci_build\github\js\pack-npm-packages.ps1'

From 913bdc7306e11b65644f733861684a3a460e8db0 Mon Sep 17 00:00:00 2001
From: Adrian Lizarraga <adlizarraga@microsoft.com>
Date: Wed, 28 Feb 2024 08:30:12 -0800
Subject: [PATCH 079/279] [QNN Quant] Handle external data for QNN
 preprocessing/quant (#19670)

### Description
- Adds parameters to `qnn_preprocess_model()` to allow saving the new
model with external data.
- Updates `get_qnn_qdq_config()` to:
  - Load model without external data (it is not needed)
- Return a quantization configuration with `use_external_data_format`
set to `True` if the model has external data or if the model is >= 2GB.

### Motivation and Context
Update QNN quantization to better handle large models that use external
data.
---
 .../execution_providers/qnn/preprocess.py     |  51 +++++-
 .../execution_providers/qnn/quant_config.py   |  15 +-
 .../quantization/test_qnn_preprocess_model.py | 170 ++++++++++++++++++
 .../test_tensor_quant_overrides_option.py     |  30 ++++
 4 files changed, 261 insertions(+), 5 deletions(-)
 create mode 100644 onnxruntime/test/python/quantization/test_qnn_preprocess_model.py

diff --git a/onnxruntime/python/tools/quantization/execution_providers/qnn/preprocess.py b/onnxruntime/python/tools/quantization/execution_providers/qnn/preprocess.py
index b1c114fe1f9fd..b0dab81830c8b 100644
--- a/onnxruntime/python/tools/quantization/execution_providers/qnn/preprocess.py
+++ b/onnxruntime/python/tools/quantization/execution_providers/qnn/preprocess.py
@@ -3,6 +3,8 @@
 # Licensed under the MIT License. See License.txt in the project root for
 # license information.
 # --------------------------------------------------------------------------
+from __future__ import annotations
+
 import logging
 from pathlib import Path
 
@@ -13,7 +15,44 @@
 from .fusion_lpnorm import FusionLpNormalization
 
 
-def qnn_preprocess_model(model_input: Path, model_output: Path, fuse_layernorm: bool = False) -> bool:
+def qnn_preprocess_model(
+    model_input: Path,
+    model_output: Path,
+    fuse_layernorm: bool = False,
+    save_as_external_data: bool = False,
+    all_tensors_to_one_file: bool = False,
+    external_data_location: str | None = None,
+    external_data_size_threshold: int = 1024,
+    external_data_convert_attribute: bool = False,
+) -> bool:
+    """
+    If necessary, this method creates a new "pre-processed" model in preparation for
+    quantization of a model to be used in QNN EP. Returns true if a new model was created.
+
+    This method perfoms the following operations:
+    - Fuse Erf sequence into a single Gelu node.
+    - Fuse ReduceL2 sequence into a single LpNormalization node (p == 2).
+    - (Optional) Fuse ReduceMean sequence into a single LayerNormalization node.
+
+    Args:
+        model_input: Path to the input model file.
+        model_output: Path the output model file, which is only created if this method returns True.
+        fuse_layernorm: True if ReduceMean sequences should be fused into LayerNormalization nodes.
+            Defaults to False.
+        save_as_external_data: True if output model should be saved with external data. Defaults to false.
+        all_tensors_to_one_file: Effective only if save_as_external_data is true. Defaults to false.
+            If true, save all tensors to one external file specified by external_data_location.
+            If false, save each tensor to a file named with the tensor name.
+        external_data_location: Effective only if save_as_external_data is true. Defaults to None.
+            Specify the external file to which all tensors are saved. Path is relative
+            to the model path. If not specified, the model's name is used.
+        external_data_size_threshold: Effective only if save_as_external_data is true. Defaults to 1024.
+            Tensors with a data size >= external_data_size_threshold are converted to external data.
+            To convert every tensor with raw data to external data, set to 0.
+        external_data_convert_attribute: Effective only if save_as_external_data is true. Defaults to false.
+            If true, convert all tensors to external data.
+            If false, convert only non-attribute tensors to external data.
+    """
     modified = False
     model = onnx.load_model(model_input)
     onnx_model = ONNXModel(model)
@@ -57,6 +96,14 @@ def qnn_preprocess_model(model_input: Path, model_output: Path, fuse_layernorm:
 
     if modified:
         onnx_model.topological_sort()
-        onnx.save_model(model, model_output)
+        onnx.save_model(
+            model,
+            model_output,
+            save_as_external_data=save_as_external_data,
+            all_tensors_to_one_file=all_tensors_to_one_file,
+            location=external_data_location,
+            size_threshold=external_data_size_threshold,
+            convert_attribute=external_data_convert_attribute,
+        )
 
     return modified
diff --git a/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py b/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py
index 7c2fa4f65ae1b..e9affae7ac263 100644
--- a/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py
+++ b/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py
@@ -15,6 +15,7 @@
 Q16_TYPES = {QuantType.QInt16, QuantType.QUInt16}
 Q8_TYPES = {QuantType.QInt8, QuantType.QUInt8}
 OP_TYPES_TO_EXCLUDE = {"Cast"}
+MODEL_SIZE_THRESHOLD = 2147483648  # Quant model should use external data if >= 2GB
 
 
 def get_qnn_qdq_config(
@@ -28,14 +29,21 @@ def get_qnn_qdq_config(
     if per_channel:
         raise ValueError("QNN EP does not yet support per-channel quantization.")
 
-    # Process model nodes to setup overrides.
-    model = onnx.load_model(model_input)
+    model = onnx.load_model(model_input, load_external_data=False)
 
     op_types = set()
     tensor_quant_overrides = {}
+    model_has_external_data = False
+    name_to_initializer = {}
 
-    name_to_initializer = {initializer.name: initializer for initializer in model.graph.initializer}
+    # Build map of initializers (name -> initializer) and
+    # check if the model has external data.
+    for initializer in model.graph.initializer:
+        name_to_initializer[initializer.name] = initializer
+        if onnx.external_data_helper.uses_external_data(initializer):
+            model_has_external_data = True
 
+    # Setup quantization overrides for specific operator types
     for node in model.graph.node:
         op_types.add(node.op_type)
 
@@ -89,5 +97,6 @@ def get_qnn_qdq_config(
         activation_type=activation_type,
         weight_type=weight_type,
         op_types_to_quantize=list(op_types.difference(OP_TYPES_TO_EXCLUDE)),
+        use_external_data_format=(model_has_external_data or model.ByteSize() >= MODEL_SIZE_THRESHOLD),
         extra_options=extra_options,
     )
diff --git a/onnxruntime/test/python/quantization/test_qnn_preprocess_model.py b/onnxruntime/test/python/quantization/test_qnn_preprocess_model.py
new file mode 100644
index 0000000000000..9b67fd41caac3
--- /dev/null
+++ b/onnxruntime/test/python/quantization/test_qnn_preprocess_model.py
@@ -0,0 +1,170 @@
+#!/usr/bin/env python
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+import math
+import unittest
+from pathlib import Path
+
+import numpy as np
+import onnx
+
+from onnxruntime.quantization.execution_providers.qnn import qnn_preprocess_model
+from onnxruntime.quantization.quant_utils import model_has_external_data, ms_domain
+
+
+class TestQnnPreprocessModel(unittest.TestCase):
+    def build_model(self, shape, scale_val, bias_val):
+        """
+        Build a model that supports 3 kinds of fusions:
+        - Erf sequence to Gelu
+        - ReduceL2 sequence to LpNormalization
+        - ReduceMean sequence to LayerNormalization
+        """
+        root_inp = onnx.helper.make_tensor_value_info("root", onnx.TensorProto.FLOAT, shape)
+        output = onnx.helper.make_tensor_value_info("output", onnx.TensorProto.FLOAT, shape)
+
+        # Erf sequence
+        one_const = onnx.numpy_helper.from_array(np.array(1.0, dtype=np.float32), "one_const")
+        half_const = onnx.numpy_helper.from_array(np.array(0.5, dtype=np.float32), "half_const")
+        root2_const = onnx.numpy_helper.from_array(np.array(math.sqrt(2.0), dtype=np.float32), "root2_const")
+
+        e_mul0_node = onnx.helper.make_node("Mul", ["root", "half_const"], ["e_mul0_out"])
+        e_div_node = onnx.helper.make_node("Div", ["root", "root2_const"], ["e_div_out"])
+        e_erf_node = onnx.helper.make_node("Erf", ["e_div_out"], ["e_erf_out"])
+        e_add_node = onnx.helper.make_node("Add", ["e_erf_out", "one_const"], ["e_add_out"])
+        e_mul1_node = onnx.helper.make_node("Mul", ["e_add_out", "e_mul0_out"], ["erf_seq_output"])
+
+        # ReduceL2 sequence
+        axes_const = onnx.numpy_helper.from_array(np.array([-1], dtype=np.int64), "axes_const")
+        eps_const = onnx.numpy_helper.from_array(np.array(1e-12, dtype=np.float32), "eps_const")
+        shape_const = onnx.numpy_helper.from_array(np.array(list(shape), dtype=np.int64), "shape_const")
+
+        l2_rl2_node = onnx.helper.make_node("ReduceL2", ["erf_seq_output", "axes_const"], ["l2_rl2_out"], keepdims=1)
+        l2_clip_node = onnx.helper.make_node("Clip", ["l2_rl2_out", "eps_const"], ["l2_clip_out"])
+        l2_expand_node = onnx.helper.make_node("Expand", ["l2_clip_out", "shape_const"], ["l2_expand_out"])
+        l2_div_node = onnx.helper.make_node("Div", ["erf_seq_output", "l2_expand_out"], ["l2_seq_output"])
+
+        # ReduceMean sequence
+        scale_const = onnx.numpy_helper.from_array(np.array(scale_val, dtype=np.float32), "scale_const")
+        bias_const = onnx.numpy_helper.from_array(np.array(bias_val, dtype=np.float32), "bias_const")
+        two_const = onnx.numpy_helper.from_array(np.array(2.0, dtype=np.float32), "two_const")
+
+        m_rm0_node = onnx.helper.make_node("ReduceMean", ["l2_seq_output", "axes_const"], ["m_rm0_out"])
+        m_sub_node = onnx.helper.make_node("Sub", ["l2_seq_output", "m_rm0_out"], ["m_sub_out"])
+        m_pow_node = onnx.helper.make_node("Pow", ["m_sub_out", "two_const"], ["m_pow_out"])
+        m_rm1_node = onnx.helper.make_node("ReduceMean", ["m_pow_out", "axes_const"], ["m_rm1_out"])
+        m_add0_node = onnx.helper.make_node("Add", ["m_rm1_out", "eps_const"], ["m_add0_out"])
+        m_sqrt_node = onnx.helper.make_node("Sqrt", ["m_add0_out"], ["m_sqrt_out"])
+        m_div_node = onnx.helper.make_node("Div", ["m_sub_out", "m_sqrt_out"], ["m_div_out"])
+        m_mul_node = onnx.helper.make_node("Mul", ["m_div_out", "scale_const"], ["m_mul_out"])
+        m_add1_node = onnx.helper.make_node("Add", ["m_mul_out", "bias_const"], ["output"])
+
+        graph = onnx.helper.make_graph(
+            [
+                e_mul0_node,
+                e_div_node,
+                e_erf_node,
+                e_add_node,
+                e_mul1_node,
+                l2_rl2_node,
+                l2_clip_node,
+                l2_expand_node,
+                l2_div_node,
+                m_rm0_node,
+                m_sub_node,
+                m_pow_node,
+                m_rm1_node,
+                m_add0_node,
+                m_sqrt_node,
+                m_div_node,
+                m_mul_node,
+                m_add1_node,
+            ],
+            "qnn_f32_model",
+            [root_inp],
+            [output],
+            initializer=[
+                one_const,
+                half_const,
+                root2_const,
+                axes_const,
+                eps_const,
+                shape_const,
+                scale_const,
+                bias_const,
+                two_const,
+            ],
+        )
+        opset_imports = [
+            onnx.helper.make_opsetid("", 18),
+        ]
+        model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        return onnx.shape_inference.infer_shapes(model)
+
+    def test_all_fusions(self):
+        """
+        Test calling qnn_preprocess_model() with a model that supports all 3 fusions.
+        """
+        model = self.build_model((1, 2, 3), [2.0, 2.0, 2.0], [1.0, 1.0, 1.0])
+        onnx.save_model(model, "model.onnx")
+        modified = qnn_preprocess_model("model.onnx", "model.qnn_pp.onnx", fuse_layernorm=True)
+
+        self.assertTrue(modified)
+
+        fused_model = onnx.load_model("model.qnn_pp.onnx")
+
+        # 3 fused Ops: Gelu, LpNorm, LayerNorm
+        self.assertEqual(len(fused_model.graph.node), 3)
+        expected_op_types = {"Gelu", "LpNormalization", "LayerNormalization"}
+        for node in fused_model.graph.node:
+            self.assertIn(node.op_type, expected_op_types)
+
+        # Should have added "com.microsoft" opset import because we added a Gelu.
+        ms_domain_opset = next((opset for opset in fused_model.opset_import if opset.domain == ms_domain), None)
+        self.assertIsNotNone(ms_domain_opset)
+        self.assertEqual(ms_domain_opset.version, 1)
+
+    def test_external_data(self):
+        """
+        Test calling qnn_preprocess_model() with a model that uses external data.
+        The new preprocessed model should also have external data.
+        """
+        model = self.build_model((1, 2, 3), [2.0, 2.0, 2.0], [1.0, 1.0, 1.0])
+        onnx.save_model(
+            model,
+            "model.onnx",
+            save_as_external_data=True,
+            all_tensors_to_one_file=True,
+            location="weights.bin",
+            size_threshold=0,
+        )
+        modified = qnn_preprocess_model(
+            "model.onnx",
+            "model.qnn_pp.onnx",
+            fuse_layernorm=True,
+            save_as_external_data=True,
+            all_tensors_to_one_file=True,
+            external_data_location="weights2.bin",
+            external_data_size_threshold=0,
+        )
+
+        self.assertTrue(modified)
+
+        # Model should still have external data.
+        self.assertTrue(model_has_external_data(Path("model.qnn_pp.onnx")))
+
+        fused_model = onnx.load_model("model.qnn_pp.onnx", load_external_data=False)
+
+        # 3 fused Ops: Gelu, LpNorm, LayerNorm
+        self.assertEqual(len(fused_model.graph.node), 3)
+        expected_op_types = {"Gelu", "LpNormalization", "LayerNormalization"}
+        for node in fused_model.graph.node:
+            self.assertIn(node.op_type, expected_op_types)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py b/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py
index 0470953e385b6..cbb6b3ae2e776 100644
--- a/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py
+++ b/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py
@@ -555,6 +555,36 @@ def test_get_qnn_qdq_config(self):
         self.assertEqual(sig_out_zp.data_type, onnx.TensorProto.UINT16)
         self.assertEqual(sig_out_sc.float_data[0], np.float32(1.0 / 65536.0))
 
+    def test_get_qnn_qdq_config_ext_data(self):
+        """
+        Test that get_qnn_qdq_config() returns a config that enables external data
+        if the input model has external data.
+        """
+
+        # Create model with a weight large enough (> 1024 bytes) to be stored externally.
+        large_weight = onnx.numpy_helper.from_array(np.random.random((1, 32, 32)).astype(np.float32), "weight")
+        graph = onnx.helper.make_graph(
+            [onnx.helper.make_node("Add", ["input", "weight"], ["output"])],
+            "add_ext_data",
+            [onnx.helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, (1, 32, 32))],
+            [onnx.helper.make_tensor_value_info("output", onnx.TensorProto.FLOAT, (1, 32, 32))],
+            initializer=[large_weight],
+        )
+        model = onnx.helper.make_model(
+            graph,
+            opset_imports=[onnx.helper.make_opsetid("", 18)],
+        )
+        onnx.save_model(
+            model,
+            "add_ext_data.onnx",
+            save_as_external_data=True,
+            all_tensors_to_one_file=True,
+            location="add_ext_data.bin",
+        )
+
+        qnn_config = get_qnn_qdq_config("add_ext_data.onnx", DummyDataReader(self.activations))
+        self.assertTrue(qnn_config.use_external_data_format)
+
 
 if __name__ == "__main__":
     t = TestTensorQuantOverridesOption()

From a93c31e3c9971063d8dfe45a627a80cbdcf99ed9 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Wed, 28 Feb 2024 12:03:17 -0800
Subject: [PATCH 080/279] Update dml-vs-2022.yml (#19687)

### Description
Fix a build error in "Zip-Nuget-Java-Nodejs Packaging Pipeline" which
deletes files too early.
---
 .../nuget/templates/dml-vs-2022.yml           | 22 ++++++++++---------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml
index 9393fb07d718a..d6bb415a68ee6 100644
--- a/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml
@@ -55,6 +55,9 @@ stages:
       - checkout: self
         clean: true
         submodules: recursive
+      - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+        displayName: 'Clean Agent Directories'
+        condition: always()
 
       - powershell: |
           if($env:TELEMETRYGUID)
@@ -231,14 +234,7 @@ stages:
               searchPattern: '**/*.pdb'
               symbolServerType: teamServices
 
-      - ${{ if eq(parameters['DoCompliance'], 'true') }}:
-        - template: ../../templates/compliance.yml
-          parameters :
-            msbuildPlatform: ${{ parameters.sln_platform }}
 
-      - template: ../../templates/component-governance-component-detection-steps.yml
-        parameters :
-          condition : 'succeeded'
 
       # Node.js Publish
       - ${{ if eq(parameters['DoNodejsPack'], 'true') }}:
@@ -294,6 +290,12 @@ stages:
             targetPath: '$(Build.SourcesDirectory)\js\node\bin\napi-v3\win32\${{ parameters.sln_platform }}'
             artifactName: 'drop-onnxruntime-nodejs-win-${{ parameters.sln_platform }}-dml'
 
-      - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
-        displayName: 'Clean Agent Directories'
-        condition: always()
+
+      - ${{ if eq(parameters['DoCompliance'], 'true') }}:
+        - template: ../../templates/compliance.yml
+          parameters :
+            msbuildPlatform: ${{ parameters.sln_platform }}
+
+      - template: ../../templates/component-governance-component-detection-steps.yml
+        parameters :
+          condition : 'succeeded'

From e30618d05535d3fe0fdc34d350d78e8ad01b64d5 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Wed, 28 Feb 2024 16:05:08 -0800
Subject: [PATCH 081/279] [js/webgpu] use Headless for webgpu test by default
 (#19702)

### Description
use Chromium Headless for webgpu test by default. Still use normal
Chromium with window when debug=true or perfMode=true.

Use the
[`--headless=new`](https://developer.chrome.com/docs/chromium/new-headless)
mode.


### Motivation and Context
try to use a more stable way to launch npm tests to avoid a "chrome not
found" issue in pipeline, which may potentially caused by windowed
application.
---
 js/web/karma.conf.js             |  4 ++--
 js/web/script/test-runner-cli.ts | 29 +++++++----------------------
 2 files changed, 9 insertions(+), 24 deletions(-)

diff --git a/js/web/karma.conf.js b/js/web/karma.conf.js
index 8fce79843f617..9e44d9c0d9652 100644
--- a/js/web/karma.conf.js
+++ b/js/web/karma.conf.js
@@ -86,11 +86,11 @@ module.exports = function(config) {
     hostname,
     listenAddress,
     customLaunchers: {
-      // the following flags are used to make sure Edge on CI agents to initialize WebGPU correctly.
+      // Chromium-based browsers
       EdgeTest: {base: 'Edge', flags: chromiumFlags},
       ChromeTest: {base: 'Chrome', flags: chromiumFlags},
-      ChromeTestHeadless: {base: 'ChromeHeadless', flags: chromiumFlags},
       ChromeCanaryTest: {base: 'ChromeCanary', flags: chromiumFlags},
+
       //
       // ==== BrowserStack browsers ====
       //
diff --git a/js/web/script/test-runner-cli.ts b/js/web/script/test-runner-cli.ts
index 9105c02412e34..59bd0d5f6313a 100644
--- a/js/web/script/test-runner-cli.ts
+++ b/js/web/script/test-runner-cli.ts
@@ -542,14 +542,13 @@ async function main() {
       npmlog.info('TestRunnerCli.Run', '(4/4) Running karma to start test runner...');
       const webgpu = args.backends.indexOf('webgpu') > -1;
       const webnn = args.backends.indexOf('webnn') > -1;
-      const browser = getBrowserNameFromEnv(
-          args.env,
-          args.bundleMode === 'perf' ? 'perf' :
-              args.debug             ? 'debug' :
-                                       'test',
-          webgpu);
+      const browser = getBrowserNameFromEnv(args.env);
       const karmaArgs = ['karma', 'start', `--browsers ${browser}`];
       const chromiumFlags = ['--enable-features=SharedArrayBuffer', ...args.chromiumFlags];
+      if (args.bundleMode === 'dev' && !args.debug) {
+        // use headless for 'test' mode (when 'perf' and 'debug' are OFF)
+        chromiumFlags.push('--headless=new');
+      }
       if (args.debug) {
         karmaArgs.push('--log-level info --timeout-mocha 9999999');
         chromiumFlags.push('--remote-debugging-port=9333');
@@ -662,10 +661,10 @@ async function main() {
     fs.writeJSONSync(path.join(TEST_ROOT, './testdata-config.json'), config);
   }
 
-  function getBrowserNameFromEnv(env: TestRunnerCliArgs['env'], mode: 'debug'|'perf'|'test', webgpu: boolean) {
+  function getBrowserNameFromEnv(env: TestRunnerCliArgs['env']) {
     switch (env) {
       case 'chrome':
-        return selectChromeBrowser(mode, webgpu);
+        return 'ChromeTest';
       case 'edge':
         return 'EdgeTest';
       case 'firefox':
@@ -680,20 +679,6 @@ async function main() {
         throw new Error(`env "${env}" not supported.`);
     }
   }
-
-  function selectChromeBrowser(mode: 'debug'|'perf'|'test', webgpu: boolean) {
-    if (webgpu) {
-      return 'ChromeTest';
-    } else {
-      switch (mode) {
-        case 'debug':
-        case 'perf':
-          return 'ChromeTest';
-        default:
-          return 'ChromeTestHeadless';
-      }
-    }
-  }
 }
 
 void main();

From 250779474de0ce50f0ef4b39f7b050755e1019ba Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Wed, 28 Feb 2024 19:36:26 -0800
Subject: [PATCH 082/279] Change "onnxruntime-Linux-CPU-For-Android-CI" machine
 pool to "onnxruntime-Ubuntu2204-AMD-CPU" (#19698)

### Description
The original one reports "out of disk space", which needs to be
investigated.
---
 .../android-x86_64-crosscompile-ci-pipeline.yml             | 6 +++---
 .../azure-pipelines/linux-cpu-minimal-build-ci-pipeline.yml | 2 +-
 .../github/azure-pipelines/mac-react-native-ci-pipeline.yml | 2 +-
 .../templates/android-binary-size-check-stage.yml           | 3 ++-
 4 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml
index 9136b21aec626..d0a22aae07741 100644
--- a/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml
@@ -53,7 +53,7 @@ stages:
     Codeql.Enabled: false
   jobs:
   - job: Build_CPU_EP
-    pool: onnxruntime-Linux-CPU-For-Android-CI
+    pool: onnxruntime-Ubuntu2204-AMD-CPU
     workspace:
       clean: all
     timeoutInMinutes: 30
@@ -140,7 +140,7 @@ stages:
 
   jobs:
   - job: Build_NNAPI_EP
-    pool: onnxruntime-Linux-CPU-For-Android-CI
+    pool: onnxruntime-Ubuntu2204-AMD-CPU
     timeoutInMinutes: ${{ variables.JobsTimeout }}
     workspace:
       clean: all
@@ -456,7 +456,7 @@ stages:
     variables:
     - name: skipComponentGovernanceDetection
       value: true
-    pool: 'onnxruntime-Linux-CPU-For-Android-CI'
+    pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
     condition: and(succeeded(), in(variables['Build.Reason'], 'IndividualCI', 'BatchedCI'))
     dependsOn:
     - NNAPI_EP_MASTER
diff --git a/tools/ci_build/github/azure-pipelines/linux-cpu-minimal-build-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-cpu-minimal-build-ci-pipeline.yml
index 1053a2518125f..bbea7a0d114e8 100644
--- a/tools/ci_build/github/azure-pipelines/linux-cpu-minimal-build-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-cpu-minimal-build-ci-pipeline.yml
@@ -59,7 +59,7 @@ jobs:
   timeoutInMinutes: 120
   workspace:
     clean: all
-  pool: onnxruntime-Linux-CPU-For-Android-CI
+  pool: onnxruntime-Ubuntu2204-AMD-CPU
   variables:
     ORT_CACHE_DIR: $(Pipeline.Workspace)/ort_ccache
     TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
diff --git a/tools/ci_build/github/azure-pipelines/mac-react-native-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/mac-react-native-ci-pipeline.yml
index e8f4931d5ad9f..886bacf5aac4d 100644
--- a/tools/ci_build/github/azure-pipelines/mac-react-native-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/mac-react-native-ci-pipeline.yml
@@ -61,4 +61,4 @@ stages:
   parameters:
     NpmPackagingMode: ${{ variables.NpmPackagingMode }}
     BuildConfig: 'Release'
-    PoolName: 'onnxruntime-Linux-CPU-For-Android-CI'
+    PoolName: 'onnxruntime-Ubuntu2204-AMD-CPU'
diff --git a/tools/ci_build/github/azure-pipelines/templates/android-binary-size-check-stage.yml b/tools/ci_build/github/azure-pipelines/templates/android-binary-size-check-stage.yml
index 733cafdeeb8c0..9822950127112 100644
--- a/tools/ci_build/github/azure-pipelines/templates/android-binary-size-check-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/android-binary-size-check-stage.yml
@@ -31,7 +31,7 @@ stages:
     timeoutInMinutes: 60
     workspace:
       clean: all
-    pool: onnxruntime-Linux-CPU-For-Android-CI
+    pool: onnxruntime-Ubuntu2204-AMD-CPU
     steps:
     - checkout: self
       clean: true
@@ -49,6 +49,7 @@ stages:
     - task: PythonScript@0
       displayName: 'Set variables from config file "${{ parameters.BuildConfigFile }}"'
       inputs:
+        pythonInterpreter: /usr/bin/python3
         scriptSource: inline
         script: |
           import json

From 7455dd1f32af760984f42e8e6d752b675a4a0852 Mon Sep 17 00:00:00 2001
From: Sophie Schoenmeyer <107952697+sophies927@users.noreply.github.com>
Date: Wed, 28 Feb 2024 21:10:25 -0800
Subject: [PATCH 083/279] Update labeler.yml to change permissions (#19709)

### Description
Updated github/issue-labeler permissions to give write access for
issues. Tried to submit the same PR last week, but the checks kept
failing, so I couldn't merge.


### Motivation and Context
Enables issue labeling again, which has been broken since GitHub Actions
permissions were changed a couple weeks ago.
---
 .github/workflows/labeler.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
index 936ab0de899a2..a196226a4b836 100644
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@@ -3,6 +3,9 @@ on:
   issues:
     types: [opened, edited]
 
+permissions:
+  issues: write
+
 jobs:
   triage:
     runs-on: ubuntu-latest

From d2e6dd25ea8bd528f614250ba0165a535734305e Mon Sep 17 00:00:00 2001
From: Vincent Wang <wangwchpku@outlook.com>
Date: Thu, 29 Feb 2024 13:45:58 +0800
Subject: [PATCH 084/279] Merge GatherToSplitFusion and #19218 to a General
 Fusion (#19600)

#19218 tried to fuse Gather/Slice to Split, but the logic has problem.
Scalar value or 1-dim value of indices in Gather node will produce
different result, scalar value will produce a result tensor by removing
the axis dim, will 1-dim indices value will keep that dim, even when the
dim value is 1. For example,

Node
    |-> Gather(indices=[0], axis=axis)
    |-> Gather(indices=[1], axis=axis)
    |-> Slice(index=2, axis=axis)
is same as
Node
   |-> Split(axis=axis)

But
Node
    |-> Gather(indices=0, axis=axis)
    |-> Gather(indices=1, axis=axis)
    |-> Slice(index=2, axis=axis)
is same as
Node
    |-> Split(axis=axis)
        ||-> Squeeze(axis=axis)
        ||-> Squeeze(axis=axis)
        ||->

Previous PR doesn't take such case related to Squeeze/Unsqueeze into
account.

This PR merges #19218 and GatherToSplitFusion to a general fusion, which
relaxes the limit the number of Gather and Slice node number, check all
Gather and Slice consumers, if the indices of Gather and start/end of
Slice can cover the specific dim of the input tensor, then we can fuse
them to a Split, and adding Squeeze if necessary according to the dim
count of the indices tensor in Gather.

@rui-ren, please check if the fix can still be applied to your model.
---
 onnxruntime/core/optimizer/gather_fusion.cc   | 318 ++++++----
 onnxruntime/core/optimizer/gather_fusion.h    |  16 +-
 .../core/optimizer/gather_slice_fusion.cc     | 344 -----------
 .../core/optimizer/gather_slice_fusion.h      |  32 -
 .../core/optimizer/graph_transformer_utils.cc |   4 +-
 .../test/optimizer/graph_transform_test.cc    | 550 +++++-------------
 .../core/optimizer/graph_transformer_utils.cc |   4 +-
 7 files changed, 352 insertions(+), 916 deletions(-)
 delete mode 100644 onnxruntime/core/optimizer/gather_slice_fusion.cc
 delete mode 100644 onnxruntime/core/optimizer/gather_slice_fusion.h

diff --git a/onnxruntime/core/optimizer/gather_fusion.cc b/onnxruntime/core/optimizer/gather_fusion.cc
index 4903bc1d6b961..90cabff88122c 100644
--- a/onnxruntime/core/optimizer/gather_fusion.cc
+++ b/onnxruntime/core/optimizer/gather_fusion.cc
@@ -9,55 +9,144 @@
 
 namespace onnxruntime {
 
-bool GatherToSplitFusion::IsSupportedGather(const Graph& graph, const Node& node, int64_t& index, int64_t& axis,
-                                            int64_t& indices_n_dims) const {
-  if (!graph_utils::IsSupportedOptypeVersionAndDomain(node, "Gather", {1, 11, 13}) ||
+namespace {
+static int64_t GetGatherAxis(const Node& node, int64_t rank) {
+  int64_t axis = 0;
+  auto& attrs = node.GetAttributes();
+  if (attrs.find("axis") != attrs.end()) {
+    auto& axis_attr = attrs.at("axis");
+    if (utils::HasInt(axis_attr)) {
+      axis = axis_attr.i();
+      if (axis < 0) axis += rank;
+    }
+  }
+  return axis;
+}
+
+static bool GetScalarInt64Initializer(const Graph& graph, const NodeArg& node_arg, int64_t& value, int64_t& rank) {
+  if (!optimizer_utils::IsScalar(node_arg)) return false;
+  const ONNX_NAMESPACE::TensorProto* tensor_proto = graph_utils::GetConstantInitializer(graph, node_arg.Name());
+  if (!tensor_proto || tensor_proto->data_type() != ONNX_NAMESPACE::TensorProto::INT64) return false;
+  Initializer init_const{*tensor_proto, graph.ModelPath()};
+  value = *(init_const.data<int64_t>());
+  rank = tensor_proto->dims_size();
+  return true;
+}
+
+static bool GetSliceAxis(const Graph& graph, const Node& node, int64_t rank, int64_t& axis) {
+  if (node.InputDefs().size() < 4) return false;
+  int64_t unused = 0;
+  if (!GetScalarInt64Initializer(graph, *node.InputDefs()[3], axis, unused)) return false;
+  if (axis < 0) axis += rank;
+  return true;
+}
+
+static bool GetAxis(const Graph& graph, const Node& node, int64_t rank, int64_t& axis) {
+  if (node.OpType() == "Gather") {
+    axis = GetGatherAxis(node, rank);
+    return true;
+  }
+  if (node.OpType() == "Slice") {
+    return GetSliceAxis(graph, node, rank, axis);
+  }
+  return false;
+}
+
+}  // namespace
+
+bool GatherSliceToSplitFusion::IsSupportedGather(const Graph& graph, const Node& node, int64_t rank,
+                                                 int64_t target_axis, int64_t dim_size, InlinedVector<bool>& consumed,
+                                                 int64_t& start, bool& need_squeeze) const {
+  if (!graph_utils::IsSupportedOptypeVersionAndDomain(node, "Gather", {13}) ||
       !graph_utils::IsSupportedProvider(node, GetCompatibleExecutionProviders())) {
     return false;
   }
 
-  const NodeArg& input_arg = *(node.InputDefs()[1]);
-  if (!optimizer_utils::IsScalar(input_arg)) return false;
-  const ONNX_NAMESPACE::TensorProto* tensor_proto = graph_utils::GetConstantInitializer(graph, input_arg.Name());
-  if (!tensor_proto) return false;
-  if (tensor_proto->data_type() != ONNX_NAMESPACE::TensorProto_DataType_INT64) return false;
-  Initializer init_const{*tensor_proto, graph.ModelPath()};
-  index = *(init_const.data<int64_t>());
-  axis = 0;  // Default value.
-  auto& attrs = node.GetAttributes();
-  if (attrs.find("axis") != attrs.end()) {
-    auto& axis_attr = attrs.at("axis");
-    if (utils::HasInt(axis_attr)) axis = axis_attr.i();
+  if (GetGatherAxis(node, rank) != target_axis) return false;
+  // Require the indices input to be a scalar tensor for now. Normally if not, the exporter will choose Slice.
+  // We can relax this later if needed.
+  int64_t indices_n_dims = 0;
+  if (!GetScalarInt64Initializer(graph, *(node.InputDefs()[1]), start, indices_n_dims)) return false;
+  if (start < 0) start += dim_size;
+  if (start < 0 || start >= dim_size || consumed[static_cast<size_t>(start)]) return false;
+  consumed[static_cast<size_t>(start)] = true;
+  need_squeeze = indices_n_dims == 0;
+  return true;
+}
+
+bool GatherSliceToSplitFusion::IsSupportedSlice(const Graph& graph, const Node& node, int64_t rank, int64_t target_axis,
+                                                int64_t dim_size, InlinedVector<bool>& consumed, int64_t& start,
+                                                int64_t& end) const {
+  if (!graph_utils::IsSupportedOptypeVersionAndDomain(node, "Slice", {13}) ||
+      !graph_utils::IsSupportedProvider(node, GetCompatibleExecutionProviders())) {
+    return false;
+  }
+
+  int64_t axis = 0;
+  if (!GetSliceAxis(graph, node, rank, axis) || axis != target_axis) return false;
+  int64_t unused = 0;
+  if (!GetScalarInt64Initializer(graph, *node.InputDefs()[1], start, unused) ||
+      !GetScalarInt64Initializer(graph, *node.InputDefs()[2], end, unused)) {
+    return false;
+  }
+  // Handling start and end according to schema definition.
+  if (start < 0) start += dim_size;
+  if (end < 0) end += dim_size;
+  if (start < 0)
+    start = 0;
+  else if (start > dim_size)
+    start = dim_size;
+  if (end < 0)
+    end = 0;
+  else if (end > dim_size)
+    end = dim_size;
+  if (start >= end) return false;
+  if (node.InputDefs().size() >= 5) {
+    int64_t step = 0;
+    if (!GetScalarInt64Initializer(graph, *node.InputDefs()[4], step, unused) || step != 1) return false;
+  }
+  for (int64_t i = start; i < end; ++i) {
+    if (consumed[static_cast<size_t>(i)]) return false;
+    consumed[static_cast<size_t>(i)] = true;
   }
-  indices_n_dims = tensor_proto->dims_size();
   return true;
 }
 
 /*
-GatherToSplitFusion is to fuse:
-Node -> Gather(index=0, axis=axis)
-    |-> Gather(index=1, axis=axis)
-    |-> Gather(index=2, axis=axis)
+GatherSliceToSplitFusion is to fuse:
+Node -> Gather(indices=0, axis=axis)
+    |-> Gather(indices=[1], axis=axis)
+    |-> Slice(start=2, end=3, axes=[axis])
     |...
 
 To
 
 Node -> Split -> Squeeze(axis=axis)
-             |-> Squeeze(axis=axis)
-             |-> Squeeze(axis=axis)
+             |->
+             |->
              |...
 
 So that we can use one kernel to finish the job.
+The fusion requires that the indices of Gather nodes and start/end of Slice nodes are not overlapping and cover
+all the elements in the target axis. Step of Slice node should be 1.
 */
-Status GatherToSplitFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level,
-                                      const logging::Logger& logger) const {
+Status GatherSliceToSplitFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level,
+                                           const logging::Logger& logger) const {
+  // Squeeze, Gather, Slice and Split have different schemas before and after OpSet 13.
+  // To make code simple, support OpSet >= 13 only.
+  int onnx_opset_version = -1;
+  if (graph.DomainToVersionMap().find(kOnnxDomain) != graph.DomainToVersionMap().end()) {
+    onnx_opset_version = graph.DomainToVersionMap().at(kOnnxDomain);
+  }
+  if (onnx_opset_version < 13) return Status::OK();
+
   GraphViewer graph_viewer(graph);
   const auto& node_topology_list = graph_viewer.GetNodesInTopologicalOrder();
 
-  InlinedVector<const NodeArg*> node_args;
+  InlinedVector<const NodeArg*> candidate_args;
   for (auto node_arg : graph.GetInputs()) {
     if (node_arg && graph.GetConsumerNodes(node_arg->Name()).size() > 1) {
-      node_args.push_back(node_arg);
+      candidate_args.push_back(node_arg);
     }
   }
 
@@ -65,7 +154,7 @@ Status GatherToSplitFusion::ApplyImpl(Graph& graph, bool& modified, int graph_le
     if (graph.GetConsumerNodes(entry.first).size() > 1) {
       auto node_arg = graph.GetNodeArg(entry.first);
       if (node_arg) {
-        node_args.push_back(node_arg);
+        candidate_args.push_back(node_arg);
       }
     }
   }
@@ -90,129 +179,108 @@ Status GatherToSplitFusion::ApplyImpl(Graph& graph, bool& modified, int graph_le
     size_t output_count = node.GetOutputEdgesCount();
     if (output_count <= 1) continue;
 
-    node_args.push_back(node.OutputDefs()[0]);
+    candidate_args.push_back(node.OutputDefs()[0]);
   }
 
-  for (const NodeArg* node_arg : node_args) {
+  for (const NodeArg* node_arg : candidate_args) {
     auto shape = node_arg->Shape();
     if (!shape) continue;
     int64_t rank = static_cast<int64_t>(shape->dim_size());
-
-    bool can_fuse = true;
-    bool first_edge = true;
-    int64_t split_axis = 0;
-    int64_t indices_n_dims = -1;
     auto consumers = graph.GetConsumerNodes(node_arg->Name());
-    size_t consumer_count = consumers.size();
-    InlinedVector<NodeArg*> gather_outputs(consumer_count, nullptr);
-    InlinedVector<std::reference_wrapper<Node>> nodes_to_fuse;
+    InlinedVector<const Node*> condidate_consumers;
     for (auto consumer : consumers) {
-      int64_t index, axis, dims;
-      if (!consumer || consumer->InputDefs()[0] != node_arg ||
-          !IsSupportedGather(graph, *consumer, index, axis, dims)) {
-        can_fuse = false;
-        break;
-      }
-      if (indices_n_dims == -1) {
-        indices_n_dims = dims;
-      } else if (indices_n_dims != dims) {
-        // Not the same number of dimensions (0 or 1) for all scalar indices.
-        can_fuse = false;
-        break;
+      if (consumer && consumer->InputDefs()[0] == node_arg &&
+          (consumer->OpType() == "Gather" || consumer->OpType() == "Slice")) {
+        condidate_consumers.emplace_back(consumer);
       }
-      if (axis < 0) axis += rank;
-      if (first_edge) {
-        auto dim = shape->dim(static_cast<int>(axis));
-        if (!utils::HasDimValue(dim) || dim.dim_value() != static_cast<int64_t>(consumer_count)) {
-          can_fuse = false;
-          break;
-        }
-        split_axis = axis;
-        first_edge = false;
-      } else if (axis != split_axis) {
+    }
+    if (condidate_consumers.size() < 2) continue;
+    int64_t axis = 0;
+    if (!GetAxis(graph, *condidate_consumers[0], rank, axis)) continue;
+    auto dim = shape->dim(static_cast<int>(axis));
+    if (!utils::HasDimValue(dim)) continue;
+    int64_t dim_size = dim.dim_value();
+    InlinedVector<bool> consumed(static_cast<size_t>(dim_size), false);
+    bool can_fuse = true;
+    InlinedVector<std::reference_wrapper<Node>> nodes_to_fuse;
+    InlinedVector<int64_t> starts;
+    InlinedHashMap<int64_t, std::tuple<NodeArg*, int64_t, bool>> output_info_map;
+    for (auto consumer : condidate_consumers) {
+      if (!consumer || consumer->InputDefs()[0] != node_arg) {
         can_fuse = false;
         break;
       }
-      if (index < 0) index += static_cast<int64_t>(consumer_count);
-      if (index < 0 || index >= static_cast<int64_t>(consumer_count) || gather_outputs[static_cast<size_t>(index)]) {
+      int64_t start = 0, end = 0;
+      bool need_squeeze = false;
+      if (IsSupportedGather(graph, *consumer, rank, axis, dim_size, consumed, start, need_squeeze)) {
+        Node& gather_node = *graph.GetNode(consumer->Index());
+        nodes_to_fuse.emplace_back(gather_node);
+        starts.emplace_back(start);
+        output_info_map[start] = std::make_tuple(gather_node.MutableOutputDefs()[0], 1, need_squeeze);
+      } else if (IsSupportedSlice(graph, *consumer, rank, axis, dim_size, consumed, start, end)) {
+        Node& slice_node = *graph.GetNode(consumer->Index());
+        nodes_to_fuse.emplace_back(slice_node);
+        starts.emplace_back(start);
+        output_info_map[start] = std::make_tuple(slice_node.MutableOutputDefs()[0], end - start, false);
+      } else {
         can_fuse = false;
         break;
       }
-      Node& gather_node = *graph.GetNode(consumer->Index());
-      nodes_to_fuse.emplace_back(gather_node);
-      gather_outputs[static_cast<size_t>(index)] = gather_node.MutableOutputDefs()[0];
-    }
-
-    if (!can_fuse) continue;
-
-    ONNX_NAMESPACE::TypeProto split_output_type;
-    const ONNX_NAMESPACE::TensorProto_DataType element_type =
-        static_cast<ONNX_NAMESPACE::TensorProto_DataType>(node_arg->TypeAsProto()->tensor_type().elem_type());
-    split_output_type.mutable_tensor_type()->set_elem_type(element_type);
-    for (int64_t i = 0; i < rank; ++i) {
-      if (i == split_axis) {
-        split_output_type.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1LL);
-      } else {
-        *(split_output_type.mutable_tensor_type()->mutable_shape()->add_dim()) = shape->dim(static_cast<int>(i));
-      }
     }
 
+    if (!can_fuse || std::find(consumed.begin(), consumed.end(), false) != consumed.end()) continue;
+    std::sort(starts.begin(), starts.end());
     InlinedVector<NodeArg*> split_outputs;
-    bool add_squeeze_node = indices_n_dims == 0;
-    if (add_squeeze_node) {
-      for (size_t i = 0; i < consumer_count; ++i) {
-        split_outputs.emplace_back(
-            &graph.GetOrCreateNodeArg(graph.GenerateNodeArgName("split" + std::to_string(i)), &split_output_type));
-      }
-    }
-
-    Node& split_node =
-        graph.AddNode(graph.GenerateNodeName("Split"), "Split", "Split for Fused Gather nodes",
-                      {graph.GetNodeArg(node_arg->Name())}, add_squeeze_node ? split_outputs : gather_outputs);
-    split_node.AddAttribute("axis", split_axis);
-    split_node.SetExecutionProviderType(nodes_to_fuse[0].get().GetExecutionProviderType());
-
-    // Squeeze-11, Squeee-13, Split-13, Split-18 have different schemas.
-    int onnx_opset_version = -1;
-    if (graph.DomainToVersionMap().find(kOnnxDomain) != graph.DomainToVersionMap().end()) {
-      onnx_opset_version = graph.DomainToVersionMap().at(kOnnxDomain);
-    }
-
-    if (onnx_opset_version < 13) {
-      if (add_squeeze_node) {
-        for (size_t i = 0; i < consumer_count; ++i) {
-          Node& squeeze_node = graph.AddNode(graph.GenerateNodeName("Squeeze" + std::to_string(i)), "Squeeze",
-                                             "Squeeze for Fused Gather nodes", {split_outputs[i]}, {gather_outputs[i]});
-          squeeze_node.AddAttribute("axes", std::vector<int64_t>{split_axis});
-          squeeze_node.SetExecutionProviderType(nodes_to_fuse[0].get().GetExecutionProviderType());
+    InlinedVector<int64_t> split_values;
+    for (int64_t start : starts) {
+      auto& output_info = output_info_map[start];
+      NodeArg* original_output_arg = std::get<0>(output_info);
+      int64_t split_value = std::get<1>(output_info);
+      split_values.emplace_back(split_value);
+      if (std::get<2>(output_info)) {
+        ONNX_NAMESPACE::TypeProto split_output_type;
+        const ONNX_NAMESPACE::TensorProto_DataType element_type =
+            static_cast<ONNX_NAMESPACE::TensorProto_DataType>(node_arg->TypeAsProto()->tensor_type().elem_type());
+        split_output_type.mutable_tensor_type()->set_elem_type(element_type);
+        for (int64_t i = 0; i < rank; ++i) {
+          if (i == axis) {
+            split_output_type.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(split_value);
+          } else {
+            *(split_output_type.mutable_tensor_type()->mutable_shape()->add_dim()) = shape->dim(static_cast<int>(i));
+          }
         }
-      }
-    } else {
-      if (onnx_opset_version >= 18) {
-        split_node.AddAttribute("num_outputs", static_cast<int64_t>(consumer_count));
-      }
-
-      if (add_squeeze_node) {
+        NodeArg* split_output_arg =
+            &graph.GetOrCreateNodeArg(graph.GenerateNodeArgName("split_output"), &split_output_type);
         ONNX_NAMESPACE::TensorProto axes_initializer_proto;
-        axes_initializer_proto.set_name(graph.GenerateNodeName("SqueezeAxesInitializer"));
+        axes_initializer_proto.set_name(graph.GenerateNodeName("squeeze_axes"));
         axes_initializer_proto.add_dims(static_cast<int64_t>(1));
         axes_initializer_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
-        InlinedVector<int64_t> axes_value{split_axis};
-        axes_initializer_proto.set_raw_data(axes_value.data(), axes_value.size() * sizeof(int64_t));
+        axes_initializer_proto.add_int64_data(axis);
         NodeArg* axes_arg = &graph_utils::AddInitializer(graph, axes_initializer_proto);
-
-        for (size_t i = 0; i < consumer_count; ++i) {
-          Node& squeeze_node =
-              graph.AddNode(graph.GenerateNodeName("Squeeze" + std::to_string(i)), "Squeeze",
-                            "Squeeze for Fused Gather nodes", {split_outputs[i], axes_arg}, {gather_outputs[i]});
-          squeeze_node.SetExecutionProviderType(nodes_to_fuse[0].get().GetExecutionProviderType());
-        }
+        Node& squeeze_node =
+            graph.AddNode(graph.GenerateNodeName("Squeeze"), "Squeeze", "Squeeze for Fused Gather nodes",
+                          {split_output_arg, axes_arg}, {original_output_arg});
+        squeeze_node.SetExecutionProviderType(nodes_to_fuse[0].get().GetExecutionProviderType());
+        split_outputs.emplace_back(split_output_arg);
+      } else {
+        split_outputs.emplace_back(original_output_arg);
       }
     }
 
-    for (Node& n : nodes_to_fuse) {
-      graph_utils::RemoveNodeOutputEdges(graph, n);
-      graph.RemoveNode(n.Index());
+    ONNX_NAMESPACE::TensorProto split_initializer_proto;
+    split_initializer_proto.set_name(graph.GenerateNodeName("splits"));
+    split_initializer_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
+    split_initializer_proto.add_dims(static_cast<int64_t>(split_values.size()));
+    split_initializer_proto.mutable_int64_data()->Add(split_values.begin(), split_values.end());
+    NodeArg* split_initializer_arg = &graph_utils::AddInitializer(graph, split_initializer_proto);
+    Node& split_node = graph.AddNode(graph.GenerateNodeName("Split"), "Split", "Split for Fused Gather nodes",
+                                     {graph.GetNodeArg(node_arg->Name()), split_initializer_arg}, split_outputs);
+    split_node.AddAttribute("axis", axis);
+    split_node.SetExecutionProviderType(nodes_to_fuse[0].get().GetExecutionProviderType());
+
+    for (Node& node : nodes_to_fuse) {
+      graph_utils::RemoveNodeOutputEdges(graph, node);
+      graph.RemoveNode(node.Index());
     }
 
     modified = true;
diff --git a/onnxruntime/core/optimizer/gather_fusion.h b/onnxruntime/core/optimizer/gather_fusion.h
index 44c235915b6cc..098278a77dafe 100644
--- a/onnxruntime/core/optimizer/gather_fusion.h
+++ b/onnxruntime/core/optimizer/gather_fusion.h
@@ -8,19 +8,23 @@
 namespace onnxruntime {
 
 /**
-@Class GatherToSplitFusion
+@Class GatherSliceToSplitFusion
 
-Fuse multiple Gather nodes that comsuming one output to one Split node.
+Fuse multiple Gather/Slice nodes that comsuming one output to one Split node.
 */
-class GatherToSplitFusion : public GraphTransformer {
+class GatherSliceToSplitFusion : public GraphTransformer {
  public:
-  GatherToSplitFusion(const InlinedHashSet<std::string_view>& compatible_execution_providers = {}) noexcept
-      : GraphTransformer("GatherToSplitFusion", compatible_execution_providers) {}
+  GatherSliceToSplitFusion(const InlinedHashSet<std::string_view>& compatible_execution_providers = {}) noexcept
+      : GraphTransformer("GatherSliceToSplitFusion", compatible_execution_providers) {}
 
   Status ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const override;
 
  private:
-  bool IsSupportedGather(const Graph& graph, const Node& node, int64_t& index, int64_t& axis, int64_t& indices_n_dims) const;
+  bool IsSupportedGather(const Graph& graph, const Node& node, int64_t rank, int64_t target_axis, int64_t dim_size,
+                         InlinedVector<bool>& consumed, int64_t& start, bool& need_squeeze) const;
+
+  bool IsSupportedSlice(const Graph& graph, const Node& node, int64_t rank, int64_t target_axis, int64_t dim_size,
+                        InlinedVector<bool>& consumed, int64_t& start, int64_t& end) const;
 };
 
 /**
diff --git a/onnxruntime/core/optimizer/gather_slice_fusion.cc b/onnxruntime/core/optimizer/gather_slice_fusion.cc
deleted file mode 100644
index 21266d356a020..0000000000000
--- a/onnxruntime/core/optimizer/gather_slice_fusion.cc
+++ /dev/null
@@ -1,344 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/optimizer/gather_slice_fusion.h"
-#include "core/graph/graph_utils.h"
-#include "core/optimizer/initializer.h"
-#include "core/optimizer/utils.h"
-
-namespace onnxruntime {
-
-bool GatherSliceToSplitFusion::IsSupportedGather(const Graph& graph, const Node& node, int64_t& index,
-                                                 int64_t& axis, int64_t& indices_n_dims) const {
-  if (!graph_utils::IsSupportedOptypeVersionAndDomain(node, "Gather", {1, 11, 13}) ||
-      !graph_utils::IsSupportedProvider(node, GetCompatibleExecutionProviders())) {
-    return false;
-  }
-
-  const NodeArg& input_arg = *(node.InputDefs()[1]);
-
-  if (!optimizer_utils::IsScalar(input_arg)) return false;
-
-  const ONNX_NAMESPACE::TensorProto* indices_init = graph_utils::GetConstantInitializer(graph, input_arg.Name());
-
-  if (!indices_init) return false;
-
-  if (indices_init->data_type() != ONNX_NAMESPACE::TensorProto::INT64) return false;
-
-  // get the index value
-  Initializer init_const(*indices_init, graph.ModelPath());
-  index = *(init_const.data<int64_t>());
-
-  // get attributes value
-  axis = 0;
-  auto& attrs = node.GetAttributes();
-  if (attrs.find("axis") != attrs.end()) {
-    auto& axis_attr = attrs.at("axis");
-    if (utils::HasInt(axis_attr)) axis = axis_attr.i();
-  }
-
-  indices_n_dims = indices_init->dims_size();
-  return true;
-}
-
-bool GatherSliceToSplitFusion::IsSupportedSlice(const Graph& graph, const Node& node,
-                                                InlinedVector<int64_t>& starts,
-                                                InlinedVector<int64_t>& ends,
-                                                InlinedVector<int64_t>& axes,
-                                                InlinedVector<int64_t>& steps) const {
-  // check the version of Slice ops
-  if (!graph_utils::IsSupportedOptypeVersionAndDomain(node, "Slice", {1, 10, 11, 13}) ||
-      !graph_utils::IsSupportedProvider(node, GetCompatibleExecutionProviders())) {
-    return false;
-  }
-
-  // get the opset version
-  int onnx_opset_version = -1;
-  if (graph.DomainToVersionMap().find(kOnnxDomain) != graph.DomainToVersionMap().end()) {
-    onnx_opset_version = graph.DomainToVersionMap().at(kOnnxDomain);
-  }
-
-  // If Slice op of opset version 1
-  if (onnx_opset_version == 1) {
-    if (!graph_utils::GetRepeatedNodeAttributeValues(node, "starts", starts) ||
-        !graph_utils::GetRepeatedNodeAttributeValues(node, "ends", ends) ||
-        starts.size() != ends.size()) {
-      return false;
-    }
-
-    if (graph_utils::GetRepeatedNodeAttributeValues(node, "axes", axes) && (axes.size() != starts.size())) {
-      return false;
-    }
-  }
-
-  // If Slice op of opset version >= 10
-  if (onnx_opset_version >= 10) {
-    // node inputs include: starts - ends - axes - steps
-
-    // return a pointer to the corresponding NodeArg if input of the node at the index exists
-    auto get_input_if_exists = [&node](size_t input_index) -> const NodeArg* {
-      const auto& input_defs = node.InputDefs();
-      const NodeArg* input = (input_defs.size() > input_index) ? input_defs[input_index] : nullptr;
-      return (input == nullptr || !input->Exists()) ? nullptr : input;
-    };
-
-    // return a pointer to the initializer if it is constant; otherwise, a nullptr
-    auto get_initializer_if_constant =
-        [&graph, get_input_if_exists](size_t input_index) -> const ONNX_NAMESPACE::TensorProto* {
-      const NodeArg* input = get_input_if_exists(input_index);
-      return input ? graph_utils::GetConstantInitializer(graph, input->Name()) : nullptr;
-    };
-
-    // return the initialization data if it is constant
-    auto get_initializer_data =
-        [&graph](const ONNX_NAMESPACE::TensorProto* slice_initializer) -> InlinedVector<int64_t> {
-      Initializer init(*slice_initializer, graph.ModelPath());
-      if (slice_initializer->data_type() == ONNX_NAMESPACE::TensorProto::INT32) {
-        int32_t* init_data = init.data<int32_t>();
-        return InlinedVector<int64_t>(init_data, init_data + init.size());
-      }
-
-      if (slice_initializer->data_type() == ONNX_NAMESPACE::TensorProto::INT64) {
-        int64_t* init_data = init.data<int64_t>();
-        return InlinedVector<int64_t>(init_data, init_data + init.size());
-      }
-      return {};
-    };
-
-    // starts and ends inputs have to exist, be constants and be of the same size.
-    const ONNX_NAMESPACE::TensorProto* starts_init = get_initializer_if_constant(1);
-    const ONNX_NAMESPACE::TensorProto* ends_init = get_initializer_if_constant(2);
-    const ONNX_NAMESPACE::TensorProto* axes_init = get_initializer_if_constant(3);
-    const ONNX_NAMESPACE::TensorProto* steps_init = get_initializer_if_constant(4);
-
-    if (!starts_init || !ends_init || !axes_init || !steps_init) {
-      return false;
-    }
-
-    starts = get_initializer_data(starts_init);
-    ends = get_initializer_data(ends_init);
-    axes = get_initializer_data(axes_init);
-    steps = get_initializer_data(steps_init);
-
-    if (starts.size() == 0 || ends.size() == 0 || starts.size() != ends.size()) {
-      return false;
-    }
-
-    if (axes_init->dims_size() != 1 || static_cast<size_t>(axes_init->dims().Get(0)) != starts.size()) {
-      return false;
-    }
-
-    // if steps exists, it should be constant and all value should be 1
-    if (steps.size() != starts.size()) {
-      return false;
-    }
-
-    for (int64_t step : steps) {
-      if (step != 1) {
-        return false;
-      }
-    }
-  }
-
-  return true;
-}
-
-/*
-GatherToSplitFusion is to fuse:
-    Node
-        |-> Gather(index=0, axis=axis)
-        |-> Gather(index=1, axis=axis)
-        |-> Slice(index=2, axis=axis)
-To
-    Node
-        |-> Split(index=0)
-So that we can use one kernel to finish the job.
-*/
-
-Status GatherSliceToSplitFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level,
-                                           const logging::Logger& logger) const {
-  GraphViewer graph_viewer(graph);
-
-  const auto& node_topology_list = graph_viewer.GetNodesInTopologicalOrder();
-
-  InlinedVector<const NodeArg*> output_args;
-
-  // Iterate the topological order and get Reshape ops
-  for (auto node_index : node_topology_list) {
-    auto* p_node = graph.GetNode(node_index);
-
-    if (p_node == nullptr) continue;
-
-    Node& node = *p_node;
-
-    ORT_RETURN_IF_ERROR(Recurse(node, modified, graph_level, logger));
-
-    // Currently only catch after Reshape ops, optimize in the future
-    if (node.OpType() != "Reshape") continue;
-
-    size_t output_count = node.GetOutputEdgesCount();
-
-    // We only catch 1 scenario for Multi Query Attention for now.
-    //         |---> Gather
-    // Reshape |---> Gather
-    //         |---> Slice
-    //         |... or (other ops)
-
-    // Get the output into node args
-    if (output_count < 3) continue;
-
-    output_args.push_back(node.OutputDefs()[0]);
-  }
-
-  // iterate the children of Reshape node
-  for (const NodeArg* node_arg : output_args) {
-    auto shape = node_arg->Shape();
-    if (!shape) continue;
-
-    auto consumers = graph.GetConsumerNodes(node_arg->Name());
-    size_t consumer_count = consumers.size();
-
-    // get the tensor rank
-    int64_t rank = static_cast<int64_t>(shape->dim_size());
-
-    bool can_fuse = true;
-    bool first_edge = true;
-    int64_t split_axis = 0;
-    int64_t indices_n_dims = -1;
-
-    // Fuse 2 Gathers and 1 slice to Split
-    // Get those outputs as Split outputs
-    InlinedVector<NodeArg*> split_outputs(3);
-
-    InlinedVector<std::reference_wrapper<Node>> nodes_to_fuse;
-    size_t gather_node_count = 2, slice_node_count = 0;
-
-    // find the nodes to be merged
-    for (auto consumer : consumers) {
-      int64_t index, axis, dims;
-      InlinedVector<int64_t> starts, ends, axes, steps;
-
-      bool IsSupportedGatherOps = IsSupportedGather(graph, *consumer, index, axis, dims);
-      bool IsSupportedSliceOps = IsSupportedSlice(graph, *consumer, starts, ends, axes, steps);
-
-      if ((!consumer || consumer->InputDefs()[0] != node_arg) ||
-          (!IsSupportedGatherOps && !IsSupportedSliceOps)) {
-        break;
-      }
-
-      if (IsSupportedGatherOps) {
-        if (indices_n_dims == -1) {
-          indices_n_dims = dims;
-        } else if (indices_n_dims != dims) {
-          // Not the same number of dimensions (0 or 1) for all scalar indices.
-          can_fuse = false;
-          break;
-        }
-
-        if (axis < 0) axis += rank;
-
-        if (first_edge) {
-          auto dim = shape->dim(static_cast<int>(axis));
-          // dim.dim_value() = 73
-          if (!utils::HasDimValue(dim)) {
-            can_fuse = false;
-            break;
-          }
-          split_axis = axis;
-          first_edge = false;
-        } else if (axis != split_axis) {
-          can_fuse = false;
-          break;
-        }
-
-        if (index < 0) index += static_cast<int64_t>(consumer_count);
-        if (index < 0 || index >= static_cast<int64_t>(consumer_count)) {
-          can_fuse = false;
-          break;
-        }
-
-        Node& gather_node = *graph.GetNode(consumer->Index());
-        nodes_to_fuse.push_back(gather_node);
-        NodeArg* gather_output_args = gather_node.MutableOutputDefs()[0];
-        split_outputs[gather_node_count--] = gather_output_args;
-      }
-
-      // check the Slice Ops
-      if (IsSupportedSliceOps) {
-        if (axes[0] != axis && !first_edge) {
-          can_fuse = false;
-          break;
-        }
-
-        Node& slice_node = *graph.GetNode(consumer->Index());
-        NodeArg* slice_output_args = slice_node.MutableOutputDefs()[0];
-        nodes_to_fuse.push_back(slice_node);
-        split_outputs[slice_node_count++] = slice_output_args;
-      }
-    }
-
-    // condition check
-    if (!can_fuse || gather_node_count != 0 || slice_node_count != 1) continue;
-
-    // generate the split node and merge the kernel
-    ONNX_NAMESPACE::TypeProto split_output_type;
-    const ONNX_NAMESPACE::TensorProto_DataType element_type = static_cast<ONNX_NAMESPACE::TensorProto_DataType>(
-        node_arg->TypeAsProto()->tensor_type().elem_type());
-
-    split_output_type.mutable_tensor_type()->set_elem_type(element_type);
-
-    for (int64_t i = 0; i < rank; i++) {
-      if (i == split_axis)
-        split_output_type.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1LL);
-      else
-        *(split_output_type.mutable_tensor_type()->mutable_shape()->add_dim()) = shape->dim(static_cast<int>(i));
-    }
-
-    InlinedVector<NodeArg*> split_output_types;
-
-    for (size_t i = 0; i < consumer_count; ++i) {
-      split_output_types.push_back(
-          &graph.GetOrCreateNodeArg(
-              graph.GenerateNodeArgName("fused_split_" + std::to_string(i)), &split_output_type));
-    }
-
-    // Generate the Split Node
-    ONNX_NAMESPACE::TensorProto split_initializer_proto;
-    split_initializer_proto.set_name(graph.GenerateNodeName("fused_Split"));
-    split_initializer_proto.add_dims(static_cast<int64_t>(3));
-    split_initializer_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
-
-    auto dim_value = shape->dim(static_cast<int>(split_axis)).dim_value();
-    // Optimize 2 Gather Nodes, so Slice_dim = dim_value - 2
-    int64_t slice_dim = static_cast<int64_t>(dim_value - 2);
-    InlinedVector<int64_t> split_value{{slice_dim, 1, 1}};
-    split_initializer_proto.set_raw_data(split_value.data(), split_value.size() * sizeof(int64_t));
-    NodeArg* split_arg = &graph_utils::AddInitializer(graph, split_initializer_proto);
-
-    Node& split_node =
-        graph.AddNode(graph.GenerateNodeName("Split"), "Split", "Split for fused Gather-Slice fusion",
-                      {graph.GetNodeArg(node_arg->Name()), split_arg}, split_outputs);
-
-    split_node.AddAttribute("axis", split_axis);
-
-    split_node.SetExecutionProviderType(nodes_to_fuse[0].get().GetExecutionProviderType());
-
-    int onnx_opset_version = -1;
-    if (graph.DomainToVersionMap().find(kOnnxDomain) != graph.DomainToVersionMap().end()) {
-      onnx_opset_version = graph.DomainToVersionMap().at(kOnnxDomain);
-    }
-
-    if (onnx_opset_version >= 18) {
-      split_node.AddAttribute("num_outputs", static_cast<int64_t>(consumer_count));
-    }
-
-    for (Node& node_to_fuse : nodes_to_fuse) {
-      graph_utils::RemoveNodeOutputEdges(graph, node_to_fuse);
-      graph.RemoveNode(node_to_fuse.Index());
-    }
-    modified = true;
-  }
-
-  return Status::OK();
-}
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/gather_slice_fusion.h b/onnxruntime/core/optimizer/gather_slice_fusion.h
deleted file mode 100644
index 1c5c307efed7f..0000000000000
--- a/onnxruntime/core/optimizer/gather_slice_fusion.h
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-
-#include "core/optimizer/graph_transformer.h"
-
-namespace onnxruntime {
-
-/**
-@class GatherSliceToSplitFusion
-Fuse (2 Gather nodes + 1 Slice) to 1 split node.
-*/
-
-class GatherSliceToSplitFusion : public GraphTransformer {
- private:
-  bool IsSupportedGather(const Graph& graph, const Node& node, int64_t& index, int64_t& axis,
-                         int64_t& indices_n_dims) const;
-
-  bool IsSupportedSlice(const Graph& graph, const Node& node,
-                        InlinedVector<int64_t>& starts,
-                        InlinedVector<int64_t>& ends,
-                        InlinedVector<int64_t>& axes,
-                        InlinedVector<int64_t>& steps) const;
-
- public:
-  GatherSliceToSplitFusion(const InlinedHashSet<std::string_view>& compatible_execution_providers = {}) noexcept
-      : GraphTransformer("GatherSliceToSplitFusion", compatible_execution_providers) {}
-
-  Status ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const override;
-};
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/graph_transformer_utils.cc b/onnxruntime/core/optimizer/graph_transformer_utils.cc
index 4e939fe3c7b6b..8376b87aee6b2 100644
--- a/onnxruntime/core/optimizer/graph_transformer_utils.cc
+++ b/onnxruntime/core/optimizer/graph_transformer_utils.cc
@@ -37,7 +37,6 @@
 #include "core/optimizer/fast_gelu_fusion.h"
 #include "core/optimizer/free_dim_override_transformer.h"
 #include "core/optimizer/gather_fusion.h"
-#include "core/optimizer/gather_slice_fusion.h"
 #include "core/optimizer/gelu_approximation.h"
 #include "core/optimizer/gelu_fusion.h"
 #include "core/optimizer/gemm_activation_fusion.h"
@@ -307,9 +306,8 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
       transformers.emplace_back(std::make_unique<SimplifiedLayerNormFusion>(cpu_cuda_rocm_eps));
       transformers.emplace_back(std::make_unique<AttentionFusion>(cpu_cuda_dml_rocm_eps));
       transformers.emplace_back(std::make_unique<EmbedLayerNormFusion>(cpu_cuda_dml_rocm_eps));
-      transformers.emplace_back(std::make_unique<GatherToSplitFusion>(cpu_cuda_rocm_eps));
-      transformers.emplace_back(std::make_unique<GatherToSliceFusion>(cpu_cuda_rocm_eps));
       transformers.emplace_back(std::make_unique<GatherSliceToSplitFusion>(cpu_cuda_rocm_eps));
+      transformers.emplace_back(std::make_unique<GatherToSliceFusion>(cpu_cuda_rocm_eps));
 
       transformers.emplace_back(std::make_unique<MatmulTransposeFusion>(cpu_cuda_dml_rocm_eps));
       transformers.emplace_back(std::make_unique<BiasGeluFusion>(cpu_cuda_dml_rocm_eps));
diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc
index e1fcf835c6043..16f38bac62713 100755
--- a/onnxruntime/test/optimizer/graph_transform_test.cc
+++ b/onnxruntime/test/optimizer/graph_transform_test.cc
@@ -42,7 +42,6 @@
 #include "core/optimizer/expand_elimination.h"
 #include "core/optimizer/fast_gelu_fusion.h"
 #include "core/optimizer/gather_fusion.h"
-#include "core/optimizer/gather_slice_fusion.h"
 #include "core/optimizer/gelu_approximation.h"
 #include "core/optimizer/gelu_fusion.h"
 #include "core/optimizer/gemm_activation_fusion.h"
@@ -7059,13 +7058,13 @@ TEST_F(GraphTransformationTests, ConstantSharing_ShouldNotShareForGraphOutput) {
   }
 }
 
-TEST_F(GraphTransformationTests, GatherToSplitFusion) {
+TEST_F(GraphTransformationTests, GatherSliceToSplitFusion_AllGather) {
   auto build_test_case = [&](ModelTestBuilder& builder) {
     auto* data_arg = builder.MakeInput<float>({{54}});
     auto* shape_arg = builder.MakeInput<int64_t>({{4}});
     auto* reshape_out = builder.MakeIntermediate<float>({{2, 3, 3, 3}});
     auto* gather_index_1 = builder.MakeInitializer<int64_t>({}, {static_cast<int64_t>(0)});
-    auto* gather_index_2 = builder.MakeInitializer<int64_t>({}, {static_cast<int64_t>(1)});
+    auto* gather_index_2 = builder.MakeInitializer<int64_t>({1}, {static_cast<int64_t>(1)});
     auto* gather_index_3 = builder.MakeInitializer<int64_t>({}, {static_cast<int64_t>(2)});
     auto* gather_out_1 = builder.MakeIntermediate();
     auto* gather_out_2 = builder.MakeIntermediate();
@@ -7082,7 +7081,8 @@ TEST_F(GraphTransformationTests, GatherToSplitFusion) {
     builder.AddNode("Gather", {reshape_out, gather_index_3}, {gather_out_3})
         .AddAttribute("axis", static_cast<int64_t>(2));
     builder.AddNode("Transpose", {gather_out_1}, {transpose_out_1}).AddAttribute("perm", std::vector<int64_t>{0, 2, 1});
-    builder.AddNode("Transpose", {gather_out_2}, {transpose_out_2}).AddAttribute("perm", std::vector<int64_t>{0, 2, 1});
+    builder.AddNode("Transpose", {gather_out_2}, {transpose_out_2})
+        .AddAttribute("perm", std::vector<int64_t>{0, 2, 1, 3});
     builder.AddNode("Transpose", {gather_out_3}, {transpose_out_3}).AddAttribute("perm", std::vector<int64_t>{0, 2, 1});
   };
 
@@ -7091,27 +7091,16 @@ TEST_F(GraphTransformationTests, GatherToSplitFusion) {
     return Status::OK();
   };
 
-  // OpSet-12
+  // OpSet-12, not support
   {
     auto post_graph_checker = [&](Graph& graph) {
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Gather"] == 0);
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Split"] == 1);
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Squeeze"] == 3);
-      for (auto& node : graph.Nodes()) {
-        if (node.OpType() == "Split") {
-          auto& attrs = node.GetAttributes();
-          TEST_RETURN_IF_NOT(attrs.find("axis") != attrs.end());
-          TEST_RETURN_IF_NOT(2 == static_cast<int>(attrs.at("axis").i()));
-        } else if (node.OpType() == "Squeeze") {
-          auto& attrs = node.GetAttributes();
-          TEST_RETURN_IF_NOT(attrs.find("axes") != attrs.end());
-          TEST_RETURN_IF_NOT(2 == static_cast<int>(attrs.at("axes").ints().at(0)));
-        }
-      }
+      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Gather"] == 3);
+      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Split"] == 0);
+      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Squeeze"] == 0);
       return Status::OK();
     };
 
-    std::unique_ptr<GraphTransformer> transformer = std::make_unique<GatherToSplitFusion>();
+    std::unique_ptr<GraphTransformer> transformer = std::make_unique<GatherSliceToSplitFusion>();
     ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 12, *logger_, std::move(transformer),
                                           TransformerLevel::Level1, 1, pre_graph_checker, post_graph_checker));
   }
@@ -7121,7 +7110,7 @@ TEST_F(GraphTransformationTests, GatherToSplitFusion) {
     auto post_graph_checker = [&](Graph& graph) {
       TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Gather"] == 0);
       TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Split"] == 1);
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Squeeze"] == 3);
+      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Squeeze"] == 2);
       for (auto& node : graph.Nodes()) {
         if (node.OpType() == "Split") {
           auto& attrs = node.GetAttributes();
@@ -7140,249 +7129,140 @@ TEST_F(GraphTransformationTests, GatherToSplitFusion) {
       return Status::OK();
     };
 
-    std::unique_ptr<GraphTransformer> transformer = std::make_unique<GatherToSplitFusion>();
+    std::unique_ptr<GraphTransformer> transformer = std::make_unique<GatherSliceToSplitFusion>();
     ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 14, *logger_, std::move(transformer),
                                           TransformerLevel::Level1, 1, pre_graph_checker, post_graph_checker));
   }
-
-  // OpSet-18
-  {
-    auto post_graph_checker = [&](Graph& graph) {
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Gather"] == 0);
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Split"] == 1);
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Squeeze"] == 3);
-      for (auto& node : graph.Nodes()) {
-        if (node.OpType() == "Split") {
-          auto& attrs = node.GetAttributes();
-          TEST_RETURN_IF_NOT(attrs.find("axis") != attrs.end());
-          TEST_RETURN_IF_NOT(2 == static_cast<int>(attrs.at("axis").i()));
-        } else if (node.OpType() == "Squeeze") {
-          const NodeArg& input_arg = *(node.InputDefs()[1]);
-          const ONNX_NAMESPACE::TensorProto* tensor_proto =
-              graph_utils::GetConstantInitializer(graph, input_arg.Name());
-          TEST_RETURN_IF_NOT(tensor_proto != nullptr);
-          Initializer init_const{*tensor_proto, graph.ModelPath()};
-          TEST_RETURN_IF_NOT(tensor_proto->data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT64);
-          TEST_RETURN_IF_NOT(2 == static_cast<int>(*(init_const.data<int64_t>())));
-        }
-      }
-      return Status::OK();
-    };
-
-    std::unique_ptr<GraphTransformer> transformer = std::make_unique<GatherToSplitFusion>();
-    ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 18, *logger_, std::move(transformer),
-                                          TransformerLevel::Level1, 1, pre_graph_checker, post_graph_checker));
-  }
 }
 
-TEST_F(GraphTransformationTests, GatherToSplitFusion_NoSqueeze) {
+TEST_F(GraphTransformationTests, GatherSliceToSplitFusion_AllSlice_GraphInput) {
   auto build_test_case = [&](ModelTestBuilder& builder) {
-    auto* data_arg = builder.MakeInput<float>({{54}});
-    auto* shape_arg = builder.MakeInput<int64_t>({{4}});
-    auto* reshape_out = builder.MakeIntermediate<float>({{2, 3, 3, 3}});
-    auto* gather_index_1 = builder.MakeInitializer<int64_t>({1}, {static_cast<int64_t>(0)});
-    auto* gather_index_2 = builder.MakeInitializer<int64_t>({1}, {static_cast<int64_t>(1)});
-    auto* gather_index_3 = builder.MakeInitializer<int64_t>({1}, {static_cast<int64_t>(2)});
-    auto* gather_out_1 = builder.MakeIntermediate();
-    auto* gather_out_2 = builder.MakeIntermediate();
-    auto* gather_out_3 = builder.MakeIntermediate();
+    auto* data_arg = builder.MakeInput<float>({{2, 3, 8, 3}});
+    auto* starts_1 = builder.MakeInitializer<int64_t>({1}, {0});
+    auto* ends_1 = builder.MakeInitializer<int64_t>({1}, {2});
+    auto* axes_1 = builder.MakeInitializer<int64_t>({1}, {2});
+    auto* steps_1 = builder.MakeInitializer<int64_t>({1}, {1});
+    auto* starts_2 = builder.MakeInitializer<int64_t>({1}, {2});
+    auto* ends_2 = builder.MakeInitializer<int64_t>({1}, {-2});
+    auto* axes_2 = builder.MakeInitializer<int64_t>({1}, {-2});
+    auto* steps_2 = builder.MakeInitializer<int64_t>({1}, {1});
+    auto* starts_3 = builder.MakeInitializer<int64_t>({1}, {-2});
+    auto* ends_3 = builder.MakeInitializer<int64_t>({1}, {16});
+    auto* axes_3 = builder.MakeInitializer<int64_t>({1}, {2});
+    auto* slice_out_1 = builder.MakeIntermediate();
+    auto* slice_out_2 = builder.MakeIntermediate();
+    auto* slice_out_3 = builder.MakeIntermediate();
     auto* transpose_out_1 = builder.MakeOutput();
     auto* transpose_out_2 = builder.MakeOutput();
     auto* transpose_out_3 = builder.MakeOutput();
 
-    builder.AddNode("Reshape", {data_arg, shape_arg}, {reshape_out});
-    builder.AddNode("Gather", {reshape_out, gather_index_1}, {gather_out_1})
-        .AddAttribute("axis", static_cast<int64_t>(2));
-    builder.AddNode("Gather", {reshape_out, gather_index_2}, {gather_out_2})
-        .AddAttribute("axis", static_cast<int64_t>(-2));
-    builder.AddNode("Gather", {reshape_out, gather_index_3}, {gather_out_3})
-        .AddAttribute("axis", static_cast<int64_t>(2));
-    builder.AddNode("Transpose", {gather_out_1}, {transpose_out_1}).AddAttribute("perm", std::vector<int64_t>{0, 2, 1});
-    builder.AddNode("Transpose", {gather_out_2}, {transpose_out_2}).AddAttribute("perm", std::vector<int64_t>{0, 2, 1});
-    builder.AddNode("Transpose", {gather_out_3}, {transpose_out_3}).AddAttribute("perm", std::vector<int64_t>{0, 2, 1});
+    builder.AddNode("Slice", {data_arg, starts_1, ends_1, axes_1, steps_1}, {slice_out_1});
+    builder.AddNode("Slice", {data_arg, starts_2, ends_2, axes_2, steps_2}, {slice_out_2});
+    builder.AddNode("Slice", {data_arg, starts_3, ends_3, axes_3}, {slice_out_3});
+    builder.AddNode("Transpose", {slice_out_1}, {transpose_out_1})
+        .AddAttribute("perm", std::vector<int64_t>{0, 2, 1, 3});
+    builder.AddNode("Transpose", {slice_out_2}, {transpose_out_2})
+        .AddAttribute("perm", std::vector<int64_t>{0, 2, 1, 3});
+    builder.AddNode("Transpose", {slice_out_3}, {transpose_out_3})
+        .AddAttribute("perm", std::vector<int64_t>{0, 2, 1, 3});
   };
 
   auto pre_graph_checker = [&](Graph& graph) {
-    TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Gather"] == 3);
+    TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Slice"] == 3);
     return Status::OK();
   };
 
-  // OpSet-12
-  {
-    auto post_graph_checker = [&](Graph& graph) {
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Gather"] == 0);
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Split"] == 1);
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Squeeze"] == 0);
-      for (auto& node : graph.Nodes()) {
-        if (node.OpType() == "Split") {
-          auto& attrs = node.GetAttributes();
-          TEST_RETURN_IF_NOT(attrs.find("axis") != attrs.end());
-          TEST_RETURN_IF_NOT(2 == static_cast<int>(attrs.at("axis").i()));
-        }
-      }
-      return Status::OK();
-    };
-
-    std::unique_ptr<GraphTransformer> transformer = std::make_unique<GatherToSplitFusion>();
-    ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 12, *logger_, std::move(transformer),
-                                          TransformerLevel::Level1, 1, pre_graph_checker, post_graph_checker));
-  }
-
-  // OpSet-14
-  {
-    auto post_graph_checker = [&](Graph& graph) {
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Gather"] == 0);
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Split"] == 1);
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Squeeze"] == 0);
-      for (auto& node : graph.Nodes()) {
-        if (node.OpType() == "Split") {
-          auto& attrs = node.GetAttributes();
-          TEST_RETURN_IF_NOT(attrs.find("axis") != attrs.end());
-          TEST_RETURN_IF_NOT(2 == static_cast<int>(attrs.at("axis").i()));
-        }
-      }
-      return Status::OK();
-    };
-
-    std::unique_ptr<GraphTransformer> transformer = std::make_unique<GatherToSplitFusion>();
-    ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 14, *logger_, std::move(transformer),
-                                          TransformerLevel::Level1, 1, pre_graph_checker, post_graph_checker));
-  }
-
-  // OpSet-18
-  {
-    auto post_graph_checker = [&](Graph& graph) {
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Gather"] == 0);
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Split"] == 1);
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Squeeze"] == 0);
-      for (auto& node : graph.Nodes()) {
-        if (node.OpType() == "Split") {
-          auto& attrs = node.GetAttributes();
-          TEST_RETURN_IF_NOT(attrs.find("axis") != attrs.end());
-          TEST_RETURN_IF_NOT(2 == static_cast<int>(attrs.at("axis").i()));
-        }
+  auto post_graph_checker = [&](Graph& graph) {
+    TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Gather"] == 0);
+    TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Split"] == 1);
+    TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Squeeze"] == 0);
+    for (auto& node : graph.Nodes()) {
+      if (node.OpType() == "Split") {
+        auto& attrs = node.GetAttributes();
+        TEST_RETURN_IF_NOT(attrs.find("axis") != attrs.end());
+        TEST_RETURN_IF_NOT(2 == static_cast<int>(attrs.at("axis").i()));
       }
-      return Status::OK();
-    };
+    }
+    return Status::OK();
+  };
 
-    std::unique_ptr<GraphTransformer> transformer = std::make_unique<GatherToSplitFusion>();
-    ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 18, *logger_, std::move(transformer),
-                                          TransformerLevel::Level1, 1, pre_graph_checker, post_graph_checker));
-  }
+  std::unique_ptr<GraphTransformer> transformer = std::make_unique<GatherSliceToSplitFusion>();
+  ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 18, *logger_, std::move(transformer), TransformerLevel::Level1,
+                                        1, pre_graph_checker, post_graph_checker));
 }
 
-TEST_F(GraphTransformationTests, GatherToSplitFusion_Consume_Input) {
+TEST_F(GraphTransformationTests, GatherSliceToSplitFusion_Combined) {
   auto build_test_case = [&](ModelTestBuilder& builder) {
-    auto* data_arg = builder.MakeInput<float>({{2, 3, 3, 3}});
-    auto* gather_index_1 = builder.MakeInitializer<int64_t>({}, {static_cast<int64_t>(0)});
-    auto* gather_index_2 = builder.MakeInitializer<int64_t>({}, {static_cast<int64_t>(1)});
-    auto* gather_index_3 = builder.MakeInitializer<int64_t>({}, {static_cast<int64_t>(2)});
+    auto* data_arg = builder.MakeInput<float>({{144}});
+    auto* shape_arg = builder.MakeInput<int64_t>({{4}});
+    auto* reshape_out = builder.MakeIntermediate<float>({{2, 8, 3, 3}});
+    auto* gather_index_1 = builder.MakeInitializer<int64_t>({}, {static_cast<int64_t>(5)});
+    auto* starts_2 = builder.MakeInitializer<int64_t>({1}, {6});
+    auto* ends_2 = builder.MakeInitializer<int64_t>({1}, {8});
+    auto* axes_2 = builder.MakeInitializer<int64_t>({1}, {-3});
+    auto* steps_2 = builder.MakeInitializer<int64_t>({1}, {1});
+    auto* gather_index_3 = builder.MakeInitializer<int64_t>({1}, {static_cast<int64_t>(4)});
+    auto* starts_4 = builder.MakeInitializer<int64_t>({1}, {-16});
+    auto* ends_4 = builder.MakeInitializer<int64_t>({1}, {4});
+    auto* axes_4 = builder.MakeInitializer<int64_t>({1}, {1});
     auto* gather_out_1 = builder.MakeIntermediate();
-    auto* gather_out_2 = builder.MakeIntermediate();
+    auto* slice_out_2 = builder.MakeIntermediate();
     auto* gather_out_3 = builder.MakeIntermediate();
+    auto* slice_out_4 = builder.MakeIntermediate();
     auto* transpose_out_1 = builder.MakeOutput();
     auto* transpose_out_2 = builder.MakeOutput();
     auto* transpose_out_3 = builder.MakeOutput();
+    auto* transpose_out_4 = builder.MakeOutput();
 
-    builder.AddNode("Gather", {data_arg, gather_index_1}, {gather_out_1}).AddAttribute("axis", static_cast<int64_t>(2));
-    builder.AddNode("Gather", {data_arg, gather_index_2}, {gather_out_2})
-        .AddAttribute("axis", static_cast<int64_t>(-2));
-    builder.AddNode("Gather", {data_arg, gather_index_3}, {gather_out_3}).AddAttribute("axis", static_cast<int64_t>(2));
+    builder.AddNode("Reshape", {data_arg, shape_arg}, {reshape_out});
+    builder.AddNode("Gather", {reshape_out, gather_index_1}, {gather_out_1})
+        .AddAttribute("axis", static_cast<int64_t>(1));
+    builder.AddNode("Slice", {reshape_out, starts_2, ends_2, axes_2, steps_2}, {slice_out_2});
+    builder.AddNode("Gather", {reshape_out, gather_index_3}, {gather_out_3})
+        .AddAttribute("axis", static_cast<int64_t>(-3));
+    builder.AddNode("Slice", {reshape_out, starts_4, ends_4, axes_4}, {slice_out_4});
     builder.AddNode("Transpose", {gather_out_1}, {transpose_out_1}).AddAttribute("perm", std::vector<int64_t>{0, 2, 1});
-    builder.AddNode("Transpose", {gather_out_2}, {transpose_out_2}).AddAttribute("perm", std::vector<int64_t>{0, 2, 1});
-    builder.AddNode("Transpose", {gather_out_3}, {transpose_out_3}).AddAttribute("perm", std::vector<int64_t>{0, 2, 1});
+    builder.AddNode("Transpose", {slice_out_2}, {transpose_out_2})
+        .AddAttribute("perm", std::vector<int64_t>{0, 2, 1, 3});
+    builder.AddNode("Transpose", {gather_out_3}, {transpose_out_3})
+        .AddAttribute("perm", std::vector<int64_t>{0, 2, 1, 3});
+    builder.AddNode("Transpose", {slice_out_4}, {transpose_out_4})
+        .AddAttribute("perm", std::vector<int64_t>{0, 2, 1, 3});
   };
 
   auto pre_graph_checker = [&](Graph& graph) {
-    TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Gather"] == 3);
+    TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Gather"] == 2);
+    TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Slice"] == 2);
     return Status::OK();
   };
 
-  // OpSet-12
-  {
-    auto post_graph_checker = [&](Graph& graph) {
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Gather"] == 0);
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Split"] == 1);
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Squeeze"] == 3);
-      for (auto& node : graph.Nodes()) {
-        if (node.OpType() == "Split") {
-          auto& attrs = node.GetAttributes();
-          TEST_RETURN_IF_NOT(attrs.find("axis") != attrs.end());
-          TEST_RETURN_IF_NOT(2 == static_cast<int>(attrs.at("axis").i()));
-        } else if (node.OpType() == "Squeeze") {
-          auto& attrs = node.GetAttributes();
-          TEST_RETURN_IF_NOT(attrs.find("axes") != attrs.end());
-          TEST_RETURN_IF_NOT(2 == static_cast<int>(attrs.at("axes").ints().at(0)));
-        }
-      }
-      return Status::OK();
-    };
-
-    std::unique_ptr<GraphTransformer> transformer = std::make_unique<GatherToSplitFusion>();
-    ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 12, *logger_, std::move(transformer),
-                                          TransformerLevel::Level1, 1, pre_graph_checker, post_graph_checker));
-  }
-
-  // OpSet-14
-  {
-    auto post_graph_checker = [&](Graph& graph) {
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Gather"] == 0);
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Split"] == 1);
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Squeeze"] == 3);
-      for (auto& node : graph.Nodes()) {
-        if (node.OpType() == "Split") {
-          auto& attrs = node.GetAttributes();
-          TEST_RETURN_IF_NOT(attrs.find("axis") != attrs.end());
-          TEST_RETURN_IF_NOT(2 == static_cast<int>(attrs.at("axis").i()));
-        } else if (node.OpType() == "Squeeze") {
-          const NodeArg& input_arg = *(node.InputDefs()[1]);
-          const ONNX_NAMESPACE::TensorProto* tensor_proto =
-              graph_utils::GetConstantInitializer(graph, input_arg.Name());
-          TEST_RETURN_IF_NOT(tensor_proto != nullptr);
-          Initializer init_const{*tensor_proto, graph.ModelPath()};
-          TEST_RETURN_IF_NOT(tensor_proto->data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT64);
-          TEST_RETURN_IF_NOT(2 == static_cast<int>(*(init_const.data<int64_t>())));
-        }
-      }
-      return Status::OK();
-    };
-
-    std::unique_ptr<GraphTransformer> transformer = std::make_unique<GatherToSplitFusion>();
-    ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 14, *logger_, std::move(transformer),
-                                          TransformerLevel::Level1, 1, pre_graph_checker, post_graph_checker));
-  }
-
-  // OpSet-18
-  {
-    auto post_graph_checker = [&](Graph& graph) {
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Gather"] == 0);
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Split"] == 1);
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Squeeze"] == 3);
-      for (auto& node : graph.Nodes()) {
-        if (node.OpType() == "Split") {
-          auto& attrs = node.GetAttributes();
-          TEST_RETURN_IF_NOT(attrs.find("axis") != attrs.end());
-          TEST_RETURN_IF_NOT(2 == static_cast<int>(attrs.at("axis").i()));
-        } else if (node.OpType() == "Squeeze") {
-          const NodeArg& input_arg = *(node.InputDefs()[1]);
-          const ONNX_NAMESPACE::TensorProto* tensor_proto =
-              graph_utils::GetConstantInitializer(graph, input_arg.Name());
-          TEST_RETURN_IF_NOT(tensor_proto != nullptr);
-          Initializer init_const{*tensor_proto, graph.ModelPath()};
-          TEST_RETURN_IF_NOT(tensor_proto->data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT64);
-          TEST_RETURN_IF_NOT(2 == static_cast<int>(*(init_const.data<int64_t>())));
-        }
+  auto post_graph_checker = [&](Graph& graph) {
+    TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Gather"] == 0);
+    TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Split"] == 1);
+    TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Squeeze"] == 1);
+    for (auto& node : graph.Nodes()) {
+      if (node.OpType() == "Split") {
+        auto& attrs = node.GetAttributes();
+        TEST_RETURN_IF_NOT(attrs.find("axis") != attrs.end());
+        TEST_RETURN_IF_NOT(1 == static_cast<int>(attrs.at("axis").i()));
+      } else if (node.OpType() == "Squeeze") {
+        const NodeArg& input_arg = *(node.InputDefs()[1]);
+        const ONNX_NAMESPACE::TensorProto* tensor_proto = graph_utils::GetConstantInitializer(graph, input_arg.Name());
+        TEST_RETURN_IF_NOT(tensor_proto != nullptr);
+        Initializer init_const{*tensor_proto, graph.ModelPath()};
+        TEST_RETURN_IF_NOT(tensor_proto->data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT64);
+        TEST_RETURN_IF_NOT(1 == static_cast<int>(*(init_const.data<int64_t>())));
       }
-      return Status::OK();
-    };
+    }
+    return Status::OK();
+  };
 
-    std::unique_ptr<GraphTransformer> transformer = std::make_unique<GatherToSplitFusion>();
-    ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 18, *logger_, std::move(transformer),
-                                          TransformerLevel::Level1, 1, pre_graph_checker, post_graph_checker));
-  }
+  std::unique_ptr<GraphTransformer> transformer = std::make_unique<GatherSliceToSplitFusion>();
+  ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 14, *logger_, std::move(transformer), TransformerLevel::Level1,
+                                        1, pre_graph_checker, post_graph_checker));
 }
 
-TEST_F(GraphTransformationTests, GatherToSplitFusion_Consume_Initializer) {
+TEST_F(GraphTransformationTests, GatherSliceToSplitFusion_Consume_Initializer) {
   auto build_test_case = [&](ModelTestBuilder& builder) {
     auto* data_arg = builder.MakeInitializer<float>({2, 3, 3, 3}, std::vector<float>(54));
     auto* gather_index_1 = builder.MakeInitializer<int64_t>({}, {static_cast<int64_t>(0)});
@@ -7430,31 +7310,31 @@ TEST_F(GraphTransformationTests, GatherToSplitFusion_Consume_Initializer) {
     return Status::OK();
   };
 
-  std::unique_ptr<GraphTransformer> transformer = std::make_unique<GatherToSplitFusion>();
+  std::unique_ptr<GraphTransformer> transformer = std::make_unique<GatherSliceToSplitFusion>();
   ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 14, *logger_, std::move(transformer), TransformerLevel::Level1,
                                         1, pre_graph_checker, post_graph_checker));
 }
 
-TEST_F(GraphTransformationTests, GatherToSplitFusion_Invalid) {
+TEST_F(GraphTransformationTests, GatherSliceToSplitFusion_Invalid) {
   auto pre_graph_checker = [&](Graph& graph) {
-    TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Gather"] == 3);
+    TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Gather"] > 0 || CountOpsInGraph(graph)["Slice"] > 0);
     return Status::OK();
   };
   auto post_graph_checker = [&](Graph& graph) {
-    TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Gather"] == 3);
+    TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Gather"] > 0 || CountOpsInGraph(graph)["Slice"] > 0);
     TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Split"] == 0);
     TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Squeeze"] == 0);
     return Status::OK();
   };
 
-  // Invalid shape.
+  // Not cover all elements of specific dimension.
   {
     auto build_test_case = [&](ModelTestBuilder& builder) {
       auto* data_arg = builder.MakeInput<float>({{72}});
-      auto* shape_arg = builder.MakeInput<int64_t>({{1}});
+      auto* shape_arg = builder.MakeInput<int64_t>({{4}});
       auto* reshape_out = builder.MakeIntermediate<float>({{2, 3, 4, 3}});
       auto* gather_index_1 = builder.MakeInitializer<int64_t>({}, {static_cast<int64_t>(0)});
-      auto* gather_index_2 = builder.MakeInitializer<int64_t>({}, {static_cast<int64_t>(1)});
+      auto* gather_index_2 = builder.MakeInitializer<int64_t>({1}, {static_cast<int64_t>(1)});
       auto* gather_index_3 = builder.MakeInitializer<int64_t>({}, {static_cast<int64_t>(2)});
       auto* gather_out_1 = builder.MakeIntermediate();
       auto* gather_out_2 = builder.MakeIntermediate();
@@ -7467,63 +7347,65 @@ TEST_F(GraphTransformationTests, GatherToSplitFusion_Invalid) {
       builder.AddNode("Gather", {reshape_out, gather_index_1}, {gather_out_1})
           .AddAttribute("axis", static_cast<int64_t>(2));
       builder.AddNode("Gather", {reshape_out, gather_index_2}, {gather_out_2})
-          .AddAttribute("axis", static_cast<int64_t>(2));
+          .AddAttribute("axis", static_cast<int64_t>(-2));
       builder.AddNode("Gather", {reshape_out, gather_index_3}, {gather_out_3})
           .AddAttribute("axis", static_cast<int64_t>(2));
       builder.AddNode("Transpose", {gather_out_1}, {transpose_out_1})
           .AddAttribute("perm", std::vector<int64_t>{0, 2, 1});
       builder.AddNode("Transpose", {gather_out_2}, {transpose_out_2})
-          .AddAttribute("perm", std::vector<int64_t>{0, 2, 1});
+          .AddAttribute("perm", std::vector<int64_t>{0, 2, 1, 3});
       builder.AddNode("Transpose", {gather_out_3}, {transpose_out_3})
           .AddAttribute("perm", std::vector<int64_t>{0, 2, 1});
     };
 
-    std::unique_ptr<GraphTransformer> transformer = std::make_unique<GatherToSplitFusion>();
-    ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 12, *logger_, std::move(transformer),
+    std::unique_ptr<GraphTransformer> transformer = std::make_unique<GatherSliceToSplitFusion>();
+    ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 14, *logger_, std::move(transformer),
                                           TransformerLevel::Level1, 1, pre_graph_checker, post_graph_checker));
   }
 
-  // Invalid Gather indices.
+  // Has overlap.
   {
     auto build_test_case = [&](ModelTestBuilder& builder) {
-      auto* data_arg = builder.MakeInput<float>({{54}});
-      auto* shape_arg = builder.MakeInput<int64_t>({{1}});
-      auto* reshape_out = builder.MakeIntermediate<float>({{2, 3, 3, 3}});
-      auto* gather_index_1 = builder.MakeInitializer<int64_t>({}, {static_cast<int64_t>(0)});
-      auto* gather_index_2 = builder.MakeInitializer<int64_t>({}, {static_cast<int64_t>(1)});
-      auto* gather_index_3 = builder.MakeInitializer<int64_t>({}, {static_cast<int64_t>(1)});
-      auto* gather_out_1 = builder.MakeIntermediate();
-      auto* gather_out_2 = builder.MakeIntermediate();
-      auto* gather_out_3 = builder.MakeIntermediate();
+      auto* data_arg = builder.MakeInput<float>({{2, 3, 8, 3}});
+      auto* starts_1 = builder.MakeInitializer<int64_t>({1}, {0});
+      auto* ends_1 = builder.MakeInitializer<int64_t>({1}, {3});
+      auto* axes_1 = builder.MakeInitializer<int64_t>({1}, {2});
+      auto* steps_1 = builder.MakeInitializer<int64_t>({1}, {1});
+      auto* starts_2 = builder.MakeInitializer<int64_t>({1}, {2});
+      auto* ends_2 = builder.MakeInitializer<int64_t>({1}, {-2});
+      auto* axes_2 = builder.MakeInitializer<int64_t>({1}, {-2});
+      auto* steps_2 = builder.MakeInitializer<int64_t>({1}, {1});
+      auto* starts_3 = builder.MakeInitializer<int64_t>({1}, {-2});
+      auto* ends_3 = builder.MakeInitializer<int64_t>({1}, {16});
+      auto* axes_3 = builder.MakeInitializer<int64_t>({1}, {2});
+      auto* slice_out_1 = builder.MakeIntermediate();
+      auto* slice_out_2 = builder.MakeIntermediate();
+      auto* slice_out_3 = builder.MakeIntermediate();
       auto* transpose_out_1 = builder.MakeOutput();
       auto* transpose_out_2 = builder.MakeOutput();
       auto* transpose_out_3 = builder.MakeOutput();
 
-      builder.AddNode("Reshape", {data_arg, shape_arg}, {reshape_out});
-      builder.AddNode("Gather", {reshape_out, gather_index_1}, {gather_out_1})
-          .AddAttribute("axis", static_cast<int64_t>(2));
-      builder.AddNode("Gather", {reshape_out, gather_index_2}, {gather_out_2})
-          .AddAttribute("axis", static_cast<int64_t>(2));
-      builder.AddNode("Gather", {reshape_out, gather_index_3}, {gather_out_3})
-          .AddAttribute("axis", static_cast<int64_t>(2));
-      builder.AddNode("Transpose", {gather_out_1}, {transpose_out_1})
-          .AddAttribute("perm", std::vector<int64_t>{0, 2, 1});
-      builder.AddNode("Transpose", {gather_out_2}, {transpose_out_2})
-          .AddAttribute("perm", std::vector<int64_t>{0, 2, 1});
-      builder.AddNode("Transpose", {gather_out_3}, {transpose_out_3})
-          .AddAttribute("perm", std::vector<int64_t>{0, 2, 1});
+      builder.AddNode("Slice", {data_arg, starts_1, ends_1, axes_1, steps_1}, {slice_out_1});
+      builder.AddNode("Slice", {data_arg, starts_2, ends_2, axes_2, steps_2}, {slice_out_2});
+      builder.AddNode("Slice", {data_arg, starts_3, ends_3, axes_3}, {slice_out_3});
+      builder.AddNode("Transpose", {slice_out_1}, {transpose_out_1})
+          .AddAttribute("perm", std::vector<int64_t>{0, 2, 1, 3});
+      builder.AddNode("Transpose", {slice_out_2}, {transpose_out_2})
+          .AddAttribute("perm", std::vector<int64_t>{0, 2, 1, 3});
+      builder.AddNode("Transpose", {slice_out_3}, {transpose_out_3})
+          .AddAttribute("perm", std::vector<int64_t>{0, 2, 1, 3});
     };
 
-    std::unique_ptr<GraphTransformer> transformer = std::make_unique<GatherToSplitFusion>();
-    ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 14, *logger_, std::move(transformer),
+    std::unique_ptr<GraphTransformer> transformer = std::make_unique<GatherSliceToSplitFusion>();
+    ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 18, *logger_, std::move(transformer),
                                           TransformerLevel::Level1, 1, pre_graph_checker, post_graph_checker));
   }
 
-  // Invalid Gather axis.
+  // Invalid axis.
   {
     auto build_test_case = [&](ModelTestBuilder& builder) {
       auto* data_arg = builder.MakeInput<float>({{54}});
-      auto* shape_arg = builder.MakeInput<int64_t>({{1}});
+      auto* shape_arg = builder.MakeInput<int64_t>({{4}});
       auto* reshape_out = builder.MakeIntermediate<float>({{2, 3, 3, 3}});
       auto* gather_index_1 = builder.MakeInitializer<int64_t>({}, {static_cast<int64_t>(0)});
       auto* gather_index_2 = builder.MakeInitializer<int64_t>({}, {static_cast<int64_t>(1)});
@@ -7550,7 +7432,7 @@ TEST_F(GraphTransformationTests, GatherToSplitFusion_Invalid) {
           .AddAttribute("perm", std::vector<int64_t>{0, 2, 1});
     };
 
-    std::unique_ptr<GraphTransformer> transformer = std::make_unique<GatherToSplitFusion>();
+    std::unique_ptr<GraphTransformer> transformer = std::make_unique<GatherSliceToSplitFusion>();
     ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 14, *logger_, std::move(transformer),
                                           TransformerLevel::Level1, 1, pre_graph_checker, post_graph_checker));
   }
@@ -7643,143 +7525,5 @@ TEST_F(GraphTransformationTests, GatherToSliceFusion) {
   }
 }
 
-TEST_F(GraphTransformationTests, GatherSliceToSplitFusion) {
-  {
-    auto build_test_case = [&](ModelTestBuilder& builder) {
-      auto* data_arg = builder.MakeInput<float>({{54}});
-      auto* reshape_arg = builder.MakeInput<int64_t>({{4}});
-      auto* reshape_out = builder.MakeIntermediate<float>({{2, 512, 73, 64}});
-      builder.AddNode("Reshape", {data_arg, reshape_arg}, {reshape_out});
-
-      // Create Gather-1 Ops
-      auto* gather_index_1 = builder.MakeInitializer<int64_t>({}, {static_cast<int64_t>(-2)});
-      auto* gather_out_1 = builder.MakeIntermediate<float>({{2, 512, 1, 64}});
-      builder.AddNode("Gather", {reshape_out, gather_index_1}, {gather_out_1})
-          .AddAttribute("axis", static_cast<int64_t>(2));
-
-      // Create Transpose 1-Ops
-      auto* transpose_out_1 = builder.MakeOutput();
-      builder.AddNode("Transpose", {gather_out_1}, {transpose_out_1})
-          .AddAttribute("perm", std::vector<int64_t>{0, 2, 1, 3});
-
-      // Create Gather-2 Ops
-      auto* gather_index_2 = builder.MakeInitializer<int64_t>({}, {static_cast<int64_t>(-1)});
-      auto* gather_out_2 = builder.MakeIntermediate<float>({{2, 512, 1, 64}});
-      builder.AddNode("Gather", {reshape_out, gather_index_2}, {gather_out_2})
-          .AddAttribute("axis", static_cast<int64_t>(2));
-
-      // Create Transpose-2 Ops
-      auto* transpose_out_2 = builder.MakeOutput();
-      builder.AddNode("Transpose", {gather_out_2}, {transpose_out_2})
-          .AddAttribute("perm", std::vector<int64_t>{0, 2, 1, 3});
-
-      // Create Slice Ops
-      auto* slice_output = builder.MakeIntermediate();
-      auto* starts = builder.MakeInitializer<int64_t>({1}, {0});
-      auto* ends = builder.MakeInitializer<int64_t>({1}, {-2});
-      auto* axes = builder.MakeInitializer<int64_t>({1}, {2});
-      auto* steps = builder.MakeInitializer<int64_t>({1}, {1});
-      builder.AddNode("Slice", {reshape_out, starts, ends, axes, steps}, {slice_output});
-
-      // Create Shape-1 Ops
-      auto* shape_output_1 = builder.MakeOutput();
-      builder.AddNode("Shape", {slice_output}, {shape_output_1});
-
-      // Create Shape-2 Ops
-      auto* shape_output_2 = builder.MakeOutput();
-      builder.AddNode("Shape", {slice_output}, {shape_output_2});
-
-      // Create Transpose-3 Ops
-      auto* transpose_out_3 = builder.MakeOutput();
-      builder.AddNode("Transpose", {slice_output}, {transpose_out_3})
-          .AddAttribute("perm", std::vector<int64_t>{0, 2, 1, 3});
-    };
-
-    auto pre_graph_checker = [&](Graph& graph) {
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Gather"] == 2);
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Slice"] == 1);
-      return Status::OK();
-    };
-
-    auto post_graph_checker = [&](Graph& graph) {
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Gather"] == 0);
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Slice"] == 0);
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Split"] == 1);
-
-      for (auto& node : graph.Nodes()) {
-        if (node.OpType() == "Split") {
-          auto& attrs = node.GetAttributes();
-          TEST_RETURN_IF_NOT(static_cast<int>(attrs.at("axis").i()) == 2);
-        }
-      }
-      return Status::OK();
-    };
-
-    std::unique_ptr<GraphTransformer> transformer = std::make_unique<GatherSliceToSplitFusion>();
-    ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 14, *logger_, std::move(transformer),
-                                          TransformerLevel::Level1, 1, pre_graph_checker, post_graph_checker));
-  }
-}
-
-TEST_F(GraphTransformationTests, GatherSliceToSplitFusion_Invalid) {
-  {
-    auto build_test_case = [&](ModelTestBuilder& builder) {
-      auto* data_arg = builder.MakeInput<float>({{54}});
-      auto* reshape_arg = builder.MakeInput<int64_t>({{4}});
-      auto* reshape_out = builder.MakeIntermediate<float>({{2, 512, 73, 64}});
-      builder.AddNode("Reshape", {data_arg, reshape_arg}, {reshape_out});
-
-      // Create Gather-1 Ops
-      auto* gather_index_1 = builder.MakeInitializer<int64_t>({}, {static_cast<int64_t>(-2)});
-      auto* gather_out_1 = builder.MakeIntermediate<float>({{2, 512, 1, 64}});
-      builder.AddNode("Gather", {reshape_out, gather_index_1}, {gather_out_1})
-          .AddAttribute("axis", static_cast<int64_t>(2));
-
-      // Create Transpose 1-Ops
-      auto* transpose_out_1 = builder.MakeOutput();
-      builder.AddNode("Transpose", {gather_out_1}, {transpose_out_1})
-          .AddAttribute("perm", std::vector<int64_t>{0, 2, 1, 3});
-
-      // Create Slice Ops
-      auto* slice_output = builder.MakeIntermediate();
-      auto* starts = builder.MakeInitializer<int64_t>({1}, {0});
-      auto* ends = builder.MakeInitializer<int64_t>({1}, {-2});
-      auto* axes = builder.MakeInitializer<int64_t>({1}, {2});
-      auto* steps = builder.MakeInitializer<int64_t>({1}, {1});
-      builder.AddNode("Slice", {reshape_out, starts, ends, axes, steps}, {slice_output});
-
-      // Create Shape-1 Ops
-      auto* shape_output_1 = builder.MakeOutput();
-      builder.AddNode("Shape", {slice_output}, {shape_output_1});
-
-      // Create Shape-2 Ops
-      auto* shape_output_2 = builder.MakeOutput();
-      builder.AddNode("Shape", {slice_output}, {shape_output_2});
-
-      // Create Transpose-3 Ops
-      auto* transpose_out_3 = builder.MakeOutput();
-      builder.AddNode("Transpose", {slice_output}, {transpose_out_3})
-          .AddAttribute("perm", std::vector<int64_t>{0, 2, 1, 3});
-    };
-
-    auto pre_graph_checker = [&](Graph& graph) {
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Gather"] == 1);
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Slice"] == 1);
-      return Status::OK();
-    };
-
-    auto post_graph_checker = [&](Graph& graph) {
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Gather"] == 1);
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Slice"] == 1);
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Split"] == 0);
-      return Status::OK();
-    };
-
-    std::unique_ptr<GraphTransformer> transformer = std::make_unique<GatherSliceToSplitFusion>();
-    ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 14, *logger_, std::move(transformer),
-                                          TransformerLevel::Level1, 1, pre_graph_checker, post_graph_checker));
-  }
-}
-
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/orttraining/orttraining/core/optimizer/graph_transformer_utils.cc b/orttraining/orttraining/core/optimizer/graph_transformer_utils.cc
index 0b68dc65e41cd..5d527369a1b75 100644
--- a/orttraining/orttraining/core/optimizer/graph_transformer_utils.cc
+++ b/orttraining/orttraining/core/optimizer/graph_transformer_utils.cc
@@ -24,7 +24,6 @@
 #include "core/optimizer/fast_gelu_fusion.h"
 #include "core/optimizer/free_dim_override_transformer.h"
 #include "core/optimizer/gather_fusion.h"
-#include "core/optimizer/gather_slice_fusion.h"
 #include "core/optimizer/gelu_approximation.h"
 #include "core/optimizer/gelu_fusion.h"
 #include "core/optimizer/gemm_activation_fusion.h"
@@ -139,9 +138,8 @@ std::vector<std::unique_ptr<GraphTransformer>> GeneratePreTrainingTransformers(
       transformers.emplace_back(std::make_unique<FastGeluFusion>(compatible_eps));
       transformers.emplace_back(std::make_unique<QuickGeluFusion>(compatible_eps));
       transformers.emplace_back(std::make_unique<SoftmaxCrossEntropyLossInternalFusion>(compatible_eps));
-      transformers.emplace_back(std::make_unique<GatherToSplitFusion>(compatible_eps));
-      transformers.emplace_back(std::make_unique<GatherToSliceFusion>(compatible_eps));
       transformers.emplace_back(std::make_unique<GatherSliceToSplitFusion>(compatible_eps));
+      transformers.emplace_back(std::make_unique<GatherToSliceFusion>(compatible_eps));
       // If a model with Q, DQ nodes is being used for the purpose of training, it must be for
       // Quantization Aware Training. So, replace QDQ nodes with FakeQuant.
       transformers.emplace_back(std::make_unique<QDQFusion>(compatible_eps));

From c1bf7fcd2fb105e067dc1f2edd408c399a61a1fe Mon Sep 17 00:00:00 2001
From: Adrian Lizarraga <adlizarraga@microsoft.com>
Date: Thu, 29 Feb 2024 01:19:25 -0800
Subject: [PATCH 085/279] [QNN Quant] Ensure 16bit tensor quant overrides set
 MS domain (#19684)

### Description
Ensures that DQ and Q ops use the msft domain if tensor quantization
overrides specify 16-bit integer types.

### Motivation and Context
ONNX does not yet support 16bit integer types for QuantizeLinear and
DequantizeLinear ops (coming soon). For now, DQ/Q ops must use the MSFT
domain.

We have to also check if tensor quantization overrides force the use of
16-bit quantization types. If so, we must correctly set the domain for
Q/DQ ops.
---
 .../tools/quantization/onnx_quantizer.py      | 11 ++++---
 .../tools/quantization/qdq_quantizer.py       |  5 ++-
 .../test_tensor_quant_overrides_option.py     | 32 ++++++++++++++++++-
 3 files changed, 42 insertions(+), 6 deletions(-)

diff --git a/onnxruntime/python/tools/quantization/onnx_quantizer.py b/onnxruntime/python/tools/quantization/onnx_quantizer.py
index 9450426f12444..19a72e38dea33 100644
--- a/onnxruntime/python/tools/quantization/onnx_quantizer.py
+++ b/onnxruntime/python/tools/quantization/onnx_quantizer.py
@@ -154,7 +154,7 @@ def __init__(
         if self.mode not in QuantizationMode:
             raise ValueError(f"unsupported quantization mode {self.mode}")
 
-        self.tensor_quant_overrides = self._get_and_check_tensor_quant_overrides()
+        self.tensor_quant_overrides, self.tensor_quant_override_types = self._get_and_check_tensor_quant_overrides()
         self.quantization_params = self.calculate_quantization_params()
 
         # QuantizeRange tensor name and zero tensor name for scale and zero point calculation.
@@ -177,8 +177,10 @@ def __init__(
     def _get_and_check_tensor_quant_overrides(self):
         """
         Get tensor quantization overrides and check correctness.
+        Also returns a set of quantization types (as TensorProto) specified across all overrides.
         """
         tensor_quant_overrides = self.extra_options.get("TensorQuantOverrides", {})
+        tensor_quant_override_types = set()
 
         # Validate that compatible/valid overrides are provided.
         if tensor_quant_overrides:
@@ -211,6 +213,8 @@ def _get_and_check_tensor_quant_overrides(self):
                     # other channels.
                     if index == 0:
                         quant_type = quant_overrides.get("quant_type")
+                        if quant_type is not None:
+                            tensor_quant_override_types.add(quant_type.tensor_type)
                     elif quant_type != quant_overrides.get("quant_type"):
                         raise ValueError(
                             "Channel quantization types for tensor '{tensor_name}' do not match at index {index}."
@@ -231,7 +235,7 @@ def _get_and_check_tensor_quant_overrides(self):
                                     f"Tensor override option '{key}' is invalid with 'scale' and 'zero_point'"
                                 )
 
-        return tensor_quant_overrides
+        return tensor_quant_overrides, tensor_quant_override_types
 
     def get_per_tensor_quant_overrides(self, tensor_name):
         quant_overrides_list = self.tensor_quant_overrides.get(tensor_name, [{}])
@@ -747,8 +751,7 @@ def _get_quantization_params(self, param_name, use_scale=None, use_zeropoint=Non
                 raise ValueError(f"Unexpected type {type(params['scale'])} and param_name={param_name!r}")
             scale_values = np.array([params["scale"]])
             assert scale_values.dtype != np.float64
-            # zero_point_type = params["quant_type"]
-            assert zero_point_type == params["quant_type"]
+            zero_point_type = params["quant_type"]
         else:
             zero_point_values = np.array([use_zeropoint])
             scale_values = np.array([use_scale])
diff --git a/onnxruntime/python/tools/quantization/qdq_quantizer.py b/onnxruntime/python/tools/quantization/qdq_quantizer.py
index 775a3e8b8b588..76cd0d21fca37 100644
--- a/onnxruntime/python/tools/quantization/qdq_quantizer.py
+++ b/onnxruntime/python/tools/quantization/qdq_quantizer.py
@@ -116,7 +116,10 @@ def __init__(
         # if the activation or weight types are 16-bit integers.
         # TODO: Remove this override (and use only the 'UseQDQContribOps' option) if/when ONNX adds 16-bit support.
         int16_types = (TensorProto.UINT16, TensorProto.INT16)
-        if not self.qdq_op_domain and (self.activation_qType in int16_types or self.weight_qType in int16_types):
+        overrides_have_int16 = any(t in int16_types for t in self.tensor_quant_override_types)
+        if not self.qdq_op_domain and (
+            self.activation_qType in int16_types or self.weight_qType in int16_types or overrides_have_int16
+        ):
             logging.warning(
                 "ONNX QuantizeLinear and DequantizeLinear operators do not support 16-bit integer quantization types. "
                 f"The domain of QuantizeLinear and DequantizeLinear operators will be set to '{ms_domain}' to "
diff --git a/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py b/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py
index cbb6b3ae2e776..9ea4719f3c595 100644
--- a/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py
+++ b/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py
@@ -13,7 +13,7 @@
 
 from onnxruntime import quantization
 from onnxruntime.quantization.execution_providers.qnn import get_qnn_qdq_config
-from onnxruntime.quantization.quant_utils import compute_scale_zp, get_qmin_qmax_for_qType
+from onnxruntime.quantization.quant_utils import compute_scale_zp, get_qmin_qmax_for_qType, ms_domain
 
 
 class DummyDataReader(quantization.CalibrationDataReader):
@@ -423,6 +423,36 @@ def test_qdq_overrides_per_channel2(self):
             self.assertEqual(zp, expected_zp)
             self.assertEqual(scale, np.float32(expected_scale))
 
+    def test_16bit_overrides_set_ms_domain(self):
+        """
+        Test that overriding a tensor to 16bit (when default is 8bit) automatically sets the 'com.microsoft'
+        domain on DQ and Q ops.
+        """
+        qdq_model_name = "model_quant_overrides_to_16bit.onnx"
+        inp_zp, _, sig_out_zp, _, _, _, _, _, out_zp, _ = self.perform_qdq_quantization(
+            qdq_model_name,
+            activation_type=onnx.TensorProto.UINT8,  # Default to 8bit activations
+            extra_options={
+                "TensorQuantOverrides": {
+                    "INP": [{"quant_type": quantization.QuantType.QUInt16}],
+                    "SIG_OUT": [{"quant_type": quantization.QuantType.QUInt16}],
+                }
+            },
+        )
+
+        # Input and Sigmoid's output should be overridden to 16bit
+        self.assertEqual(inp_zp.data_type, onnx.TensorProto.UINT16)
+        self.assertEqual(sig_out_zp.data_type, onnx.TensorProto.UINT16)
+
+        # Output should the default uint8 type
+        self.assertEqual(out_zp.data_type, onnx.TensorProto.UINT8)
+
+        # Q/DQ ops should all have the 'com.microsoft' domain
+        qdq_model = onnx.load_model(qdq_model_name)
+        for node in qdq_model.graph.node:
+            if node.op_type in {"QuantizeLinear", "DequantizeLinear"}:
+                self.assertEqual(node.domain, ms_domain)
+
     def test_override_validation_nonexisting_tensor(self):
         """
         Test that specifying a non-existing tensor should fail.

From c311d1faf50167e38613927e44c8a430ffcc8e89 Mon Sep 17 00:00:00 2001
From: PeixuanZuo <94887879+PeixuanZuo@users.noreply.github.com>
Date: Thu, 29 Feb 2024 17:51:29 +0800
Subject: [PATCH 086/279] [ROCm] Update dockerfile (#19661)

Update dockerfile to ROCm6.0
---
 dockerfiles/Dockerfile.migraphx | 43 +++------------------------------
 dockerfiles/Dockerfile.rocm     |  4 +--
 dockerfiles/README.md           |  4 +--
 3 files changed, 8 insertions(+), 43 deletions(-)

diff --git a/dockerfiles/Dockerfile.migraphx b/dockerfiles/Dockerfile.migraphx
index bc513a8e8ba6d..c3541a8bd3425 100644
--- a/dockerfiles/Dockerfile.migraphx
+++ b/dockerfiles/Dockerfile.migraphx
@@ -5,57 +5,22 @@
 # Dockerfile to run ONNXRuntime with MIGraphX integration
 #--------------------------------------------------------------------------
 
-FROM ubuntu:20.04
+FROM rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1
 
 ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime
 ARG ONNXRUNTIME_BRANCH=main
-ARG ROCM_VERSION=5.4
-# MIGraphX version should be the same as ROCm version
-ARG MIGRAPHX_VERSION=rocm-5.4.0
-ENV DEBIAN_FRONTEND noninteractive
-ENV MIGRAPHX_DISABLE_FAST_GELU=1
 
-RUN apt-get clean && apt-get update && apt-get install -y locales
-RUN locale-gen en_US.UTF-8
-RUN update-locale LANG=en_US.UTF-8
-ENV LC_ALL C.UTF-8
-ENV LANG C.UTF-8
+ENV PATH /code/cmake-3.27.3-linux-x86_64/bin:${PATH}
 
-# Install rocm
-RUN apt-get update && apt-get install -y gnupg2 --no-install-recommends curl && \
-  curl -sL http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
-  sh -c 'echo deb [arch=amd64] http://repo.radeon.com/rocm/apt/${ROCM_VERSION}/ ubuntu main > /etc/apt/sources.list.d/rocm.list'
-
-RUN apt-get update &&\
-    apt-get install -y sudo git bash build-essential rocm-dev python3-dev python3-pip miopen-hip \
-    rocblas half aria2 libnuma-dev pkg-config
-
-RUN aria2c -q -d /tmp -o cmake-3.27.3-linux-x86_64.tar.gz \
-https://github.com/Kitware/CMake/releases/download/v3.27.3/cmake-3.27.3-linux-x86_64.tar.gz &&\
-tar -zxf /tmp/cmake-3.27.3-linux-x86_64.tar.gz --strip=1 -C /usr
-
-# Install rbuild
-RUN pip3 install https://github.com/RadeonOpenCompute/rbuild/archive/master.tar.gz numpy yapf==0.28.0
-
-ENV PATH /opt/miniconda/bin:/code/cmake-3.27.3-linux-x86_64/bin:${PATH}
-
-# Install MIGraphX from source
-RUN mkdir -p /migraphx
-RUN cd /migraphx && git clone --depth=1 --branch ${MIGRAPHX_VERSION} https://github.com/ROCmSoftwarePlatform/AMDMIGraphX src
-RUN cd /migraphx && rbuild package --cxx /opt/rocm/llvm/bin/clang++ -d /migraphx/deps -B /migraphx/build -S /migraphx/src/ -DPYTHON_EXECUTABLE=/usr/bin/python3
-RUN dpkg -i /migraphx/build/*.deb
-RUN rm -rf /migraphx
-
-# Install rocm ep dependencies
 RUN apt-get update &&\
-    apt-get install -y rocrand rccl hipsparse hipfft hipcub hipblas rocthrust
+    apt-get install -y migraphx
 
 WORKDIR /code
 
 # Prepare onnxruntime repository & build onnxruntime
 RUN git clone --single-branch --branch ${ONNXRUNTIME_BRANCH} --recursive ${ONNXRUNTIME_REPO} onnxruntime &&\
     /bin/sh onnxruntime/dockerfiles/scripts/install_common_deps.sh &&\
-    cd onnxruntime  &&\
+    cd onnxruntime  && pip install --upgrade pip &&\
     /bin/sh ./build.sh --allow_running_as_root --cmake_extra_defines ONNXRUNTIME_VERSION=`cat ./VERSION_NUMBER` --config Release --parallel \
             --skip_tests --build_wheel --use_rocm --rocm_version=${ROCM_VERSION} --rocm_home /opt/rocm --use_migraphx &&\
     pip install /code/onnxruntime/build/Linux/Release/dist/*.whl
diff --git a/dockerfiles/Dockerfile.rocm b/dockerfiles/Dockerfile.rocm
index 35a676383337b..c242933f677f0 100644
--- a/dockerfiles/Dockerfile.rocm
+++ b/dockerfiles/Dockerfile.rocm
@@ -5,14 +5,14 @@
 # Dockerfile to run ONNXRuntime with ROCm integration
 #--------------------------------------------------------------------------
 
-FROM rocm/pytorch:rocm5.4_ubuntu20.04_py3.7_pytorch_1.12.1
+FROM rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1
 
 ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime
 ARG ONNXRUNTIME_BRANCH=main
 
 WORKDIR /code
 
-ENV PATH /opt/miniconda/bin:/code/cmake-3.27.3-linux-x86_64/bin:${PATH}
+ENV PATH /code/cmake-3.27.3-linux-x86_64/bin:${PATH}
 
 # Prepare onnxruntime repository & build onnxruntime
 RUN git clone --single-branch --branch ${ONNXRUNTIME_BRANCH} --recursive ${ONNXRUNTIME_REPO} onnxruntime &&\
diff --git a/dockerfiles/README.md b/dockerfiles/README.md
index f226ebfe8b193..a2e99d66d4654 100644
--- a/dockerfiles/README.md
+++ b/dockerfiles/README.md
@@ -277,7 +277,7 @@ Nothing else from ONNX Runtime source tree will be copied/installed to the image
 Note: When running the container you built in Docker, please either use 'nvidia-docker' command instead of 'docker', or use Docker command-line options to make sure NVIDIA runtime will be used and appropiate files mounted from host. Otherwise, CUDA libraries won't be found. You can also [set NVIDIA runtime as default in Docker](https://github.com/dusty-nv/jetson-containers#docker-default-runtime).
 
 ## MIGraphX
-**Ubuntu 20.04, ROCm5.4, AMDMIGraphX v1.2**
+**Ubuntu 20.04, ROCm6.0, MIGraphX**
 
 1. Build the docker image from the Dockerfile in this repository.
   ```
@@ -291,7 +291,7 @@ Note: When running the container you built in Docker, please either use 'nvidia-
   ```
 
    ## ROCm
-**Ubuntu 20.04, ROCm5.4**
+**Ubuntu 20.04, ROCm6.0**
 
 1. Build the docker image from the Dockerfile in this repository.
   ```

From 937cdd651e4f656e65053d027c71b51f1e1411ec Mon Sep 17 00:00:00 2001
From: Vincent Wang <wangwchpku@outlook.com>
Date: Thu, 29 Feb 2024 23:03:57 +0800
Subject: [PATCH 087/279] [ORTMODULE] Support Register Custom Triton Kernel
 (#19690)

Add support for registering custom Triton kernel function.
---
 .../python/training/ort_triton/__init__.py           |  1 +
 .../python/training/ort_triton/triton_op_executor.py | 12 +++++++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/orttraining/orttraining/python/training/ort_triton/__init__.py b/orttraining/orttraining/python/training/ort_triton/__init__.py
index fbb59d1354ae7..5f2d0c62ffa50 100644
--- a/orttraining/orttraining/python/training/ort_triton/__init__.py
+++ b/orttraining/orttraining/python/training/ort_triton/__init__.py
@@ -9,6 +9,7 @@
 from onnxruntime.capi import _pybind_state as _C
 
 from .kernel import *  # noqa: F403
+from .triton_op_executor import register_triton_kernel  # noqa: F401
 from .triton_op_executor import call_triton_by_name, call_triton_by_onnx, get_config
 
 
diff --git a/orttraining/orttraining/python/training/ort_triton/triton_op_executor.py b/orttraining/orttraining/python/training/ort_triton/triton_op_executor.py
index f16abc71251ed..e104ea13c59a3 100644
--- a/orttraining/orttraining/python/training/ort_triton/triton_op_executor.py
+++ b/orttraining/orttraining/python/training/ort_triton/triton_op_executor.py
@@ -23,6 +23,8 @@
 
 _DEBUG_MODE = "ORTMODULE_TRITON_DEBUG" in os.environ and int(os.getenv("ORTMODULE_TRITON_DEBUG")) == 1
 
+_CUSTOM_KERNELS = dict()
+
 
 @functools.lru_cache(None)
 def _gen_module_internal(sorted_graph: SortedGraph) -> Tuple[str, str, ModuleType]:
@@ -113,7 +115,10 @@ def call_triton_by_name(func_name: str, *tensors, **kwargs):
     """
 
     torch_tensors = [_from_dlpack(tensor) if tensor is not None else None for tensor in tensors]
-    func = getattr(sys.modules[".".join(__name__.split(".")[:-1])], func_name)
+    func = getattr(sys.modules[".".join(__name__.split(".")[:-1])], func_name, None)
+    if func is None:
+        func = _CUSTOM_KERNELS.get(func_name)
+    assert func is not None, f"Function {func_name} is not found in the registered kernels."
     output = func(*torch_tensors, **kwargs)
     if output is not None:
         if isinstance(output, tuple):
@@ -138,3 +143,8 @@ def call_triton_by_onnx(onnx_key: int, onnx_str: bytes, *tensors):
     if isinstance(output, tuple):
         return tuple([to_dlpack(tensor) for tensor in output])
     return to_dlpack(output)
+
+
+def register_triton_kernel(fn):
+    _CUSTOM_KERNELS[fn.__name__] = fn
+    return fn

From ec0e4d3b6572c18a3462eb6efb3bb007ec3a2962 Mon Sep 17 00:00:00 2001
From: Yi-Hong Lyu <yilyu@microsoft.com>
Date: Thu, 29 Feb 2024 10:31:57 -0800
Subject: [PATCH 088/279] Parallel Transpose_BSNH_to_BNSH (#19406)

Achieved a speedup of 1.098 in MultiHeadAttention and an end-to-end
speedup of 1.021 in the OCR model through parallelization of the
Transpose_BSNH_to_BNSH operation.
---
 onnxruntime/contrib_ops/cpu/bert/multihead_attention.cc | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/contrib_ops/cpu/bert/multihead_attention.cc b/onnxruntime/contrib_ops/cpu/bert/multihead_attention.cc
index eb25d0fd7cc1e..c4e4b4ec707fb 100644
--- a/onnxruntime/contrib_ops/cpu/bert/multihead_attention.cc
+++ b/onnxruntime/contrib_ops/cpu/bert/multihead_attention.cc
@@ -58,11 +58,12 @@ Status Reshape_BSD_to_BSNH(Tensor* qkv,
 
 // Transpose Q/K/V from BxSxNxH to BxNxSxH
 Status Transpose_BSNH_to_BNSH(const Tensor* qkv,
-                              OrtValue& qkv_transposed) {
+                              OrtValue& qkv_transposed,
+                              concurrency::ThreadPool* tp = nullptr) {
   std::vector<size_t> permutations({0, 2, 1, 3});
   gsl::span<const size_t> permutations_span{permutations};
   size_t from = 2, to = 1;
-  SingleAxisTranspose(permutations_span, *qkv, *qkv_transposed.GetMutable<Tensor>(), from, to);
+  SingleAxisTranspose(permutations_span, *qkv, *qkv_transposed.GetMutable<Tensor>(), from, to, nullptr, tp);
   return Status::OK();
 }
 
@@ -143,7 +144,8 @@ Status AddBiasTranspose(const Tensor* qkv,                   // Input: Q/K/V dat
   ORT_RETURN_IF_ERROR(Reshape_BSD_to_BSNH(qkv_with_bias.GetMutable<Tensor>(), batch_size, sequence_length, num_heads, head_size));
 
   // Transpose Q from BxSxNxH to BxNxSxH
-  ORT_RETURN_IF_ERROR(Transpose_BSNH_to_BNSH(qkv_with_bias.GetMutable<Tensor>(), qkv_with_bias_transposed));
+  auto tp = context->GetOperatorThreadPool();
+  ORT_RETURN_IF_ERROR(Transpose_BSNH_to_BNSH(qkv_with_bias.GetMutable<Tensor>(), qkv_with_bias_transposed, tp));
 
   return Status::OK();
 }

From d5606cd7ee394ba9444ef509021720ebe63c9856 Mon Sep 17 00:00:00 2001
From: Adam Louly <adamlouly3@gmail.com>
Date: Thu, 29 Feb 2024 13:40:56 -0800
Subject: [PATCH 089/279] Introducing customizable input names for loss in
 generate_artifacts. (#19705)

# loss function extra inputs.
Currently, the loss functions in onnxblock expect exactly two inputs in
their build method.
Occasionally, models may pass additional inputs, causing the build
function to fail.
To solve this issue, we can let users pass a list of loss input names to
be used in the loss function.
---
 .../orttraining/python/training/artifacts.py  | 22 ++++++++++++++-----
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/orttraining/orttraining/python/training/artifacts.py b/orttraining/orttraining/python/training/artifacts.py
index 7a4eb251bc5bc..4e76174d8255e 100644
--- a/orttraining/orttraining/python/training/artifacts.py
+++ b/orttraining/orttraining/python/training/artifacts.py
@@ -48,6 +48,7 @@ def generate_artifacts(
     custom_op_library: Optional[Union[str, bytes, os.PathLike]] = None,
     additional_output_names: Optional[List[str]] = None,
     nominal_checkpoint: bool = False,
+    loss_input_names: Optional[List[str]] = None,
 ) -> None:
     """Generates artifacts required for training with ORT training api.
 
@@ -77,7 +78,9 @@ def generate_artifacts(
             Default is False. Nominal checkpoint is a checkpoint that contains nominal information about the model
             parameters. It can be used on the device to reduce overhead while constructing the training model
             as well as to reduce the size of the checkpoint packaged with the on-device application.
-
+        loss_input_names: Specifies a list of input names to be used specifically for the loss computation. When provided,
+            only these inputs will be passed to the loss function. If `None`, all graph outputs are passed to
+            the loss function.
     Raises:
         RuntimeError: If the loss provided is neither one of the supported losses nor an instance of `onnxblock.Block`
         RuntimeError: If the optimizer provided is not one of the supported optimizers.
@@ -111,11 +114,16 @@ def generate_artifacts(
         logging.info("Custom loss block provided: %s", loss.__class__.__name__)
 
     class _TrainingBlock(onnxblock.TrainingBlock):
-        def __init__(self, _loss):
+        def __init__(self, _loss, _loss_input_names=None):
             super().__init__()
             self._loss = _loss
+            self._loss_input_names = _loss_input_names
 
         def build(self, *inputs_to_loss):
+            # If loss_input_names is passed, only pass the specified input names to the loss function.
+            if self._loss_input_names:
+                inputs_to_loss = self._loss_input_names
+
             if additional_output_names:
                 # If additional output names is not a list, raise an error
                 if not isinstance(additional_output_names, list):
@@ -132,7 +140,7 @@ def build(self, *inputs_to_loss):
 
             return self._loss(*inputs_to_loss)
 
-    training_block = _TrainingBlock(loss_block)
+    training_block = _TrainingBlock(loss_block, loss_input_names)
 
     if requires_grad is not None and frozen_params is not None and set(requires_grad).intersection(set(frozen_params)):
         raise RuntimeError(
@@ -157,9 +165,11 @@ def build(self, *inputs_to_loss):
         logging.info("Custom op library provided: %s", custom_op_library)
         custom_op_library_path = pathlib.Path(custom_op_library)
 
-    with onnxblock.base(model), onnxblock.custom_op_library(
-        custom_op_library_path
-    ) if custom_op_library is not None else contextlib.nullcontext():
+    with onnxblock.base(model), (
+        onnxblock.custom_op_library(custom_op_library_path)
+        if custom_op_library is not None
+        else contextlib.nullcontext()
+    ):
         _ = training_block(*[output.name for output in model.graph.output])
         training_model, eval_model = training_block.to_model_proto()
         model_params = training_block.parameters()

From 5ee62a6bcc228e63704f64f2de46d61d2c57a281 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <yuslepukhin@users.noreply.github.com>
Date: Thu, 29 Feb 2024 14:46:42 -0800
Subject: [PATCH 090/279] CUDA Resize-18 implementation (#19595)

### Description
Implement Resize-18 on CUDA.

### Motivation and Context
Performance
---
 docs/OperatorKernels.md                       |    3 +-
 .../providers/cpu/cpu_execution_provider.cc   |    6 +-
 .../core/providers/cpu/cpu_provider_shared.cc |    8 +
 .../core/providers/cpu/cpu_provider_shared.h  |    5 +
 .../core/providers/cpu/tensor/upsample.cc     |   79 +-
 .../core/providers/cpu/tensor/upsample.h      |   14 +-
 .../providers/cpu/tensor/upsample_antialias.h |   95 +-
 .../core/providers/cpu/tensor/upsamplebase.h  |  191 ++-
 .../core/providers/cuda/cu_inc/common.cuh     |   12 +-
 .../providers/cuda/cuda_execution_provider.cc |   30 +-
 .../core/providers/cuda/tensor/resize.cc      |   14 +-
 .../cuda/tensor/resize_antialias_impl.cu      | 1179 +++++++++++++++++
 .../core/providers/cuda/tensor/resize_impl.cu |  254 ++--
 .../core/providers/cuda/tensor/resize_impl.h  |  111 ++
 .../core/providers/cuda/tensor/upsample.cc    |  254 +++-
 .../core/providers/cuda/tensor/upsample.h     |   10 +-
 .../providers/rocm/rocm_execution_provider.cc |   40 +-
 .../provider_bridge_provider.cc               |    7 +-
 .../core/providers/xnnpack/tensor/resize.cc   |    2 +-
 .../providers/cpu/tensor/resize_op_test.cc    |  171 ++-
 20 files changed, 2090 insertions(+), 395 deletions(-)
 create mode 100644 onnxruntime/core/providers/cuda/tensor/resize_antialias_impl.cu

diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index b0ed68d595c42..1eaf0fb6dad76 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -734,7 +734,8 @@ Do not modify directly.*
 |||13|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **shape** = tensor(int64)|
 |||[5, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **shape** = tensor(int64)|
 |||[1, 4]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|Resize|*in* X:**T**<br> *in* scales:**tensor(float)**<br> *out* Y:**T**<br><br>or<br><br>*in* X:**T1**<br> *in* roi:**T2**<br> *in* scales:**tensor(float)**<br> *in* sizes:**tensor(int64)**<br> *out* Y:**T1**|13+|**T1** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(uint8)|
+|Resize|*in* X:**T**<br> *in* scales:**tensor(float)**<br> *out* Y:**T**<br><br>or<br><br>*in* X:**T1**<br> *in* roi:**T2**<br> *in* scales:**tensor(float)**<br> *in* sizes:**tensor(int64)**<br> *out* Y:**T1**|18+|**T1** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(uint8)|
+|||[13, 17]|**T1** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(uint8)|
 |||[11, 12]|**T1** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(uint8)|
 |||10|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(uint8)|
 |ReverseSequence|*in* input:**T**<br> *in* sequence_lens:**tensor(int64)**<br> *out* Y:**T**|10+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
diff --git a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
index 48e4617b33b4d..37e7e42150413 100644
--- a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
+++ b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
@@ -2008,8 +2008,10 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
                                                                 Greater)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Less)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t, Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int64_t, Less)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13,
+                                                                int32_t, Less)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13,
+                                                                int64_t, Less)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
                                                                           float, Add)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
diff --git a/onnxruntime/core/providers/cpu/cpu_provider_shared.cc b/onnxruntime/core/providers/cpu/cpu_provider_shared.cc
index bf73c59fb78ca..c4a83efa01a91 100644
--- a/onnxruntime/core/providers/cpu/cpu_provider_shared.cc
+++ b/onnxruntime/core/providers/cpu/cpu_provider_shared.cc
@@ -25,6 +25,7 @@
 #include "core/providers/cpu/tensor/tile.h"
 #include "core/providers/cpu/tensor/gather_elements.h"
 #include "core/providers/cpu/tensor/unsqueeze.h"
+#include "core/providers/cpu/tensor/upsamplebase.h"
 
 #ifndef DISABLE_CONTRIB_OPS
 #include "contrib_ops/cpu/bert/attention_base.h"
@@ -62,6 +63,7 @@
 #endif
 
 #include "cpu_provider_shared.h"
+#include <limits>
 
 namespace onnxruntime {
 // The suppressed warning is: "The type with a virtual function needs either public virtual or protected nonvirtual destructor."
@@ -292,6 +294,12 @@ struct ProviderHostCPUImpl : ProviderHostCPU {
   Status Sampling__Compute(const contrib::transformers::Sampling* p, OpKernelContext* ctx) override { return p->contrib::transformers::Sampling::Compute(ctx); }
   Status Sampling__SetupSubgraphExecutionInfo(contrib::transformers::Sampling* p, const SessionState& session_state, const std::string& attribute_name, const SessionState& subgraph_session_state) override { return p->contrib::transformers::Sampling::SetupSubgraphExecutionInfo(session_state, attribute_name, subgraph_session_state); }
 
+  void UpsampleBase__AdjustOutputSizeAsPolicy(const UpsampleBase* p, TensorShapeVector& output_dims,
+                                              gsl::span<const int64_t> input_dims,
+                                              InlinedVector<float>& scales) const override {
+    p->AdjustOutputSizeAsPolicy(output_dims, input_dims, scales);
+  }
+
 #ifdef ENABLE_ATEN
   Status ATen__Compute(const contrib::ATen* p, OpKernelContext* p_ctx) override { return p->ATen::Compute(p_ctx); }
 #endif
diff --git a/onnxruntime/core/providers/cpu/cpu_provider_shared.h b/onnxruntime/core/providers/cpu/cpu_provider_shared.h
index f33eec4b93e98..c0e674827e4d1 100644
--- a/onnxruntime/core/providers/cpu/cpu_provider_shared.h
+++ b/onnxruntime/core/providers/cpu/cpu_provider_shared.h
@@ -24,6 +24,7 @@ class SliceOp__PrepareForComputeMetadata;  // Directly maps to SliceOp::PrepareF
 class UnsqueezeBase__Prepare;              // Directly maps to UnsqueezeBase::Prepare
 class contrib__AdamWOptimizerBase__Prepare;
 class contrib__SGDOptimizerV2Base__Prepare;
+class UpsampleBase;
 
 using PadsVector = InlinedVector<int64_t, kTensorShapeSmallBufferElementsSize * 2>;
 
@@ -202,6 +203,10 @@ struct ProviderHostCPU {
   virtual Status Sampling__Compute(const contrib::transformers::Sampling* p, OpKernelContext* ctx) = 0;
   virtual Status Sampling__SetupSubgraphExecutionInfo(contrib::transformers::Sampling* p, const SessionState& session_state, const std::string& attribute_name, const SessionState& subgraph_session_state) = 0;
 
+  virtual void UpsampleBase__AdjustOutputSizeAsPolicy(const UpsampleBase* p, TensorShapeVector& output_dims,
+                                                      gsl::span<const int64_t> input_dims,
+                                                      InlinedVector<float>& scales) const = 0;
+
 #ifdef ENABLE_ATEN
   virtual Status ATen__Compute(const contrib::ATen* p, OpKernelContext* p_ctx) = 0;
 #endif
diff --git a/onnxruntime/core/providers/cpu/tensor/upsample.cc b/onnxruntime/core/providers/cpu/tensor/upsample.cc
index fa69e144be554..babbac0b7be17 100644
--- a/onnxruntime/core/providers/cpu/tensor/upsample.cc
+++ b/onnxruntime/core/providers/cpu/tensor/upsample.cc
@@ -1,10 +1,15 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "core/providers/cpu/tensor/upsample.h"
+
+#include <limits>
+
+#include "core/common/inlined_containers.h"
 #include "core/common/safeint.h"
 #include "core/platform/threadpool.h"
-#include "core/providers/cpu/tensor/upsample.h"
 #include "core/providers/cpu/tensor/upsample_antialias.h"
+
 using namespace onnxruntime::common;
 using namespace std;
 using onnxruntime::narrow;
@@ -30,6 +35,46 @@ REGISTER_VERSIONED_TYPED_KERNEL(int32_t, 9, 9);
 REGISTER_VERSIONED_TYPED_KERNEL(int8_t, 9, 9);
 REGISTER_VERSIONED_TYPED_KERNEL(uint8_t, 9, 9);
 
+void UpsampleBase::AdjustOutputSizeAsPolicy(TensorShapeVector& output_dims, gsl::span<const int64_t> input_dims,
+                                            InlinedVector<float>& scales) const {
+  // AspectRatioPolicy::STRETCH is default policy when opset < 18
+  if (keep_aspect_ratio_policy_ == AspectRatioPolicy::STRETCH) {
+    return;
+  }
+
+  InlinedHashSet<int64_t> axes_set(axes_.begin(), axes_.end());
+
+  float scale_in_policy = 0.0f;
+  if (keep_aspect_ratio_policy_ == AspectRatioPolicy ::NOT_LARGER) {
+    scale_in_policy = std::numeric_limits<float>::max();
+
+    for (size_t i = 0; i < scales.size(); i++) {
+      if (axes_set.empty() || axes_set.count(i) > 0) {
+        scale_in_policy = std::min(scale_in_policy, scales[i]);
+      }
+    }
+  } else if (keep_aspect_ratio_policy_ == AspectRatioPolicy ::NOT_SMALLER) {
+    scale_in_policy = std::numeric_limits<float>::min();
+
+    for (size_t i = 0; i < scales.size(); i++) {
+      if (axes_set.empty() || axes_set.count(i) > 0) {
+        scale_in_policy = std::max(scale_in_policy, scales[i]);
+      }
+    }
+  }
+
+  for (size_t i = 0; i < scales.size(); i++) {
+    // if axes is not specified (AKA axes_set.empty()), we apply the policy to all axes
+    if (axes_set.empty() || axes_set.count(i) > 0) {
+      scales[i] = scale_in_policy;
+      output_dims[i] = static_cast<int64_t>(std::round(scales[i] * input_dims[i]));
+    } else {
+      scales[i] = 1.0f;
+      output_dims[i] = input_dims[i];
+    }
+  }
+}
+
 template <typename T>
 void UpsampleNearest2x(int64_t batch_size,
                        int64_t num_channels,
@@ -94,8 +139,8 @@ UpsampleNearestSetupInputMappings(int64_t n_dim,
                                   const TensorShape& input_shape,
                                   const TensorShape& output_shape,
                                   const std::vector<int64_t>& input_dim_factor,
-                                  const vector<float>& scales,
-                                  const vector<float>& roi,
+                                  gsl::span<const float> scales,
+                                  gsl::span<const float> roi,
                                   bool extrapolation_enabled,
                                   const GetOriginalCoordinateFunc& get_original_coordinate,
                                   const GetNearestPixelFunc& get_nearest_pixel) {
@@ -141,8 +186,8 @@ static Status UpsampleNearestImpl(const T* input,
                                   T* output,
                                   const TensorShape& input_shape,
                                   const TensorShape& output_shape,
-                                  const vector<float>& scales,
-                                  const vector<float>& roi,
+                                  gsl::span<const float> scales,
+                                  gsl::span<const float> roi,
                                   bool extrapolation_enabled,
                                   const T extrapolation_value,
                                   const GetOriginalCoordinateFunc& get_original_coordinate,
@@ -285,8 +330,8 @@ static Status UpsampleNearest(const T* input,
                               T* output,
                               const TensorShape& input_shape,
                               const TensorShape& output_shape,
-                              const vector<float>& scales,
-                              const vector<float>& roi,
+                              gsl::span<const float> scales,
+                              gsl::span<const float> roi,
                               bool is_resize,
                               bool extrapolation_enabled,
                               T extrapolation_value,
@@ -412,7 +457,7 @@ BilinearParams SetupUpsampleBilinear(const int32_t input_height,
                                      const int32_t output_width,
                                      const float height_scale,
                                      const float width_scale,
-                                     const std::vector<float>& roi,
+                                     gsl::span<const float> roi,
                                      AllocatorPtr& alloc,
                                      const GetOriginalCoordinateFunc& get_original_coordinate,
                                      const bool is_nchw) {
@@ -518,7 +563,7 @@ BilinearParamsInteger SetupUpsampleBilinearInteger(const int32_t input_height,
                                                    const int32_t output_width,
                                                    const float height_scale,
                                                    const float width_scale,
-                                                   const std::vector<float>& roi,
+                                                   gsl::span<const float> roi,
                                                    AllocatorPtr& alloc,
                                                    const GetOriginalCoordinateFunc& get_original_coordinate,
                                                    const bool is_nchw) {
@@ -650,7 +695,7 @@ static TrilinearParams SetupUpsampleTrilinear(int64_t input_depth,
                                               float depth_scale,
                                               float height_scale,
                                               float width_scale,
-                                              const std::vector<float>& roi,
+                                              gsl::span<const float> roi,
                                               AllocatorPtr& alloc,
                                               const GetOriginalCoordinateFunc& get_original_coordinate) {
   TrilinearParams p;
@@ -796,7 +841,7 @@ void UpsampleTrilinear(int64_t batch_size,
                        float depth_scale,
                        float height_scale,
                        float width_scale,
-                       const std::vector<float>& roi,
+                       gsl::span<const float> roi,
                        bool use_extrapolation,
                        float extrapolation_value,
                        const T* XdataBase,
@@ -929,7 +974,7 @@ void ResizeBiCubic(int64_t batch_size,
                    bool use_extrapolation,
                    float extrapolation_value,
                    bool exclude_outside,
-                   const std::vector<float>& roi,
+                   gsl::span<const float> roi,
                    const T* Xdata,
                    T* Ydata,
                    const GetOriginalCoordinateFunc& get_original_coordinate) {
@@ -1067,9 +1112,9 @@ void ResizeBiCubic(int64_t batch_size,
 
 template <typename T>
 Status Upsample<T>::BaseCompute(OpKernelContext* context,
-                                const std::vector<float>& roi,
-                                const std::vector<float>& scales,
-                                const gsl::span<const int64_t>& output_dims) const {
+                                gsl::span<const float> roi,
+                                gsl::span<const float> scales,
+                                gsl::span<const int64_t> output_dims) const {
   const auto* X = context->Input<Tensor>(0);
   auto dims = X->Shape().GetDims();
   ORT_RETURN_IF_NOT(output_dims.size() == dims.size(), "Rank of input and output tensor should be same.");
@@ -1327,7 +1372,7 @@ Status Upsample<T>::Compute(OpKernelContext* context) const {
   // Initialize the roi array to all zeros as this will be the most common case
   // Roi data is needed only when coordinate transformation mode is set to tf_crop_and_resize
   // for all other cases we need a 0 initialized roi array
-  std::vector<float> roi_array(roi_);
+  InlinedVector<float> roi_array(roi_);
 
   if (!roi_cached_) {
     bool use_default_roi = true;
@@ -1353,7 +1398,7 @@ Status Upsample<T>::Compute(OpKernelContext* context) const {
 
   ComputeROIWithAxes(roi_array, input_dims.size());
   // Get scales data
-  std::vector<float> scales_array(input_dims.size());
+  InlinedVector<float> scales_array(input_dims.size());
 
   if (OpKernel::Node().InputDefs().size() == 1) {
     // Compute output shape from scales and input dims
diff --git a/onnxruntime/core/providers/cpu/tensor/upsample.h b/onnxruntime/core/providers/cpu/tensor/upsample.h
index 3046ee4b8260d..8ff04781f6ad0 100644
--- a/onnxruntime/core/providers/cpu/tensor/upsample.h
+++ b/onnxruntime/core/providers/cpu/tensor/upsample.h
@@ -66,8 +66,8 @@ class Upsample : public UpsampleBase, public OpKernel {
 
   Status Compute(OpKernelContext* context) const override;
 
-  Status BaseCompute(OpKernelContext* context, const std::vector<float>& roi, const std::vector<float>& scales,
-                     const gsl::span<const int64_t>& output_dims) const;
+  Status BaseCompute(OpKernelContext* context, gsl::span<const float> roi, gsl::span<const float> scales,
+                     gsl::span<const int64_t> output_dims) const;
 };
 
 BilinearParams SetupUpsampleBilinear(const int32_t input_height,
@@ -76,7 +76,7 @@ BilinearParams SetupUpsampleBilinear(const int32_t input_height,
                                      const int32_t output_width,
                                      const float height_scale,
                                      const float width_scale,
-                                     const std::vector<float>& roi,
+                                     gsl::span<const float> roi,
                                      AllocatorPtr& alloc,
                                      const GetOriginalCoordinateFunc& get_original_coordinate,
                                      const bool is_nchw);
@@ -90,7 +90,7 @@ void UpsampleBilinear(const int32_t batch_size,
                       const int32_t output_width,
                       const float height_scale,
                       const float width_scale,
-                      const std::vector<float>& roi,
+                      gsl::span<const float> roi,
                       const bool use_extrapolation,
                       const float extrapolation_value,
                       const T* const XdataBase,
@@ -144,7 +144,7 @@ void NhwcUpsampleBilinear(const int32_t batch_size,
                           const int32_t output_width,
                           const float height_scale,
                           const float width_scale,
-                          const std::vector<float>& roi,
+                          gsl::span<const float> roi,
                           const float extrapolation_value,
                           const T* const XdataBase,
                           T* const YdataBase,
@@ -227,7 +227,7 @@ BilinearParamsInteger SetupUpsampleBilinearInteger(const int32_t input_height,
                                                    const int32_t output_width,
                                                    const float height_scale,
                                                    const float width_scale,
-                                                   const std::vector<float>& roi,
+                                                   gsl::span<const float> roi,
                                                    AllocatorPtr& alloc,
                                                    const GetOriginalCoordinateFunc& get_original_coordinate,
                                                    const bool is_nchw);
@@ -241,7 +241,7 @@ void NhwcUpsampleBilinearInteger(const int32_t batch_size,
                                  const int32_t output_width,
                                  const float height_scale,
                                  const float width_scale,
-                                 const std::vector<float>& roi,
+                                 gsl::span<const float> roi,
                                  const float extrapolation_value,
                                  const T* const XdataBase,
                                  T* const YdataBase,
diff --git a/onnxruntime/core/providers/cpu/tensor/upsample_antialias.h b/onnxruntime/core/providers/cpu/tensor/upsample_antialias.h
index e1dcaf500a325..1e32b7e874b1a 100644
--- a/onnxruntime/core/providers/cpu/tensor/upsample_antialias.h
+++ b/onnxruntime/core/providers/cpu/tensor/upsample_antialias.h
@@ -21,32 +21,6 @@
 
 namespace onnxruntime {
 
-namespace ConstValue {
-constexpr int32_t mag_factor = 1 << (22 - 1);
-}
-
-namespace {
-const uint8_t* GetLookupTableShared() {
-  // initialized once
-  static const auto* lookup_table = []() {
-    // if we have already initialized the lookup table, just return
-    // ideally we could have a global lookup table, but that account for too much space.
-    /* Handles values form -640 to 639. */
-    static uint8_t table[1280] = {0};
-
-    // taken from https://github.com/python-pillow/Pillow/blob/66add095a50d76c35c7f58643461f2edf78a3f05/src/libImaging/Resample.c#L94
-    //  we need to handle negative values
-    //  it's equivalent to :x = np.clip(x, 0, 255) where x \in [-640, 639]
-    // we will accept a negative x for (&table[640])[x] means table +640 -x
-    for (int i = 0; i < 1280; ++i) {
-      table[i] = static_cast<uint8_t>(std::min(std::max(i - 640, 0), 255));
-    }
-    return table;
-  }();
-  return lookup_table;
-}
-}  // namespace
-
 template <typename T>
 struct FilterParamsBaseAntiAlias {
   std::vector<int64_t> bound;
@@ -57,15 +31,15 @@ struct FilterParamsBaseAntiAlias {
 
 template <typename T>
 struct FilterParamsAntiAlias {
-  float support_size = 2.0f;
-  float cubic_coeff_a = -0.75f;
+  float support_size = antialias_constants::kSupportSize;
+  float cubic_coeff_a = antialias_constants::kCubicCoeffA;
 
   FilterParamsBaseAntiAlias<T> dim_x;
   FilterParamsBaseAntiAlias<T> dim_y;
   FilterParamsBaseAntiAlias<T> dim_z;
 
   const uint8_t* GetClip8LookupTable() const {
-    return GetLookupTableShared();
+    return UpsampleBase::GetLookupTableShared();
   }
   virtual ~FilterParamsAntiAlias() = default;
   virtual float Filter(float x) const = 0;
@@ -89,7 +63,7 @@ struct BilinearParamsAntiAlias : FilterParamsAntiAlias<T> {
 template <typename T>
 struct BiCubicParamsAntiAlias : FilterParamsAntiAlias<T> {
   BiCubicParamsAntiAlias() {
-    this->support_size = 4.0f;
+    this->support_size = antialias_constants::kBiCubicSupportSize;
   }
 
   // taken from
@@ -124,27 +98,6 @@ struct TriLinearParamsAntiAlias : FilterParamsAntiAlias<T> {
   }
 };
 
-template <typename T>
-struct AccumulateType {
-  using type = int32_t;
-  using Dtype = T;
-};
-
-template <>
-struct AccumulateType<int32_t> {
-  using type = float;
-};
-
-template <>
-struct AccumulateType<float> {
-  using type = float;
-};
-
-template <>
-struct AccumulateType<double> {
-  using type = double;
-};
-
 // The following method supports a 3/4/5-D input in 'Linear mode, cubic mode'
 // that amounts to 'Bilinear,TriLinear, Bicubic/Tricubic' Upsampling/Resizing in the sense that it assumes
 // A N-D tensor has
@@ -156,19 +109,20 @@ struct AccumulateType<double> {
 // - [N, H, W, C] and the scales are [1.0, height_scale, width_scale, 1.0]
 template <class T>
 void SetupUpsampleFilterAntiAlias(FilterParamsAntiAlias<T>& p,
-                                  const gsl::span<int64_t> input_h_w_c,
-                                  const gsl::span<int64_t> output_h_w_c,
-                                  const gsl::span<float> scale_h_w_c,
-                                  const std::vector<float>& roi,
+                                  gsl::span<const int64_t> input_h_w_c,
+                                  gsl::span<const int64_t> output_h_w_c,
+                                  gsl::span<const float> scale_h_w_c,
+                                  gsl::span<const float> roi,
                                   AllocatorPtr& alloc,
                                   const GetOriginalCoordinateFunc& get_original_coordinate,
                                   bool exclude_outside, const bool is_nchw) {
-  auto compute_weight_coefficients = [&alloc, &roi, &get_original_coordinate, exclude_outside](const FilterParamsAntiAlias<T>& p,
-                                                                                               const int64_t input_size,
-                                                                                               const int64_t output_size,
-                                                                                               size_t rindex,
-                                                                                               FilterParamsBaseAntiAlias<T>& param_base,
-                                                                                               const float rscale) -> int64_t {
+  auto compute_weight_coefficients = [&alloc, roi, &get_original_coordinate, exclude_outside](
+                                         const FilterParamsAntiAlias<T>& p,
+                                         const int64_t input_size,
+                                         const int64_t output_size,
+                                         size_t rindex,
+                                         FilterParamsBaseAntiAlias<T>& param_base,
+                                         const float rscale) -> int64_t {
     param_base.bound.reserve(static_cast<size_t>(output_size) * 2);
     param_base.out_of_bound_idx.reserve(static_cast<size_t>(output_size));
 
@@ -245,13 +199,14 @@ void SetupUpsampleFilterAntiAlias(FilterParamsAntiAlias<T>& p,
 
         // normalize the scale to 1 << 22 for int8/uint8
         if constexpr (std::is_same<T, int32_t>::value) {
-          scale_buffer_int[x] = static_cast<int32_t>(std::round(scale_buffer[x] * ConstValue::mag_factor * 2.f));
+          scale_buffer_int[x] = static_cast<int32_t>(std::round(scale_buffer[x] * ConstValue::mag_factor_x_2));
         }
       }
       /*for (; x < window_size; x++) {
         scale_buffer[x] = 0;
       }*/
     }
+
     return window_size;
   };
 
@@ -269,9 +224,6 @@ void SetupUpsampleFilterAntiAlias(FilterParamsAntiAlias<T>& p,
   }
 }
 
-template <class T>
-inline constexpr bool is_8bit_v = std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value;
-
 /**
  * @brief To compute interpolation along with the last axis.
  * For brief,we assume the input tensor has 3 dimensions and we all it CHW for each character represent a dim.
@@ -398,6 +350,7 @@ void ComputeInterpolationAtLevel2(int64_t num_channels, int64_t input_height, in
                 output += *Xdata_offset * (*weight_coeff_start++);
                 Xdata_offset += output_width;
               }
+
               if constexpr (is_8bit_v<InputType>) {
                 *Ydata_offset++ = static_cast<InputType>(clip8_lookups[output >> 22]);
               } else if constexpr (std::is_same<InputType, int32_t>::value) {
@@ -444,6 +397,7 @@ void ComputeInterpolationAtLevel2(int64_t num_channels, int64_t input_height, in
                 output += *Xdata_offset * (*weight_coeff_start++);
                 Xdata_offset += output_width;
               }
+
               if constexpr (is_8bit_v<InputType>) {
                 *Ydata_offset++ = static_cast<InputType>(clip8_lookups[output >> 22]);
               } else if constexpr (std::is_same<InputType, int32_t>::value) {
@@ -515,6 +469,7 @@ void UpsampleBaseAntiAlias(FilterParamsAntiAlias<T1>& p,
                                        narrow<size_t>(input_height * num_channels * input_width));
       auto ydata_span = gsl::make_span(image_temp_buffer.get(), narrow<size_t>(input_height * num_channels * output_width));
 
+      // This computes only the width direction.Thus height keeps unchanged.
       ComputeInterpolationAtLevel1(num_channels, input_height, input_width, input_height, output_width,
                                    xdata_span, ydata_span, p, p.dim_x, tp);
     }
@@ -546,7 +501,7 @@ void UpsampleBilinearAntiAlias(const int64_t batch_size,
                                const int64_t output_width,
                                const float height_scale,
                                const float width_scale,
-                               const std::vector<float>& roi,
+                               gsl::span<const float> roi,
                                const bool use_extrapolation,
                                const float extrapolation_value,
                                bool exclude_outside,
@@ -575,7 +530,7 @@ void NhwcUpsampleBilinearAntiAlias(const int64_t batch_size,
                                    const int64_t output_width,
                                    const float height_scale,
                                    const float width_scale,
-                                   const std::vector<float>& roi,
+                                   gsl::span<const float> roi,
                                    const bool use_extrapolation,
                                    const float extrapolation_value,
                                    bool exclude_outside,
@@ -608,7 +563,7 @@ void NhwcResizeBiCubicAntiAlias(const int64_t batch_size,
                                 bool use_extrapolation,
                                 float extrapolation_value,
                                 bool exclude_outside,
-                                const std::vector<float>& roi,
+                                gsl::span<const float> roi,
                                 const Tensor* X,
                                 T* Ydata_base,
                                 AllocatorPtr& alloc,
@@ -688,7 +643,7 @@ void ResizeBiCubicAntiAlias(int64_t batch_size,
                             bool use_extrapolation,
                             float extrapolation_value,
                             bool exclude_outside,
-                            const std::vector<float>& roi,
+                            gsl::span<const float> roi,
                             const Tensor* X,
                             T* Ydata_base,
                             AllocatorPtr& alloc,
@@ -719,7 +674,7 @@ void UpsampleTrilinearAntiAlias(int64_t batch_size,
                                 float depth_scale,
                                 float height_scale,
                                 float width_scale,
-                                const std::vector<float>& roi,
+                                gsl::span<const float> roi,
                                 bool use_extrapolation,
                                 float extrapolation_value,
                                 bool exclude_outside,
diff --git a/onnxruntime/core/providers/cpu/tensor/upsamplebase.h b/onnxruntime/core/providers/cpu/tensor/upsamplebase.h
index a0e7ca1084fef..b768fedd8513a 100644
--- a/onnxruntime/core/providers/cpu/tensor/upsamplebase.h
+++ b/onnxruntime/core/providers/cpu/tensor/upsamplebase.h
@@ -3,11 +3,13 @@
 
 #pragma once
 
+#include <algorithm>
 #include <string>
 #include <string_view>
 #include <unordered_map>
 #include <vector>
-#include <unordered_set>
+
+#include <core/common/inlined_containers_fwd.h>
 #include "core/common/status.h"
 #include <core/common/safeint.h>
 #include <core/common/narrow.h>
@@ -58,7 +60,73 @@ enum class AspectRatioPolicy {
   NOT_SMALLER,
 };
 
+// Antialias types
+template <typename T>
+struct AccumulateType {
+  using type = int32_t;
+  using Dtype = T;
+};
+
+template <>
+struct AccumulateType<int32_t> {
+  using type = float;
+};
+
+template <>
+struct AccumulateType<float> {
+  using type = float;
+};
+
+template <>
+struct AccumulateType<MLFloat16> {
+  using type = float;
+};
+
+template <>
+struct AccumulateType<double> {
+  using type = double;
+};
+
+namespace antialias_constants {
+constexpr float kCubicCoeffA = -0.75f;
+constexpr float kSupportSize = 2.0f;
+constexpr float kBiCubicSupportSize = 4.0f;
+}  // namespace antialias_constants
+
+namespace ConstValue {
+constexpr int32_t mag_factor = 1 << (22 - 1);
+// We use to multiply by 2, let's make a constant which is twice as big
+constexpr int32_t mag_factor_x_2 = 1 << 22;
+}  // namespace ConstValue
+
+template <class T>
+inline constexpr bool is_8bit_v = std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value;
+
+template <typename T>
+void PrintAntiAliasBuffers(std::ostream& os, gsl::span<int64_t> bounds, gsl::span<int64_t> out_of_bounds,
+                           gsl::span<T> weight_coefficients) {
+  os << "#### Bounds: ";
+  std::copy(bounds.begin(), bounds.end(), std::ostream_iterator<int64_t>(os, " "));
+  os << std::endl;
+
+  os << "#### Out of Bounds: ";
+  std::copy(out_of_bounds.begin(), out_of_bounds.end(),
+            std::ostream_iterator<int64_t>(os, " "));
+  os << std::endl;
+
+  os << "#### Scale Buffer: ";
+  std::copy(weight_coefficients.begin(), weight_coefficients.end(),
+            std::ostream_iterator<T>(os, " "));
+  os << std::endl;
+}
+
 class UpsampleBase {
+ public:
+  // Make this available in other EP via provider bridge
+  // it works iff output_shape is specified
+  void AdjustOutputSizeAsPolicy(TensorShapeVector& output_dims, gsl::span<const int64_t> input_dims,
+                                InlinedVector<float>& scales) const;
+
  protected:
   explicit UpsampleBase(const OpKernelInfo& info)
       : scales_cached_(false), roi_cached_(false), use_extrapolation_(false) {
@@ -69,23 +137,32 @@ class UpsampleBase {
     std::string mode;
     ORT_ENFORCE(info.GetAttr<std::string>("mode", &mode).IsOK());
     mode_ = StringToUpsampleMode(mode);
-    antialias_ = info.GetAttrOrDefault<int64_t>("antialias", 0) == 0 ? false : true;
-    if (antialias_) {
-      ORT_ENFORCE((UpsampleMode::LINEAR == mode_ || UpsampleMode::CUBIC == mode_),
-                  "when anti-aliasing is set, Resize only supports mode `LINEAR` and `CUBIC`.");
-    }
 
     auto input_count = info.GetInputCount();
     if (input_count == 1) {  // opset < 10
-      ORT_THROW_IF_ERROR(info.GetAttrs<float>("scales", scales_));
-      ORT_THROW_IF_ERROR(ScalesValidation(scales_, mode_));
+      std::vector<float> scales;
+      ORT_THROW_IF_ERROR(info.GetAttrs<float>("scales", scales));
+      ORT_THROW_IF_ERROR(ScalesValidation(scales, mode_));
+      scales_.assign(scales.cbegin(), scales.cend());
       scales_cached_ = true;
     }
 
-    std::string keep_aspect_ratio_policy = info.GetAttrOrDefault<std::string>("keep_aspect_ratio_policy", "stretch");
-    keep_aspect_ratio_policy_ = StringToKeepAspectRatioPolicy(keep_aspect_ratio_policy);
+    if (opset >= 18) {
+      antialias_ = info.GetAttrOrDefault<int64_t>("antialias", 0) == 0 ? false : true;
+
+      if (antialias_) {
+        ORT_ENFORCE((UpsampleMode::LINEAR == mode_ || UpsampleMode::CUBIC == mode_),
+                    "when anti-aliasing is set, Resize only supports mode `LINEAR` and `CUBIC`.");
+      }
 
-    axes_ = info.GetAttrsOrDefault<int64_t>("axes");
+      // The attribute is absent in opset < 18, but the default value as if stretch.
+      std::string keep_aspect_ratio_policy = info.GetAttrOrDefault<std::string>("keep_aspect_ratio_policy", "stretch");
+      keep_aspect_ratio_policy_ = StringToKeepAspectRatioPolicy(keep_aspect_ratio_policy);
+
+      // guard against unit tests that can add an attribute
+      auto axes = info.GetAttrsOrDefault<int64_t>("axes");
+      axes_.assign(axes.cbegin(), axes.cend());
+    }
 
     extrapolation_value_ = info.GetAttrOrDefault<float>("extrapolation_value", 0.0f);
 
@@ -112,7 +189,7 @@ class UpsampleBase {
     nearest_mode_ = StringToNearestMode(nearest_mode_name);
     get_nearest_pixel_ = GetNearestPixelFromOriginal(nearest_mode_);
 
-    cubic_coeff_a_ = info.GetAttrOrDefault<float>("cubic_coeff_a", -0.75f);
+    cubic_coeff_a_ = info.GetAttrOrDefault<float>("cubic_coeff_a", antialias_constants::kCubicCoeffA);
     exclude_outside_ = info.GetAttrOrDefault<int64_t>("exclude_outside", 0) == 0 ? false : true;
 
     if ((exclude_outside_ == 1 && mode_ != CUBIC) && (antialias_ == false || mode_ != LINEAR)) {
@@ -166,7 +243,7 @@ class UpsampleBase {
   ResizeCoordinateTransformationMode coordinate_transform_mode_;
   GetOriginalCoordinateFunc get_original_coordinate_;
   ResizeNearestMode nearest_mode_;
-  AspectRatioPolicy keep_aspect_ratio_policy_;
+  AspectRatioPolicy keep_aspect_ratio_policy_{AspectRatioPolicy::STRETCH};
   GetNearestPixelFunc get_nearest_pixel_;
   float cubic_coeff_a_;
   bool exclude_outside_;
@@ -174,9 +251,9 @@ class UpsampleBase {
   float extrapolation_value_;
   bool use_nearest2x_optimization_ = false;
 
-  std::vector<float> scales_;
-  std::vector<float> roi_;
-  std::vector<int64_t> axes_;
+  InlinedVector<float> scales_;
+  InlinedVector<float> roi_;
+  TensorShapeVector axes_;
 
   bool scales_cached_;
   bool roi_cached_;
@@ -335,7 +412,7 @@ class UpsampleBase {
     }
   }
 
-  [[nodiscard]] Status ScalesValidation(const std::vector<float>& scales, const UpsampleMode mode) const {
+  [[nodiscard]] Status ScalesValidation(gsl::span<const float> scales, const UpsampleMode mode) const {
     if (!is_resize_) {
       for (auto& scale : scales) {
         ORT_RETURN_IF_NOT(scale >= 1, "Scale value should be greater than or equal to 1.");
@@ -372,7 +449,7 @@ class UpsampleBase {
   }
 
   [[nodiscard]] Status
-  ParseScalesData(const Tensor* scale, std::vector<float>& scales, int64_t rank) const {
+  ParseScalesData(const Tensor* scale, InlinedVector<float>& scales, int64_t rank) const {
     const auto* scale_data = scale->Data<float>();
     int64_t scales_size = scale->Shape().Size();
     ORT_RETURN_IF_NOT(scales_size > 0, "scales size should be greater than 0.");
@@ -387,19 +464,19 @@ class UpsampleBase {
     // in which case the other axes is ignored and use default scale of 1
     // scales_size == axes_.size() should be guaranteed if axes is not empty
     if (rank > 0 && (scales_size != rank || axes_.size())) {
-      std::vector<float> new_scales(size_t(rank), 1.0f);
+      InlinedVector<float> new_scales(size_t(rank), 1.0f);
       ORT_RETURN_IF_NOT(*std::max_element(axes_.begin(), axes_.end()) < rank && (int64_t(axes_.size()) == scales_size),
                         "all values in axes should be less than rank of the data");
 
       for (size_t i = 0; i < axes_.size(); i++) {
         new_scales[static_cast<size_t>(axes_[i])] = scales[i];
       }
-      scales = new_scales;
+      scales.swap(new_scales);
     }
     return ScalesValidation(scales, mode_);
   }
 
-  void ParseRoiData(const Tensor* roi, std::vector<float>& roi_array) const {
+  void ParseRoiData(const Tensor* roi, InlinedVector<float>& roi_array) const {
     int64_t roi_size = roi->Shape().Size();
     if (roi_size > 0) {
       roi_array.resize(onnxruntime::narrow<size_t>(roi_size));
@@ -429,52 +506,11 @@ class UpsampleBase {
     return Status::OK();
   }
 
-  // it works iff output_shape is specified
-  void AdjustOutputSizeAsPolicy(TensorShapeVector& output_dims, gsl::span<const int64_t> input_dims,
-                                std::vector<float>& scales) const {
-    std::unordered_set<int64_t> axes_set(axes_.begin(), axes_.end());
-
-    // AspectRatioPolicy::STRETCH is default policy when opset < 18
-    if (keep_aspect_ratio_policy_ == AspectRatioPolicy ::STRETCH) {
-      return;
-    }
-
-    float scale_in_policy = 0.0f;
-    if (keep_aspect_ratio_policy_ == AspectRatioPolicy ::NOT_LARGER) {
-      scale_in_policy = std::numeric_limits<float>::max();
-
-      for (size_t i = 0; i < scales.size(); i++) {
-        if (axes_set.empty() || axes_set.count(i) > 0) {
-          scale_in_policy = std::min(scale_in_policy, scales[i]);
-        }
-      }
-    } else if (keep_aspect_ratio_policy_ == AspectRatioPolicy ::NOT_SMALLER) {
-      scale_in_policy = std::numeric_limits<float>::min();
-
-      for (size_t i = 0; i < scales.size(); i++) {
-        if (axes_set.empty() || axes_set.count(i) > 0) {
-          scale_in_policy = std::max(scale_in_policy, scales[i]);
-        }
-      }
-    }
-
-    for (size_t i = 0; i < scales.size(); i++) {
-      // if axes is not specified (AKA axes_set.empty()), we apply the policy to all axes
-      if (axes_set.empty() || axes_set.count(i) > 0) {
-        scales[i] = scale_in_policy;
-        output_dims[i] = static_cast<int64_t>(std::round(scales[i] * input_dims[i]));
-      } else {
-        scales[i] = 1.0f;
-        output_dims[i] = input_dims[i];
-      }
-    }
-  }
-
   // It's different in Opset 18 and before.
   // we will modify output_shape by sorts of policy even if it's specified
   [[nodiscard]] Status ParseScalesDataAndAdjustOutputSize(TensorShapeVector& output_dims,
                                                           gsl::span<const int64_t> input_dims,
-                                                          std::vector<float>& scales) const {
+                                                          InlinedVector<float>& scales) const {
     for (size_t i = 0, end = input_dims.size(); i < end; ++i) {
       // Handle corner case to avoid dividing by zero in the next step
       if (input_dims[i] == 0) {
@@ -507,9 +543,9 @@ class UpsampleBase {
 
   // Roi is redefined in Opset-18, we have a concept of axes.
   // So we need to update it accordingly.
-  void ComputeROIWithAxes(std::vector<float>& roi_array, size_t rank) const {
+  void ComputeROIWithAxes(InlinedVector<float>& roi_array, size_t rank) const {
     if (axes_.size()) {
-      std::vector<float> roi_tmp(rank * 2, 0);
+      InlinedVector<float> roi_tmp(rank * 2, 0);
       for (size_t i = rank; i < rank * 2; ++i) {
         roi_tmp[i] = 1;
       }
@@ -518,9 +554,32 @@ class UpsampleBase {
         roi_tmp[v_in_axes] = (roi_array[i]);
         roi_tmp[rank + v_in_axes] = (roi_array[axes_.size() + i]);
       }
-      roi_array = roi_tmp;
+      roi_array.swap(roi_tmp);
     }
   }
+
+ public:
+  static constexpr size_t kLookupTableSize = 1280;
+
+  static const uint8_t* GetLookupTableShared() {
+    // initialized once
+    static const auto* lookup_table = []() {
+      // if we have already initialized the lookup table, just return
+      // ideally we could have a global lookup table, but that account for too much space.
+      /* Handles values form -640 to 639. */
+      static uint8_t table[kLookupTableSize] = {0};
+
+      // taken from https://github.com/python-pillow/Pillow/blob/66add095a50d76c35c7f58643461f2edf78a3f05/src/libImaging/Resample.c#L94
+      //  we need to handle negative values
+      //  it's equivalent to :x = np.clip(x, 0, 255) where x \in [-640, 639]
+      // we will accept a negative x for (&table[640])[x] means table +640 -x
+      for (int i = 0; i < static_cast<int>(kLookupTableSize); ++i) {
+        table[i] = static_cast<uint8_t>(std::min(std::max(i - 640, 0), 255));
+      }
+      return table;
+    }();
+    return lookup_table;
+  }
 };  // UpsampleBase
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/cu_inc/common.cuh b/onnxruntime/core/providers/cuda/cu_inc/common.cuh
index 0d9928baa86e0..66794f88d8670 100644
--- a/onnxruntime/core/providers/cuda/cu_inc/common.cuh
+++ b/onnxruntime/core/providers/cuda/cu_inc/common.cuh
@@ -194,13 +194,13 @@ template <>
 __device__ __inline__ half _Ceil(half a) { return half(ceilf((float)a)); }
 
 template <typename T>
-__device__ __inline__ T _Floor(T a);
+__device__ __host__ __inline__ T _Floor(T a);
 
 template <>
-__device__ __inline__ float _Floor(float a) { return floorf(a); }
+__device__ __host__ __inline__ float _Floor(float a) { return floorf(a); }
 
 template <>
-__device__ __inline__ double _Floor(double a) { return floor(a); }
+__device__ __host__ __inline__ double _Floor(double a) { return floor(a); }
 
 template <>
 __device__ __inline__ half _Floor(half a) { return half(floorf((float)a)); }
@@ -230,13 +230,13 @@ template <>
 __device__ __inline__ half _Erf(half a) { return half(erff((float)a)); }
 
 template <typename T>
-__device__ __inline__ T _Round(T a);
+__device__ __host__ __inline__ T _Round(T a);
 
 template <>
-__device__ __inline__ float _Round(float a) { return rintf(a); }
+__device__ __host__ __inline__ float _Round(float a) { return rintf(a); }
 
 template <>
-__device__ __inline__ double _Round(double a) { return rint(a); }
+__device__ __host__ __inline__ double _Round(double a) { return rint(a); }
 
 template <>
 __device__ __inline__ half _Round(half a) {
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index 00783bcbc2665..1ce089fd93044 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -1109,11 +1109,11 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, ReduceSumSquare);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int64_t, GatherND);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Dropout);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Resize);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Resize);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Resize);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int32_t, Resize);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, uint8_t, Resize);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, float, Resize);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, double, Resize);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, MLFloat16, Resize);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, int32_t, Resize);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, uint8_t, Resize);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, If);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, Loop);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Flatten);
@@ -1277,6 +1277,11 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, double, Pad);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, Pad);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, bool, Pad);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, float, Resize);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, double, Resize);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, Resize);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, int32_t, Resize);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, uint8_t, Resize);
 
 // Opset 19
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, float, Cast);
@@ -2009,11 +2014,11 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, ReduceSumSquare)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int64_t, GatherND)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Dropout)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int32_t, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, uint8_t, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, float, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, double, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, MLFloat16, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, int32_t, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, uint8_t, Resize)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, If)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, Loop)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Flatten)>,
@@ -2176,6 +2181,11 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, double, Pad)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, Pad)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, bool, Pad)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, float, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, double, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, int32_t, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, uint8_t, Resize)>,
 
     // Opset 19
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, float, Cast)>,
diff --git a/onnxruntime/core/providers/cuda/tensor/resize.cc b/onnxruntime/core/providers/cuda/tensor/resize.cc
index 764172a8d1fac..97d4eb71e970a 100644
--- a/onnxruntime/core/providers/cuda/tensor/resize.cc
+++ b/onnxruntime/core/providers/cuda/tensor/resize.cc
@@ -28,10 +28,22 @@ namespace cuda {
           .InputMemoryType(OrtMemTypeCPUInput, 3)                  \
           .TypeConstraint("T1", DataTypeImpl::GetTensorType<T>()), \
       Resize<T>);                                                  \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                         \
+      Resize,                                                      \
+      kOnnxDomain,                                                 \
+      13, 17,                                                      \
+      T,                                                           \
+      kCudaExecutionProvider,                                      \
+      (*KernelDefBuilder::Create())                                \
+          .InputMemoryType(OrtMemTypeCPUInput, 1)                  \
+          .InputMemoryType(OrtMemTypeCPUInput, 2)                  \
+          .InputMemoryType(OrtMemTypeCPUInput, 3)                  \
+          .TypeConstraint("T1", DataTypeImpl::GetTensorType<T>()), \
+      Resize<T>);                                                  \
   ONNX_OPERATOR_TYPED_KERNEL_EX(                                   \
       Resize,                                                      \
       kOnnxDomain,                                                 \
-      13,                                                          \
+      18,                                                          \
       T,                                                           \
       kCudaExecutionProvider,                                      \
       (*KernelDefBuilder::Create())                                \
diff --git a/onnxruntime/core/providers/cuda/tensor/resize_antialias_impl.cu b/onnxruntime/core/providers/cuda/tensor/resize_antialias_impl.cu
new file mode 100644
index 0000000000000..56b7c3f499303
--- /dev/null
+++ b/onnxruntime/core/providers/cuda/tensor/resize_antialias_impl.cu
@@ -0,0 +1,1179 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/cuda/cu_inc/common.cuh"
+#include "core/providers/cuda/tensor/resize_impl.h"
+
+#define FUNC_DEF __device__
+
+namespace onnxruntime {
+namespace cuda {
+
+using onnxruntime::ResizeCoordinateTransformationMode;
+using onnxruntime::UpsampleMode;
+
+/// <summary>
+/// Compute a buffer for bilinear data for CUDA antialias resizing.
+/// </summary>
+static std::tuple<int64_t, int64_t> ComputeBilinearScaleBufferSize(
+    int64_t output_height, int64_t output_width,
+    float height_rscale, float width_rscale,
+    float support_value,
+    float& scaled_support_height, float& scaled_support_width,
+    int32_t& window_size_height, int32_t& window_size_width) {
+  scaled_support_height = ComputeScaledSupportValue(support_value, height_rscale);
+  scaled_support_width = ComputeScaledSupportValue(support_value, width_rscale);
+  window_size_height = ComputeWindowSize(scaled_support_height);
+  window_size_width = ComputeWindowSize(scaled_support_width);
+
+  auto height_buffer_size = ComputeWeightedCoeffBufferSize(output_height, window_size_height);
+  auto width_buffer_size = ComputeWeightedCoeffBufferSize(output_width, window_size_width);
+
+  return std::make_tuple(height_buffer_size, width_buffer_size);
+}
+
+/// <summary>
+/// Compute a buffer for btrilinear data for CUDA antialias resizing.
+/// </summary>
+static std::tuple<int64_t, int64_t, int64_t> ComputeTrilinearScaleBufferSize(
+    int64_t output_depth, int64_t output_height, int64_t output_width,
+    float depth_rscale, float height_rscale, float width_rscale,
+    float support_value,
+    float& scaled_support_depth, float& scaled_support_height,
+    float& scaled_support_width, int32_t& window_size_depth,
+    int32_t& window_size_height, int32_t& window_size_width) {
+  scaled_support_depth = ComputeScaledSupportValue(support_value, depth_rscale);
+  window_size_depth = ComputeWindowSize(scaled_support_depth);
+  auto depth_buffer_size = ComputeWeightedCoeffBufferSize(output_depth, window_size_depth);
+
+  const auto [y_buffer_size, w_buffer_size] = ComputeBilinearScaleBufferSize(output_height,
+                                                                             output_width, height_rscale,
+                                                                             width_rscale, support_value,
+                                                                             scaled_support_height,
+                                                                             scaled_support_width,
+                                                                             window_size_height, window_size_width);
+  return std::make_tuple(depth_buffer_size, y_buffer_size, w_buffer_size);
+}
+
+// Antialiasing filters
+struct BilinearFilter {
+  __device__ __host__ float operator()(float x, float /* cubic_coeff_a */) const {
+    if (x < 0.0f) {
+      x = -x;
+    }
+    if (x < 1.0f) {
+      return 1.0f - x;
+    }
+    return 0.0f;
+  }
+};
+
+struct BiCubicFilter {
+  __device__ __host__ float operator()(float x, float cubic_coeff_a) const {
+    /* https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
+     */
+    if (x < 0.0f) {
+      x = -x;
+    }
+    if (x < 1.0f) {
+      return ((cubic_coeff_a + 2.0f) * x - (cubic_coeff_a + 3.0f)) * x * x + 1;
+    }
+    if (x < 2.0f) {
+      return (((x - 5.0f) * x + 8.f) * x - 4.f) * cubic_coeff_a;
+    }
+    return 0.0f;
+  }
+};
+
+struct TriLinearFilter {
+  __device__ __host__ float operator()(float x, float /* cubic_coeff_a */) const {
+    if (x < 0.0f) {
+      x = -x;
+    }
+    if (x < 1.0f) {
+      return 1.0f - x;
+    }
+    return 0.0f;
+  }
+};
+
+template <typename AccumType>
+struct AccumTypeCaster {
+  static __device__ __host__ AccumType* cast(AccumType* p) {
+    return p;
+  }
+};
+
+template <>
+struct AccumTypeCaster<int32_t> {
+  static __device__ __host__ float* cast(int32_t* p) {
+    return reinterpret_cast<float*>(p);
+  }
+};
+
+template <typename T, typename AccumType>
+__global__ void _ComputeInterpolationAtLevel1(
+    int64_t num_channels,
+    int64_t input_height, int64_t input_width,
+    int64_t output_height, int64_t output_width,
+    const fast_divmod div_output_width,
+    const fast_divmod div_output_image,
+    int32_t window_size,
+    const uint8_t* clip8_table,
+    const int64_t* bound_data,
+    std::tuple<int64_t*, int64_t*> outof_bounds_buffers,
+    const AccumType* weight_coefficients,
+    const T* Xdata, T* Ydata,
+    const int N) {
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
+
+  // No need to do scale
+  if (output_width == input_width) {
+    Ydata[id] = Xdata[id];
+    return;
+  }
+
+  int bxc, output_image_index;
+  div_output_image.divmod(id, bxc, output_image_index);
+
+  int output_y, output_x;
+  div_output_width.divmod(output_image_index, output_y, output_x);
+
+  CUDA_LONG input_index = static_cast<CUDA_LONG>(bxc * num_channels * input_height * input_width);
+  CUDA_LONG output_index = static_cast<CUDA_LONG>(bxc * num_channels * output_height * output_width);
+
+  auto* Ydata_offset = Ydata + output_index + output_width * output_y + output_x;
+  const auto* bound = bound_data;
+
+  AccumType output = onnxruntime::is_8bit_v<T> ? ConstValue::mag_factor : 0;
+
+  const auto* weight_coeff = weight_coefficients + window_size * output_x;
+  int64_t xmin = bound[static_cast<ptrdiff_t>(output_x) * 2];
+  int64_t xmax = bound[static_cast<ptrdiff_t>(output_x) * 2 + 1];
+
+  // Input window
+  const auto* Xdata_offset = Xdata + input_index + input_width * output_y + xmin;
+
+  for (; xmin < xmax; ++xmin) {
+    if constexpr (std::is_same<T, half>::value) {
+      // This cast is needed when we deal with half
+      output += static_cast<AccumType>((*Xdata_offset++)) * (*weight_coeff++);
+    } else {
+      output += (*Xdata_offset++) * (*weight_coeff++);
+    }
+  }
+
+  if constexpr (onnxruntime::is_8bit_v<T>) {
+    const uint8_t* clip8_lookups = &clip8_table[640];
+    *Ydata_offset = static_cast<T>(clip8_lookups[output >> 22]);
+  } else if constexpr (std::is_same<T, int32_t>::value) {
+    *Ydata_offset = static_cast<int32_t>(std::round(output));
+  } else {
+    *Ydata_offset = static_cast<T>(output);
+  }
+}
+
+template <typename T, typename AccumType>
+__global__ void _ComputeInterpolationAtLevel2(
+    int64_t num_channels,
+    int64_t input_height, int64_t input_width,
+    int64_t output_height, int64_t output_width,
+    const fast_divmod div_output_height,
+    const fast_divmod div_output_width,
+    const fast_divmod div_output_image,
+    int32_t window_size,
+    bool use_extrapolation, float extrapolation_value,
+    const uint8_t* clip8_table,
+    const int64_t* bound_data,
+    std::tuple<int64_t*, int64_t*> outof_bounds_buffers,
+    const AccumType* weight_coefficients,
+    const T* Xdata, T* Ydata, int N) {
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
+
+  // No need to do scale
+  if (output_height == input_height) {
+    Ydata[id] = Xdata[id];
+    return;
+  }
+
+  int bxc, output_image_index;
+  div_output_image.divmod(id, bxc, output_image_index);
+
+  int output_z, output_y, output_x, temp;
+  div_output_height.divmod(output_image_index, output_z, temp);
+  div_output_width.divmod(temp, output_y, output_x);
+
+  CUDA_LONG input_index = static_cast<CUDA_LONG>(bxc * num_channels * input_height * input_width +
+                                                 output_z * input_height * input_width);
+  CUDA_LONG output_index = static_cast<CUDA_LONG>(bxc * num_channels * output_height * output_width +
+                                                  output_z * output_height * output_width);
+
+  auto* Ydata_offset = Ydata + output_index + output_width * output_y + output_x;
+
+  if (use_extrapolation) {
+    const auto* w_outof_bounds = std::get<1>(outof_bounds_buffers);
+    // Extrapolate along the w dimension
+    if (w_outof_bounds[static_cast<ptrdiff_t>(output_x)] != -1) {
+      *Ydata_offset = static_cast<T>(extrapolation_value);
+      return;
+    }
+
+    // Extrapolate along the y dimension
+    const auto* y_outof_bounds = std::get<0>(outof_bounds_buffers);
+    if (y_outof_bounds[static_cast<ptrdiff_t>(output_y)] != -1) {
+      *Ydata_offset = static_cast<T>(extrapolation_value);
+      return;
+    }
+  }
+
+  const auto* bound = bound_data;
+
+  AccumType output = onnxruntime::is_8bit_v<T> ? ConstValue::mag_factor : 0;
+
+  const auto* weight_coeff = weight_coefficients + window_size * output_y;
+  int64_t ymin = bound[static_cast<ptrdiff_t>(output_y) * 2];
+  int64_t ymax = bound[static_cast<ptrdiff_t>(output_y) * 2 + 1];
+
+  const auto* Xdata_offset = Xdata + input_index + ymin * output_width + output_x;
+
+  for (; ymin < ymax; ++ymin) {
+    if constexpr (std::is_same<T, half>::value) {
+      // We cast to AccumType to resolve ambiguous call to operator* for half in CUDA
+      output += static_cast<AccumType>((*Xdata_offset)) * (*weight_coeff++);
+    } else {
+      output += (*Xdata_offset) * (*weight_coeff++);
+    }
+    Xdata_offset += input_width;
+  }
+
+  if constexpr (onnxruntime::is_8bit_v<T>) {
+    const uint8_t* clip8_lookups = &clip8_table[640];
+    *Ydata_offset = static_cast<T>(clip8_lookups[output >> 22]);
+  } else if constexpr (std::is_same<T, int32_t>::value) {
+    *Ydata_offset = static_cast<int32_t>(std::round(output));
+  } else {
+    *Ydata_offset = output;
+  }
+}
+
+template <typename T, typename AccumType>
+__global__ void _ComputeInterpolationAtLevel3(
+    int64_t input_depth,
+    int64_t input_height, int64_t input_width,
+    int64_t output_depth,
+    int64_t output_height, int64_t output_width,
+    const fast_divmod div_output_height,
+    const fast_divmod div_output_width,
+    const fast_divmod div_output_image,
+    int32_t window_size,
+    bool use_extrapolation, float extrapolation_value,
+    const uint8_t* clip8_table,
+    const int64_t* bound_data,
+    std::tuple<int64_t*, int64_t*, int64_t*> outof_bounds_buffers,
+    const AccumType* weight_coefficients,
+    const T* Xdata, T* Ydata, int N) {
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
+
+  // No need to do scale
+  if (input_depth == output_depth) {
+    Ydata[id] = Xdata[id];
+    return;
+  }
+
+  int bxc, output_image_index;
+  div_output_image.divmod(id, bxc, output_image_index);
+
+  int output_z, output_y, output_x, temp;
+  div_output_height.divmod(output_image_index, output_z, temp);
+  div_output_width.divmod(temp, output_y, output_x);
+
+  CUDA_LONG input_index = static_cast<CUDA_LONG>(bxc * input_depth * input_height * input_width);
+
+  auto* Ydata_offset = Ydata + id;
+
+  if (use_extrapolation) {
+    const auto* w_outof_bounds = std::get<2>(outof_bounds_buffers);
+    // Extrapolate along the w dimension
+    if (w_outof_bounds[static_cast<ptrdiff_t>(output_x)] != -1) {
+      *Ydata_offset = static_cast<T>(extrapolation_value);
+      return;
+    }
+
+    // Extrapolate along the y dimension
+    const auto* y_outof_bounds = std::get<1>(outof_bounds_buffers);
+    if (y_outof_bounds[static_cast<ptrdiff_t>(output_y)] != -1) {
+      *Ydata_offset = static_cast<T>(extrapolation_value);
+      return;
+    }
+
+    // Extrapolate along the z dimension
+    const int64_t* z_outof_bounds = std::get<0>(outof_bounds_buffers);
+    if (z_outof_bounds != nullptr && z_outof_bounds[static_cast<ptrdiff_t>(output_z)] != -1) {
+      *Ydata_offset = static_cast<T>(extrapolation_value);
+      return;
+    }
+  }
+
+  const auto* bound = bound_data;
+
+  AccumType output = onnxruntime::is_8bit_v<T> ? ConstValue::mag_factor : 0;
+
+  const auto* weight_coeff = weight_coefficients + window_size * output_z;
+  int64_t zmin = bound[static_cast<ptrdiff_t>(output_z) * 2];
+  int64_t zmax = bound[static_cast<ptrdiff_t>(output_z) * 2 + 1];
+
+  const auto z_step = input_height * input_width;
+  const auto* Xdata_offset = Xdata + input_index + zmin * z_step + output_y * output_width + output_x;
+
+  for (; zmin < zmax; ++zmin) {
+    if constexpr (std::is_same<T, half>::value) {
+      // We cast to AccumType to resolve ambiguous call to operator* for half in CUDA
+      output += static_cast<AccumType>((*Xdata_offset)) * (*weight_coeff++);
+    } else {
+      output += (*Xdata_offset) * (*weight_coeff++);
+    }
+    Xdata_offset += z_step;
+  }
+
+  if constexpr (onnxruntime::is_8bit_v<T>) {
+    const uint8_t* clip8_lookups = &clip8_table[640];
+    *Ydata_offset = static_cast<T>(clip8_lookups[output >> 22]);
+  } else if constexpr (std::is_same<T, int32_t>::value) {
+    *Ydata_offset = static_cast<int32_t>(std::round(output));
+  } else {
+    *Ydata_offset = output;
+  }
+}
+
+/// <summary>
+/// This function expects the following buffers to be pre-allocated on device
+/// 1. bounds: int64_t[output_size * 2]
+/// 2. out_of_bounds: int64_t[output_size]
+/// 3. scale_data: T[output_size * window_size]
+///
+/// Template parameter AccumType
+/// </summary>
+template <typename AccumType, typename Filter, typename CudaFunctionOriginalCoordinate>
+FUNC_DEF void SetupUpsampleFilterAnitAliasImpl(
+    int64_t i,
+    int64_t input_size, int64_t output_size,
+    float rscale,
+    float roi_start, float roi_end,
+    float scaled_support, int32_t window_size, bool exclude_outside,
+    float cubic_coeff_a,
+    int64_t* bounds,
+    int64_t* out_of_bounds,
+    AccumType* scale_data) {
+  Filter filter{};
+  CudaFunctionOriginalCoordinate get_original_coordinate{};
+
+  const auto scale = 1.f / rscale;
+  const float inv_scale = (scale >= 1.0f) ? 1.0f / scale : 1.0f;
+
+  const float id = static_cast<float>(i);
+  float center = 0.5f;
+  if (scale == 1.0f) {
+    center += id;
+  } else {
+    center += get_original_coordinate(id, rscale,
+                                      static_cast<float>(output_size),
+                                      static_cast<float>(input_size),
+                                      roi_start, roi_end);
+  }
+
+  if (center - 0.5f < 0 || center - 0.5f > static_cast<float>(input_size - 1)) {
+    out_of_bounds[i] = i;
+  } else {
+    out_of_bounds[i] = -1;
+  }
+
+  float total_weight{0};
+
+  auto fmin = _Floor(center - scaled_support + 0.5f);
+  auto fmax = _Floor(center + scaled_support + 0.5f);
+
+  int64_t min_real = static_cast<int64_t>(fmin);
+  int64_t max_real = static_cast<int64_t>(fmax);
+  int64_t min_cut = std::max<int64_t>(min_real, 0);
+  int64_t max_cut = std::min(max_real, input_size);
+
+  int64_t min_val = exclude_outside ? min_cut : min_real;
+  int64_t max_val = exclude_outside ? max_cut : max_real;
+  bounds[i * 2] = min_cut;
+  bounds[i * 2 + 1] = max_cut;
+
+  // This is done for int32_t case, when the final result is in int32_t, but
+  // we perform calculations in float. All other types as is.
+  auto* scale_buffer = AccumTypeCaster<AccumType>::cast(&scale_data[i * window_size]);
+
+  max_val -= min_val;
+  for (int64_t x = 0; x < max_val; x++) {
+    const float arg = (x + min_val - center + 0.5f) * inv_scale;
+    const auto w = filter(arg, cubic_coeff_a);
+    scale_buffer[x] = w;
+    total_weight += w;
+  }
+
+  if (!exclude_outside) {
+    int64_t neg_xsize = min_val < 0 ? -min_val : 0;
+    for (int64_t x = 0; x < neg_xsize; x++) {
+      scale_buffer[neg_xsize] += scale_buffer[x];
+    }
+
+    int64_t bound_size =
+        max_val + min_val > input_size ? max_val + min_val - input_size : 0;
+    for (int64_t x = max_val - bound_size; x < max_val; x++) {
+      scale_buffer[max_val - bound_size - 1] +=
+          scale_buffer[x];
+    }
+
+    for (int64_t x = 0; (neg_xsize | bound_size) > 0 && x < max_cut - min_cut; x++) {
+      scale_buffer[x] = scale_buffer[x + neg_xsize];
+    }
+  }
+
+  const float total_weight_inv = (total_weight == 0) ? 1.f : (1.f / total_weight);
+  if constexpr (std::is_same<AccumType, int32_t>::value) {
+    auto* scale_buffer_int = reinterpret_cast<int32_t*>(scale_buffer);
+    for (int64_t x = 0; x < max_cut - min_cut; x++) {
+      scale_buffer[x] *= total_weight_inv;
+      // normalize the scale to 1 << 22 for int8/uint8
+      scale_buffer_int[x] = static_cast<int32_t>(_Round(scale_buffer[x] * ConstValue::mag_factor_x_2));
+    }
+  } else {
+    for (int64_t x = 0; x < max_cut - min_cut; x++) {
+      scale_buffer[x] *= total_weight_inv;
+    }
+  }
+}
+
+/// This kernel computes antialias filter for bilinear or bicubic upsampling.
+/// The function expects the following buffers to be pre-allocated on device
+/// 1. bounds: int64_t[output_size * 2] for each of the two dimensions
+/// 2. out_of_bounds: int64_t[output_size] for each of the two dimensions
+/// 3. scale_data: AccumType[output_size * window_size] for each of the two dimensions
+/// Buffers layout [h_data, w_data]
+template <typename AccumType, typename Filter, typename CudaFunctionOriginalCoordinate>
+__global__ void _SetupBilinearUpsampleFilterAntiAlias(
+    std::tuple<int64_t, int64_t> input_dims,       // h, w
+    std::tuple<int64_t, int64_t> output_dims,      // h, w
+    std::tuple<float, float> inv_scale_vals,       // h, w
+    std::tuple<float, float> roi_start_vals,       // h, w
+    std::tuple<float, float> roi_end_vals,         // h, w
+    std::tuple<float, float> dim_scaled_support,   // Pre-computed scaled support values h, w
+    std::tuple<int32_t, int32_t> dim_window_size,  // Pre-computed windows sizes h, w
+    float cubic_coeff_a,
+    bool exclude_outside,
+    int64_t* bounds,
+    int64_t* out_of_bounds,
+    std::tuple<AccumType*, AccumType*> weighted_coefficients  // y, h buffers
+) {
+  const auto N = std::get<0>(output_dims) + std::get<1>(output_dims);
+
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
+
+  if (id < std::get<0>(output_dims)) {
+    // Setup for y
+    int64_t input_size = std::get<0>(input_dims);
+    int64_t output_size = std::get<0>(output_dims);
+    float inv_scale = std::get<0>(inv_scale_vals);
+    float roi_start = std::get<0>(roi_start_vals);
+    float roi_end = std::get<0>(roi_end_vals);
+    float scaled_support = std::get<0>(dim_scaled_support);
+    int32_t window_size = std::get<0>(dim_window_size);
+
+    SetupUpsampleFilterAnitAliasImpl<AccumType, Filter, CudaFunctionOriginalCoordinate>(
+        id,
+        input_size, output_size,
+        inv_scale,
+        roi_start, roi_end,
+        scaled_support, window_size,
+        exclude_outside,
+        cubic_coeff_a,
+        bounds,
+        out_of_bounds,
+        std::get<0>(weighted_coefficients));
+
+  } else {
+    // Setup for w
+    // w = id - output_height
+
+    int64_t input_size = std::get<1>(input_dims);
+    int64_t output_size = std::get<1>(output_dims);
+    float inv_scale = std::get<1>(inv_scale_vals);
+    float roi_start = std::get<1>(roi_start_vals);
+    float roi_end = std::get<1>(roi_end_vals);
+
+    float scaled_support = std::get<1>(dim_scaled_support);
+    int32_t window_size = std::get<1>(dim_window_size);
+
+    // Adjust buffer positions
+    const auto y_output_size = std::get<0>(output_dims);
+
+    auto i = id - y_output_size;
+    bounds += (y_output_size * 2);
+    out_of_bounds += y_output_size;
+
+    SetupUpsampleFilterAnitAliasImpl<AccumType, Filter, CudaFunctionOriginalCoordinate>(
+        i,
+        input_size, output_size,
+        inv_scale,
+        roi_start, roi_end,
+        scaled_support, window_size,
+        exclude_outside,
+        cubic_coeff_a,
+        bounds,
+        out_of_bounds,
+        std::get<1>(weighted_coefficients));
+  }
+}
+
+/// <summary>
+/// Compute AntiAlias filter for trilinear upsampling, all in one go
+/// The function expects the following buffers to be pre-allocated on device
+/// 1. bounds: int64_t[output_size * 2] for each of the three dimensions
+/// 2. out_of_bounds: int64_t[output_size] for each of the three dimensions
+/// 3. scale_data: AccumType[output_size * window_size] for each of the three dimensions
+/// Each kind of buffer contains data for all 3 dims.
+/// Buffers layout [d_data, h_data, w_data]
+/// </summary>
+template <typename AccumType, typename Filter, typename CudaFunctionOriginalCoordinate>
+__global__ void _SetupTrilinerarUpsampleFilterAntiAlias(
+    std::tuple<int64_t, int64_t, int64_t> input_dims,       // d, h, w
+    std::tuple<int64_t, int64_t, int64_t> output_dims,      // d, h, w
+    std::tuple<float, float, float> inv_scale_vals,         // d, h, w
+    std::tuple<float, float, float> roi_start_vals,         // d, h, w
+    std::tuple<float, float, float> roi_end_vals,           // d, h, w
+    std::tuple<float, float, float> dim_scaled_support,     // Pre-computed scaled support values d, h, w
+    std::tuple<int32_t, int32_t, int32_t> dim_window_size,  // Pre-computed windows sizes d, h, w
+    bool exclude_outisde,
+    int64_t* bounds,
+    int64_t* out_of_bounds,
+    std::tuple<AccumType*, AccumType*, AccumType*> weighted_coefficients) {
+  const auto N = std::get<0>(output_dims) + std::get<1>(output_dims) + std::get<2>(output_dims);
+
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
+
+  if (id < std::get<0>(output_dims)) {
+    // Setup for d by default (id < output_depth)
+    int64_t input_size = std::get<0>(input_dims);
+    int64_t output_size = std::get<0>(output_dims);
+    float inv_scale = std::get<0>(inv_scale_vals);
+    float roi_start = std::get<0>(roi_start_vals);
+    float roi_end = std::get<0>(roi_end_vals);
+    float scaled_support = std::get<0>(dim_scaled_support);
+    int32_t window_size = std::get<0>(dim_window_size);
+
+    SetupUpsampleFilterAnitAliasImpl<AccumType, Filter, CudaFunctionOriginalCoordinate>(
+        id,
+        input_size, output_size,
+        inv_scale,
+        roi_start, roi_end,
+        scaled_support, window_size,
+        exclude_outisde,
+        onnxruntime::antialias_constants::kCubicCoeffA,  // Default value for trilinear
+        bounds,
+        out_of_bounds,
+        std::get<0>(weighted_coefficients));
+
+  } else if (id >= std::get<0>(output_dims) && id < (std::get<0>(output_dims) + std::get<1>(output_dims))) {
+    int64_t input_size = std::get<1>(input_dims);
+    int64_t output_size = std::get<1>(output_dims);
+    float inv_scale = std::get<1>(inv_scale_vals);
+    float roi_start = std::get<1>(roi_start_vals);
+    float roi_end = std::get<1>(roi_end_vals);
+
+    float scaled_support = std::get<1>(dim_scaled_support);
+    int32_t window_size = std::get<1>(dim_window_size);
+
+    // Adjust buffer positions
+    const auto d_output_size = std::get<0>(output_dims);
+
+    auto i = id - d_output_size;
+    bounds += d_output_size * 2;
+    out_of_bounds += d_output_size;
+
+    SetupUpsampleFilterAnitAliasImpl<AccumType, Filter, CudaFunctionOriginalCoordinate>(
+        i,
+        input_size, output_size,
+        inv_scale,
+        roi_start, roi_end,
+        scaled_support, window_size,
+        exclude_outisde,
+        onnxruntime::antialias_constants::kCubicCoeffA,  // Default value for trilinear
+        bounds,
+        out_of_bounds,
+        std::get<1>(weighted_coefficients));
+  } else {
+    int64_t input_size = std::get<2>(input_dims);
+    int64_t output_size = std::get<2>(output_dims);
+    float inv_scale = std::get<2>(inv_scale_vals);
+    float roi_start = std::get<2>(roi_start_vals);
+    float roi_end = std::get<2>(roi_end_vals);
+    float scaled_support = std::get<2>(dim_scaled_support);
+    int32_t window_size = std::get<2>(dim_window_size);
+
+    // Adjust buffer positions
+    const auto d_y_output_size = std::get<0>(output_dims) + std::get<1>(output_dims);
+
+    auto i = id - d_y_output_size;
+    bounds += (d_y_output_size * 2);
+    out_of_bounds += d_y_output_size;
+
+    SetupUpsampleFilterAnitAliasImpl<AccumType, Filter, CudaFunctionOriginalCoordinate>(
+        i,
+        input_size, output_size,
+        inv_scale,
+        roi_start, roi_end,
+        scaled_support, window_size,
+        exclude_outisde,
+        onnxruntime::antialias_constants::kCubicCoeffA,  // Default value for trilinear
+        bounds,
+        out_of_bounds,
+        std::get<2>(weighted_coefficients));
+  }
+}
+
+#define CASEA_COORD_ANTIALIAS(coordinate_mode, TransformCoordType, ...) \
+  case coordinate_mode: {                                               \
+    using coord_t = TransformCoordType;                                 \
+    return __VA_ARGS__();                                               \
+    break;                                                              \
+  }
+
+#define DISPATCH_ANTIALIAS_FILTER_SETUP(coord_enum, ...)                              \
+  [&] {                                                                               \
+    const auto the_type = coord_enum;                                                 \
+    switch (the_type) {                                                               \
+      CASEA_COORD_ANTIALIAS(ResizeCoordinateTransformationMode::HALF_PIXEL,           \
+                            TransformCoordinate_HALF_PIXEL, __VA_ARGS__)              \
+      CASEA_COORD_ANTIALIAS(ResizeCoordinateTransformationMode::ASYMMETRIC,           \
+                            TransformCoordinate_ASYMMETRIC, __VA_ARGS__)              \
+      CASEA_COORD_ANTIALIAS(ResizeCoordinateTransformationMode::PYTORCH_HALF_PIXEL,   \
+                            TransformCoordinate_PYTORCH_HALF_PIXEL, __VA_ARGS__)      \
+      CASEA_COORD_ANTIALIAS(ResizeCoordinateTransformationMode::ALIGN_CORNERS,        \
+                            TransformCoordinate_ALIGN_CORNERS, __VA_ARGS__)           \
+      CASEA_COORD_ANTIALIAS(ResizeCoordinateTransformationMode::TF_HALF_PIXEL_FOR_NN, \
+                            TransformCoordinate_TF_HALF_PIXEL_FOR_NN, __VA_ARGS__)    \
+      CASEA_COORD_ANTIALIAS(ResizeCoordinateTransformationMode::TF_CROP_AND_RESIZE,   \
+                            TransformCoordinate_TF_CROP_AND_RESIZE, __VA_ARGS__)      \
+      default:                                                                        \
+        ORT_THROW("unknown ResizeCoordinateTransformationMode");                      \
+    }                                                                                 \
+  }()
+
+namespace {
+template <typename T>
+IAllocatorUniquePtr<uint8_t> AllocateTyped(
+    const TempSpaceAllocateFunc& alloc,
+    size_t elements) {
+  return alloc(elements * sizeof(T));
+}
+
+template <typename T>
+T* GetTyped(IAllocatorUniquePtr<uint8_t>& bytes) {
+  return reinterpret_cast<T*>(bytes.get());
+}
+}  // namespace
+
+template <typename T>
+void ResizeTrilinearUpsample(
+    cudaStream_t stream,
+    int rank,
+    const UpsampleMode upsample_mode,
+    ResizeCoordinateTransformationMode coordinate_transform_mode,
+    gsl::span<const int64_t> input_shape,
+    gsl::span<const int64_t> output_shape,
+    int64_t batch_size, int64_t num_channels,
+    std::tuple<int64_t, int64_t, int64_t> inferred_input_dims,
+    std::tuple<int64_t, int64_t, int64_t> inferred_output_dims,
+    std::tuple<float, float, float> inferred_dim_rscales,
+    const TArray<fast_divmod>& output_div_pitches,
+    gsl::span<const float> roi_vals,
+    const std::optional<float>& extrapolation,
+    bool exclude_outside,
+    const TempSpaceAllocateFunc& allocate_temp_space,
+    const uint8_t* clip8_lookups,
+    const T* input_data,
+    T* output_data,
+    const size_t N) {
+  using AccumType = typename onnxruntime::AccumulateType<T>::type;
+
+  const bool use_extrapolation = extrapolation.has_value();
+  const float extrapolation_value = use_extrapolation ? *extrapolation : 0.f;
+
+  int64_t input_depth, input_height, input_width;
+  std::tie(input_depth, input_height, input_width) = inferred_input_dims;
+
+  int64_t output_depth, output_height, output_width;
+  std::tie(output_depth, output_height, output_width) = inferred_output_dims;
+
+  int blocksPerDimsMappingGrid =
+      static_cast<int>(ceil((output_depth + output_height + output_width) / 32.0));
+
+  int blocksPerGrid = static_cast<int>(ceil(static_cast<float>(N) / GridDim::maxThreadsPerBlock));
+
+  constexpr float support_value = antialias_constants::kSupportSize;
+  float z_scale, h_scale, w_scale;
+  std::tie(z_scale, h_scale, w_scale) = inferred_dim_rscales;
+
+  const auto& div_output_width = output_div_pitches[rank - 2];
+
+  SafeInt<int64_t> bounds_buffer_size = (SafeInt<int64_t>(output_depth) + output_height + output_width) * 2;
+  SafeInt<int64_t> out_of_bounds_buffer_size = (SafeInt<int64_t>(output_depth) + output_height + output_width);
+
+  auto bounds_buffer_ptr = AllocateTyped<int64_t>(allocate_temp_space, bounds_buffer_size);
+  auto out_of_bounds_buffer_ptr = AllocateTyped<int64_t>(allocate_temp_space, out_of_bounds_buffer_size);
+
+  int64_t* z_bounds_buffer = GetTyped<int64_t>(bounds_buffer_ptr);
+  int64_t* y_bounds_buffer = z_bounds_buffer + output_depth * 2;
+  int64_t* w_bounds_buffer = y_bounds_buffer + output_height * 2;
+
+  int64_t* z_outof_bounds_buffer = GetTyped<int64_t>(out_of_bounds_buffer_ptr);
+  int64_t* y_outof_bounds_buffer = z_outof_bounds_buffer + output_depth;
+  int64_t* w_outof_bounds_buffer = y_outof_bounds_buffer + output_height;
+
+  float z_scaled_support, h_scaled_support, w_scaled_support;
+  int32_t z_window_size, h_window_size, w_window_size;
+  const auto [z_buffer_size, y_buffer_size, w_buffer_size] = ComputeTrilinearScaleBufferSize(
+      output_depth, output_height, output_width,
+      z_scale, h_scale, w_scale, support_value,
+      z_scaled_support, h_scaled_support, w_scaled_support,
+      z_window_size, h_window_size, w_window_size);
+
+  const int64_t weighted_buffer_size = SafeInt<int64_t>(z_buffer_size) + y_buffer_size + w_buffer_size;
+
+  auto weighted_buffer_ptr = AllocateTyped<AccumType>(allocate_temp_space, weighted_buffer_size);
+  AccumType* z_weighted_buffer = GetTyped<AccumType>(weighted_buffer_ptr);
+  AccumType* y_weighted_buffer = z_weighted_buffer + z_buffer_size;
+  AccumType* w_weighted_buffer = y_weighted_buffer + y_buffer_size;
+
+  const auto h_w_interpolate_temp_buf_size = SafeInt<int64_t>(batch_size) * num_channels *
+                                             input_depth * input_height * output_width;
+  auto h_w_interpolate_temp_buffer_ptr = AllocateTyped<T>(allocate_temp_space,
+                                                          narrow<size_t>(h_w_interpolate_temp_buf_size));
+
+  const auto h_w_interpolate_result_buffer_size = SafeInt<int64_t>(batch_size) * num_channels *
+                                                  input_depth * output_height * output_width;
+  auto h_w_interpolate_result_buffer_ptr = AllocateTyped<T>(allocate_temp_space, h_w_interpolate_result_buffer_size);
+
+  // clang-format off
+  DISPATCH_ANTIALIAS_FILTER_SETUP(coordinate_transform_mode, [&]() {
+    _SetupTrilinerarUpsampleFilterAntiAlias<AccumType,
+                                            TriLinearFilter,
+                                            coord_t><<<blocksPerDimsMappingGrid, 32, 0, stream>>>(
+        inferred_input_dims,
+        inferred_output_dims,
+        inferred_dim_rscales,
+        std::make_tuple(roi_vals[rank - 3], roi_vals[rank - 2], roi_vals[rank - 1]),  // roi starts d, h, w
+        std::make_tuple(roi_vals[rank - 3 + rank], roi_vals[rank - 2 + rank],         // roi ends d, h, w
+                        roi_vals[rank - 1 + rank]),
+        std::make_tuple(z_scaled_support, h_scaled_support, w_scaled_support),
+        std::make_tuple(z_window_size, h_window_size, w_window_size),
+        exclude_outside,
+        GetTyped<int64_t>(bounds_buffer_ptr),
+        GetTyped<int64_t>(out_of_bounds_buffer_ptr),
+        std::make_tuple(z_weighted_buffer, y_weighted_buffer, w_weighted_buffer));
+  });
+
+  // clang-format on
+  const fast_divmod div_w_image(narrow<int>(num_channels * input_depth * input_height * output_width));
+  // clang-format off
+  _ComputeInterpolationAtLevel1<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+      num_channels * input_depth, input_height, input_width, input_height, output_width,
+      div_output_width,
+      div_w_image,
+      w_window_size,
+      clip8_lookups,
+      w_bounds_buffer,
+      std::make_tuple(y_outof_bounds_buffer, w_outof_bounds_buffer),
+      w_weighted_buffer, input_data,
+      GetTyped<T>(h_w_interpolate_temp_buffer_ptr),
+      narrow<int>(h_w_interpolate_temp_buf_size));
+
+  // clang-format on
+  const fast_divmod div_output_height{narrow<int>(output_height * output_width)};
+  const fast_divmod div_h_w_image(narrow<int>(num_channels * input_depth * output_height * output_width));
+  // clang-format off
+  _ComputeInterpolationAtLevel2<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+      num_channels * input_depth, input_height, output_width, output_height, output_width,
+      div_output_height,
+      div_output_width,
+      div_h_w_image,
+      h_window_size,
+      false, 0.f,  // No extrapolation
+      clip8_lookups,
+      y_bounds_buffer,
+      std::make_tuple(y_outof_bounds_buffer, w_outof_bounds_buffer),
+      y_weighted_buffer, GetTyped<T>(h_w_interpolate_temp_buffer_ptr),
+      GetTyped<T>(h_w_interpolate_result_buffer_ptr),
+      narrow<int>(h_w_interpolate_result_buffer_size));
+
+  // clang-format on
+  const fast_divmod div_z_h_w_image(narrow<int>(input_depth * output_height * output_width));
+  // clang-format off
+  _ComputeInterpolationAtLevel3<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+      input_depth, output_height, output_width,
+      output_depth, output_height, output_width,
+      div_output_height,
+      div_output_width,
+      div_z_h_w_image,
+      z_window_size,
+      use_extrapolation, extrapolation_value,
+      clip8_lookups,
+      z_bounds_buffer,
+      std::make_tuple(z_outof_bounds_buffer, y_outof_bounds_buffer, w_outof_bounds_buffer),
+      z_weighted_buffer, GetTyped<T>(h_w_interpolate_result_buffer_ptr),
+      output_data,
+      narrow<int>(N));
+  // clang-format on
+}
+
+template <class T>
+void ResizeBiLinearUpsample(cudaStream_t stream,
+                            int rank,
+                            const UpsampleMode upsample_mode,
+                            ResizeCoordinateTransformationMode coordinate_transform_mode,
+                            gsl::span<const int64_t> input_shape,
+                            gsl::span<const int64_t> output_shape,
+                            int64_t batch_size, int64_t num_channels,
+                            std::tuple<int64_t, int64_t, int64_t> inferred_input_dims,
+                            std::tuple<int64_t, int64_t, int64_t> inferred_output_dims,
+                            std::tuple<float, float, float> inferred_dim_rscales,
+                            const TArray<fast_divmod>& output_div_pitches,
+                            gsl::span<const float> roi_vals,
+                            const std::optional<float>& extrapolation,
+                            bool exclude_outside,
+                            const TempSpaceAllocateFunc& allocate_temp_space,
+                            const uint8_t* clip8_lookups,
+                            const T* input_data,
+                            T* output_data,
+                            const size_t N) {
+  using AccumType = typename onnxruntime::AccumulateType<T>::type;
+
+  const bool use_extrapolation = extrapolation.has_value();
+  const float extrapolation_value = use_extrapolation ? *extrapolation : 0.f;
+
+  int64_t input_depth, input_height, input_width;
+  std::tie(input_depth, input_height, input_width) = inferred_input_dims;
+
+  int64_t output_depth, output_height, output_width;
+  std::tie(output_depth, output_height, output_width) = inferred_output_dims;
+
+  int blocksPerDimsMappingGrid =
+      narrow<int>(CeilDiv((output_depth + output_height + output_width), 32));
+
+  // rank 2 or 4
+  const fast_divmod div_output_image = (rank > 2) ? output_div_pitches[rank - 4]
+                                                  : fast_divmod(gsl::narrow_cast<int>(N));
+  const fast_divmod& div_output_width = output_div_pitches[rank - 2];
+
+  constexpr float support_value = antialias_constants::kSupportSize;
+
+  float h_scale, w_scale;
+  std::tie(std::ignore, h_scale, w_scale) = inferred_dim_rscales;
+
+  int blocksPerGrid = narrow<int>(CeilDiv(N, GridDim::maxThreadsPerBlock));
+
+  SafeInt<int64_t> bounds_buffer_size = (SafeInt<int64_t>(output_height) + output_width) * 2;
+  SafeInt<int64_t> out_of_bounds_buffer_size = (SafeInt<int64_t>(output_height) + output_width);
+
+  float h_scaled_support, w_scaled_support;
+  int32_t h_window_size, w_window_size;
+  const auto [weighted_y_size, weighted_w_size] =
+      ComputeBilinearScaleBufferSize(output_height, output_width,
+                                     h_scale, w_scale, support_value,
+                                     h_scaled_support, w_scaled_support, h_window_size, w_window_size);
+
+  auto bounds_buffer_ptr = AllocateTyped<int64_t>(allocate_temp_space, bounds_buffer_size);
+  auto out_of_bounds_buffer_ptr = AllocateTyped<int64_t>(allocate_temp_space, out_of_bounds_buffer_size);
+
+  int64_t* y_bounds_buffer = GetTyped<int64_t>(bounds_buffer_ptr);
+  int64_t* w_bounds_buffer = y_bounds_buffer + output_height * 2;
+
+  int64_t* y_outof_bounds_buffer = GetTyped<int64_t>(out_of_bounds_buffer_ptr);
+  int64_t* w_outof_bounds_buffer = y_outof_bounds_buffer + output_height;
+
+  const int64_t weighted_buffer_size = SafeInt<int64_t>(weighted_y_size) + weighted_w_size;
+  auto weighted_buffer_ptr = AllocateTyped<AccumType>(allocate_temp_space, narrow<size_t>(weighted_buffer_size));
+
+  AccumType* y_weighted_buffer = GetTyped<AccumType>(weighted_buffer_ptr);
+  AccumType* w_weighted_buffer = y_weighted_buffer + weighted_y_size;
+
+  const auto temp_buf_size = num_channels * input_height * output_width;
+  auto image_temp_buffer = AllocateTyped<T>(allocate_temp_space, narrow<size_t>(temp_buf_size));
+
+  // clang-format off
+  DISPATCH_ANTIALIAS_FILTER_SETUP(coordinate_transform_mode, [&]() {
+    //  Data is d, h, w in tuples
+
+    _SetupBilinearUpsampleFilterAntiAlias<AccumType,
+                                          BilinearFilter,
+                                          coord_t><<<blocksPerDimsMappingGrid, 32, 0, stream>>>(
+        std::make_tuple(input_height, input_width),
+        std::make_tuple(output_height, output_width),
+        std::make_tuple(h_scale, w_scale),
+        std::make_tuple(roi_vals[rank - 2], roi_vals[rank - 1]),                // roi starts h, w
+        std::make_tuple(roi_vals[rank - 2 + rank], roi_vals[rank - 1 + rank]),  // roi ends h, w
+        std::make_tuple(h_scaled_support, w_scaled_support),
+        std::make_tuple(h_window_size, w_window_size),
+        onnxruntime::antialias_constants::kCubicCoeffA, exclude_outside,
+        GetTyped<int64_t>(bounds_buffer_ptr),
+        GetTyped<int64_t>(out_of_bounds_buffer_ptr),
+        std::make_tuple(y_weighted_buffer, w_weighted_buffer));
+  });
+
+  // clang-format on
+  const fast_divmod div_step_image{narrow<int>(num_channels * input_height * output_width)};
+  // clang-format off
+  _ComputeInterpolationAtLevel1<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+      num_channels, input_height, input_width, input_height, output_width,
+      div_output_width,
+      div_step_image,
+      w_window_size,
+      clip8_lookups,
+      w_bounds_buffer,
+      std::make_tuple(y_outof_bounds_buffer, w_outof_bounds_buffer),
+      w_weighted_buffer, input_data, GetTyped<T>(image_temp_buffer),
+      narrow<int>(temp_buf_size));
+
+  // clang-format on
+  const fast_divmod div_output_height{narrow<int>(output_height * output_width)};
+  // clang-format off
+  _ComputeInterpolationAtLevel2<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+      num_channels, input_height, output_width, output_height, output_width,
+      div_output_height,
+      div_output_width,
+      div_output_image,
+      h_window_size,
+      use_extrapolation, extrapolation_value,
+      clip8_lookups,
+      y_bounds_buffer,
+      std::make_tuple(y_outof_bounds_buffer, w_outof_bounds_buffer),
+      y_weighted_buffer, GetTyped<T>(image_temp_buffer), output_data,
+      narrow<int>(N));
+
+  // clang-format on
+}
+
+template <typename T>
+void ResizeBicubicUpsample(cudaStream_t stream,
+                           int rank,
+                           const UpsampleMode upsample_mode,
+                           ResizeCoordinateTransformationMode coordinate_transform_mode,
+                           gsl::span<const int64_t> input_shape,
+                           gsl::span<const int64_t> output_shape,
+                           int64_t batch_size, int64_t num_channels,
+                           std::tuple<int64_t, int64_t, int64_t> inferred_input_dims,
+                           std::tuple<int64_t, int64_t, int64_t> inferred_output_dims,
+                           std::tuple<float, float, float> inferred_dim_rscales,
+                           // const TArray<int64_t>& input_strides,
+                           const TArray<fast_divmod>& output_div_pitches,
+                           gsl::span<const float> roi_vals,
+                           const std::optional<float>& extrapolation,
+                           bool exclude_outside,
+                           const TempSpaceAllocateFunc& allocate_temp_space,
+                           const uint8_t* clip8_lookups,
+                           const T* input_data,
+                           T* output_data,
+                           const size_t N) {
+  using AccumType = typename onnxruntime::AccumulateType<T>::type;
+
+  const bool use_extrapolation = extrapolation.has_value();
+  const float extrapolation_value = use_extrapolation ? *extrapolation : 0.f;
+
+  int blocksPerGrid = narrow<int>(CeilDiv(N, GridDim::maxThreadsPerBlock));
+  const fast_divmod div_output_image = (rank > 2) ? output_div_pitches[rank - 4]
+                                                  : fast_divmod(gsl::narrow_cast<int>(N));
+  const fast_divmod& div_output_width = output_div_pitches[rank - 2];
+
+  constexpr float support_value = antialias_constants::kBiCubicSupportSize;
+
+  int64_t input_depth, input_height, input_width;
+  std::tie(input_depth, input_height, input_width) = inferred_input_dims;
+
+  int64_t output_depth, output_height, output_width;
+  std::tie(output_depth, output_height, output_width) = inferred_output_dims;
+
+  int blocksPerDimsMappingGrid =
+      narrow<int>(CeilDiv((output_depth + output_height + output_width), 32));
+
+  float h_scale, w_scale;
+  std::tie(std::ignore, h_scale, w_scale) = inferred_dim_rscales;
+
+  SafeInt<int64_t> bounds_buffer_size = (SafeInt<int64_t>(output_height) + output_width) * 2;
+  SafeInt<int64_t> out_of_bounds_buffer_size = (SafeInt<int64_t>(output_height) + output_width);
+
+  float h_scaled_support, w_scaled_support;
+  int32_t h_window_size, w_window_size;
+  const auto [weighted_y_size, weighted_w_size] =
+      ComputeBilinearScaleBufferSize(output_height, output_width,
+                                     h_scale, w_scale, support_value,
+                                     h_scaled_support, w_scaled_support, h_window_size, w_window_size);
+
+  auto bounds_buffer_ptr = AllocateTyped<int64_t>(allocate_temp_space, bounds_buffer_size);
+  auto out_of_bounds_buffer_ptr = AllocateTyped<int64_t>(allocate_temp_space, out_of_bounds_buffer_size);
+
+  int64_t* y_bounds_buffer = GetTyped<int64_t>(bounds_buffer_ptr);
+  int64_t* w_bounds_buffer = y_bounds_buffer + output_height * 2;
+
+  int64_t* y_outof_bounds_buffer = GetTyped<int64_t>(out_of_bounds_buffer_ptr);
+  int64_t* w_outof_bounds_buffer = y_outof_bounds_buffer + output_height;
+
+  const int64_t weighted_buffer_size = SafeInt<int64_t>(weighted_y_size) +
+                                       weighted_w_size;
+  auto weighted_buffer_ptr = AllocateTyped<AccumType>(allocate_temp_space, weighted_buffer_size);
+
+  AccumType* y_weighted_buffer = GetTyped<AccumType>(weighted_buffer_ptr);
+  AccumType* w_weighted_buffer = y_weighted_buffer + weighted_y_size;
+
+  const auto temp_buf_size = SafeInt<int64_t>(batch_size) * num_channels * input_height * output_width;
+  auto image_temp_buffer = AllocateTyped<T>(allocate_temp_space, narrow<size_t>(temp_buf_size));
+
+  // clang-format off
+  DISPATCH_ANTIALIAS_FILTER_SETUP(coordinate_transform_mode, [&]() {
+    _SetupBilinearUpsampleFilterAntiAlias<AccumType,
+                                          BiCubicFilter,
+                                          coord_t><<<blocksPerDimsMappingGrid, 32, 0, stream>>>(
+        std::make_tuple(input_height, input_width),
+        std::make_tuple(output_height, output_width),
+        std::make_tuple(h_scale, w_scale),
+        std::make_tuple(roi_vals[rank - 2], roi_vals[rank - 1]),                // roi starts h, w
+        std::make_tuple(roi_vals[rank - 2 + rank], roi_vals[rank - 1 + rank]),  // roi ends h, w
+        std::make_tuple(h_scaled_support, w_scaled_support),
+        std::make_tuple(h_window_size, w_window_size),
+        onnxruntime::antialias_constants::kCubicCoeffA, exclude_outside,
+        GetTyped<int64_t>(bounds_buffer_ptr),
+        GetTyped<int64_t>(out_of_bounds_buffer_ptr),
+        std::make_tuple(y_weighted_buffer, w_weighted_buffer));
+  });
+  // clang-format on
+  const fast_divmod div_step_image(narrow<int>(num_channels * input_height * output_width));
+  // clang-format off
+  _ComputeInterpolationAtLevel1<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+      num_channels, input_height, input_width, input_height, output_width,
+      div_output_width,
+      div_step_image,
+      w_window_size,
+      clip8_lookups,
+      w_bounds_buffer,
+      std::make_tuple(y_outof_bounds_buffer, w_outof_bounds_buffer),
+      w_weighted_buffer, input_data, GetTyped<T>(image_temp_buffer),
+      narrow<int>(temp_buf_size));
+  // clang-format on
+
+  const fast_divmod div_output_height{narrow<int>(output_height * output_width)};
+  // clang-format off
+  _ComputeInterpolationAtLevel2<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+      num_channels, input_height, output_width, output_height, output_width,
+      div_output_height,
+      div_output_width,
+      div_output_image,
+      h_window_size,
+      use_extrapolation, extrapolation_value,
+      clip8_lookups,
+      y_bounds_buffer,
+      std::make_tuple(y_outof_bounds_buffer, w_outof_bounds_buffer),
+      y_weighted_buffer, GetTyped<T>(image_temp_buffer), output_data,
+      narrow<int>(N));
+  // clang-format on
+}
+
+template <class T>
+void ResizeAntiAliasImpl(
+    cudaStream_t stream,
+    int rank,
+    const UpsampleMode upsample_mode,
+    ResizeCoordinateTransformationMode coordinate_transform_mode,
+    gsl::span<const int64_t> input_shape,
+    gsl::span<const int64_t> output_shape,
+    int64_t batch_size, int64_t num_channels,
+    std::tuple<int64_t, int64_t, int64_t> inferred_input_dims,
+    std::tuple<int64_t, int64_t, int64_t> inferred_output_dims,
+    std::tuple<float, float, float> inferred_dim_rscales,
+    const TArray<fast_divmod>& output_div_pitches,
+    gsl::span<const float> roi_vals,
+    const std::optional<float>& extrapolation,
+    bool exclude_outside,
+    TempSpaceAllocateFunc allocate_temp_space,
+    const uint8_t* clip8_lookups,
+    const T* input_data,
+    T* output_data,
+    const size_t N) {
+  // We support a special case of bilinear or bicubic if the input data is 4D with the outer 2 scales being 1.0
+  // We would have validated the outer scale values by the time execution reaches this
+  const bool is_2D = (rank == 2 || rank == 4);
+
+  // We support a special case of trilinear or tricubic if the input data is 5D with the outer 2 scales being 1.0
+  // We would have validated the outer scale values by the time execution reaches this
+  const bool is_3D = (rank == 3 || rank == 5);
+
+  // Should not hit this as we have already validated input rank/scales and we provide verbose error messages
+  // to the user.
+  ORT_ENFORCE(is_2D || is_3D, "Only bilinear/trilinear and bicubic modes are supported in Resize anti-alias mode");
+
+  switch (upsample_mode) {
+    case UpsampleMode::LINEAR: {
+      if (is_2D) {
+        ResizeBiLinearUpsample<T>(stream, rank, upsample_mode, coordinate_transform_mode,
+                                  input_shape, output_shape, batch_size, num_channels,
+                                  inferred_input_dims, inferred_output_dims, inferred_dim_rscales,
+                                  output_div_pitches, roi_vals, extrapolation, exclude_outside,
+                                  allocate_temp_space, clip8_lookups, input_data, output_data, N);
+      } else if (is_3D) {
+        ResizeTrilinearUpsample<T>(stream, rank, upsample_mode, coordinate_transform_mode,
+                                   input_shape, output_shape, batch_size, num_channels,
+                                   inferred_input_dims, inferred_output_dims, inferred_dim_rscales,
+                                   output_div_pitches, roi_vals, extrapolation, exclude_outside,
+                                   allocate_temp_space, clip8_lookups, input_data, output_data, N);
+      } else {
+        ORT_NOT_IMPLEMENTED("Resize supports only 2-D or 3-D in LINEAR mode.");
+      }
+    } break;
+    case CUBIC: {
+      if (is_2D) {
+        ResizeBicubicUpsample<T>(stream, rank, upsample_mode, coordinate_transform_mode,
+                                 input_shape, output_shape, batch_size, num_channels,
+                                 inferred_input_dims, inferred_output_dims, inferred_dim_rscales,
+                                 output_div_pitches, roi_vals, extrapolation, exclude_outside,
+                                 allocate_temp_space, clip8_lookups, input_data, output_data, N);
+      } else {
+        ORT_NOT_IMPLEMENTED("Resize supports only 2-D in CUBIC mode.");
+      }
+    } break;
+    default:
+      ORT_NOT_IMPLEMENTED("Only bilinear/trilinear and bicubic modes are supported in Resize anti-alias mode");
+      break;
+  }
+}
+
+#define SPECIALIZED_ANTIALIAS_IMPL(T)                               \
+  template void ResizeAntiAliasImpl<T>(                             \
+      cudaStream_t stream,                                          \
+      int rank,                                                     \
+      const UpsampleMode upsample_mode,                             \
+      ResizeCoordinateTransformationMode coordinate_transform_mode, \
+      gsl::span<const int64_t> input_shape,                         \
+      gsl::span<const int64_t> output_shape,                        \
+      int64_t batch_size, int64_t num_channels,                     \
+      std::tuple<int64_t, int64_t, int64_t> inferred_input_dims,    \
+      std::tuple<int64_t, int64_t, int64_t> inferred_output_dims,   \
+      std::tuple<float, float, float> inferred_dim_rscales,         \
+      const TArray<fast_divmod>& output_div_pitches,                \
+      gsl::span<const float> roi_vals,                              \
+      const std::optional<float>& extrapolation_value,              \
+      bool exclude_outside,                                         \
+      TempSpaceAllocateFunc allocate_temp_space,                    \
+      const uint8_t* clip8_lookups,                                 \
+      const T* input_data,                                          \
+      T* output_data,                                               \
+      const size_t N);
+
+SPECIALIZED_ANTIALIAS_IMPL(float)
+SPECIALIZED_ANTIALIAS_IMPL(double)
+SPECIALIZED_ANTIALIAS_IMPL(half)
+SPECIALIZED_ANTIALIAS_IMPL(int32_t)
+SPECIALIZED_ANTIALIAS_IMPL(uint8_t)
+
+}  // namespace cuda
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/tensor/resize_impl.cu b/onnxruntime/core/providers/cuda/tensor/resize_impl.cu
index 1a94c7705e913..0cde0ed8e8681 100644
--- a/onnxruntime/core/providers/cuda/tensor/resize_impl.cu
+++ b/onnxruntime/core/providers/cuda/tensor/resize_impl.cu
@@ -12,7 +12,7 @@ using onnxruntime::ResizeNearestMode;
 using onnxruntime::UpsampleMode;
 
 struct NearestPixel_SIMPLE {
-  __device__ __forceinline__ int operator() (float x_original, bool is_down_sampling) const {
+  __device__ __forceinline__ int operator()(float x_original, bool is_down_sampling) const {
     if (is_down_sampling) {
       return static_cast<int>(_Ceil(x_original));
     }
@@ -21,7 +21,7 @@ struct NearestPixel_SIMPLE {
 };
 
 struct NearestPixel_ROUND_PREFER_FLOOR {
-  __device__ __forceinline__ int operator() (float x_original, bool) const {
+  __device__ __forceinline__ int operator()(float x_original, bool) const {
     if (x_original == static_cast<int>(x_original) + 0.5f) {
       return static_cast<int>(_Floor(x_original));
     }
@@ -30,62 +30,23 @@ struct NearestPixel_ROUND_PREFER_FLOOR {
 };
 
 struct NearestPixel_ROUND_PREFER_CEIL {
-  __device__ __forceinline__ int operator() (float x_original, bool) const {
+  __device__ __forceinline__ int operator()(float x_original, bool) const {
     return static_cast<int>(roundf(x_original));
   }
 };
 
 struct NearestPixel_FLOOR {
-  __device__ __forceinline__ int operator() (float x_original, bool) const {
+  __device__ __forceinline__ int operator()(float x_original, bool) const {
     return static_cast<int>(_Floor(x_original));
   }
 };
 
 struct NearestPixel_CEIL {
-  __device__ __forceinline__ int operator() (float x_original, bool) const {
+  __device__ __forceinline__ int operator()(float x_original, bool) const {
     return static_cast<int>(_Ceil(x_original));
   }
 };
 
-struct TransformCoordinate_ASYMMETRIC {
-  __device__ __forceinline__ float operator() (float x_resized, float x_scale, float, float, float, float) const {
-    return x_resized / x_scale;
-  }
-};
-
-struct TransformCoordinate_HALF_PIXEL {
-  __device__ __forceinline__ float operator() (float x_resized, float x_scale, float, float, float, float) const {
-    return ((x_resized + 0.5f) / x_scale) - 0.5f;
-  }
-};
-
-struct TransformCoordinate_PYTORCH_HALF_PIXEL {
-  __device__ __forceinline__ float operator() (float x_resized, float x_scale, float length_resized, float, float, float) const {
-    return length_resized > 1 ? (x_resized + 0.5f) / x_scale - 0.5f : 0.0f;
-  }
-};
-
-struct TransformCoordinate_TF_HALF_PIXEL_FOR_NN {
-  __device__ __forceinline__ float operator() (float x_resized, float x_scale, float, float, float, float) const {
-    return (x_resized + 0.5f) / x_scale;
-  }
-};
-
-struct TransformCoordinate_ALIGN_CORNERS {
-  __device__ __forceinline__ float operator() (float x_resized, float, float length_resized, float length_original, float, float) const {
-    return length_resized == 1 ? 0 : x_resized * (length_original - 1) / (length_resized - 1);
-  }
-};
-
-struct TransformCoordinate_TF_CROP_AND_RESIZE {
-  __device__ __forceinline__ float operator() (float x_resized, float, float length_resized, float length_original, float roi_start, float roi_end) const {
-    auto orig = length_resized > 1
-      ? roi_start * (length_original - 1) + (x_resized * (roi_end - roi_start) * (length_original - 1)) / (length_resized - 1)
-      : 0.5 * (roi_start + roi_end) * (length_original - 1);
-    return static_cast<float>(orig);
-  }
-};
-
 #define CASE_TYPE_USING_HINT(enum_type, type, HINT, ...) \
   case enum_type: {                                      \
     using HINT = type;                                   \
@@ -95,20 +56,24 @@ struct TransformCoordinate_TF_CROP_AND_RESIZE {
 #define CASE_TYPE_COORD(enum_type, type, ...) \
   CASE_TYPE_USING_HINT(enum_type, type, coord_t, __VA_ARGS__)
 
-#define DISPATCH_RESIZE_COORDINATE_TRANSFORMATION_MODE(TYPE, ...)                                                                      \
-  [&] {                                                                                                                                \
-    const auto& the_type = TYPE;                                                                                                       \
-    /* don't use TYPE again in case it is an expensive or side-effect op */                                                            \
-    switch (the_type) {                                                                                                                \
-      CASE_TYPE_COORD(ResizeCoordinateTransformationMode::HALF_PIXEL,           TransformCoordinate_HALF_PIXEL, __VA_ARGS__)           \
-      CASE_TYPE_COORD(ResizeCoordinateTransformationMode::ASYMMETRIC,           TransformCoordinate_ASYMMETRIC, __VA_ARGS__)           \
-      CASE_TYPE_COORD(ResizeCoordinateTransformationMode::PYTORCH_HALF_PIXEL,   TransformCoordinate_PYTORCH_HALF_PIXEL, __VA_ARGS__)   \
-      CASE_TYPE_COORD(ResizeCoordinateTransformationMode::ALIGN_CORNERS,        TransformCoordinate_ALIGN_CORNERS, __VA_ARGS__)        \
-      CASE_TYPE_COORD(ResizeCoordinateTransformationMode::TF_HALF_PIXEL_FOR_NN, TransformCoordinate_TF_HALF_PIXEL_FOR_NN, __VA_ARGS__) \
-      CASE_TYPE_COORD(ResizeCoordinateTransformationMode::TF_CROP_AND_RESIZE,   TransformCoordinate_TF_CROP_AND_RESIZE, __VA_ARGS__)   \
-      default:                                                                                                                         \
-        ORT_THROW("unknown ResizeCoordinateTransformationMode");                                                                       \
-    }                                                                                                                                  \
+#define DISPATCH_RESIZE_COORDINATE_TRANSFORMATION_MODE(TYPE, ...)                                                  \
+  [&] {                                                                                                            \
+    const auto& the_type = TYPE;                                                                                   \
+    /* don't use TYPE again in case it is an expensive or side-effect op */                                        \
+    switch (the_type) {                                                                                            \
+      CASE_TYPE_COORD(ResizeCoordinateTransformationMode::HALF_PIXEL, TransformCoordinate_HALF_PIXEL, __VA_ARGS__) \
+      CASE_TYPE_COORD(ResizeCoordinateTransformationMode::ASYMMETRIC, TransformCoordinate_ASYMMETRIC, __VA_ARGS__) \
+      CASE_TYPE_COORD(ResizeCoordinateTransformationMode::PYTORCH_HALF_PIXEL,                                      \
+                      TransformCoordinate_PYTORCH_HALF_PIXEL, __VA_ARGS__)                                         \
+      CASE_TYPE_COORD(ResizeCoordinateTransformationMode::ALIGN_CORNERS,                                           \
+                      TransformCoordinate_ALIGN_CORNERS, __VA_ARGS__)                                              \
+      CASE_TYPE_COORD(ResizeCoordinateTransformationMode::TF_HALF_PIXEL_FOR_NN,                                    \
+                      TransformCoordinate_TF_HALF_PIXEL_FOR_NN, __VA_ARGS__)                                       \
+      CASE_TYPE_COORD(ResizeCoordinateTransformationMode::TF_CROP_AND_RESIZE,                                      \
+                      TransformCoordinate_TF_CROP_AND_RESIZE, __VA_ARGS__)                                         \
+      default:                                                                                                     \
+        ORT_THROW("unknown ResizeCoordinateTransformationMode");                                                   \
+    }                                                                                                              \
   }()
 
 #define CASE_TYPE_NEAREST(enum_type, type, ...) \
@@ -119,11 +84,11 @@ struct TransformCoordinate_TF_CROP_AND_RESIZE {
     const auto& the_type = TYPE;                                                                             \
     /* don't use TYPE again in case it is an expensive or side-effect op */                                  \
     switch (the_type) {                                                                                      \
-      CASE_TYPE_NEAREST(ResizeNearestMode::SIMPLE,             NearestPixel_SIMPLE, __VA_ARGS__)             \
+      CASE_TYPE_NEAREST(ResizeNearestMode::SIMPLE, NearestPixel_SIMPLE, __VA_ARGS__)                         \
       CASE_TYPE_NEAREST(ResizeNearestMode::ROUND_PREFER_FLOOR, NearestPixel_ROUND_PREFER_FLOOR, __VA_ARGS__) \
-      CASE_TYPE_NEAREST(ResizeNearestMode::ROUND_PREFER_CEIL,  NearestPixel_ROUND_PREFER_CEIL, __VA_ARGS__)  \
-      CASE_TYPE_NEAREST(ResizeNearestMode::FLOOR,              NearestPixel_FLOOR, __VA_ARGS__)              \
-      CASE_TYPE_NEAREST(ResizeNearestMode::CEIL,               NearestPixel_CEIL, __VA_ARGS__)               \
+      CASE_TYPE_NEAREST(ResizeNearestMode::ROUND_PREFER_CEIL, NearestPixel_ROUND_PREFER_CEIL, __VA_ARGS__)   \
+      CASE_TYPE_NEAREST(ResizeNearestMode::FLOOR, NearestPixel_FLOOR, __VA_ARGS__)                           \
+      CASE_TYPE_NEAREST(ResizeNearestMode::CEIL, NearestPixel_CEIL, __VA_ARGS__)                             \
       default:                                                                                               \
         ORT_THROW("unknown ResizeNearestMode");                                                              \
     }                                                                                                        \
@@ -151,10 +116,12 @@ __global__ void _ResizeNearestMappingKernel2D(
 
     // only apply co-ordinate transformation if scale != 1.0
     if (scales_height == 1.0f) {
-        dims_mapping[id].extrapolate_ = 0;
+      dims_mapping[id].extrapolate_ = 0;
     } else {
-      float orig_coord = transform_coordinate(static_cast<float>(dim), scales_height, static_cast<float>(output_height),
-                                              static_cast<float>(input_height), roi_start_height, roi_end_height);
+      float orig_coord = transform_coordinate(static_cast<float>(dim), scales_height,
+                                              static_cast<float>(output_height),
+                                              static_cast<float>(input_height),
+                                              roi_start_height, roi_end_height);
       dims_mapping[id].extrapolate_ = static_cast<int>(
           extrapolation_enabled && (orig_coord < 0.f || orig_coord > static_cast<float>(input_height - 1)));
       dim = calc_nearest_pixel(orig_coord, scales_height < 1);
@@ -210,9 +177,12 @@ __global__ void _ResizeNearestMappingKernel(
       if (scales[axis] == 1.0f) {
         dims_mapping[id].extrapolate_ = 0;
       } else {
-        float orig_coord = transform_coordinate(static_cast<float>(dim), scales[axis], static_cast<float>(output_shape[axis]),
+        float orig_coord = transform_coordinate(static_cast<float>(dim), scales[axis],
+                                                static_cast<float>(output_shape[axis]),
                                                 static_cast<float>(input_shape[axis]), roi[axis], roi[axis + rank]);
-        dims_mapping[id].extrapolate_ = static_cast<int>(extrapolation_enabled && (orig_coord < 0.f || orig_coord > static_cast<float>(input_shape[axis] - 1)));
+        dims_mapping[id].extrapolate_ = static_cast<int>(extrapolation_enabled &&
+                                                         (orig_coord < 0.f ||
+                                                          orig_coord > static_cast<float>(input_shape[axis] - 1)));
         dim = calc_nearest_pixel(orig_coord, scales[axis] < 1);
         if (dim >= input_shape[axis]) dim = input_shape[axis] - 1;
         if (dim < 0) dim = 0;
@@ -293,21 +263,27 @@ __global__ void _ResizeBilinearCoordinateMapping(
     LinearMappingInfo* dims_mapping) {
   CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, SumHW);
   if (id < output_height) {  //  y = id
-    float input_y = scale_height == 1 ? static_cast<float>(id) :
-                                        transform_coordinate(static_cast<float>(id), scale_height,
-                                        static_cast<float>(output_height), static_cast<float>(input_height),
-                                        roi_height_start, roi_height_end);
-    dims_mapping[id].extrapolate_ = (int)(extrapolation_enabled && (input_y < 0 || input_y > static_cast<float>(input_height - 1)));
+    float input_y = scale_height == 1 ? static_cast<float>(id)
+                                      : transform_coordinate(static_cast<float>(id), scale_height,
+                                                             static_cast<float>(output_height),
+                                                             static_cast<float>(input_height),
+                                                             roi_height_start, roi_height_end);
+    dims_mapping[id].extrapolate_ = static_cast<int>((extrapolation_enabled &&
+                                                      (input_y < 0 ||
+                                                       input_y > static_cast<float>(input_height - 1))));
     input_y = max(0.0f, min(input_y, static_cast<float>(input_height - 1)));
     int y_int = static_cast<int>(input_y);
     dims_mapping[id].origin_ = y_int;
     dims_mapping[id].weight_ = (y_int >= input_height - 1) ? 0.5f : input_y - y_int;
-  } else {  //x = id - output_height
-    float input_x = scale_width == 1 ? static_cast<float>(id - output_height) :
-                                       transform_coordinate(static_cast<float>(id - output_height), scale_width,
-                                       static_cast<float>(output_width), static_cast<float>(input_width),
-                                       roi_width_start, roi_width_end);
-    dims_mapping[id].extrapolate_ = (int)(extrapolation_enabled && (input_x < 0 || input_x > static_cast<float>(input_width - 1)));
+  } else {  // x = id - output_height
+    float input_x = scale_width == 1 ? static_cast<float>(id - output_height)
+                                     : transform_coordinate(static_cast<float>(id - output_height),
+                                                            scale_width, static_cast<float>(output_width),
+                                                            static_cast<float>(input_width), roi_width_start,
+                                                            roi_width_end);
+    dims_mapping[id].extrapolate_ = static_cast<int>((extrapolation_enabled &&
+                                                      (input_x < 0 ||
+                                                       input_x > static_cast<float>(input_width - 1))));
     input_x = max(0.0f, min(input_x, static_cast<float>(input_width - 1)));
     int x_int = static_cast<int>(input_x);
     dims_mapping[id].origin_ = x_int;
@@ -371,32 +347,40 @@ __global__ void _ResizeTrilinearCoordinateMapping(
     LinearMappingInfo* dims_mapping) {
   CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, SumDHW);
   if (id < output_depth) {  //  z = id
-    float input_z = scale_depth == 1 ? static_cast<float>(id)  :
-                                       transform_coordinate(static_cast<float>(id), scale_depth,
-                                       static_cast<float>(output_depth), static_cast<float>(input_depth),
-                                       roi_depth_start, roi_depth_end);
-    dims_mapping[id].extrapolate_ = (int)(extrapolation_enabled && (input_z < 0 || input_z > static_cast<float>(input_depth - 1)));
+    float input_z = scale_depth == 1 ? static_cast<float>(id)
+                                     : transform_coordinate(static_cast<float>(id), scale_depth,
+                                                            static_cast<float>(output_depth),
+                                                            static_cast<float>(input_depth),
+                                                            roi_depth_start, roi_depth_end);
+    dims_mapping[id].extrapolate_ = static_cast<int>((extrapolation_enabled &&
+                                                      (input_z < 0 ||
+                                                       input_z > static_cast<float>(input_depth - 1))));
     input_z = max(0.0f, min(input_z, static_cast<float>(input_depth - 1)));
     int z_int = static_cast<int>(input_z);
     dims_mapping[id].origin_ = z_int;
     dims_mapping[id].weight_ = (z_int >= input_depth - 1) ? 0.5f : input_z - z_int;
   } else if (id >= output_depth && id < (output_depth + output_height)) {  //  y = id - output_depth
-    float input_y = scale_height == 1 ? static_cast<float>(id - output_depth) :
-                                        transform_coordinate(static_cast<float>(id - output_depth), scale_height,
-                                        static_cast<float>(output_height), static_cast<float>(input_height),
-                                        roi_height_start, roi_height_end);
-
-    dims_mapping[id].extrapolate_ = (int)(extrapolation_enabled && (input_y < 0 || input_y > static_cast<float>(input_height - 1)));
+    float input_y = scale_height == 1 ? static_cast<float>(id - output_depth)
+                                      : transform_coordinate(static_cast<float>(id - output_depth),
+                                                             scale_height, static_cast<float>(output_height),
+                                                             static_cast<float>(input_height),
+                                                             roi_height_start, roi_height_end);
+
+    dims_mapping[id].extrapolate_ = static_cast<int>((extrapolation_enabled &&
+                                                      (input_y < 0 ||
+                                                       input_y > static_cast<float>(input_height - 1))));
     input_y = max(0.0f, min(input_y, static_cast<float>(input_height - 1)));
     int y_int = static_cast<int>(input_y);
     dims_mapping[id].origin_ = y_int;
     dims_mapping[id].weight_ = (y_int >= input_height - 1) ? 0.5f : input_y - y_int;
-  } else {  //x = id - output_depth - output_height
-    float input_x = scale_width == 1 ? static_cast<float>(id - output_depth - output_height) :
-                                       transform_coordinate(static_cast<float>(id - output_depth - output_height), scale_width,
-                                       static_cast<float>(output_width), static_cast<float>(input_width),
-                                       roi_width_start, roi_width_end);
-    dims_mapping[id].extrapolate_ = (int)(extrapolation_enabled && (input_x < 0 || input_x > static_cast<float>(input_width - 1)));
+  } else {  // x = id - output_depth - output_height
+    float input_x = scale_width == 1 ? static_cast<float>(id - output_depth - output_height)
+                                     : transform_coordinate(static_cast<float>(id - output_depth - output_height),
+                                                            scale_width, static_cast<float>(output_width),
+                                                            static_cast<float>(input_width),
+                                                            roi_width_start, roi_width_end);
+    dims_mapping[id].extrapolate_ = (int)(extrapolation_enabled && (input_x < 0 ||
+                                                                    input_x > static_cast<float>(input_width - 1)));
     input_x = max(0.0f, min(input_x, static_cast<float>(input_width - 1)));
     int x_int = static_cast<int>(input_x);
     dims_mapping[id].origin_ = x_int;
@@ -513,21 +497,33 @@ __global__ void _ResizeCubicCoordinateMapping(
   int max_input_coord = static_cast<int>(is_y_axis ? input_height : input_width);
 
   float scale = is_y_axis ? scale_height : scale_width;
-  float input_coordinat = scale == 1 ? (is_y_axis ? id : id - output_height) :
-      transform_coordinate(
-      static_cast<float>(is_y_axis ? id : id - output_height),
-      scale,
-      static_cast<float>(is_y_axis ? output_height : output_width),
-      static_cast<float>(max_input_coord),
-      (is_y_axis ? roi_height_start : roi_width_start),
-      (is_y_axis ? roi_height_end : roi_width_end));
+  float input_coordinat = scale == 1 ? (is_y_axis ? id : id - output_height)
+                                     : transform_coordinate(
+                                           static_cast<float>(is_y_axis ? id : id - output_height),
+                                           scale,
+                                           static_cast<float>(is_y_axis ? output_height : output_width),
+                                           static_cast<float>(max_input_coord),
+                                           (is_y_axis ? roi_height_start : roi_width_start),
+                                           (is_y_axis ? roi_height_end : roi_width_end));
   int coord_int = static_cast<int>(_Floor(input_coordinat));
   float s_coord = abs(input_coordinat - coord_int);
   float coeff_sum = 1.0f;
-  float coeff_0 = static_cast<float>(((cubic_coeff_a * (s_coord + 1) - 5 * cubic_coeff_a) * (s_coord + 1) + 8 * cubic_coeff_a) * (s_coord + 1) - 4 * cubic_coeff_a);
-  float coeff_1 = static_cast<float>(((cubic_coeff_a + 2) * s_coord - (cubic_coeff_a + 3)) * s_coord * s_coord + 1);
-  float coeff_2 = static_cast<float>(((cubic_coeff_a + 2) * (1 - s_coord) - (cubic_coeff_a + 3)) * (1 - s_coord) * (1 - s_coord) + 1);
-  float coeff_3 = static_cast<float>(((cubic_coeff_a * (2 - s_coord) - 5 * cubic_coeff_a) * (2 - s_coord) + 8 * cubic_coeff_a) * (2 - s_coord) - 4 * cubic_coeff_a);
+  float coeff_0 = static_cast<float>(((cubic_coeff_a * (s_coord + 1) - 5 * cubic_coeff_a) *
+                                          (s_coord + 1) +
+                                      8 * cubic_coeff_a) *
+                                         (s_coord + 1) -
+                                     4 * cubic_coeff_a);
+  float coeff_1 = static_cast<float>(((cubic_coeff_a + 2) * s_coord - (cubic_coeff_a + 3)) *
+                                         s_coord * s_coord +
+                                     1);
+  float coeff_2 = static_cast<float>(((cubic_coeff_a + 2) * (1 - s_coord) - (cubic_coeff_a + 3)) *
+                                         (1 - s_coord) * (1 - s_coord) +
+                                     1);
+  float coeff_3 = static_cast<float>(((cubic_coeff_a * (2 - s_coord) - 5 * cubic_coeff_a) *
+                                          (2 - s_coord) +
+                                      8 * cubic_coeff_a) *
+                                         (2 - s_coord) -
+                                     4 * cubic_coeff_a);
   if (exclude_outside) {
     coeff_0 = (coord_int - 1 < 0 || coord_int - 1 >= max_input_coord) ? 0.0 : coeff_0;
     coeff_1 = (coord_int + 0 < 0 || coord_int + 0 >= max_input_coord) ? 0.0 : coeff_1;
@@ -540,7 +536,8 @@ __global__ void _ResizeCubicCoordinateMapping(
   dm.coeff1_ = coeff_1 / coeff_sum;
   dm.coeff2_ = coeff_2 / coeff_sum;
   dm.coeff3_ = coeff_3 / coeff_sum;
-  dm.extrapolate_ = (int)(extrapolation_enabled && (input_coordinat < 0 || input_coordinat > static_cast<float>(max_input_coord - 1)));
+  dm.extrapolate_ = (int)(extrapolation_enabled && (input_coordinat < 0 ||
+                                                    input_coordinat > static_cast<float>(max_input_coord - 1)));
 }
 
 template <typename T>
@@ -569,21 +566,30 @@ __global__ void _ResizeBiCubicKernel(
   int x_int = x_info.origin_;
   int y_int = y_info.origin_;
   const T* image = input_data + input_index;
-  output_data[id] = y_info.coeff0_ * CubicInterpolationRowwise(image, x_int, y_int - 1, input_height, input_width, w0, w1, w2, w3) +
-                    y_info.coeff1_ * CubicInterpolationRowwise(image, x_int, y_int, input_height, input_width, w0, w1, w2, w3) +
-                    y_info.coeff2_ * CubicInterpolationRowwise(image, x_int, y_int + 1, input_height, input_width, w0, w1, w2, w3) +
-                    y_info.coeff3_ * CubicInterpolationRowwise(image, x_int, y_int + 2, input_height, input_width, w0, w1, w2, w3);
+  output_data[id] = y_info.coeff0_ *
+                        CubicInterpolationRowwise(image, x_int, y_int - 1, input_height, input_width, w0, w1, w2, w3) +
+                    y_info.coeff1_ *
+                        CubicInterpolationRowwise(image, x_int, y_int, input_height, input_width, w0, w1, w2, w3) +
+                    y_info.coeff2_ *
+                        CubicInterpolationRowwise(image, x_int, y_int + 1, input_height, input_width, w0, w1, w2, w3) +
+                    y_info.coeff3_ *
+                        CubicInterpolationRowwise(image, x_int, y_int + 2, input_height, input_width, w0, w1, w2, w3);
 }
 
 size_t CalcResizeBufferSize(const onnxruntime::UpsampleMode upsample_mode,
                             const gsl::span<const int64_t>& output_dims) {
   switch (upsample_mode) {
     case UpsampleMode::NN:
-      return sizeof(int64_t) * output_dims.size() + sizeof(NearestMappingInfo) * static_cast<size_t>(std::accumulate(output_dims.begin(), output_dims.end(), (int64_t)0));
+      return sizeof(int64_t) * output_dims.size() +
+             sizeof(NearestMappingInfo) *
+                 static_cast<size_t>(std::accumulate(output_dims.begin(),
+                                                     output_dims.end(), (int64_t)0));
     case UpsampleMode::LINEAR:
-      return sizeof(LinearMappingInfo) * static_cast<size_t>(std::accumulate(output_dims.rbegin(), output_dims.rbegin() + 2, (int64_t)0));
+      return sizeof(LinearMappingInfo) *
+             static_cast<size_t>(std::accumulate(output_dims.rbegin(), output_dims.rbegin() + 2, (int64_t)0));
     case UpsampleMode::CUBIC:
-      return sizeof(CubicMappingInfo) * static_cast<size_t>(std::accumulate(output_dims.rbegin(), output_dims.rbegin() + 2, (int64_t)0));
+      return sizeof(CubicMappingInfo) *
+             static_cast<size_t>(std::accumulate(output_dims.rbegin(), output_dims.rbegin() + 2, (int64_t)0));
   }
   return 0;
 }
@@ -616,7 +622,8 @@ void ResizeNearestImpl(
   if (could2d) {
     int64_t output_height = output_shape[rank - 2];
     int64_t output_width = output_shape[rank - 1];
-    fast_divmod div_output_image = (rank > 2) ? output_div_pitches[rank - 3] : fast_divmod(static_cast<int>(output_height * output_width));
+    fast_divmod div_output_image = (rank > 2) ? output_div_pitches[rank - 3]
+                                              : fast_divmod(static_cast<int>(output_height * output_width));
     int blocksPerDimsMappingGrid = static_cast<int>(ceil((output_height + output_width) / 32.0));
 
     DISPATCH_RESIZE_COORDINATE_TRANSFORMATION_MODE(transform_coordinate, [&]() {
@@ -694,13 +701,6 @@ void ResizeImpl(
     ResizeCoordinateTransformationMode coordinate_transform_mode,
     ResizeNearestMode nearest_mode,
     void* dims_mapping) {
-  bool isSame = std::all_of(scales_vals.Data(), scales_vals.Data() + rank, [](float v) { return v == 1.0f; }) &&
-                (coordinate_transform_mode != ResizeCoordinateTransformationMode::TF_CROP_AND_RESIZE);
-  if (isSame) {
-    CUDA_CALL_THROW(cudaMemcpyAsync(output_data, input_data, N * sizeof(T), cudaMemcpyDeviceToDevice, stream));
-    return;
-  }
-
   if (upsample_mode == UpsampleMode::NN) {
     ResizeNearestImpl(
         stream, rank, input_shape, output_shape, input_strides, output_div_pitches,
@@ -761,7 +761,7 @@ void ResizeImpl(
       } else if (is_3D) {
         DISPATCH_RESIZE_COORDINATE_TRANSFORMATION_MODE(coordinate_transform_mode, [&]() {
           _ResizeTrilinearCoordinateMapping<T><<<blocksPerDimsMappingGrid, 32, 0, stream>>>(
-              input_shape[rank - 3] , input_shape[rank - 2], input_shape[rank - 1],
+              input_shape[rank - 3], input_shape[rank - 2], input_shape[rank - 1],
               output_depth, output_height, output_width,
               scales_vals[rank - 3], scales_vals[rank - 2], scales_vals[rank - 1],
               roi_vals[rank - 3], roi_vals[rank - 3 + rank],
@@ -778,7 +778,7 @@ void ResizeImpl(
             reinterpret_cast<LinearMappingInfo*>(dims_mapping));
         return;
       }
-      ORT_THROW("Only bilinear/trilinear and bicubic modes are supported in Resize");
+      ORT_THROW("Resize support 2-D and 3-D dimensions in LINEAR mode.");
       break;
     case UpsampleMode::CUBIC:
       if (is_2D) {
@@ -801,7 +801,7 @@ void ResizeImpl(
             reinterpret_cast<CubicMappingInfo*>(dims_mapping));
         return;
       }
-      ORT_THROW("Only bilinear/trilinear and bicubic modes are supported in Resize");
+      ORT_THROW("Resize supports only 2-D in CUBIC mode.");
     case UpsampleMode::NN:
       ORT_THROW("Only bilinear/trilinear and bicubic modes are supported in Resize");
   }
@@ -809,7 +809,7 @@ void ResizeImpl(
 
 #define SPECIALIZED_IMPL(T)                                         \
   template void ResizeImpl<T>(                                      \
-      cudaStream_t stream,                                    \
+      cudaStream_t stream,                                          \
       const UpsampleMode upsample_mode,                             \
       const int rank,                                               \
       TArray<int64_t>& input_shape,                                 \
diff --git a/onnxruntime/core/providers/cuda/tensor/resize_impl.h b/onnxruntime/core/providers/cuda/tensor/resize_impl.h
index d459dbff18d3e..ad06eebb9efb1 100644
--- a/onnxruntime/core/providers/cuda/tensor/resize_impl.h
+++ b/onnxruntime/core/providers/cuda/tensor/resize_impl.h
@@ -2,15 +2,69 @@
 // Licensed under the MIT License.
 
 #pragma once
+
 #include <stdint.h>
+
+#include <tuple>
+
 #include "core/providers/cuda/shared_inc/cuda_utils.h"
 #include "core/common/common.h"
 #include "core/providers/cpu/tensor/upsamplebase.h"
 #include "core/providers/cuda/cuda_common.h"
 
 namespace onnxruntime {
+template <>
+struct AccumulateType<half> {
+  using type = float;
+};
 namespace cuda {
 
+struct TransformCoordinate_ASYMMETRIC {
+  __device__ __host__ __forceinline__ float operator()(float x_resized, float x_scale,
+                                                       float, float, float, float) const {
+    return x_resized / x_scale;
+  }
+};
+
+struct TransformCoordinate_HALF_PIXEL {
+  __device__ __host__ __forceinline__ float operator()(float x_resized, float x_scale,
+                                                       float, float, float, float) const {
+    return ((x_resized + 0.5f) / x_scale) - 0.5f;
+  }
+};
+
+struct TransformCoordinate_PYTORCH_HALF_PIXEL {
+  __device__ __host__ __forceinline__ float operator()(float x_resized, float x_scale, float length_resized, float,
+                                                       float, float) const {
+    return length_resized > 1 ? (x_resized + 0.5f) / x_scale - 0.5f : 0.0f;
+  }
+};
+
+struct TransformCoordinate_TF_HALF_PIXEL_FOR_NN {
+  __device__ __host__ __forceinline__ float operator()(float x_resized, float x_scale,
+                                                       float, float, float, float) const {
+    return (x_resized + 0.5f) / x_scale;
+  }
+};
+
+struct TransformCoordinate_ALIGN_CORNERS {
+  __device__ __host__ __forceinline__ float operator()(float x_resized, float, float length_resized,
+                                                       float length_original, float, float) const {
+    return length_resized == 1 ? 0 : x_resized * (length_original - 1) / (length_resized - 1);
+  }
+};
+
+struct TransformCoordinate_TF_CROP_AND_RESIZE {
+  __device__ __host__ __forceinline__ float operator()(float x_resized, float, float length_resized,
+                                                       float length_original, float roi_start, float roi_end) const {
+    auto orig = length_resized > 1
+                    ? roi_start * (length_original - 1) +
+                          (x_resized * (roi_end - roi_start) * (length_original - 1)) / (length_resized - 1)
+                    : 0.5 * (roi_start + roi_end) * (length_original - 1);
+    return static_cast<float>(orig);
+  }
+};
+
 size_t CalcResizeBufferSize(const onnxruntime::UpsampleMode upsample_mode,
                             const gsl::span<const int64_t>& output_dims);
 
@@ -36,5 +90,62 @@ void ResizeImpl(
     onnxruntime::ResizeNearestMode nearest_mode,
     void* dims_mapping);
 
+using TempSpaceAllocateFunc = std::function<onnxruntime::IAllocatorUniquePtr<uint8_t>(size_t buffer_size)>;
+
+template <class T>
+void ResizeAntiAliasImpl(
+    cudaStream_t stream,
+    int rank,
+    const UpsampleMode upsample_mode,
+    ResizeCoordinateTransformationMode coordinate_transform_mode,
+    gsl::span<const int64_t> input_shape,
+    gsl::span<const int64_t> output_shape,
+    int64_t batch_size, int64_t num_channels,
+    std::tuple<int64_t, int64_t, int64_t> inferred_input_dims,
+    std::tuple<int64_t, int64_t, int64_t> inferred_output_dims,
+    std::tuple<float, float, float> inferred_dim_rscales,
+    const TArray<fast_divmod>& output_div_pitches,
+    gsl::span<const float> roi_vals,  // CPU
+    const std::optional<float>& extrapolation_value,
+    bool exclude_outside,
+    TempSpaceAllocateFunc allocate_temp_space,
+    const uint8_t* clip8_lookups,
+    const T* input_data,
+    T* output_data,
+    const size_t N);
+
+/// <summary>
+/// Compute scaled support value for a given dimension inverse scale
+/// </summary>
+/// <param name="support_value">Support value from parameters</param>
+/// <param name="inv_scale">inverse scale value comes from input/attr for</param>
+/// <returns></returns>
+inline float ComputeScaledSupportValue(float support_value, float rscale) {
+  const float scale = 1.0f / rscale;
+  float scaled_support = (scale >= 1.0f) ? (support_value * 0.5f) * scale : support_value * 0.5f;
+  return scaled_support;
+}
+
+/// <summary>
+/// Compute window size for a given dimension scaled support value.
+/// </summary>
+/// <param name="scaled_support"></param>
+/// <returns></returns>
+inline int32_t ComputeWindowSize(float scaled_support) {
+  SafeInt<int32_t> window_size(ceilf(scaled_support));
+  return window_size * 2 + 1;
+}
+
+/// <summary>
+/// Computes scale buffer size in number of elements for allocation purposes.
+/// </summary>
+/// <param name="output_size"></param>
+/// <param name="window_size"></param>
+/// <returns>Number of elements to fit in the buffer</returns>
+inline SafeInt<int64_t> ComputeWeightedCoeffBufferSize(int64_t output_size, int32_t window_size) {
+  SafeInt<int64_t> buffer_size(output_size);
+  return buffer_size * window_size;
+}
+
 }  // namespace cuda
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/tensor/upsample.cc b/onnxruntime/core/providers/cuda/tensor/upsample.cc
index ae12ca328bc7c..17533eb3d9a72 100644
--- a/onnxruntime/core/providers/cuda/tensor/upsample.cc
+++ b/onnxruntime/core/providers/cuda/tensor/upsample.cc
@@ -2,6 +2,9 @@
 // Licensed under the MIT License.
 
 #include "upsample.h"
+
+#include <utility>
+
 #include "upsample_impl.h"
 #include "core/providers/cuda/tensor/resize_impl.h"
 #include "core/providers/cpu/tensor/utils.h"
@@ -37,11 +40,23 @@ REGISTER_VERSIONED_TYPED_KERNEL(MLFloat16, 9, 9);
 REGISTER_VERSIONED_TYPED_KERNEL(int32_t, 9, 9);
 REGISTER_VERSIONED_TYPED_KERNEL(uint8_t, 9, 9);
 
+template <typename T>
+Upsample<T>::Upsample(const OpKernelInfo& info) : UpsampleBase(info), CudaKernel(info) {
+  if (UpsampleBase::antialias_) {
+    // Copy the table on DEVICE
+    const uint8_t* lookup_table = GetLookupTableShared();
+    auto alloc = info.GetAllocator(OrtMemTypeDefault);
+    shared_lookup_table_ondevice_ = IAllocator::MakeUniquePtr<uint8_t>(std::move(alloc), kLookupTableSize);
+    CUDA_CALL_THROW(cudaMemcpyAsync(shared_lookup_table_ondevice_.get(), lookup_table, kLookupTableSize,
+                                    cudaMemcpyHostToDevice, nullptr));
+  }
+}
+
 template <typename T>
 Status Upsample<T>::BaseCompute(OpKernelContext* context,
-                                const std::vector<float>& roi,
-                                const std::vector<float>& scales,
-                                const gsl::span<const int64_t>& output_dims) const {
+                                gsl::span<const float> roi,
+                                gsl::span<const float> scales,
+                                gsl::span<const int64_t> output_dims) const {
   const Tensor* X = context->Input<Tensor>(0);
   auto X_dims = X->Shape().GetDims();
   int32_t rank = static_cast<int32_t>(X_dims.size());
@@ -52,7 +67,8 @@ Status Upsample<T>::BaseCompute(OpKernelContext* context,
                   is_resize_ ? "Resize: input tensor cannot be scalar." : "Upsample: input tensor cannot be scalar.");
   if (rank != static_cast<int32_t>(scales.size()))
     return Status(ONNXRUNTIME, INVALID_ARGUMENT,
-                  is_resize_ ? "Resize: input tensor's dimension does not match the scales." : "Upsample: input tensor's dimension does not match the scales.");
+                  is_resize_ ? "Resize: input tensor's dimension does not match the scales."
+                             : "Upsample: input tensor's dimension does not match the scales.");
   if (roi.size() != 2 * X_dims.size())
     return Status(ONNXRUNTIME, INVALID_ARGUMENT,
                   "Resize: size of roi array should be 2 * N where N is the rank of input tensor X.");
@@ -79,22 +95,194 @@ Status Upsample<T>::BaseCompute(OpKernelContext* context,
   size_t output_count = Y->Shape().Size();
 
   if (is_resize_) {
-    TArray<int64_t> input_shape(X_dims);
-    TArray<int64_t> output_shape(output_dims);
-    TArray<float, 10> roi_vals(roi);
-    TArray<float> scales_vals(scales);
-
-    size_t temp_buffer_size = CalcResizeBufferSize(mode_, output_dims);
-    auto dims_mapping_buffer = GetScratchBuffer<unsigned char>(temp_buffer_size, context->GetComputeStream());
-    void* dims_mapping = reinterpret_cast<void*>(dims_mapping_buffer.get());
-    ResizeImpl(Stream(context), mode_, (int)rank, input_shape, output_shape,
-               input_strides, output_div_pitches, scales_vals, roi_vals,
-               reinterpret_cast<const CudaT*>(X->Data<T>()),
-               reinterpret_cast<CudaT*>(Y->MutableData<T>()),
-               output_count, use_extrapolation_, ToCudaType<T>::FromFloat(extrapolation_value_),
-               cubic_coeff_a_, exclude_outside_,
-               coordinate_transform_mode_, nearest_mode_,
-               dims_mapping);
+    const bool is_same = std::all_of(scales.begin(), scales.end(), [](float v) { return v == 1.0f; }) &&
+                         (coordinate_transform_mode_ != ResizeCoordinateTransformationMode::TF_CROP_AND_RESIZE);
+    if (is_same) {
+      CUDA_CALL_THROW(cudaMemcpyAsync(Y->MutableData<T>(), X->Data<T>(),
+                                      output_count * sizeof(T), cudaMemcpyDeviceToDevice, Stream(context)));
+      return Status::OK();
+    }
+
+    if (antialias_) {
+      TempSpaceAllocateFunc allocate_temp_space = [&](size_t bytes_size) {
+        return GetScratchBuffer<uint8_t>(bytes_size, context->GetComputeStream());
+      };
+
+      std::optional<float> extrapolation_value;
+      if (use_extrapolation_)
+        extrapolation_value.emplace(extrapolation_value_);
+
+      switch (mode_) {
+        case UpsampleMode::LINEAR: {
+          if (X_dims.size() == 2 || X_dims.size() == 4) {
+            const bool is_2D = X_dims.size() == 2;
+
+            int64_t batch_size = 1;
+            int64_t num_channels = 1;
+
+            int64_t input_height;
+            int64_t input_width;
+
+            int64_t output_height;
+            int64_t output_width;
+
+            float height_scale;
+            float width_scale;
+
+            if (is_2D) {
+              input_height = X_dims[0];
+              input_width = X_dims[1];
+
+              output_height = output_dims[0];
+              output_width = output_dims[1];
+
+              height_scale = scales[0];
+              width_scale = scales[1];
+            } else {
+              if (scales[0] == 1.0f && scales[1] == 1.0f) {
+                batch_size = X_dims[Channels<LAYOUT_NCHW>::N];
+                num_channels = X_dims[Channels<LAYOUT_NCHW>::C];
+                input_height = X_dims[Channels<LAYOUT_NCHW>::H];
+                input_width = X_dims[Channels<LAYOUT_NCHW>::W];
+
+                output_height = output_dims[Channels<LAYOUT_NCHW>::H];
+                output_width = output_dims[Channels<LAYOUT_NCHW>::W];
+
+                height_scale = scales[2];
+                width_scale = scales[3];
+              } else {
+                return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED, "Resize", ": NHWC is not supported yet");
+              }
+            }
+
+            ResizeAntiAliasImpl(Stream(context),
+                                rank,
+                                mode_,
+                                coordinate_transform_mode_,
+                                X_dims, output_dims,
+                                batch_size, num_channels,
+                                std::make_tuple(0, input_height, input_width),
+                                std::make_tuple(0, output_height, output_width),
+                                std::make_tuple(0.f, height_scale, width_scale),
+                                output_div_pitches,
+                                roi,
+                                extrapolation_value,
+                                exclude_outside_,
+                                allocate_temp_space,
+                                shared_lookup_table_ondevice_.get(),
+                                reinterpret_cast<const CudaT*>(X->Data<T>()),
+                                reinterpret_cast<CudaT*>(Y->MutableData<T>()),
+                                output_count);
+
+          } else if (X_dims.size() == 3 || X_dims.size() == 5) {
+            const bool is_3D = X_dims.size() == 3;
+
+            if (!is_3D) {
+              if (!(scales[0] == 1.0f && scales[1] == 1.0f)) {
+                return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED, "Resize", ": NDHWC is not supported yet");
+              }
+            }
+
+            const int64_t batch_size = is_3D ? 1 : X_dims[0];
+            const int64_t num_channels = is_3D ? 1 : X_dims[1];
+            const int64_t input_depth = is_3D ? X_dims[0] : X_dims[2];
+            const int64_t input_height = is_3D ? X_dims[1] : X_dims[3];
+            const int64_t input_width = is_3D ? X_dims[2] : X_dims[4];
+
+            const int64_t output_depth = is_3D ? output_dims[0] : output_dims[2];
+            const int64_t output_height = is_3D ? output_dims[1] : output_dims[3];
+            const int64_t output_width = is_3D ? output_dims[2] : output_dims[4];
+
+            const float depth_scale = is_3D ? scales[0] : scales[2];
+            const float height_scale = is_3D ? scales[1] : scales[3];
+            const float width_scale = is_3D ? scales[2] : scales[4];
+
+            ResizeAntiAliasImpl(Stream(context),
+                                rank,
+                                mode_,
+                                coordinate_transform_mode_,
+                                X_dims, output_dims,
+                                batch_size, num_channels,
+                                std::make_tuple(input_depth, input_height, input_width),
+                                std::make_tuple(output_depth, output_height, output_width),
+                                std::make_tuple(depth_scale, height_scale, width_scale),
+                                output_div_pitches,
+                                roi,
+                                extrapolation_value,
+                                exclude_outside_,
+                                allocate_temp_space,
+                                shared_lookup_table_ondevice_.get(),
+                                reinterpret_cast<const CudaT*>(X->Data<T>()),
+                                reinterpret_cast<CudaT*>(Y->MutableData<T>()),
+                                output_count);
+          } else {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED, "Resize",
+                                   ": 'Linear' mode only support 2-D inputs or 3-D inputs ('Bilinear', 'Trilinear') "
+                                   "or 4-D inputs or 5-D inputs with the corresponding outermost 2 scale values "
+                                   "being 1.");
+          }
+        } break;
+        case UpsampleMode::CUBIC: {
+          if (X_dims.size() != 2 && X_dims.size() != 4) {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED, "Resize",
+                                   ": 'Cubic' mode only support 2-D inputs ('Bicubic') or 4-D inputs "
+                                   "with the corresponding outermost 2 scale values being 1.");
+          }
+
+          const bool is_2D = X_dims.size() == 2;
+          const bool is_nchw = is_2D ? true : (scales[1] == 1.0f && scales[1] == 1.0f);
+
+          ORT_RETURN_IF_NOT(is_nchw,
+                            "Resize 'Cubic' mode only supports NCWH layout "
+                            " with 2-D or 4-D with leading dims equal to 1");
+
+          const int64_t batch_size = is_2D ? 1 : X_dims[Channels<LAYOUT_NCHW>::N];
+          const int64_t num_channels = is_2D ? 1 : X_dims[Channels<LAYOUT_NCHW>::C];
+          const int64_t input_height = is_2D ? X_dims[0] : X_dims[Channels<LAYOUT_NCHW>::H];
+          const int64_t input_width = is_2D ? X_dims[1] : X_dims[Channels<LAYOUT_NCHW>::W];
+
+          const int64_t output_height = is_2D ? output_dims[0] : output_dims[Channels<LAYOUT_NCHW>::H];
+          const int64_t output_width = is_2D ? output_dims[1] : output_dims[Channels<LAYOUT_NCHW>::W];
+          const float height_scale = is_2D ? scales[0] : scales[2];
+          const float width_scale = is_2D ? scales[1] : scales[3];
+
+          ResizeAntiAliasImpl(Stream(context), rank, mode_, coordinate_transform_mode_,
+                              X_dims, output_dims,
+                              batch_size, num_channels,
+                              std::make_tuple(0, input_height, input_width),
+                              std::make_tuple(0, output_height, output_width),
+                              std::make_tuple(0.f, height_scale, width_scale),
+                              output_div_pitches,
+                              roi,
+                              extrapolation_value,
+                              exclude_outside_,
+                              allocate_temp_space,
+                              shared_lookup_table_ondevice_.get(),
+                              reinterpret_cast<const CudaT*>(X->Data<T>()),
+                              reinterpret_cast<CudaT*>(Y->MutableData<T>()),
+                              output_count);
+        } break;
+        default:
+          return Status(ONNXRUNTIME, INVALID_ARGUMENT, "Resize: unexpected mode");
+      }
+    } else {
+      TArray<int64_t> input_shape(X_dims);
+      TArray<int64_t> output_shape(output_dims);
+      TArray<float, 10> roi_vals(roi);
+      TArray<float> scales_vals(scales);
+
+      size_t temp_buffer_size = CalcResizeBufferSize(mode_, output_dims);
+      auto dims_mapping_buffer = GetScratchBuffer<unsigned char>(temp_buffer_size, context->GetComputeStream());
+      void* dims_mapping = reinterpret_cast<void*>(dims_mapping_buffer.get());
+      ResizeImpl(Stream(context), mode_, rank, input_shape, output_shape,
+                 input_strides, output_div_pitches, scales_vals, roi_vals,
+                 reinterpret_cast<const CudaT*>(X->Data<T>()),
+                 reinterpret_cast<CudaT*>(Y->MutableData<T>()),
+                 output_count, use_extrapolation_, ToCudaType<T>::FromFloat(extrapolation_value_),
+                 cubic_coeff_a_, exclude_outside_,
+                 coordinate_transform_mode_, nearest_mode_,
+                 dims_mapping);
+    }
   } else {
     TArray<fast_divmod> scales_div(rank);
 
@@ -124,7 +312,7 @@ Status Upsample<T>::ComputeInternal(OpKernelContext* context) const {
   auto input_dims = X->Shape().GetDims();
 
   TensorShapeVector output_dims(input_dims.size());
-  std::vector<float> roi_array(input_dims.size() * 2, 0.0f);
+  InlinedVector<float> roi_array(input_dims.size() * 2, 0.0f);
   if (!roi_cached_) {
     bool use_default_roi = true;
     if (need_roi_input_) {
@@ -147,29 +335,37 @@ Status Upsample<T>::ComputeInternal(OpKernelContext* context) const {
     }
   }
 
-  const std::vector<float>& roi = roi_cached_ ? roi_ : roi_array;
-  std::vector<float> scales_array = scales_;
+  ComputeROIWithAxes(roi_array, input_dims.size());
 
+  InlinedVector<float> scales_array(input_dims.size());
+  // opset < 10
   if (OpKernel::Node().InputDefs().size() == 1) {
-    // Compute output shape from scales and input dims
+    // Compute output shape from scales attributes and input dims
+    scales_array = scales_;
+
     ComputeOutputShape(scales_array, input_dims, output_dims);
-    return BaseCompute(context, roi, scales_, output_dims);
+    return BaseCompute(context, roi_array, scales_, output_dims);
   }
 
   const Tensor* scales = context->Input<Tensor>(scales_input_idx_);
   const Tensor* sizes = context->Input<Tensor>(sizes_input_idx_);
 
+  // This is when scales are obtained and cached from a constant initializer
   if (scales_cached_) {
-    ORT_ENFORCE(sizes == nullptr, "Only one of scales or sizes must be provided as input.");
+    ORT_RETURN_IF_NOT(sizes == nullptr, "Only one of scales or sizes must be provided as input.");
+    scales_array = scales_;
+    // Compute output shape from scales and input dims
     ComputeOutputShape(scales_array, input_dims, output_dims);
-    return BaseCompute(context, roi, scales_, output_dims);
+    return BaseCompute(context, roi_array, scales_array, output_dims);
   }
 
-  scales_array.resize((input_dims.size()));
+  // Scales and sizes are input to the node
   if (scales != nullptr && scales->Shape().Size() != 0) {
     // use scales input data
     ORT_ENFORCE(sizes == nullptr, "Only one of scales or sizes must be provided as input.");
     ORT_RETURN_IF_ERROR(ParseScalesData(scales, scales_array, input_dims.size()));
+
+    // Compute output shape from scales and input dims
     ComputeOutputShape(scales_array, input_dims, output_dims);
   } else {
     // When sizes input is available directly populate it into the output_dims array.
@@ -179,7 +375,7 @@ Status Upsample<T>::ComputeInternal(OpKernelContext* context) const {
     ORT_RETURN_IF_ERROR(ParseScalesDataAndAdjustOutputSize(output_dims, input_dims, scales_array));
   }
 
-  return BaseCompute(context, roi, scales_array, output_dims);
+  return BaseCompute(context, roi_array, scales_array, output_dims);
 }
 
 }  // namespace cuda
diff --git a/onnxruntime/core/providers/cuda/tensor/upsample.h b/onnxruntime/core/providers/cuda/tensor/upsample.h
index 7bf2a23ede399..50597e0fba1b9 100644
--- a/onnxruntime/core/providers/cuda/tensor/upsample.h
+++ b/onnxruntime/core/providers/cuda/tensor/upsample.h
@@ -13,12 +13,14 @@ namespace cuda {
 template <typename T>
 class Upsample : public UpsampleBase, public CudaKernel {
  public:
-  Upsample(const OpKernelInfo& info) : UpsampleBase(info), CudaKernel(info) {
-  }
+  explicit Upsample(const OpKernelInfo& info);
 
   Status ComputeInternal(OpKernelContext* context) const override;
-  Status BaseCompute(OpKernelContext* context, const std::vector<float>& roi, const std::vector<float>& scales,
-                     const gsl::span<const int64_t>& output_dims) const;
+  Status BaseCompute(OpKernelContext* context, gsl::span<const float> roi, gsl::span<const float> scales,
+                     gsl::span<const int64_t> output_dims) const;
+
+ private:
+  IAllocatorUniquePtr<uint8_t> shared_lookup_table_ondevice_;
 };
 
 }  // namespace cuda
diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
index 3fd5423681b81..0265c06b9a938 100644
--- a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
+++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
@@ -1145,11 +1145,11 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, MLFloat16, ReduceSumSquare);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, int64_t, GatherND);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, Dropout);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, float, Resize);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, double, Resize);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, MLFloat16, Resize);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, int32_t, Resize);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, uint8_t, Resize);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 17, float, Resize);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 17, double, Resize);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 17, MLFloat16, Resize);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 17, int32_t, Resize);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 17, uint8_t, Resize);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 18, If);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 18, Loop);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, Flatten);
@@ -1304,6 +1304,11 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18, MLFloat16, Pad);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18, bool, Pad);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18, ScatterElements);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18, float, Resize);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18, double, Resize);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18, MLFloat16, Resize);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18, int32_t, Resize);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18, uint8_t, Resize);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18, Split);
 
 // Opset 19
@@ -2081,11 +2086,16 @@ static Status RegisterRocmKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, MLFloat16, ReduceSumSquare)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, int64_t, GatherND)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, Dropout)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, float, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, double, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, MLFloat16, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, int32_t, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, uint8_t, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          float, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          double, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          MLFloat16, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          int32_t, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          uint8_t, Resize)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 18, If)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 18, Loop)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, Flatten)>,
@@ -2240,6 +2250,16 @@ static Status RegisterRocmKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18, MLFloat16, Pad)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18, bool, Pad)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18, ScatterElements)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18,
+                                                                                                     float, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18,
+                                                                                                     double, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18,
+                                                                                                   MLFloat16, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18,
+                                                                                                     int32_t, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18,
+                                                                                                     uint8_t, Resize)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18, Split)>,
 
     // Opset 19
diff --git a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
index da17135878fe5..7b73ab36b3742 100644
--- a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
+++ b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
@@ -24,6 +24,7 @@
 #include "core/providers/cpu/tensor/size.h"
 #include "core/providers/cpu/tensor/scatter_nd.h"
 #include "core/providers/cpu/tensor/unsqueeze.h"
+#include "core/providers/cpu/tensor/upsamplebase.h"
 #include "core/providers/cpu/tensor/tile.h"
 
 #ifndef DISABLE_CONTRIB_OPS
@@ -572,6 +573,11 @@ std::unique_ptr<EinsumTypedComputeProcessor<double>> EinsumTypedComputeProcessor
 template <>
 std::unique_ptr<EinsumTypedComputeProcessor<MLFloat16>> EinsumTypedComputeProcessor<MLFloat16>::Create(OpKernelContext* context, AllocatorPtr allocator, concurrency::ThreadPool* tp, EinsumComputePreprocessor& einsum_compute_preprocessor, void* einsum_cuda_assets) { return g_host_cpu.EinsumTypedComputeProcessor_MLFloat16__Create(context, allocator, tp, einsum_compute_preprocessor, einsum_cuda_assets); }
 
+void UpsampleBase::AdjustOutputSizeAsPolicy(TensorShapeVector& output_dims, gsl::span<const int64_t> input_dims,
+                                            InlinedVector<float>& scales) const {
+  g_host_cpu.UpsampleBase__AdjustOutputSizeAsPolicy(this, output_dims, input_dims, scales);
+}
+
 #ifndef DISABLE_CONTRIB_OPS
 namespace contrib {
 Status embed_layer_norm::CheckInputs(const OpKernelContext* context, bool quantizedVersion) {
@@ -648,7 +654,6 @@ Status Sampling::SetupSubgraphExecutionInfo(const SessionState& session_state, c
                                             const SessionState& subgraph_session_state) {
   return g_host_cpu.Sampling__SetupSubgraphExecutionInfo(this, session_state, attribute_name, subgraph_session_state);
 }
-
 }  // namespace transformers
 
 #ifdef ENABLE_ATEN
diff --git a/onnxruntime/core/providers/xnnpack/tensor/resize.cc b/onnxruntime/core/providers/xnnpack/tensor/resize.cc
index 0c9e2e9fc17a2..09666c8039402 100644
--- a/onnxruntime/core/providers/xnnpack/tensor/resize.cc
+++ b/onnxruntime/core/providers/xnnpack/tensor/resize.cc
@@ -288,7 +288,7 @@ Status Resize::Compute(OpKernelContext* ctx) const {
 
     // Get scales data
     const auto* scales = ctx->Input<Tensor>(scales_input_idx_);
-    std::vector<float> scales_array(X->Shape().GetDims().size());
+    InlinedVector<float> scales_array(X->Shape().GetDims().size());
 
     if (scales != nullptr && scales->Shape().Size() != 0) {
       ORT_RETURN_IF_ERROR(ParseScalesData(scales, scales_array, output_shape.size()));
diff --git a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
index 10f02349a24d5..1d31f3fdb4eb4 100644
--- a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
@@ -11,7 +11,8 @@ namespace test {
 TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_tf_crop_and_resize) {
   // TODO: Unskip when fixed #41968513
   if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: The difference between expected[i] and output[i] is 0.20000028610229492, which exceeds threshold";
+    GTEST_SKIP() << "Skipping because of the following error: The difference between expected[i] and output[i] "
+                 << "is 0.20000028610229492, which exceeds threshold";
   }
 
   OpTester test("Resize", 13);
@@ -32,7 +33,8 @@ TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_tf_crop_and_resize) {
 
   test.AddInput<float>("X", {H, W}, X);
   test.AddInput<float>("roi", {4}, roi);
-  test.AddInput<float>("", {0}, scales);  // opset13 requires either 'sizes' or 'scales' must be provided, but not both of them
+  // opset13 requires either 'sizes' or 'scales' must be provided, but not both of them
+  test.AddInput<float>("", {0}, scales);
   test.AddInput<int64_t>("sizes", {2}, sizes);
 
   std::vector<float> Y = {7.600004f, 7.9f, 8.2f,
@@ -188,7 +190,9 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_tf_crop_and_resize_without_e
   // CUDA: result mismatch due to not implementing NHWC support
   // ROCm: results mismatch
   // DML: results mismatch
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider, kDmlExecutionProvider});
+  test.Run(
+      OpTester::ExpectResult::kExpectSuccess, "",
+      {kCudaExecutionProvider, kRocmExecutionProvider, kDmlExecutionProvider});
 }
 
 TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_tf_crop_and_resize_without_extrapolation_int8) {
@@ -317,7 +321,7 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_int8) {
 // The output size is [1,1,2,4].*[1,1,0.6,0.6]=[1,1,1,2]
 // NNAPI will recaluclate the scales as the output size divided by input size
 // scales = [1,1,1,2]./[1,1,2,4] = [1,1,0.5,0.5]
-// See, https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/kernels/internal/reference/reference_ops.h
+// See:https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/kernels/internal/reference/reference_ops.h
 // So the result of the above example will be different than CPU EP
 // Add the following 2 tests to test with scales valid to NNAPI
 TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear1) {
@@ -475,7 +479,8 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_align_corners_int
 TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_2DBilinear_pytorch_half_pixel) {
   // TODO: Unskip when fixed #41968513
   if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: The difference between expected[i] and output[i] is 1.5000001192092896, which exceeds threshold";
+    GTEST_SKIP() << "Skipping because of the following error: "
+                 << " The difference between expected[i] and output[i] is 1.5000001192092896, which exceeds threshold";
   }
 
   OpTester test("Resize", 13);
@@ -533,7 +538,8 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_pytorch_half_pixe
   // CUDA: result mismatch due to not implementing NHWC support
   // ROCm: results mismatch
   // DML: results mismatch
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider, kDmlExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kCudaExecutionProvider, kRocmExecutionProvider, kDmlExecutionProvider});
 }
 
 TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_pytorch_half_pixel_int8) {
@@ -721,7 +727,8 @@ TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_2DBilinear_align_corners) {
 TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_3DTrilinear_pytorch_half_pixel) {
   // TODO: Unskip when fixed #41968513
   if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: The difference between expected[i] and output[i] is 1.5000001192092896, which exceeds threshold";
+    GTEST_SKIP() << "Skipping because of the following error: "
+                 << "The difference between expected[i] and output[i] is 1.5000001192092896, which exceeds threshold";
   }
 
   OpTester test("Resize", 13);
@@ -1088,7 +1095,8 @@ TEST(ResizeOpTest, ResizeOpNearestUpSample_Floor_Align_Corners) {
 TEST(ResizeOpTest, ResizeOpNearest_OneToOneMappingBetweenInputAndOutputDataDims) {
   // TODO: Unskip when fixed #41968513
   if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: The difference between expected[i] and output[i] is 3, which exceeds threshold";
+    GTEST_SKIP() << "Skipping because of the following error: "
+                 << "The difference between expected[i] and output[i] is 3, which exceeds threshold";
   }
 
   OpTester test("Resize", 12);  // tf_half_pixel_for_nn is deprecated since opset 13
@@ -1480,7 +1488,8 @@ TEST(ResizeOpTest, ResizeOpCubicUpSampleTest_tf_half_pixel_for_nn) {
 TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear_Ver10) {
   // TODO: Unskip when fixed #41968513
   if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: The difference between expected[i] and output[i] is 1.6666665077209473, which exceeds threshold";
+    GTEST_SKIP() << "Skipping because of the following error: "
+                 << "The difference between expected[i] and output[i] is 1.6666665077209473, which exceeds threshold";
   }
 
   OpTester test("Resize", 10);
@@ -1505,7 +1514,8 @@ TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear_Ver10) {
 TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_2DBilinear_Ver10) {
   // TODO: Unskip when fixed #41968513
   if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: The difference between expected[i] and output[i] is 1.6666665077209473, which exceeds threshold";
+    GTEST_SKIP() << "Skipping because of the following error: "
+                 << "The difference between expected[i] and output[i] is 1.6666665077209473, which exceeds threshold ";
   }
 
   OpTester test("Resize", 10);
@@ -1530,7 +1540,8 @@ TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_2DBilinear_Ver10) {
 TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_4DBilinear_Ver10) {
   // TODO: Unskip when fixed #41968513
   if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: The difference between expected[i] and output[i] is 0.5, which exceeds threshold";
+    GTEST_SKIP() << "Skipping because of the following error: "
+                 << "The difference between expected[i] and output[i] is 0.5, which exceeds threshold";
   }
 
   OpTester test("Resize", 10);
@@ -1565,7 +1576,8 @@ TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_4DBilinear_Ver10) {
 TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_2DBilinear_Ver10) {
   // TODO: Unskip when fixed #41968513
   if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: The difference between expected[i] and output[i] is 0.5, which exceeds threshold";
+    GTEST_SKIP() << "Skipping because of the following error: "
+                 << "The difference between expected[i] and output[i] is 0.5, which exceeds threshold";
   }
 
   OpTester test("Resize", 10);
@@ -1676,7 +1688,8 @@ TEST(UpsampleOpTest, ResizeOpNearestNoScaleTest_Ver10) {
 TEST(ResizeOpTest, ResizeOp_MissingRoiAndMissingScalesOptionalInputs) {
   // TODO: Unskip when fixed #41968513
   if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: MLOperatorAuthorImpl.cpp(1876): The parameter is incorrect.";
+    GTEST_SKIP() << "Skipping because of the following error: "
+                 << "MLOperatorAuthorImpl.cpp(1876): The parameter is incorrect.";
   }
 
   OpTester test("Resize", 13);
@@ -1827,7 +1840,8 @@ template <typename T, typename T1 = int64_t>
 void TestAntialiasing(std::map<std::string, std::string> attributes,
                       std::vector<int64_t> input_shape,
                       std::vector<T> input_data,
-                      std::vector<T1> output_shape_or_scale, std::vector<T> output_data) {
+                      std::vector<T1> output_shape_or_scale, std::vector<T> output_data,
+                      gsl::span<std::string_view> excluded_ep = {}) {
   auto parse_attr = [](const std::string& str, auto typed_v) {
     using Tdata = decltype(typed_v);
     std::vector<Tdata> vect;
@@ -1891,13 +1905,22 @@ void TestAntialiasing(std::map<std::string, std::string> attributes,
   }
 
   test.AddOutput<T>("Y", output_shape, output_data);
-  // TensorRT 8.5 supports operators up to Opset 17. Temporarily exclude TensorRT EP due to accurarcy issue.
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+
+  std::unordered_set<std::string> excluded_eps;
+  std::transform(excluded_ep.begin(), excluded_ep.end(),
+                 std::inserter(excluded_eps, excluded_eps.end()), [](std::string_view ep) {
+                   return std::string(ep);
+                 });
+  // TensorRT 8.5 supports operators up to Opset 17. Temporarily exclude TensorRT EP due to accuracy issue.
+  excluded_eps.insert(kTensorrtExecutionProvider);
+
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", excluded_eps);
 }
 
 TEST(ResizeOpTest, Antialias_Bilinear_No_ExcludeOutside) {
   if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because dml implementation of antialias is slightly different and doesn't match in all cases.";
+    GTEST_SKIP() << "Skipping because dml implementation of antialias "
+                 << "is slightly different and doesn't match in all cases.";
   }
   std::vector<float> X(16);
   std::iota(X.begin(), X.end(), 1.f);
@@ -1939,7 +1962,8 @@ TEST(ResizeOpTest, Antialias_Bilinear_dtype) {
     std::vector<int8_t> Y = {1, 3, 4,
                              6, 8, 9,
                              11, 13, 14};
-    TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}}, {1, 1, 4, 4}, X, {1, 1, 3, 3}, Y);
+    InlinedVector<std::string_view> excluded_eps = {kCudaExecutionProvider};
+    TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}}, {1, 1, 4, 4}, X, {1, 1, 3, 3}, Y, excluded_eps);
   }
   {
     std::vector<int32_t> X(16);
@@ -1982,17 +2006,21 @@ TEST(ResizeOpTest, Antialias_NhwcBilinear) {
                           33.5f, 73.5f, 113.5f,
                           35.074074f, 75.07407f, 115.07407f,
                           36.590908f, 76.59091f, 116.59091f};
-  TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}}, {1, 5, 8, 3}, X, {1, 4, 5, 3}, Y);
+
+  // Nchw is not supported by CUDA Resize implementation
+  InlinedVector<std::string_view> excluded_eps = {kCudaExecutionProvider, kRocmExecutionProvider};
+  TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}}, {1, 5, 8, 3}, X, {1, 4, 5, 3}, Y, excluded_eps);
 }
 
 TEST(ResizeOpTest, Antialias_NhwcBilinear_dtype) {
+  InlinedVector<std::string_view> excluded_eps = {kCudaExecutionProvider, kRocmExecutionProvider};
   {
     std::vector<uint8_t> X(16);
     std::iota(X.begin(), X.end(), uint8_t(0));
     std::vector<uint8_t> Y = {1, 3, 4,
                               6, 8, 9,
                               11, 13, 14};
-    TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}}, {1, 4, 4, 1}, X, {1, 3, 3, 1}, Y);
+    TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}}, {1, 4, 4, 1}, X, {1, 3, 3, 1}, Y, excluded_eps);
   }
   {
     std::vector<int8_t> X(16);
@@ -2000,7 +2028,7 @@ TEST(ResizeOpTest, Antialias_NhwcBilinear_dtype) {
     std::vector<int8_t> Y = {1, 3, 4,
                              6, 8, 9,
                              11, 13, 14};
-    TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}}, {1, 4, 4, 1}, X, {1, 3, 3, 1}, Y);
+    TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}}, {1, 4, 4, 1}, X, {1, 3, 3, 1}, Y, excluded_eps);
   }
   {
     std::vector<int32_t> X(16);
@@ -2008,13 +2036,14 @@ TEST(ResizeOpTest, Antialias_NhwcBilinear_dtype) {
     std::vector<int32_t> Y = {1, 3, 4,
                               6, 8, 9,
                               11, 13, 14};
-    TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}}, {1, 4, 4, 1}, X, {1, 3, 3, 1}, Y);
+    TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}}, {1, 4, 4, 1}, X, {1, 3, 3, 1}, Y, excluded_eps);
   }
 }
 
 TEST(ResizeOpTest, Antialias_Trilinear_No_ExcludeOutside) {
   if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because dml implementation of antialias is slightly different and doesn't match in all cases.";
+    GTEST_SKIP() << "Skipping because dml implementation of "
+                 << "antialias is slightly different and doesn't match in all cases.";
   }
   std::vector<float> X(16 * 4);
   std::iota(X.begin(), X.end(), 0.f);
@@ -2038,13 +2067,17 @@ TEST(ResizeOpTest, Antialias_Trilinear_ExcludeOutside) {
 
 TEST(ResizeOpTest, Antialias_Trilinear_Scale_Is_11s_and_1s1) {
   if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because dml implementation of antialias is slightly different and doesn't match in all cases.";
+    GTEST_SKIP() << "Skipping because dml implementation of antialias"
+                 << " is slightly different and doesn't match in all cases.";
   }
+
+  InlinedVector<std::string_view> excluded_eps = {kCudaExecutionProvider};
   std::vector<float> X(16 * 4 * 4);
   std::iota(X.begin(), X.end(), 0.f);
   {
     std::vector<float> Y = X;
-    TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}}, {4, 1, 4, 4, 4}, X, {4, 1, 4, 4, 4}, Y);
+    TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}}, {4, 1, 4, 4, 4}, X, {4, 1, 4, 4, 4}, Y,
+                     excluded_eps);
   }
   {
     std::vector<float> Y = {0.625f, 2.375f, 4.625f, 6.375f, 8.625f, 10.375f, 12.625f,
@@ -2066,7 +2099,8 @@ TEST(ResizeOpTest, Antialias_Trilinear_Scale_Is_11s_and_1s1) {
                             224.625f, 226.375f, 228.625f, 230.375f, 232.625f, 234.375f, 236.625f,
                             238.375f, 240.625f, 242.375f, 244.625f, 246.375f, 248.625f, 250.375f,
                             252.625f, 254.375f};
-    TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "0"}}, {4, 1, 4, 4, 4}, X, {4, 1, 4, 4, 2}, Y);
+    TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "0"}}, {4, 1, 4, 4, 4}, X, {4, 1, 4, 4, 2}, Y,
+                     excluded_eps);
   }
   {
     std::vector<float> Y = {2.5f, 3.5f, 4.5f, 5.5f, 9.5f, 10.5f, 11.5f, 12.5f, 18.5f,
@@ -2084,7 +2118,8 @@ TEST(ResizeOpTest, Antialias_Trilinear_Scale_Is_11s_and_1s1) {
                             217.5f, 218.5f, 219.5f, 220.5f, 226.5f, 227.5f, 228.5f, 229.5f, 233.5f,
                             234.5f, 235.5f, 236.5f, 242.5f, 243.5f, 244.5f, 245.5f, 249.5f, 250.5f,
                             251.5f, 252.5f};
-    TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "0"}}, {4, 1, 4, 4, 4}, X, {4, 1, 4, 2, 4}, Y);
+    TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "0"}}, {4, 1, 4, 4, 4}, X, {4, 1, 4, 2, 4}, Y,
+                     excluded_eps);
   }
 }
 
@@ -2124,12 +2159,15 @@ TEST(ResizeOpTest, Antialias_NHWCBicubic_ExcludeOutside) {
       19.576872f, 43.57687f, 21.126253f, 45.126255f, 22.606192f,
       46.606194f, 19.878183f, 43.87818f, 21.358122f, 45.35812f,
       22.907503f, 46.907505f, 24.387442f, 48.387444f};
-  TestAntialiasing({{"mode", "cubic"}, {"exclude_outside", "0"}}, {1, 4, 6, 2}, X, {1, 8, 4, 2}, Y);
+
+  InlinedVector<std::string_view> excluded_eps = {kCudaExecutionProvider, kRocmExecutionProvider};
+  TestAntialiasing({{"mode", "cubic"}, {"exclude_outside", "0"}}, {1, 4, 6, 2}, X, {1, 8, 4, 2}, Y, excluded_eps);
 }
 
 TEST(ResizeOpTest, Antialias_Linear_AlignCorners) {
   if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because dml implementation of antialias is slightly different and doesn't match in all cases.";
+    GTEST_SKIP() << "Skipping because dml implementation of antialias"
+                 << "is slightly different and doesn't match in all cases.";
   }
   std::vector<float> X(256);
   std::iota(X.begin(), X.end(), 0.0f);
@@ -2145,9 +2183,40 @@ TEST(ResizeOpTest, Antialias_Linear_AlignCorners) {
       187.08333f, 195.91667f, 198.41667f, 205.91667f, 208.41667f,
       217.25f, 219.75f, 227.25f, 229.75f, 238.58333f,
       241.08333f, 248.58333f, 251.08333f};
+  InlinedVector<std::string_view> excluded_eps = {kCudaExecutionProvider, kRocmExecutionProvider};
   TestAntialiasing(
       {{"mode", "linear"}, {"exclude_outside", "0"}, {"coordinate_transformation_mode", "align_corners"}},
-      {4, 1, 4, 4, 4}, X, {4, 1, 3, 2, 2}, Y);
+      {4, 1, 4, 4, 4}, X, {4, 1, 3, 2, 2}, Y, excluded_eps);
+}
+
+TEST(ResizeOpTest, Antialias_Linear_AlignCorners_3D) {
+  if (DefaultDmlExecutionProvider().get() != nullptr) {
+    GTEST_SKIP() << "Skipping because dml implementation of antialias is slightly "
+                 << "different and doesn't match in all cases.";
+  }
+  std::vector<float> X(256);
+  std::iota(X.begin(), X.end(), 0.0f);
+  std::vector<float> Y{
+      1.25f, 3.75f, 11.25f, 13.75f,
+      17.25f, 19.75f, 27.25f, 29.75f,
+      33.25f, 35.75f, 43.25f, 45.75f,
+      49.25f, 51.75f, 59.25f, 61.75f,
+      65.25f, 67.75f, 75.25f, 77.75f,
+      81.25f, 83.75f, 91.25f, 93.75f,
+      97.25f, 99.75f, 107.25f, 109.75f,
+      113.25f, 115.75f, 123.25f, 125.75f,
+      129.25f, 131.75f, 139.25f, 141.75f,
+      145.25f, 147.75f, 155.25f, 157.75f,
+      161.25f, 163.75f, 171.25f, 173.75f,
+      177.25f, 179.75f, 187.25f, 189.75f,
+      193.25f, 195.75f, 203.25f, 205.75f,
+      209.25f, 211.75f, 219.25f, 221.75f,
+      225.25f, 227.75f, 235.25f, 237.75f,
+      241.25f, 243.75f, 251.25f, 253.75f};
+
+  TestAntialiasing(
+      {{"mode", "linear"}, {"exclude_outside", "0"}, {"coordinate_transformation_mode", "align_corners"}},
+      {16, 4, 4}, X, {16, 2, 2}, Y);
 }
 
 TEST(ResizeOpTest, Antialias_Bicubic_ExcludeOutside) {
@@ -2166,19 +2235,23 @@ TEST(ResizeOpTest, Antialias_Bicubic_Dtype) {
     std::vector<uint8_t> X(36);
     std::iota(X.begin(), X.end(), uint8_t(0));
     std::vector<uint8_t> Y = {4, 6, 7, 16, 18, 19, 28, 30, 31};
-    TestAntialiasing({{"mode", "cubic"}, {"cubic_coeff_a", "-0.5f"}, {"exclude_outside", "1"}}, {1, 1, 6, 6}, X, {1, 1, 3, 3}, Y);
+    TestAntialiasing({{"mode", "cubic"}, {"cubic_coeff_a", "-0.5f"}, {"exclude_outside", "1"}}, {1, 1, 6, 6},
+                     X, {1, 1, 3, 3}, Y);
   }
   {
     std::vector<int8_t> X(36);
     std::iota(X.begin(), X.end(), int8_t(0));
     std::vector<int8_t> Y = {4, 6, 7, 16, 18, 19, 28, 30, 31};
-    TestAntialiasing({{"mode", "cubic"}, {"cubic_coeff_a", "-0.5f"}, {"exclude_outside", "1"}}, {1, 1, 6, 6}, X, {1, 1, 3, 3}, Y);
+    InlinedVector<std::string_view> excluded_eps = {kCudaExecutionProvider};
+    TestAntialiasing({{"mode", "cubic"}, {"cubic_coeff_a", "-0.5f"}, {"exclude_outside", "1"}}, {1, 1, 6, 6},
+                     X, {1, 1, 3, 3}, Y, excluded_eps);
   }
   {
     std::vector<int32_t> X(36);
     std::iota(X.begin(), X.end(), 0);
     std::vector<int32_t> Y = {4, 6, 7, 16, 18, 19, 28, 30, 31};
-    TestAntialiasing({{"mode", "cubic"}, {"cubic_coeff_a", "-0.5f"}, {"exclude_outside", "1"}}, {1, 1, 6, 6}, X, {1, 1, 3, 3}, Y);
+    TestAntialiasing({{"mode", "cubic"}, {"cubic_coeff_a", "-0.5f"}, {"exclude_outside", "1"}}, {1, 1, 6, 6},
+                     X, {1, 1, 3, 3}, Y);
   }
 }
 
@@ -2189,8 +2262,10 @@ TEST(ResizeOpTest, Antialias_Axes_and_Scale) {
   std::vector<float> Y = {6.3f, 7.5f, 8.7f, 11.1f, 12.3f, 13.5f, 15.9f, 17.1f, 18.3f, 25.5f, 26.7f,
                           27.9f, 30.3f, 31.5f, 32.7f, 35.1f, 36.3f, 37.5f, 44.7f, 45.9f, 47.1f, 49.5f,
                           50.7f, 51.9f, 54.3f, 55.5f, 56.7f};
-  TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}, {"axes", "{2,3,4}"}, {"output_shape", "{1,1,3,3,3}"}}, {1, 1, 4, 4, 4}, X,
-                   std::vector<float>{3 / 4.0f, 3 / 4.0f, 3 / 4.0f}, Y);
+  TestAntialiasing(
+      {{"mode", "linear"}, {"exclude_outside", "1"}, {"axes", "{2,3,4}"}, {"output_shape", "{1,1,3,3,3}"}},
+      {1, 1, 4, 4, 4}, X,
+      std::vector<float>{3 / 4.0f, 3 / 4.0f, 3 / 4.0f}, Y);
 }
 
 TEST(ResizeOpTest, Antialias_Axes_and_Size) {
@@ -2199,8 +2274,10 @@ TEST(ResizeOpTest, Antialias_Axes_and_Size) {
   std::vector<float> Y = {6.3f, 7.5f, 8.7f, 11.1f, 12.3f, 13.5f, 15.9f, 17.1f, 18.3f, 25.5f, 26.7f,
                           27.9f, 30.3f, 31.5f, 32.7f, 35.1f, 36.3f, 37.5f, 44.7f, 45.9f, 47.1f, 49.5f,
                           50.7f, 51.9f, 54.3f, 55.5f, 56.7f};
-  TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}, {"axes", "{2,3,4}"}, {"output_shape", "{1,1,3,3,3}"}}, {1, 1, 4, 4, 4}, X,
-                   {3, 3, 3}, Y);
+  TestAntialiasing(
+      {{"mode", "linear"}, {"exclude_outside", "1"}, {"axes", "{2,3,4}"}, {"output_shape", "{1,1,3,3,3}"}},
+      {1, 1, 4, 4, 4}, X,
+      {3, 3, 3}, Y);
 }
 
 TEST(ResizeOpTest, Antialias_Axes_and_PolicyNoLarger) {
@@ -2209,9 +2286,13 @@ TEST(ResizeOpTest, Antialias_Axes_and_PolicyNoLarger) {
   std::vector<float> Y = {6.3f, 7.5f, 8.7f, 11.1f, 12.3f, 13.5f, 15.9f, 17.1f, 18.3f, 25.5f, 26.7f,
                           27.9f, 30.3f, 31.5f, 32.7f, 35.1f, 36.3f, 37.5f, 44.7f, 45.9f, 47.1f, 49.5f,
                           50.7f, 51.9f, 54.3f, 55.5f, 56.7f};
-  TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}, {"axes", "{2,3,4}"}, {"output_shape", "{1,1,3,3,3}"}, {"policy", "not_larger"}},
-                   {1, 1, 4, 4, 4}, X,
-                   {3, 4, 5}, Y);
+  // clang-format off
+  TestAntialiasing(
+      {{"mode", "linear"}, {"exclude_outside", "1"}, {"axes", "{2,3,4}"}, {"output_shape", "{1,1,3,3,3}"},
+       {"policy", "not_larger"}},
+      {1, 1, 4, 4, 4}, X,
+      {3, 4, 5}, Y);
+  // clang-format on
 }
 
 TEST(ResizeOpTest, Antialias_Axes_and_PolicyNoSmaller) {
@@ -2220,9 +2301,13 @@ TEST(ResizeOpTest, Antialias_Axes_and_PolicyNoSmaller) {
   std::vector<float> Y = {6.3f, 7.5f, 8.7f, 11.1f, 12.3f, 13.5f, 15.9f, 17.1f, 18.3f, 25.5f, 26.7f,
                           27.9f, 30.3f, 31.5f, 32.7f, 35.1f, 36.3f, 37.5f, 44.7f, 45.9f, 47.1f, 49.5f,
                           50.7f, 51.9f, 54.3f, 55.5f, 56.7f};
-  TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}, {"axes", "{2,3,4}"}, {"output_shape", "{1,1,3,3,3}"}, {"policy", "not_smaller"}},
-                   {1, 1, 4, 4, 4}, X,
-                   {1, 2, 3}, Y);
+  // clang-format off
+  TestAntialiasing(
+      {{"mode", "linear"}, {"exclude_outside", "1"}, {"axes", "{2,3,4}"}, {"output_shape", "{1,1,3,3,3}"},
+       {"policy", "not_smaller"}},
+      {1, 1, 4, 4, 4}, X,
+      {1, 2, 3}, Y);
+  // clang-format on
 }
 
 TEST(ResizeOpTest, Antialias_Use_Extrapolation) {

From 2a857d9a86ca3049829256df3347521069ccd6b4 Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Fri, 1 Mar 2024 10:23:29 +1000
Subject: [PATCH 091/279] Add ML Program support for more operators (#19527)

### Description
<!-- Describe your changes. -->

Add support for:
- Clip/Relu/Relu6
- Add/Mul/Div/Sub/Pow
- GlobalAveragePool/GlobalMaxPool/AveragePool/MaxPool
- Reshape
- Gemm/MatMul

Fix some build issues/warnings from changes.

Fix a couple of potential issues with the Resize op as well (noticed due
to change to reject inputs with empty data at a higher level).

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
Enable mobilenetv2 with ML Program
---
 cmake/onnxruntime_providers_coreml.cmake      |   2 +-
 .../providers/coreml/builders/coreml_spec.h   |   7 +-
 .../core/providers/coreml/builders/helper.cc  |  14 +-
 .../coreml/builders/impl/base_op_builder.cc   |  13 +-
 .../coreml/builders/impl/base_op_builder.h    |   6 +-
 .../coreml/builders/impl/binary_op_builder.cc | 113 +++---
 .../coreml/builders/impl/builder_utils.cc     |  68 ++++
 .../coreml/builders/impl/builder_utils.h      |  17 +-
 .../coreml/builders/impl/clip_op_builder.cc   | 187 ++++++---
 .../coreml/builders/impl/conv_op_builder.cc   |  94 +----
 .../coreml/builders/impl/gemm_op_builder.cc   | 332 +++++++++++-----
 .../coreml/builders/impl/pool_op_builder.cc   | 218 +++++++----
 .../builders/impl/reshape_op_builder.cc       |  70 ++--
 .../coreml/builders/impl/resize_op_builder.cc |  16 +-
 .../coreml/builders/impl/slice_op_builder.cc  |   2 +-
 .../builders/impl/softmax_op_builder.cc       |   4 +-
 .../coreml/builders/model_builder.cc          | 366 +++++++++++++-----
 .../providers/coreml/builders/model_builder.h |  63 +--
 .../coreml/coreml_execution_provider.cc       |  82 ++--
 .../providers/coreml/dump_mlprogram_model.py  |  27 ++
 .../core/providers/coreml/model/host_utils.h  |   6 +
 .../core/providers/coreml/model/host_utils.mm |  10 +
 .../core/providers/coreml/model/model.h       |  19 +-
 .../core/providers/coreml/model/model.mm      |  13 +
 .../core/providers/coreml/model/model_stub.cc |   4 +
 .../providers/cpu/tensor/reshape_helper.h     |   6 +-
 .../test/perftest/command_args_parser.cc      |  25 +-
 onnxruntime/test/perftest/ort_test_session.cc |  30 +-
 .../providers/coreml/coreml_basic_test.cc     |  20 +
 .../test/providers/cpu/math/clip_test.cc      |  27 +-
 .../test/providers/cpu/math/gemm_test.cc      |  37 +-
 .../providers/cpu/nn/batch_norm_op_test.cc    |  37 ++
 .../providers/cpu/tensor/resize_op_test.cc    |   4 +-
 33 files changed, 1344 insertions(+), 595 deletions(-)
 create mode 100644 onnxruntime/core/providers/coreml/dump_mlprogram_model.py

diff --git a/cmake/onnxruntime_providers_coreml.cmake b/cmake/onnxruntime_providers_coreml.cmake
index c9f35e5337f9b..8f3b1828e1c61 100644
--- a/cmake/onnxruntime_providers_coreml.cmake
+++ b/cmake/onnxruntime_providers_coreml.cmake
@@ -111,7 +111,7 @@ if(_enable_ML_PROGRAM)
   file(GLOB
     onnxruntime_providers_coreml_modelpackage_cc_srcs CONFIGURE_DEPENDS
     "${coremltools_SOURCE_DIR}/modelpackage/src/ModelPackage.?pp"
-    "${coremltools_SOURCE_DIR}/modelpackage/src/Utils/JsonMap.?pp"
+    "${coremltools_SOURCE_DIR}/modelpackage/src/utils/JsonMap.?pp"
   )
 
   set(coremltools_srcs
diff --git a/onnxruntime/core/providers/coreml/builders/coreml_spec.h b/onnxruntime/core/providers/coreml/builders/coreml_spec.h
index c9adba9e579d0..9448f1167990e 100644
--- a/onnxruntime/core/providers/coreml/builders/coreml_spec.h
+++ b/onnxruntime/core/providers/coreml/builders/coreml_spec.h
@@ -17,14 +17,19 @@
 #ifdef HAS_SHORTEN_64_TO_32
 #pragma GCC diagnostic ignored "-Wshorten-64-to-32"
 #endif
+#elif defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4244)  // conversion from long to int
 #endif
 
 // Model.pb.h is generated in the build output directory from the CoreML protobuf files in
-// onnxruntime/core/providers/coreml/coremltools/mlmodel/format
+// <build output directory>/_deps/coremltools-src/mlmodel/format
 #include "coreml_proto/Model.pb.h"
 
 #if defined(__GNUC__)
 #pragma GCC diagnostic pop
+#elif defined(_MSC_VER)
+#pragma warning(pop)
 #endif
 
 namespace COREML_SPEC = CoreML::Specification;
diff --git a/onnxruntime/core/providers/coreml/builders/helper.cc b/onnxruntime/core/providers/coreml/builders/helper.cc
index bc3ba4432e66d..b8ebbd05a2a20 100644
--- a/onnxruntime/core/providers/coreml/builders/helper.cc
+++ b/onnxruntime/core/providers/coreml/builders/helper.cc
@@ -85,9 +85,15 @@ bool IsInputSupported(const Node& node, const NodeArg& input,
     }
 
     if (dim == 0) {
-      LOGS(logger, WARNING) << "CoreML does not support shapes with dimension values of 0. Input:" << input_name
-                            << ", shape: " << Shape2String(shape);
-      return false;
+      if (node.OpType() == "Resize" && &input == node.InputDefs()[1]) {
+        // one special case. Resize 'roi' input was originally a required input but is rarely used.
+        // ROI is not supported in the CoreML implementation so we will ignore the value, but is often added
+        // (at least in the unit tests) as an initializer with shape {0}.
+      } else {
+        LOGS(logger, WARNING) << "CoreML does not support shapes with dimension values of 0. Input:" << input_name
+                              << ", shape: " << Shape2String(shape);
+        return false;
+      }
     }
   }
 
@@ -125,7 +131,7 @@ std::unordered_set<const Node*> GetSupportedNodes(const GraphViewer& graph_viewe
 
 bool CheckIsConstantInitializer(const NodeArg& node_arg, const GraphViewer& graph_viewer,
                                 const logging::Logger& logger, std::string_view input_description) {
-  if (graph_viewer.GetConstantInitializer(node_arg.Name(), true) == nullptr) {
+  if (graph_viewer.GetConstantInitializer(node_arg.Name()) == nullptr) {
     LOGS(logger, VERBOSE) << input_description << " (NodeArg name: '" << node_arg.Name()
                           << "') is not a constant initializer tensor";
     return false;
diff --git a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
index 2570e6d88ae0d..83a572f4b60fa 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
@@ -83,9 +83,14 @@ bool BaseOpBuilder::HasSupportedInputs(const Node& node, const OpBuilderInputPar
 }
 
 /* static */
-bool BaseOpBuilder::IsInput0Supported(const Node& node, const OpBuilderInputParams& /*input_params*/,
-                                      const logging::Logger& logger) {
-  const auto& input = *node.InputDefs()[0];
+bool BaseOpBuilder::IsInputFloat(const Node& node, size_t idx, const OpBuilderInputParams& /*input_params*/,
+                                 const logging::Logger& logger) {
+  if (idx >= node.InputDefs().size()) {
+    LOGS(logger, VERBOSE) << "Input index [" << idx << "] is out of range";
+    return false;
+  }
+
+  const auto& input = *node.InputDefs()[idx];
 
   int32_t input_type = ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED;
 
@@ -102,7 +107,7 @@ bool BaseOpBuilder::HasSupportedInputsImpl(const Node& node, const OpBuilderInpu
                                            const logging::Logger& logger) const {
   // We only check the type of input 0 by default
   // specific op builder can override this
-  return IsInput0Supported(node, input_params, logger);
+  return IsInputFloat(node, 0, input_params, logger);
 }
 
 bool BaseOpBuilder::HasSupportedOpSet(const Node& node, const logging::Logger& logger) const {
diff --git a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.h b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.h
index 06c4dd94ea30d..63f0b813d654c 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.h
+++ b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.h
@@ -28,9 +28,9 @@ class BaseOpBuilder : public IOpBuilder {
   void AddInitializersToSkip(ModelBuilder& /*model_builder*/, const Node& /*node*/) const override {}
 
  protected:
-  // check if the first input's data type is supported.
-  static bool IsInput0Supported(const Node& node, const OpBuilderInputParams& input_params,
-                                const logging::Logger& logger);
+  // currently we only support float
+  static bool IsInputFloat(const Node& node, size_t idx, const OpBuilderInputParams& input_params,
+                           const logging::Logger& logger);
 
  private:
   virtual bool IsOpSupportedImpl(const Node& /*node*/, const OpBuilderInputParams& /*input_params*/,
diff --git a/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc
index 6074fba1433d9..fb8e07633621f 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc
@@ -5,6 +5,7 @@
 #include "core/providers/common.h"
 #include "core/providers/coreml/builders/helper.h"
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/impl/builder_utils.h"
 #include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/shared/utils/utils.h"
@@ -19,6 +20,8 @@ class BinaryOpBuilder : public BaseOpBuilder {
 
   bool HasSupportedInputsImpl(const Node& node, const OpBuilderInputParams& input_params,
                               const logging::Logger& logger) const override;
+
+  bool SupportsMLProgram() const override { return true; }
 };
 
 namespace {
@@ -57,38 +60,72 @@ Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
   const auto& op_type(node.OpType());
   const auto& input_defs(node.InputDefs());
 
-  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
-
-  if (op_type == "Add") {
-    // original mutable_add() has limited broadcasting support
-    // updated to use CoreML::AddBroadcastableLayerParams which has more general broadcasting support
-    if (CheckIfBothInputShapesMatch(node, logger)) {
-      layer->mutable_add();
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (model_builder.CreateMLProgram()) {
+    using namespace CoreML::Specification::MILSpec;
+
+    // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#module-coremltools.converters.mil.mil.ops.defs.iOS15.elementwise_binary
+    std::string_view coreml_op_type;
+    if (op_type == "Add") {
+      coreml_op_type = "add";
+    } else if (op_type == "Mul") {
+      coreml_op_type = "mul";
+    } else if (op_type == "Sub") {
+      coreml_op_type = "sub";
+    } else if (op_type == "Div") {
+      // we only support fp32 currently. when we add support for integers we need to check the type and use
+      // "floor_div" or "real_div" accordingly
+      coreml_op_type = "real_div";
+    } else if (op_type == "Pow") {
+      coreml_op_type = "pow";
     } else {
-      layer->mutable_addbroadcastable();
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "BinaryOpBuilder::AddToModelBuilderImpl, unexpected op: ", op_type);
     }
-  } else if (op_type == "Mul") {
-    if (CheckIfBothInputShapesMatch(node, logger)) {
-      layer->mutable_multiply();
+
+    std::unique_ptr<Operation> op = model_builder.CreateOperation(node, coreml_op_type);
+    AddOperationInput(*op, "x", input_defs[0]->Name());
+    AddOperationInput(*op, "y", input_defs[1]->Name());
+    AddOperationOutput(*op, *node.OutputDefs()[0]);
+
+    model_builder.AddOperation(std::move(op));
+  } else
+#endif  // defined (COREML_ENABLE_MLPROGRAM)
+  {
+    std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
+
+    if (op_type == "Add") {
+      // original mutable_add() has limited broadcasting support
+      // updated to use CoreML::AddBroadcastableLayerParams which has more general broadcasting support
+      if (CheckIfBothInputShapesMatch(node, logger)) {
+        layer->mutable_add();
+      } else {
+        layer->mutable_addbroadcastable();
+      }
+    } else if (op_type == "Mul") {
+      if (CheckIfBothInputShapesMatch(node, logger)) {
+        layer->mutable_multiply();
+      } else {
+        layer->mutable_multiplybroadcastable();
+      }
+    } else if (op_type == "Sub") {
+      layer->mutable_subtractbroadcastable();
+    } else if (op_type == "Div") {
+      layer->mutable_dividebroadcastable();
+    } else if (op_type == "Pow") {
+      layer->mutable_powbroadcastable();
     } else {
-      layer->mutable_multiplybroadcastable();
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "BinaryOpBuilder::AddToModelBuilderImpl, unexpected op: ", op_type);
     }
-  } else if (op_type == "Sub") {
-    layer->mutable_subtractbroadcastable();
-  } else if (op_type == "Div") {
-    layer->mutable_dividebroadcastable();
-  } else if (op_type == "Pow") {
-    layer->mutable_powbroadcastable();
-  } else {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "BinaryOpBuilder::AddToModelBuilderImpl, unknown op: ", op_type);
-  }
 
-  *layer->mutable_input()->Add() = input_defs[0]->Name();
-  *layer->mutable_input()->Add() = input_defs[1]->Name();
-  *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+    *layer->mutable_input()->Add() = input_defs[0]->Name();
+    *layer->mutable_input()->Add() = input_defs[1]->Name();
+    *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+
+    model_builder.AddLayer(std::move(layer));
+  }
 
-  model_builder.AddLayer(std::move(layer));
   return Status::OK();
 }
 
@@ -99,25 +136,11 @@ int BinaryOpBuilder::GetMinSupportedOpSet(const Node& /* node */) const {
 
 bool BinaryOpBuilder::HasSupportedInputsImpl(const Node& node, const OpBuilderInputParams& input_params,
                                              const logging::Logger& logger) const {
-  if (node.OpType() != "Pow") {
-    return IsInput0Supported(node, input_params, logger);
-  }
-
-  const auto& input_1 = *node.InputDefs()[0];
-  const auto& input_2 = *node.InputDefs()[1];
-
-  // Pow we only support both inputs as fp32 for now
-  int32_t input_type_1;
-  int32_t input_type_2;
-  if (!GetType(input_1, input_type_1, logger) ||
-      !GetType(input_2, input_type_2, logger)) {
-    return false;
-  }
-
-  if (input_type_1 != ONNX_NAMESPACE::TensorProto_DataType_FLOAT || input_type_1 != input_type_2) {
-    LOGS(logger, VERBOSE) << "Pow only supports fp32 inputs, actual input type"
-                          << ", Input type 1: " << input_type_1
-                          << ", Input type 2: " << input_type_2;
+  // Add/Sub/Mul/Div spec says inputs must be of the same type.
+  // Pow spec says inputs can be different types.
+  // We only support float for all of these inputs.
+  if (!IsInputFloat(node, 0, input_params, logger) ||
+      ((node.OpType() == "Pow") && !IsInputFloat(node, 1, input_params, logger))) {
     return false;
   }
 
diff --git a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc
index 710f596b2a562..cbea969904ed5 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc
@@ -7,6 +7,7 @@
 #include "core/framework/tensorprotoutils.h"
 #include "core/providers/coreml/builders/coreml_spec.h"
 #include "core/providers/coreml/builders/helper.h"
+#include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/shared/utils/utils.h"
 #include "core/optimizer/initializer.h"
 
@@ -132,6 +133,7 @@ void CreateCoreMLWeight(CoreML::Specification::WeightParams& weight, gsl::span<c
   CreateCoreMLWeightConvertingDataToFloats(weight, data);
 }
 
+#if defined(COREML_ENABLE_MLPROGRAM)
 //
 // ML Program Utils
 //
@@ -309,5 +311,71 @@ void AddOperationOutput(COREML_SPEC::MILSpec::Operation& op, const NodeArg& outp
                     output.Shape());
 }
 
+void AddPadTypeAndPads(COREML_SPEC::MILSpec::Operation& op, ModelBuilder& model_builder, std::string_view op_type,
+                       const NodeAttrHelper& helper, int num_spatial_dims) {
+  AutoPadType auto_pad_type = StringToAutoPadType(helper.Get("auto_pad", "NOTSET"));
+
+  // pad type (string)
+  //   valid - no pads  (ONNX auto_pad VALID)
+  //   custom - pads input  (ONNX NOTSET)
+  //   same - inferred to be `d_out[i] = ceil(d_in[i] / strides[i])`  (assuming == ONNX SAME_UPPER)
+  //   same_lower - as per same but any extra rows/cols are added at top/left if padding is odd (ONNX SAME_LOWER)
+  //
+  // TODO: See if we want to update HandleAutoPad to support 1D (and 3D) so we can infer if an autopad value
+  //       can be used. TBD if that provides any performance benefit with ML Program though as CoreML could
+  //       potentially do that same optimization internally.
+  switch (auto_pad_type) {
+    case AutoPadType::NOTSET: {
+      // use `pads` attribute.
+      auto onnx_pads = helper.GetInt64s("pads");  // 'pads' are used if auto_pad is NOTSET
+      if (onnx_pads) {
+        AddOperationInput(op, "pad_type",
+                          model_builder.AddScalarConstant(op_type, "pad_type", std::string("custom")));
+
+        // need to re-order from x1_start, x2_start..., x1_end, x2_end... to
+        // x1_start, x1_end, x2_start, x2_end,...
+        size_t num_pads = onnx_pads->size();
+        size_t num_dims = num_pads / 2;
+        std::vector<int64_t> reordered_pads(num_pads, 0);
+        for (size_t i = 0; i < num_pads; ++i) {
+          auto cur_dim = i % num_dims;
+          if (i < num_dims) {  // start values
+            reordered_pads[cur_dim * 2] = (*onnx_pads)[i];
+          } else {  // end values
+            reordered_pads[cur_dim * 2 + 1] = (*onnx_pads)[i];
+          }
+        }
+
+        AddOperationInput(op, "pad", model_builder.AddConstant(op_type, "pad", reordered_pads));
+
+        break;
+      }
+
+      // fall through if explicit pads were not provided as the default value for `pads` is all zeros,
+      // which is the same as 'valid' padding.
+      [[fallthrough]];
+    }
+    case AutoPadType::VALID:
+      AddOperationInput(op, "pad_type",
+                        model_builder.AddScalarConstant(op_type, "pad_type", std::string("valid")));
+
+      break;
+    case AutoPadType::SAME_UPPER:
+    case AutoPadType::SAME_LOWER: {
+      const auto pad_type = (auto_pad_type == AutoPadType::SAME_UPPER ? "same" : "same_lower");
+      AddOperationInput(op, "pad_type",
+                        model_builder.AddScalarConstant(op_type, "pad_type", std::string(pad_type)));
+
+      // despite what the spec says, a 'pad' input seems to be required.
+      // https://github.com/apple/coremltools/issues/2127
+      // Provide the default value as that's what coremltools does for conv/avg_pool/max_pool.
+      std::vector<int64_t> ignored_pads(num_spatial_dims * 2, 0);
+      AddOperationInput(op, "pad", model_builder.AddConstant(op_type, "pad", ignored_pads));
+
+      break;
+    }
+  }
+}
+#endif  // defined(COREML_ENABLE_MLPROGRAM)
 }  // namespace coreml
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h
index 8126f0c126914..2804589065631 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h
+++ b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h
@@ -11,13 +11,15 @@
 #include "core/common/status.h"
 #include "core/graph/basic_types.h"
 #include "core/providers/common.h"
-
 #include "core/providers/coreml/builders/coreml_spec.h"
+#include "core/providers/shared/utils/utils.h"
 
 namespace onnxruntime {
 class NodeArg;
 
 namespace coreml {
+class ModelBuilder;
+
 // Try to see if we can map explicit padding to auto padding for Conv/Pool
 // Since usually use auto padding is more efficient
 Status HandleAutoPad(const std::vector<int64_t> input_shape,
@@ -45,6 +47,7 @@ void CreateCoreMLWeight(CoreML::Specification::WeightParams& weight, gsl::span<c
 // Copy the int64_t array to a coreml weight
 void CreateCoreMLWeight(CoreML::Specification::WeightParams& weight, gsl::span<const int64_t> data);
 
+#if defined(COREML_ENABLE_MLPROGRAM)
 //
 // MLProgram utils
 //
@@ -130,5 +133,17 @@ void AddOperationInput(COREML_SPEC::MILSpec::Operation& op,
 /// <param name="op">Operation to update.</param>
 /// <param name="output">NodeArg with details of output to add.</param>
 void AddOperationOutput(COREML_SPEC::MILSpec::Operation& op, const NodeArg& output);
+
+/// <summary>
+/// Add pad_type and pad values.
+/// </summary>
+/// <param name="op">Operator to update</param>
+/// <param name="model_builder">ModelBuilder to add constants with.</param>
+/// <param name="op_type">Operator type.</param>
+/// <param name="helper">Node attribute helper.</param>
+/// <param name="num_spatial_dims">Number of spatial dims in input. Generally rank - 2 (ignore N and C dims).</param>
+void AddPadTypeAndPads(COREML_SPEC::MILSpec::Operation& op, ModelBuilder& model_builder, std::string_view op_type,
+                       const NodeAttrHelper& helper, int num_spatial_dims);
+#endif  // defined(COREML_ENABLE_MLPROGRAM)
 }  // namespace coreml
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/coreml/builders/impl/clip_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/clip_op_builder.cc
index 9aca172abec98..41f4041ef1181 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/clip_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/clip_op_builder.cc
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/impl/builder_utils.h"
 #include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/shared/utils/utils.h"
@@ -17,11 +18,31 @@ class ClipOpBuilder : public BaseOpBuilder {
 
   bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                          const logging::Logger& logger) const override;
+
+  bool SupportsMLProgram() const override { return true; }
 };
 
 void ClipOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const {
+  bool skip = true;
+
+  if (model_builder.CreateMLProgram()) {
+    float min, max;
+    ORT_IGNORE_RETURN_VALUE(GetClipMinMax(model_builder.GetGraphViewer(), node, min, max, model_builder.Logger()));
+
+    bool has_min = min != std::numeric_limits<float>::lowest();
+    bool has_max = max != std::numeric_limits<float>::max();
+    if (has_min && has_max && min == 0.f && max == 6.f) {
+      // relu6 - skip both
+    } else if (has_min && min == 0.f && !has_max) {
+      // relu - skip both
+    } else {
+      // clip - we will use both
+      skip = false;
+    }
+  }
+
   // Both min and max values will be injected into the layer, no need to add to the model
-  if (node.SinceVersion() >= 11) {
+  if (skip && node.SinceVersion() >= 11) {
     if (node.InputDefs().size() > 1)
       model_builder.AddInitializerToSkip(node.InputDefs()[1]->Name());
 
@@ -35,72 +56,126 @@ Status ClipOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                             const logging::Logger& logger) const {
   const auto& node_name = node.Name();
   const auto& input_name = node.InputDefs()[0]->Name();
-  const auto& output_name = node.OutputDefs()[0]->Name();
+  const auto& output = *node.OutputDefs()[0];
+  const auto& output_name = output.Name();
   float min, max;
   ORT_RETURN_IF_NOT(GetClipMinMax(model_builder.GetGraphViewer(), node, min, max, logger), "GetClipMinMax failed");
 
   bool has_min = min != std::numeric_limits<float>::lowest();
   bool has_max = max != std::numeric_limits<float>::max();
 
-  if (!has_min && !has_max) {
-    // Clip without min/max is an identity node
-    // In CoreML we don't have identity, use ActivationLinear instead
-    std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
-    layer->mutable_activation()->mutable_linear()->set_alpha(1.0f);
-    *layer->mutable_input()->Add() = input_name;
-    *layer->mutable_output()->Add() = output_name;
-
-    model_builder.AddLayer(std::move(layer));
-  } else {
-    // The implementation of clip(min, max) is done by
-    // 1. Clipping at min -> max(input, min) is handled by
-    //    min_output = threshold(input, min)
-    // 2. Clipping at max -> min(min_output, max) is handled by
-    //    output = -1 * (threshold(-min_output, -max))
-
-    // Now we have at least one or min or max is not default value
-    // Clipping at max will need take the output of clipping at min, or the node input, if min value is default
-    // If max value is default, the output of clipping at min will be the output of the node
-    std::string min_output_name = output_name;
-    if (has_max) {
-      min_output_name = has_min
-                            ? model_builder.GetUniqueName(node_name + "min_output")
-                            : input_name;
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (model_builder.CreateMLProgram()) {
+    using namespace CoreML::Specification::MILSpec;
+
+    std::unique_ptr<Operation> op;
+    if (!has_min && !has_max) {
+      // Clip without min/max is an identity node.
+      op = model_builder.CreateOperation(node, "identity");
+      Operation& identity_op = *op;
+      AddOperationInput(identity_op, "x", input_name);
+    } else {
+      if (has_min && has_max && min == 0.f && max == 6.f) {
+        // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.activation.relu6
+        op = model_builder.CreateOperation(node, "relu6");
+        Operation& relu6_op = *op;
+        AddOperationInput(relu6_op, "x", input_name);
+      } else if (has_min && min == 0.f && !has_max) {
+        // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.activation.relu
+        op = model_builder.CreateOperation(node, "relu");
+        Operation& relu_op = *op;
+        AddOperationInput(relu_op, "x", input_name);
+      } else {
+        // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.elementwise_unary.clip
+        op = model_builder.CreateOperation(node, "clip");
+
+        Operation& clip_op = *op;
+        AddOperationInput(clip_op, "x", input_name);
+
+        // if min and max were attributes we need to add initializers. otherwise we use the existing inputs
+        const bool min_max_attribs = node.SinceVersion() < 11;
+        std::string_view min_name = min_max_attribs ? model_builder.AddScalarConstant(clip_op.type(), "min", min)
+                                                    : node.InputDefs()[1]->Name();
+
+        AddOperationInput(clip_op, "alpha", min_name);
+
+        if (has_max) {
+          std::string_view max_name = min_max_attribs ? model_builder.AddScalarConstant(clip_op.type(), "max", max)
+                                                      : node.InputDefs()[2]->Name();
+          AddOperationInput(clip_op, "beta", max_name);
+        }
+      }
     }
 
-    // Handle clipping at min first
-    if (has_min) {
-      std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> min_layer = model_builder.CreateNNLayer(node, "_Clip_min");
-      if (min == 0.0f) {  // If min is 0. then this min will be handled by relu
-        min_layer->mutable_activation()->mutable_relu();
-      } else {  // otherwise, min will be handled by unary->threshold
-        min_layer->mutable_unary()->set_alpha(min);
-        min_layer->mutable_unary()->set_type(COREML_SPEC::UnaryFunctionLayerParams::THRESHOLD);
+    AddOperationOutput(*op, output);
+    model_builder.AddOperation(std::move(op));
+  } else
+#endif  // defined(COREML_ENABLE_MLPROGRAM)
+  {
+    // TODO: CoreML has a Clip layer for NeuralNetwork. Added in CoreML 4. We could potentially use that if available
+    // to simplify.
+    // https://apple.github.io/coremltools/mlmodel/Format/NeuralNetwork.html#cliplayerparams
+
+    if (!has_min && !has_max) {
+      // Clip without min/max is an identity node
+      // In CoreML we don't have identity, use ActivationLinear instead
+      std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
+      layer->mutable_activation()->mutable_linear()->set_alpha(1.0f);
+      *layer->mutable_input()->Add() = input_name;
+      *layer->mutable_output()->Add() = output_name;
+
+      model_builder.AddLayer(std::move(layer));
+    } else {
+      // The implementation of clip(min, max) is done by
+      // 1. Clipping at min -> max(input, min) is handled by
+      //    min_output = threshold(input, min)
+      // 2. Clipping at max -> min(min_output, max) is handled by
+      //    output = -1 * (threshold(-min_output, -max))
+
+      // Now we have at least one or min or max is not default value
+      // Clipping at max will need take the output of clipping at min, or the node input, if min value is default
+      // If max value is default, the output of clipping at min will be the output of the node
+      std::string min_output_name = output_name;
+      if (has_max) {
+        min_output_name = has_min
+                              ? model_builder.GetUniqueName(node_name + "min_output")
+                              : input_name;
       }
 
-      *min_layer->mutable_input()->Add() = input_name;
-      *min_layer->mutable_output()->Add() = min_output_name;
-      model_builder.AddLayer(std::move(min_layer));
-    }
-
-    // Clipping at max is handled by -1 * (threshold (-min_output, -max))
-    if (has_max) {
-      const auto threshold_output_name = model_builder.GetUniqueName(MakeString(node_name, "threshold_output"));
-      {  // Add threshold layer, which is actually max( -1 * min_output, -max)
-        auto threshold_layer = model_builder.CreateNNLayer(node, "_Clip_max_threshold");
-        threshold_layer->mutable_unary()->set_alpha(-max);
-        threshold_layer->mutable_unary()->set_scale(-1.0f);
-        threshold_layer->mutable_unary()->set_type(COREML_SPEC::UnaryFunctionLayerParams::THRESHOLD);
-        *threshold_layer->mutable_input()->Add() = min_output_name;
-        *threshold_layer->mutable_output()->Add() = threshold_output_name;
-        model_builder.AddLayer(std::move(threshold_layer));
+      // Handle clipping at min first
+      if (has_min) {
+        std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> min_layer = model_builder.CreateNNLayer(node, "_Clip_min");
+        if (min == 0.0f) {  // If min is 0. then this min will be handled by relu
+          min_layer->mutable_activation()->mutable_relu();
+        } else {  // otherwise, min will be handled by unary->threshold
+          min_layer->mutable_unary()->set_alpha(min);
+          min_layer->mutable_unary()->set_type(COREML_SPEC::UnaryFunctionLayerParams::THRESHOLD);
+        }
+
+        *min_layer->mutable_input()->Add() = input_name;
+        *min_layer->mutable_output()->Add() = min_output_name;
+        model_builder.AddLayer(std::move(min_layer));
       }
-      {  // Add linear activation layer -1 * threshold_output
-        auto linear_layer = model_builder.CreateNNLayer(node, "_Clip_max_linear");
-        linear_layer->mutable_activation()->mutable_linear()->set_alpha(-1.0f);
-        *linear_layer->mutable_input()->Add() = threshold_output_name;
-        *linear_layer->mutable_output()->Add() = output_name;
-        model_builder.AddLayer(std::move(linear_layer));
+
+      // Clipping at max is handled by -1 * (threshold (-min_output, -max))
+      if (has_max) {
+        const auto threshold_output_name = model_builder.GetUniqueName(MakeString(node_name, "threshold_output"));
+        {  // Add threshold layer, which is actually max( -1 * min_output, -max)
+          auto threshold_layer = model_builder.CreateNNLayer(node, "_Clip_max_threshold");
+          threshold_layer->mutable_unary()->set_alpha(-max);
+          threshold_layer->mutable_unary()->set_scale(-1.0f);
+          threshold_layer->mutable_unary()->set_type(COREML_SPEC::UnaryFunctionLayerParams::THRESHOLD);
+          *threshold_layer->mutable_input()->Add() = min_output_name;
+          *threshold_layer->mutable_output()->Add() = threshold_output_name;
+          model_builder.AddLayer(std::move(threshold_layer));
+        }
+        {  // Add linear activation layer -1 * threshold_output
+          auto linear_layer = model_builder.CreateNNLayer(node, "_Clip_max_linear");
+          linear_layer->mutable_activation()->mutable_linear()->set_alpha(-1.0f);
+          *linear_layer->mutable_input()->Add() = threshold_output_name;
+          *linear_layer->mutable_output()->Add() = output_name;
+          model_builder.AddLayer(std::move(linear_layer));
+        }
       }
     }
   }
diff --git a/onnxruntime/core/providers/coreml/builders/impl/conv_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/conv_op_builder.cc
index 05e43dbbd16af..38125957bf481 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/conv_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/conv_op_builder.cc
@@ -67,99 +67,25 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
       AddOperationInput(*conv_op, "bias", input_defs[2]->Name());
     }
 
-    // ONNX attributes. Add as inputs if specified/required
-    auto strides = helper.GetInt64s("strides");
-    auto dilations = helper.GetInt64s("dilations");
-    auto groups = helper.GetInt64("group");
-
     // we know this input has a valid shape due to the check in IsOpSupportedImpl. ignore N and C dims.
     const auto num_spatial_dims = input_defs[1]->Shape()->dim_size() - 2;
     const auto& op_type = conv_op->type();
 
-    if (strides) {
-      AddOperationInput(*conv_op, "strides", model_builder.AddConstant(op_type, "strides", *strides));
-    } else {
-      // spec says optional. testing suggests otherwise for at least the iOS15 target (CoreML5)
-      static const auto default_value = std::vector<int64_t>(num_spatial_dims, 1);
-      AddOperationInput(*conv_op, "strides", model_builder.AddConstant(op_type, "strides", default_value));
-    }
+    // Spec says strides and dilations are optional, but reality is they're required for at least the iOS15 target
+    // (CoreML5).
+    const auto strides = helper.Get("strides", std::vector<int64_t>(num_spatial_dims, 1));
+    auto dilations = helper.Get("dilations", std::vector<int64_t>(num_spatial_dims, 1));
+    auto groups = helper.GetInt64("group");
 
-    if (dilations) {
-      AddOperationInput(*conv_op, "dilations", model_builder.AddConstant(op_type, "dilations", *dilations));
-    } else {
-      // spec says optional. testing suggests otherwise for at least the iOS15 target (CoreML5)
-      static const auto default_value = std::vector<int64_t>(num_spatial_dims, 1);
-      AddOperationInput(*conv_op, "dilations", model_builder.AddConstant(op_type, "dilations", default_value));
-    }
+    AddOperationInput(*conv_op, "strides", model_builder.AddConstant(op_type, "strides", strides));
+    AddOperationInput(*conv_op, "dilations", model_builder.AddConstant(op_type, "dilations", dilations));
 
     if (groups) {
       AddOperationInput(*conv_op, "groups", model_builder.AddScalarConstant(op_type, "groups", *groups));
     }
 
-    AutoPadType auto_pad_type = StringToAutoPadType(helper.Get("auto_pad", "NOTSET"));
-
-    // pad type (string)
-    //   valid - no pads  (ONNX auto_pad VALID)
-    //   custom - pads input  (ONNX NOTSET)
-    //   same - inferred to be `d_out[i] = ceil(d_in[i] / strides[i])`  (assuming == ONNX SAME_UPPER)
-    //   same_lower - as per same but any extra rows/cols are added at top/left if padding is odd (ONNX SAME_LOWER)
-    //
-    // TODO: See if we want to update HandleAutoPad to support 1D (and 3D) so we can infer if an autopad value
-    //       can be used. TBD if that provides any performance benefit with ML Program though as CoreML could
-    //       potentially do that for us.
-    switch (auto_pad_type) {
-      case AutoPadType::NOTSET: {
-        // use `pads` attribute.
-        auto onnx_pads = helper.GetInt64s("pads");  // 'pads' must be provided if auto_pad is NOTSET
-        if (onnx_pads) {
-          AddOperationInput(*conv_op, "pad_type",
-                            model_builder.AddScalarConstant(op_type, "pad_type", std::string("custom")));
-
-          // need to re-order from x1_start, x2_start..., x1_end, x2_end... to
-          // x1_start, x1_end, x2_start, x2_end,...
-          size_t num_pads = onnx_pads->size();
-          size_t num_dims = num_pads / 2;
-          std::vector<int64_t> reordered_pads(num_pads, 0);
-          for (size_t i = 0; i < num_pads; ++i) {
-            auto cur_dim = i % num_dims;
-            if (i < num_dims) {  // start values
-              reordered_pads[cur_dim * 2] = (*onnx_pads)[i];
-            } else {  // end values
-              reordered_pads[cur_dim * 2 + 1] = (*onnx_pads)[i];
-            }
-          }
-
-          AddOperationInput(*conv_op, "pad", model_builder.AddConstant(op_type, "pad", reordered_pads));
-
-          break;
-        }
-
-        // in theory the pads may not be provided and in that case the default is no padding.
-        // as that is the same as 'valid', fall through
-        [[fallthrough]];
-      }
-      case AutoPadType::VALID:
-        AddOperationInput(*conv_op, "pad_type",
-                          model_builder.AddScalarConstant(op_type, "pad_type", std::string("valid")));
-
-        break;
-      case AutoPadType::SAME_UPPER:
-      case AutoPadType::SAME_LOWER: {
-        const auto pad_type = (auto_pad_type == AutoPadType::SAME_UPPER ? "same" : "same_lower");
-        AddOperationInput(*conv_op, "pad_type",
-                          model_builder.AddScalarConstant(op_type, "pad_type", std::string(pad_type)));
-
-        // despite what the spec says, a 'pad' input seems to be required.
-        // https://github.com/apple/coremltools/issues/2127
-        // provide the default value. passing in an empty vector also works. TBD what's better.
-        std::vector<int64_t> ignored_pads(num_spatial_dims * 2, 0);
-        AddOperationInput(*conv_op, "pad", model_builder.AddConstant(op_type, "pad", ignored_pads));
-
-        break;
-      }
-    }
+    AddPadTypeAndPads(*conv_op, model_builder, op_type, helper, num_spatial_dims);
 
-    // set output
     AddOperationOutput(*conv_op, *node.OutputDefs()[0]);
 
     model_builder.AddOperation(std::move(conv_op));
@@ -297,7 +223,7 @@ bool ConvOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPara
   const auto& input_defs = node.InputDefs();
 
   const auto& weight_name = input_defs[1]->Name();
-  const auto* weight = input_params.graph_viewer.GetConstantInitializer(weight_name, true);
+  const auto* weight = input_params.graph_viewer.GetConstantInitializer(weight_name);
 
 #if defined(COREML_ENABLE_MLPROGRAM)
   if (input_params.create_mlprogram) {
@@ -324,7 +250,7 @@ bool ConvOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPara
     return false;
   }
 
-  if (input_defs.size() > 2 && !input_params.graph_viewer.GetConstantInitializer(input_defs[2]->Name(), true)) {
+  if (input_defs.size() > 2 && !input_params.graph_viewer.GetConstantInitializer(input_defs[2]->Name())) {
     LOGS(logger, VERBOSE) << "The bias of Conv [" << name << "] must be a constant initializer";
     return false;
   }
diff --git a/onnxruntime/core/providers/coreml/builders/impl/gemm_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/gemm_op_builder.cc
index 48f77354d7c30..8daf64dc4a457 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/gemm_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/gemm_op_builder.cc
@@ -22,18 +22,51 @@ class GemmOpBuilder : public BaseOpBuilder {
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                const logging::Logger& logger) const override;
 
-  bool IsOpSupportedImpl(const Node& /* node */, const OpBuilderInputParams& /* input_params */,
-                         const logging::Logger& /* logger */) const override;
+  bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
+                         const logging::Logger& logger) const override;
+
+  bool SupportsMLProgram() const override { return true; }
 };
 
 void GemmOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const {
   const auto& op = node.OpType();
   const auto& input_defs(node.InputDefs());
-  // We have already embedded the weights (matrix B and C(if any)) into the coreml layer
-  // No need to copy them later to reduce memory consumption
-  model_builder.AddInitializerToSkip(input_defs[1]->Name());
-  if (op == "Gemm" && input_defs.size() > 2) {
-    model_builder.AddInitializerToSkip(input_defs[2]->Name());
+  const bool is_gemm = op == "Gemm";
+
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (model_builder.CreateMLProgram()) {
+    // we have to transpose the weight input of Gemm if transB is false, and potentially override the bias shape
+    if (is_gemm) {
+      NodeAttrHelper helper(node);
+      const auto transB = helper.Get("transB", 0);
+      if (transB == 0) {
+        model_builder.AddInitializerToSkip(input_defs[1]->Name());
+      }
+
+      if (input_defs.size() > 2) {
+        // ONNX spec requires B to be 2D and we required it to be a constant initializer so reading N this way is safe
+        // B is {K, N] by default. or {N, K} if transB is true
+        int N_dim = transB ? 0 : 1;
+        int64_t N = input_defs[1]->Shape()->dim().at(N_dim).dim_value();
+
+        const auto& bias_name = input_defs[2]->Name();
+        const auto& bias = *model_builder.GetConstantInitializer(bias_name);
+        if (bias.dims_size() != 1 || bias.dims(0) != N) {
+          // we have to override the shape/duplicate data to convert {}, {1} or {1, N} to 1D {N}
+          // when adding the Gemm operation so skip adding the original initializer
+          model_builder.AddInitializerToSkip(bias_name);
+        }
+      }
+    }
+  } else
+#endif  // defined(COREML_ENABLE_MLPROGRAM)
+  {
+    // We have already embedded the weights (matrix B and C(if any)) into the coreml layer
+    // No need to copy them later to reduce memory consumption
+    model_builder.AddInitializerToSkip(input_defs[1]->Name());
+    if (is_gemm && input_defs.size() > 2) {
+      model_builder.AddInitializerToSkip(input_defs[2]->Name());
+    }
   }
 }
 
@@ -57,54 +90,152 @@ static Status GetTensorFloatDataTransposed(const ONNX_NAMESPACE::TensorProto& te
 }
 
 Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
-                                            const logging::Logger& /* logger */) const {
+                                            const logging::Logger& logger) const {
   std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
 
   const auto& op_type = node.OpType();
   const auto& input_defs = node.InputDefs();
-  const auto& b_tensor = *model_builder.GetInitializerTensors().at(input_defs[1]->Name());
-  const auto& b_shape = b_tensor.dims();
-
-  auto* coreml_inner_product = layer->mutable_innerproduct();
-
-  // The coreml innerproduct weight (matrix B) is stored transposed
-  // - for MatMul and Gemm (transB = 0), the coreml weight is B'
-  // - for Gemm (transB = 1), the coreml weight is B
-  if (op_type == "MatMul") {
-    coreml_inner_product->set_inputchannels(b_shape[0]);
-    coreml_inner_product->set_outputchannels(b_shape[1]);
-    // Add weight (b of MatMul)
-    std::vector<float> b_transposed;
-    ORT_RETURN_IF_ERROR(GetTensorFloatDataTransposed(b_tensor, b_transposed));
-    CreateCoreMLWeight(*coreml_inner_product->mutable_weights(), b_transposed);
-  } else {  // Gemm
-    NodeAttrHelper helper(node);
-    const auto transB = helper.Get("transB", 0);
-    if (transB == 0) {
-      coreml_inner_product->set_inputchannels(b_shape[0]);
-      coreml_inner_product->set_outputchannels(b_shape[1]);
+  const auto& a = *input_defs[0];
+  const auto& b = *input_defs[1];
+  const auto* b_initializer = model_builder.GetConstantInitializer(b.Name());  // MLProgram MatMul may not be constant
+
+  const bool is_matmul = op_type == "MatMul";
+  const bool is_gemm = op_type == "Gemm";
+
+  NodeAttrHelper helper(node);
+  const auto transB = is_gemm ? helper.Get("transB", 0) : 0;
+
+  std::vector<int64_t> b_shape;
+  ORT_IGNORE_RETURN_VALUE(GetShape(b, b_shape, logger));
+  int64_t b0 = -1, b1 = -1;
+
+  // ML Program MatMul supports N-D input
+  if (model_builder.CreateMLProgram() && is_matmul) {
+    if (b_shape.size() == 1) {
+      // B is treated as {b_shape[0], 1} according to the numpy rules.
+      b0 = b_shape[0];
+      b1 = 1;
+    } else {
+      // last 2 dims are used
+      b0 = b_shape[b_shape.size() - 2];
+      b1 = b_shape[b_shape.size() - 1];
+    }
+  } else {
+    // we only support 2D input
+    b0 = b_shape[0];
+    b1 = b_shape[1];
+  }
+
+  // B is {K, N} in ONNX spec by default, or {N, K} in Gemm if transB is true
+  const auto K = transB ? b1 : b0;
+  const auto N = transB ? b0 : b1;
+
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (model_builder.CreateMLProgram()) {
+    using namespace CoreML::Specification::MILSpec;
+
+    if (is_gemm) {
+      // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.linear.linear
+      auto gemm_op = model_builder.CreateOperation(node, "linear");
+      AddOperationInput(*gemm_op, "x", a.Name());
+
+      // CoreML takes weight input as {N, K} which is the reverse of ONNX.
+      // if transB is true the input weight is {N, K} so can be added directly.
+      if (transB) {
+        AddOperationInput(*gemm_op, "weight", b.Name());
+      } else {
+        // transpose from {K, N} to {N, K}
+        std::vector<float> weight_nk;
+        std::vector<int64_t> weight_nk_shape = {N, K};
+        ORT_RETURN_IF_ERROR(GetTensorFloatDataTransposed(*b_initializer, weight_nk));
+
+        AddOperationInput(*gemm_op, "weight",
+                          model_builder.AddConstant(gemm_op->type(), b.Name() + "_t", weight_nk, weight_nk_shape));
+      }
+
+      if (input_defs.size() == 3) {
+        const auto& bias_arg = *input_defs[2];
+        const auto& bias = *model_builder.GetConstantInitializer(bias_arg.Name());
+
+        // CoreML linear op requires bias to be 1D tensor of size N
+        if (bias.dims_size() == 1 && bias.dims().at(0) == N) {
+          // can use existing initializer
+          AddOperationInput(*gemm_op, "bias", bias_arg.Name());
+        } else {
+          Initializer unpacked_tensor(bias);
+          auto bias_data = unpacked_tensor.DataAsSpan<float>();
+          std::string_view bias_data_name;
+          if (bias_data.size() == 1) {
+            // expand scalar to N
+            std::vector<float> expanded_bias_data(N, bias_data[0]);
+            bias_data_name = model_builder.AddConstant(gemm_op->type(), "bias", expanded_bias_data);
+          } else {
+            // can use data as-is but need to adjust shape (inferred by AddConstant as {bias_data.size()})
+            bias_data_name = model_builder.AddConstant(gemm_op->type(), "bias", bias_data);
+          }
+
+          AddOperationInput(*gemm_op, "bias", bias_data_name);
+        }
+      }
+
+      AddOperationOutput(*gemm_op, *node.OutputDefs()[0]);
+      model_builder.AddOperation(std::move(gemm_op));
+    } else {
+      // CoreML implementation is the same as ONNX MatMul.
+      // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.linear.matmul
+      auto matmul_op = model_builder.CreateOperation(node, "matmul");
+      AddOperationInput(*matmul_op, "x", a.Name());
+      AddOperationInput(*matmul_op, "y", b.Name());
+
+      // once again the spec lies and says transpose_y and transpose_x are optional...
+      auto false_value_name = model_builder.AddScalarConstant(matmul_op->type(), "false", false);
+      AddOperationInput(*matmul_op, "transpose_x", false_value_name);
+      AddOperationInput(*matmul_op, "transpose_y", false_value_name);
+
+      AddOperationOutput(*matmul_op, *node.OutputDefs()[0]);
+      model_builder.AddOperation(std::move(matmul_op));
+    }
+  } else
+#endif  // defined(COREML_ENABLE_MLPROGRAM)
+  {
+    auto* coreml_inner_product = layer->mutable_innerproduct();
+
+    *layer->mutable_input()->Add() = a.Name();
+
+    coreml_inner_product->set_inputchannels(K);
+    coreml_inner_product->set_outputchannels(N);
+
+    // CoreML takes weight input as {N, K} which is the reverse of ONNX.
+    // if Gemm's transB is true the input weight is {N, K} and can be added directly.
+    if (transB) {
+      ORT_RETURN_IF_ERROR(CreateCoreMLWeight(*coreml_inner_product->mutable_weights(), *b_initializer));
+    } else {
       std::vector<float> b_transposed;
-      ORT_RETURN_IF_ERROR(GetTensorFloatDataTransposed(b_tensor, b_transposed));
+      ORT_RETURN_IF_ERROR(GetTensorFloatDataTransposed(*b_initializer, b_transposed));
       CreateCoreMLWeight(*coreml_inner_product->mutable_weights(), b_transposed);
-    } else {
-      coreml_inner_product->set_inputchannels(b_shape[1]);
-      coreml_inner_product->set_outputchannels(b_shape[0]);
-      // Add weight (b of MatMul)
-      ORT_RETURN_IF_ERROR(CreateCoreMLWeight(*coreml_inner_product->mutable_weights(), b_tensor));
     }
 
-    // Add bias if present
-    if (input_defs.size() > 2) {
+    if (is_gemm && input_defs.size() > 2) {
+      // Add bias
       coreml_inner_product->set_hasbias(true);
-      const auto& bias_tensor = *model_builder.GetInitializerTensors().at(input_defs[2]->Name());
-      ORT_RETURN_IF_ERROR(CreateCoreMLWeight(*coreml_inner_product->mutable_bias(), bias_tensor));
+      const auto& bias_tensor = *model_builder.GetConstantInitializer(input_defs[2]->Name());
+
+      // if scalar, or single value expand to 1D tensor of size N
+      // IsOpSupportedImpl enforces it's scalar, {1}, {N}, or {1, N}.
+      Initializer unpacked_tensor(bias_tensor);
+      auto bias_data = unpacked_tensor.DataAsSpan<float>();
+      if (bias_data.size() == 1 && N > 1) {
+        std::vector<float> expanded_bias_data(N, bias_data[0]);
+        CreateCoreMLWeight(*coreml_inner_product->mutable_bias(), expanded_bias_data);
+      } else {
+        CreateCoreMLWeight(*coreml_inner_product->mutable_bias(), bias_data);
+      }
     }
-  }
 
-  *layer->mutable_input()->Add() = input_defs[0]->Name();
-  *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+    *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+    model_builder.AddLayer(std::move(layer));
+  }
 
-  model_builder.AddLayer(std::move(layer));
   return Status::OK();
 }
 
@@ -112,98 +243,105 @@ bool GemmOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPara
                                       const logging::Logger& logger) const {
   const auto& op_type = node.OpType();
   const auto& input_defs(node.InputDefs());
+  const bool is_matmul = op_type == "MatMul";
+  const bool is_gemm = op_type == "Gemm";
+
   size_t a_idx = 0, b_idx = 1, c_idx = 2;  // A*B+C
 
-  const auto& initializers = input_params.graph_viewer.GetAllInitializedTensors();
-  if (!Contains(initializers, input_defs[b_idx]->Name())) {
-    LOGS(logger, VERBOSE) << "B of Gemm/Matmul must be an initializer tensor";
+  std::vector<int64_t> a_shape;
+  if (!GetShape(*input_defs[a_idx], a_shape, logger)) {
     return false;
   }
 
-  std::vector<int64_t> a_shape;
-  {
-    if (!GetShape(*input_defs[a_idx], a_shape, logger))
-      return false;
-
-    if (a_shape.size() != 2) {
-      LOGS(logger, VERBOSE) << "A must be 2D";
-      return false;
-    }
+  std::vector<int64_t> b_shape;
+  if (!GetShape(*input_defs[b_idx], b_shape, logger)) {
+    return false;
+  }
 
-    // TODO is it ok if the shape is dynamic and empty?
-    if (Product(a_shape) == 0) {
-      LOGS(logger, VERBOSE) << "A must be non-empty";
+  if (!input_params.graph_viewer.GetConstantInitializer(input_defs[b_idx]->Name())) {
+    if (input_params.create_mlprogram && is_matmul) {
+      // ML Program MatMul allows non-constant B input
+    } else {
+      LOGS(logger, VERBOSE) << op_type << " B input must be a constant initializer";
       return false;
     }
   }
 
-  std::vector<int64_t> b_shape;
-  {
-    if (!GetShape(*input_defs[b_idx], b_shape, logger))
-      return false;
-
-    if (b_shape.size() != 2) {
-      LOGS(logger, VERBOSE) << "B must be 2D";
-      return false;
-    }
+  if (is_matmul) {
+    if (input_params.create_mlprogram) {
+      // ML Program matmul op has numpy semantics the same as the ONNX spec so we can use directly
+    } else {
+      // we could potentially support 1D and 3D if required. beyond 3D the dims that merge diverge.
+      // https://github.com/apple/coremltools/blob/1931758aae383c83daddfc56f11a24a9d2bf4b87/coremltools/converters/onnx/_operators.py#L1607
+      // https://github.com/apple/coremltools/blob/1931758aae383c83daddfc56f11a24a9d2bf4b87/coremltools/converters/mil/backend/nn/op_mapping.py#L1374
+      // https://apple.github.io/coremltools/mlmodel/Format/NeuralNetwork.html#innerproductlayerparams
+      if (a_shape.size() != 2 || b_shape.size() != 2) {
+        LOGS(logger, VERBOSE) << "a and b inputs must be 2D. ";
+        return false;
+      }
 
-    if (Product(b_shape) == 0) {
-      LOGS(logger, VERBOSE) << "B must be non-empty";
-      return false;
+      if (input_defs.size() > 2) {
+        LOGS(logger, VERBOSE) << "MatMul with C input is not supported";
+        return false;
+      }
     }
   }
 
-  if (op_type == "Gemm") {
+  if (is_gemm) {
+    // A and B are 2D due to the ONNX spec
     NodeAttrHelper helper(node);
     const auto transA = helper.Get("transA", 0);
     const auto transB = helper.Get("transB", 0);
     const auto alpha = helper.Get("alpha", 1.0f);
     const auto beta = helper.Get("beta", 1.0f);
+
+    // TODO: We can support transA, alpha and beta by using multiple layers/operations if needed.
     if (!(transA == 0 && alpha == 1.f && beta == 1.f)) {
-      LOGS(logger, VERBOSE) << "Only transA == 0, alpha == 1.0 "
-                            << "and beta == 1.0 is supported."
+      LOGS(logger, VERBOSE) << "Only support for transA == 0, alpha == 1.0 "
+                            << "and beta == 1.0 is currently implemented."
                             << " transA " << transA
                             << " alpha " << alpha
                             << " beta " << beta;
       return false;
     }
 
-    // C of Gemm
-    // For now we only support {n} or {1,n} tensor
     if (input_defs.size() == 3) {
-      if (!Contains(initializers, input_defs[c_idx]->Name())) {
-        LOGS(logger, VERBOSE) << "C of Gemm must be an initializer tensor";
+      if (!input_params.graph_viewer.GetConstantInitializer(input_defs[c_idx]->Name())) {
+        LOGS(logger, VERBOSE) << "C of Gemm must be a constant initializer";
         return false;
       }
 
       std::vector<int64_t> c_shape;
-      if (!GetShape(*input_defs[c_idx], c_shape, logger))
+      if (!GetShape(*input_defs[c_idx], c_shape, logger)) {
         return false;
+      }
 
-      size_t c_dim = c_shape.size();
+      // B is {K, N} in ONNX spec by default, or {N, K} in Gemm if transB is true
+      const auto N = transB ? b_shape[0] : b_shape[1];
 
-      if (c_dim == 0) {
-        LOGS(logger, VERBOSE) << "C of Gemm cannot be a scalar";
-        return false;
-      }
+      size_t c_rank = c_shape.size();
 
-      if (c_dim != 1) {
-        // If C is a (2+)d tensor, it must have the format {1, 1, ..., 1, n}
-        // where every except the last dimension should be 1
-        for (size_t i = 0; i < c_dim - 1; ++i) {
-          if (c_shape[i] != 1) {
-            LOGS(logger, VERBOSE) << "C of Gemm must be a vector or a tensor with only last dimension != 1";
-            return false;
+      // allowed: scalar, or 1D where the value is 1 or N, 2D with shape {1, N}
+      bool c_valid = false;
+      switch (c_rank) {
+        case 0:
+          c_valid = true;
+          break;
+        case 1:
+          if (c_shape[0] == 1 || c_shape[0] == N) {
+            c_valid = true;
           }
-        }
+          break;
+        case 2:
+          if (c_shape[0] == 1 && c_shape[1] == N) {
+            c_valid = true;
+          }
+          break;
       }
 
-      auto c_size = c_shape[c_dim - 1];
-      if (c_size != (transB == 0 ? b_shape[1] : b_shape[0])) {
-        LOGS(logger, VERBOSE) << "C of Gemm must be a vector of b_shape["
-                              << (transB == 0 ? "1" : "0") << "]"
-                              << " b_shape: [" << b_shape[0] << ", " << b_shape[1] << "]"
-                              << " c_size: " << c_size;
+      if (!c_valid) {
+        LOGS(logger, VERBOSE) << "Shape of C Gemm input must be {}, {1}, {N}, or {1, N}. N:" << N << " C shape:"
+                              << Shape2String(c_shape);
 
         return false;
       }
diff --git a/onnxruntime/core/providers/coreml/builders/impl/pool_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/pool_op_builder.cc
index 01aced739b36d..17910ba6fd486 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/pool_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/pool_op_builder.cc
@@ -19,104 +19,176 @@ class PoolOpBuilder : public BaseOpBuilder {
 
   bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                          const logging::Logger& logger) const override;
+
+  bool SupportsMLProgram() const override { return true; }
 };
 
 Status PoolOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                             const Node& node,
                                             const logging::Logger& logger) const {
-  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
-
-  auto* coreml_pool = layer->mutable_pooling();
   const auto& op_type = node.OpType();
   const auto& input_defs = node.InputDefs();
 
-  bool is_global_pooling = false;
-  if (op_type == "GlobalAveragePool") {
-    is_global_pooling = true;
-    coreml_pool->set_type(COREML_SPEC::PoolingLayerParams_PoolingType_AVERAGE);
-  } else if (op_type == "GlobalMaxPool") {
-    is_global_pooling = true;
-    coreml_pool->set_type(COREML_SPEC::PoolingLayerParams_PoolingType_MAX);
-  } else if (op_type == "AveragePool") {
-    coreml_pool->set_type(COREML_SPEC::PoolingLayerParams_PoolingType_AVERAGE);
-  } else if (op_type == "MaxPool") {
-    coreml_pool->set_type(COREML_SPEC::PoolingLayerParams_PoolingType_MAX);
-  } else {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "PoolOpBuilder, unknown op: ", op_type);
-  }
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (model_builder.CreateMLProgram()) {
+    using namespace CoreML::Specification::MILSpec;
+
+    std::string_view coreml_op_type;
+    bool is_global = false;
+    bool is_avg_pool = false;
+    if (op_type == "GlobalAveragePool") {
+      // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.reduction.reduce_mean
+      coreml_op_type = "reduce_mean";
+      is_global = true;
+    } else if (op_type == "GlobalMaxPool") {
+      // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.reduction.reduce_max
+      coreml_op_type = "reduce_max";
+      is_global = true;
+    } else if (op_type == "AveragePool") {
+      // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.pool.avg_pool
+      coreml_op_type = "avg_pool";
+      is_avg_pool = true;
+    } else if (op_type == "MaxPool") {
+      // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.pool.max_pool
+      coreml_op_type = "max_pool";
+    } else {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "PoolOpBuilder, unexpected op: ", op_type);
+    }
 
-  if (is_global_pooling) {
-    coreml_pool->set_globalpooling(true);
-    coreml_pool->mutable_valid();
-  } else {  // AveragePool or MaxPool
-    NodeAttrHelper helper(node);
-    const auto kernel_shape = helper.Get("kernel_shape", std::vector<int64_t>{0, 0});
-    const auto strides = helper.Get("strides", std::vector<int64_t>{1, 1});
-    const auto onnx_pads = helper.Get("pads", std::vector<int64_t>{0, 0, 0, 0});
-
-    coreml_pool->add_kernelsize(kernel_shape[0]);
-    coreml_pool->add_kernelsize(kernel_shape[1]);
-    coreml_pool->add_stride(strides[0]);
-    coreml_pool->add_stride(strides[1]);
-    coreml_pool->set_avgpoolexcludepadding(helper.Get("count_include_pad", 0) == 0);
-    coreml_pool->set_globalpooling(false);
-
-    // Add Padding
-    // Usually using autopadding is more efficient than using explicit padding
-    // Try to see if we can map explicit padding to auto padding
-    std::vector<int64_t> input_shape;
-    ORT_RETURN_IF_NOT(GetShape(*input_defs[0], input_shape, logger), "Cannot get shape");
-    AutoPadType auto_pad_type;
-    ORT_RETURN_IF_ERROR(HandleAutoPad(input_shape, kernel_shape[0], kernel_shape[1],
-                                      onnx_pads, strides, {1, 1} /* dilations */,
-                                      StringToAutoPadType(helper.Get("auto_pad", "NOTSET")),
-                                      auto_pad_type));
-
-    if (AutoPadType::SAME_UPPER == auto_pad_type || AutoPadType::SAME_LOWER == auto_pad_type) {
-      auto* padding_type = coreml_pool->mutable_same();
-      if (AutoPadType::SAME_LOWER == auto_pad_type) {  // default is SAME_UPPER
-        padding_type->set_asymmetrymode(COREML_SPEC::SamePadding_SamePaddingMode_TOP_LEFT_HEAVY);
+    std::unique_ptr<Operation> op = model_builder.CreateOperation(node, coreml_op_type);
+
+    AddOperationInput(*op, "x", input_defs[0]->Name());
+
+    if (is_global) {
+      // keep N and C dims, reduce the rest with keepdims=True. equivalent to the ONNX Global*Pool ops.
+      std::vector<int64_t> axes{2, 3};  // we only support 4D input currently.
+      AddOperationInput(*op, "axes", model_builder.AddConstant(op->type(), "axes", axes));
+      AddOperationInput(*op, "keep_dims", model_builder.AddScalarConstant(op->type(), "keep_dims", true));
+    } else {
+      NodeAttrHelper helper(node);
+      constexpr int num_spatial_dims = 2;  // we only support 4D. -2 for N and C dims.
+
+      AddPadTypeAndPads(*op, model_builder, op->type(), helper, num_spatial_dims);
+
+      const auto kernel_shape = helper.GetInt64s("kernel_shape");  // required
+      AddOperationInput(*op, "kernel_sizes", model_builder.AddConstant(op->type(), "kernel_sizes", *kernel_shape));
+
+      // in theory all these values are optional according to the CoreML spec but simpler to just provide default
+      // values as the actual model compilation tends to require them.
+      const auto strides = helper.Get("strides", std::vector<int64_t>(num_spatial_dims, 1));
+      const bool ceil_mode = helper.Get("ceil_mode", int64_t(0));  // convert int64_t to bool
+
+      AddOperationInput(*op, "strides", model_builder.AddConstant(op->type(), "strides", strides));
+      AddOperationInput(*op, "ceil_mode", model_builder.AddScalarConstant(op->type(), "ceil_mode", ceil_mode));
+
+      if (is_avg_pool) {
+        const bool count_exclude_pad = helper.Get("count_include_pad", int64_t(0)) == 0;
+        AddOperationInput(*op, "exclude_padding_from_average",
+                          model_builder.AddScalarConstant(op->type(), "count_exclude_pad", count_exclude_pad));
       }
+    }
+
+    AddOperationOutput(*op, *node.OutputDefs()[0]);
+    model_builder.AddOperation(std::move(op));
+
+  } else
+#endif  // defined(COREML_ENABLE_MLPROGRAM)
+  {
+    std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
+
+    auto* coreml_pool = layer->mutable_pooling();
+
+    bool is_global_pooling = false;
+    if (op_type == "GlobalAveragePool") {
+      is_global_pooling = true;
+      coreml_pool->set_type(COREML_SPEC::PoolingLayerParams_PoolingType_AVERAGE);
+    } else if (op_type == "GlobalMaxPool") {
+      is_global_pooling = true;
+      coreml_pool->set_type(COREML_SPEC::PoolingLayerParams_PoolingType_MAX);
+    } else if (op_type == "AveragePool") {
+      coreml_pool->set_type(COREML_SPEC::PoolingLayerParams_PoolingType_AVERAGE);
+    } else if (op_type == "MaxPool") {
+      coreml_pool->set_type(COREML_SPEC::PoolingLayerParams_PoolingType_MAX);
     } else {
-      auto* padding_type = coreml_pool->mutable_valid();
-      if (AutoPadType::NOTSET == auto_pad_type && onnx_pads != std::vector<int64_t>{0, 0, 0, 0}) {
-        // NOTSET is adding the explicit padding to the ValidPadding.paddingAmounts
-        auto* height_border = padding_type->mutable_paddingamounts()->add_borderamounts();
-        height_border->set_startedgesize(onnx_pads[0]);
-        height_border->set_endedgesize(onnx_pads[2]);
-        auto* width_border = padding_type->mutable_paddingamounts()->add_borderamounts();
-        width_border->set_startedgesize(onnx_pads[1]);
-        width_border->set_endedgesize(onnx_pads[3]);
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "PoolOpBuilder, unexpected op: ", op_type);
+    }
+
+    if (is_global_pooling) {
+      coreml_pool->set_globalpooling(true);
+      coreml_pool->mutable_valid();
+    } else {  // AveragePool or MaxPool
+      NodeAttrHelper helper(node);
+      const auto kernel_shape = helper.Get("kernel_shape", std::vector<int64_t>{0, 0});
+      const auto strides = helper.Get("strides", std::vector<int64_t>{1, 1});
+      const auto onnx_pads = helper.Get("pads", std::vector<int64_t>{0, 0, 0, 0});
+
+      coreml_pool->add_kernelsize(kernel_shape[0]);
+      coreml_pool->add_kernelsize(kernel_shape[1]);
+      coreml_pool->add_stride(strides[0]);
+      coreml_pool->add_stride(strides[1]);
+      coreml_pool->set_avgpoolexcludepadding(helper.Get("count_include_pad", 0) == 0);
+      coreml_pool->set_globalpooling(false);
+
+      // Add Padding
+      // Usually using autopadding is more efficient than using explicit padding
+      // Try to see if we can map explicit padding to auto padding
+      std::vector<int64_t> input_shape;
+      ORT_RETURN_IF_NOT(GetShape(*input_defs[0], input_shape, logger), "Cannot get shape");
+      AutoPadType auto_pad_type;
+      ORT_RETURN_IF_ERROR(HandleAutoPad(input_shape, kernel_shape[0], kernel_shape[1],
+                                        onnx_pads, strides, {1, 1} /* dilations */,
+                                        StringToAutoPadType(helper.Get("auto_pad", "NOTSET")),
+                                        auto_pad_type));
+
+      if (AutoPadType::SAME_UPPER == auto_pad_type || AutoPadType::SAME_LOWER == auto_pad_type) {
+        auto* padding_type = coreml_pool->mutable_same();
+        if (AutoPadType::SAME_LOWER == auto_pad_type) {  // default is SAME_UPPER
+          padding_type->set_asymmetrymode(COREML_SPEC::SamePadding_SamePaddingMode_TOP_LEFT_HEAVY);
+        }
+      } else {
+        auto* padding_type = coreml_pool->mutable_valid();
+        if (AutoPadType::NOTSET == auto_pad_type && onnx_pads != std::vector<int64_t>{0, 0, 0, 0}) {
+          // NOTSET is adding the explicit padding to the ValidPadding.paddingAmounts
+          auto* height_border = padding_type->mutable_paddingamounts()->add_borderamounts();
+          height_border->set_startedgesize(onnx_pads[0]);
+          height_border->set_endedgesize(onnx_pads[2]);
+          auto* width_border = padding_type->mutable_paddingamounts()->add_borderamounts();
+          width_border->set_startedgesize(onnx_pads[1]);
+          width_border->set_endedgesize(onnx_pads[3]);
+        }
       }
     }
-  }
 
-  *layer->mutable_input()->Add() = input_defs[0]->Name();
-  *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+    *layer->mutable_input()->Add() = input_defs[0]->Name();
+    *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+
+    model_builder.AddLayer(std::move(layer));
+  }
 
-  model_builder.AddLayer(std::move(layer));
   return Status::OK();
 }
 
-bool PoolOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& /* input_params */,
+bool PoolOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                                       const logging::Logger& logger) const {
   const auto& op_type = node.OpType();
   const auto& input_defs = node.InputDefs();
 
   std::vector<int64_t> input_shape;
-  if (!GetShape(*input_defs[0], input_shape, logger))
+  if (!GetShape(*input_defs[0], input_shape, logger)) {
     return false;
+  }
 
+  // TODO: ML Program supports 3D and 5D. Add if we have a use case for that.
   const auto input_size = input_shape.size();
   if (input_size != 4) {
-    LOGS(logger, VERBOSE)
-        << op_type << " only supports rank-4 tensor, input ["
-        << input_defs[0]->Name() << "] has actual dim count " << input_size;
+    LOGS(logger, VERBOSE) << op_type << " only supports rank-4 tensor, input ["
+                          << input_defs[0]->Name() << "] has actual dim count " << input_size;
     return false;
   }
 
   if (op_type == "AveragePool" || op_type == "MaxPool") {
     NodeAttrHelper helper(node);
+
     const auto storage_order = helper.Get("storage_order", 0);
     if (storage_order == 1) {
       LOGS(logger, VERBOSE) << "storage_order == 1 is not supported";
@@ -128,12 +200,14 @@ bool PoolOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPara
       return false;
     }
 
-    // TODO, add support of the ceil_mode by adjusting the padding
-    // See https://stackoverflow.com/questions/59906456/in-pytorchs-maxpool2d-is-padding-added-depending-on-ceil-mode
-    // and https://github.com/apple/coremltools/blob/1931758aae383c83daddfc56f11a24a9d2bf4b87/coremltools/converters/mil/frontend/torch/ops.py#L621-L644
-    if (helper.Get("ceil_mode", 0) == 1) {
-      LOGS(logger, VERBOSE) << "ceil_mode == 1 is not supported for pooling";
-      return false;
+    if (!input_params.create_mlprogram) {
+      // TODO, add support of the ceil_mode by adjusting the padding
+      // See https://stackoverflow.com/questions/59906456/in-pytorchs-maxpool2d-is-padding-added-depending-on-ceil-mode
+      // and https://github.com/apple/coremltools/blob/1931758aae383c83daddfc56f11a24a9d2bf4b87/coremltools/converters/mil/frontend/torch/ops.py#L621-L644
+      if (helper.Get("ceil_mode", 0) == 1) {
+        LOGS(logger, VERBOSE) << "ceil_mode == 1 is not supported for pooling";
+        return false;
+      }
     }
 
     if (helper.Get("dilations", std::vector<int32_t>{1, 1}) !=
diff --git a/onnxruntime/core/providers/coreml/builders/impl/reshape_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/reshape_op_builder.cc
index 7ae1746be3122..27d24d9c21893 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/reshape_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/reshape_op_builder.cc
@@ -1,11 +1,10 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/framework/tensorprotoutils.h"
 #include "core/optimizer/initializer.h"
-#include "core/providers/common.h"
 #include "core/providers/coreml/builders/helper.h"
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/impl/builder_utils.h"
 #include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/coreml/shape_utils.h"
@@ -26,34 +25,56 @@ class ReshapeOpBuilder : public BaseOpBuilder {
 
   // Reshape opset 4- uses attributes for new shape which we do not support for now
   int GetMinSupportedOpSet(const Node& /* node */) const override { return 5; }
+
+  bool SupportsMLProgram() const override { return true; }
 };
 
 void ReshapeOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const {
+  // Skip the second input which is the new shape as we always have to create a new version as the CoreML rules
+  // are different from ONNX.
   model_builder.AddInitializerToSkip(node.InputDefs()[1]->Name());
 }
 
 Status ReshapeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                                const Node& node,
                                                const logging::Logger& logger) const {
-  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
-
   const auto& input_defs = node.InputDefs();
-  const auto& initializers(model_builder.GetInitializerTensors());
-  const auto& target_shape_tensor = *initializers.at(input_defs[1]->Name());
-  const int64_t* raw_target_shape = target_shape_tensor.int64_data().empty()
-                                        ? reinterpret_cast<const int64_t*>(target_shape_tensor.raw_data().data())
-                                        : target_shape_tensor.int64_data().data();
-
-  const auto size = target_shape_tensor.dims()[0];
-  TensorShapeVector target_shape{raw_target_shape, raw_target_shape + size};
   std::vector<int64_t> input_shape;
-  ORT_RETURN_IF_NOT(GetStaticShape(*input_defs[0], input_shape, logger), "Cannot get shape");
-  ReshapeHelper helper(TensorShape(input_shape), target_shape);
-  *layer->mutable_reshapestatic()->mutable_targetshape() = {target_shape.cbegin(), target_shape.cend()};
-  *layer->mutable_input()->Add() = input_defs[0]->Name();
-  *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+  ORT_RETURN_IF_NOT(GetStaticShape(*input_defs[0], input_shape, logger), "Cannot get shape of data");
+
+  const auto& data_name = input_defs[0]->Name();
+  const auto& new_shape_name = input_defs[1]->Name();
+  Initializer unpacked_tensor(*model_builder.GetConstantInitializer(new_shape_name));
+  TensorShapeVector new_shape = ToShapeVector(unpacked_tensor.DataAsSpan<int64_t>());
+
+  // ReshapeHelper applies the ONNX rules to create the concrete output shape
+  ReshapeHelper helper(TensorShape(input_shape), new_shape);
+
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (model_builder.CreateMLProgram()) {
+    using namespace CoreML::Specification::MILSpec;
 
-  model_builder.AddLayer(std::move(layer));
+    // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.tensor_transformation.reshape
+    std::unique_ptr<Operation> reshape_op = model_builder.CreateOperation(node, "reshape");
+
+    AddOperationInput(*reshape_op, "x", data_name);
+    AddOperationInput(*reshape_op, "shape",
+                      model_builder.AddConstant(reshape_op->type(), "shape", ToConstSpan(new_shape)));
+
+    AddOperationOutput(*reshape_op, *node.OutputDefs()[0]);
+
+    model_builder.AddOperation(std::move(reshape_op));
+  } else
+#endif  // defined(COREML_ENABLE_MLPROGRAM)
+  {
+    std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
+
+    *layer->mutable_reshapestatic()->mutable_targetshape() = {new_shape.cbegin(), new_shape.cend()};
+    *layer->mutable_input()->Add() = data_name;
+    *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+
+    model_builder.AddLayer(std::move(layer));
+  }
   return Status::OK();
 }
 
@@ -61,14 +82,15 @@ bool ReshapeOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputP
                                          const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
   const auto& new_shape_name = input_defs[1]->Name();
-  const auto& initializers = input_params.graph_viewer.GetAllInitializedTensors();
-  if (!Contains(initializers, new_shape_name)) {
+  const auto* new_shape_tensor = input_params.graph_viewer.GetConstantInitializer(new_shape_name);
+  if (!new_shape_tensor) {
+    // ONNX has different rules around how -1 and 0 values are used/combined, and
+    // we can't check if those can be translated to CoreML if the shape is unknown.
     LOGS(logger, VERBOSE) << "New shape of reshape must be a constant initializer";
     return false;
   }
 
-  const auto& new_shape_tensor = *initializers.at(new_shape_name);
-  Initializer unpacked_tensor(new_shape_tensor);
+  Initializer unpacked_tensor(*new_shape_tensor);
   auto new_shape = unpacked_tensor.DataAsSpan<int64_t>();
   if (new_shape.empty()) {
     LOGS(logger, VERBOSE) << "New shape of reshape cannot be empty";
@@ -84,7 +106,7 @@ bool ReshapeOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputP
     return false;
   }
 
-  // CoreML reshape doesn't support new shape with more than 5 dimensions
+  // CoreML reshape doesn't support new shape with more than 5 dimensions.
   if (new_shape.size() > 5) {
     LOGS(logger, VERBOSE) << "Reshape does not support new shape with rank greater than 5. Input shape: "
                           << Shape2String(input_shape) << ", new shape: " << Shape2String(new_shape);
@@ -93,7 +115,7 @@ bool ReshapeOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputP
 
   // CoreML reshape does not support 0 as dimension
   NodeAttrHelper helper(node);
-  const bool allow_zero = helper.Get("allowzero ", 0) == 1;
+  const bool allow_zero = helper.Get("allowzero", 0) == 1;
   if (allow_zero) {
     if (std::find(new_shape.begin(), new_shape.end(), int64_t{0}) != new_shape.end()) {
       LOGS(logger, VERBOSE) << "Reshape does not support new shape with 0 as dimension when allowzero is enabled. "
diff --git a/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc
index 35dcde41a6bcf..6c2fcc2ace856 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc
@@ -98,7 +98,7 @@ Status ResizeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   const auto& input_defs = node.InputDefs();
   const auto& initializers(model_builder.GetInitializerTensors());
 
-  if (input_defs.size() == 3) {  // use scales
+  if (input_defs.size() >= 3 && input_defs[2]->Exists()) {  // use scales
     std::vector<float> scales;
     ORT_RETURN_IF_NOT(GetResizeScales(initializers, node, scales, logger), "Error getting resize scales");
     coreml_upsample->add_scalingfactor(static_cast<int64_t>(scales[2]));
@@ -182,20 +182,24 @@ bool ResizeOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPa
       return false;
     }
 
+    bool using_scales = input_defs.size() >= 3 && input_defs[2]->Exists();
     // scales
-    if (input_defs.size() == 3 && !Contains(initializers, input_defs[2]->Name())) {
-      LOGS(logger, VERBOSE) << "Input scales of Resize must be known";
+    if (using_scales && !input_params.graph_viewer.GetConstantInitializer(input_defs[2]->Name())) {
+      LOGS(logger, VERBOSE) << "scales input of Resize must be a constant initializer";
       return false;
     }
 
     // sizes
-    if (input_defs.size() > 3 && !Contains(initializers, input_defs[3]->Name())) {
-      LOGS(logger, VERBOSE) << "Input sizes of Resize must be known";
+    if (!using_scales &&
+        (input_defs.size() < 4 ||
+         !input_defs[3]->Exists() ||
+         !input_params.graph_viewer.GetConstantInitializer(input_defs[3]->Name()))) {
+      LOGS(logger, VERBOSE) << "sizes input of Resize must be a constant initializer";
       return false;
     }
 
     // We want to check if the scales or sizes are not trying to resize on N/C channels here
-    if (input_defs.size() == 3) {  // we are using scales
+    if (using_scales) {
       std::vector<float> scales;
       if (!GetResizeScales(initializers, node, scales, logger))
         return false;
diff --git a/onnxruntime/core/providers/coreml/builders/impl/slice_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/slice_op_builder.cc
index b716af738e1b1..39bfbfe5bba1f 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/slice_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/slice_op_builder.cc
@@ -54,7 +54,7 @@ Status PrepareSliceComputeMetadataFromConstantInitializers(const Node& slice_nod
       return Status::OK();
     }
 
-    const auto* tensor_proto = graph_viewer.GetConstantInitializer(input_defs[input_idx]->Name(), true);
+    const auto* tensor_proto = graph_viewer.GetConstantInitializer(input_defs[input_idx]->Name());
     ORT_RETURN_IF_NOT(tensor_proto, "Failed to get constant initializer.");
     Initializer unpacked_tensor(*tensor_proto, graph_viewer.ModelPath());
     const auto data_type = unpacked_tensor.data_type();
diff --git a/onnxruntime/core/providers/coreml/builders/impl/softmax_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/softmax_op_builder.cc
index 266396a0fe90e..d6584124c6aba 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/softmax_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/softmax_op_builder.cc
@@ -52,7 +52,7 @@ Status SoftmaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
     target_shape.push_back(size_to_dimension);
     target_shape.push_back(size_from_dimension);
 
-    const auto reshape1_output_name = model_builder.GetUniqueName(MakeString(node.Name(), "reshape1_output"));
+    const auto reshape1_output_name = model_builder.GetUniqueName(node, "reshape1_output");
     {  // Add reshape layer
       auto reshape_layer = model_builder.CreateNNLayer(node, "_Softmax_reshape1");
       *reshape_layer->mutable_reshapestatic()->mutable_targetshape() = {target_shape.cbegin(), target_shape.cend()};
@@ -60,7 +60,7 @@ Status SoftmaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
       *reshape_layer->mutable_output()->Add() = reshape1_output_name;
       model_builder.AddLayer(std::move(reshape_layer));
     }
-    const auto softmax_output_name = model_builder.GetUniqueName(MakeString(node.Name(), "softmax_output"));
+    const auto softmax_output_name = model_builder.GetUniqueName(node, "softmax_output");
     {
       auto* coreml_softmaxnd = layer->mutable_softmaxnd();
       coreml_softmaxnd->set_axis(-1);
diff --git a/onnxruntime/core/providers/coreml/builders/model_builder.cc b/onnxruntime/core/providers/coreml/builders/model_builder.cc
index daab36f7b933d..eb4723a3b9746 100644
--- a/onnxruntime/core/providers/coreml/builders/model_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/model_builder.cc
@@ -144,14 +144,18 @@ void CopyOnnxTensorToCoreMLTensor(const ONNX_NAMESPACE::TensorProto& tensor_prot
       break;
     }
     case ONNX_NAMESPACE::TensorProto_DataType_INT64: {
-      // from: int64_data/raw, to: longints
-      if (has_raw_data) {
-        CopyRawDataToRepeatedField<int64_t>(tensor_proto, *tensor_value.mutable_longints()->mutable_values());
-
-      } else {
-        tensor_value.mutable_longints()->mutable_values()->CopyFrom(tensor_proto.int64_data());
-      }
-      break;
+      // enable when this is proven to not be the case
+      ORT_THROW(
+          "INT64 is unexpected as CoreML uses 32-bit int for indices. "
+          "Most likely an initializer that should have been skipped was not.");
+      //// from: int64_data/raw, to: longints
+      // if (has_raw_data) {
+      //   CopyRawDataToRepeatedField<int64_t>(tensor_proto, *tensor_value.mutable_longints()->mutable_values());
+
+      //} else {
+      //  tensor_value.mutable_longints()->mutable_values()->CopyFrom(tensor_proto.int64_data());
+      //}
+      // break;
     }
     case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16: {
       // from: int32_data/raw, to: bytes
@@ -186,18 +190,22 @@ void CopyOnnxTensorToCoreMLTensor(const ONNX_NAMESPACE::TensorProto& tensor_prot
       break;
     }
     case ONNX_NAMESPACE::TensorProto_DataType_UINT64: {
-      // from: uint64_data/raw, to: longints
-      if (has_raw_data) {
-        CopyRawDataToRepeatedField<uint64_t>(tensor_proto, *tensor_value.mutable_longints()->mutable_values());
-      } else {
-        // TODO: Is this safe? Need to check the CopyFrom implementation. As it's a straight copy of bytes this
-        // hopefully can do it as one block instead of iterating and potentially doing a static_cast of each
-        // individual value.
-        tensor_value.mutable_longints()->mutable_values()->CopyFrom(
-            reinterpret_cast<const google::protobuf::RepeatedField<int64_t>&>(tensor_proto.uint64_data()));
-      }
-
-      break;
+      // enable when this is proven to not be the case
+      ORT_THROW(
+          "UINT64 is unexpected as CoreML uses 32-bit int for indices. "
+          "Most likely an initializer that should have been skipped was not.");
+      //// from: uint64_data/raw, to: longints
+      // if (has_raw_data) {
+      //   CopyRawDataToRepeatedField<uint64_t>(tensor_proto, *tensor_value.mutable_longints()->mutable_values());
+      // } else {
+      //   // TODO: Is this safe? Need to check the CopyFrom implementation. As it's a straight copy of bytes this
+      //   // hopefully can do it as one block instead of iterating and potentially doing a static_cast of each
+      //   // individual value.
+      //   tensor_value.mutable_longints()->mutable_values()->CopyFrom(
+      //       reinterpret_cast<const google::protobuf::RepeatedField<int64_t>&>(tensor_proto.uint64_data()));
+      // }
+
+      // break;
     }
     case ONNX_NAMESPACE::TensorProto_DataType_BOOL: {
       // from: int32_data/raw, to: bools
@@ -392,23 +400,28 @@ std::string GetModelOutputPath(bool create_ml_program) {
 }  // namespace
 
 ModelBuilder::ModelBuilder(const GraphViewer& graph_viewer, const logging::Logger& logger,
-                           int32_t coreml_version, uint32_t coreml_flags)
+                           int32_t coreml_version, uint32_t coreml_flags,
+                           std::vector<std::string>&& onnx_input_names,
+                           std::vector<std::string>&& onnx_output_names)
     : graph_viewer_(graph_viewer),
       logger_(logger),
       coreml_version_(coreml_version),
       coreml_flags_(coreml_flags),
       create_ml_program_((coreml_flags_ & COREML_FLAG_CREATE_MLPROGRAM) != 0),
       model_output_path_(GetModelOutputPath(create_ml_program_)),
+      onnx_input_names_(std::move(onnx_input_names)),
+      onnx_output_names_(std::move(onnx_output_names)),
       coreml_model_(std::make_unique<CoreML::Specification::Model>()) {
   if (create_ml_program_) {
 #if defined(COREML_ENABLE_MLPROGRAM)
     coreml_model_->set_specificationversion(CoreMLSpecVersion());
     MILSpec::Program& mlprogram = *coreml_model_->mutable_mlprogram();
-    MILSpec::Function& main = (*mlprogram.mutable_functions())["main"];
+    mlprogram.set_version(1);
+    mlprogram_main_fn_ = &(*mlprogram.mutable_functions())["main"];
 
     const std::string coreml_opset = "CoreML" + std::to_string(CoreMLVersion());
-    *main.mutable_opset() = coreml_opset;
-    mlprogram_main_ = &(*main.mutable_block_specializations())[coreml_opset];
+    *mlprogram_main_fn_->mutable_opset() = coreml_opset;
+    mlprogram_main_block_ = &(*mlprogram_main_fn_->mutable_block_specializations())[coreml_opset];
 
     // create the ModelPackage. this creates the output directory.
     mlpackage_ = std::make_unique<MPL::ModelPackage>(model_output_path_, /* create */ true);
@@ -426,6 +439,8 @@ ModelBuilder::ModelBuilder(const GraphViewer& graph_viewer, const logging::Logge
     weights_file_writer_ = std::make_unique<StorageWriter>(weights_info->path() + "/weight.bin");
 #else
     // should never happen due to handling in coreml_execution_provider.cc
+    // throw here so all other code in this class can assume create_ml_program_ is only ever true in a build
+    // where ML Program support is enabled.
     ORT_THROW("ML Program is not enabled in this build");
 #endif
   } else {
@@ -435,6 +450,28 @@ ModelBuilder::ModelBuilder(const GraphViewer& graph_viewer, const logging::Logge
     neural_network->set_arrayinputshapemapping(
         CoreML::Specification::NeuralNetworkMultiArrayShapeMapping::EXACT_ARRAY_MAPPING);
   }
+
+  // populate names.
+  const auto& initializers = graph_viewer_.GetAllInitializedTensors();
+  const auto& inputs = graph_viewer_.GetInputs();
+  // rough guess to try and avoid reallocs. most nodes produce one output but some have more so allow for that.
+  // also need to convert attributes to constants so allow for that
+  unique_names_.reserve(initializers.size() + inputs.size() + size_t(graph_viewer_.NumberOfNodes() * 1.5));
+  for (const auto& pair : initializers) {
+    unique_names_.insert(pair.first);
+  }
+
+  for (const auto* input : inputs) {
+    unique_names_.insert(input->Name());
+  }
+
+  for (const auto& node : graph_viewer_.Nodes()) {
+    for (const auto& def : node.OutputDefs()) {
+      if (def->Exists()) {
+        unique_names_.insert(def->Name());
+      }
+    }
+  }
 }
 
 ModelBuilder::~ModelBuilder() = default;
@@ -455,11 +492,94 @@ void ModelBuilder::AddLayer(std::unique_ptr<NeuralNetworkLayer> layer) {
   neural_network->mutable_layers()->AddAllocated(layer.release());
 }
 
-#if defined(COREML_ENABLE_MLPROGRAM)
-
 /*
  * ML Program related helpers
  */
+#if defined(COREML_ENABLE_MLPROGRAM)
+const std::string& ModelBuilder::GetSafeName(const std::string& name) {
+  // Check the name is valid according to the MILSpec rules
+  // `Identifiers, generally used for names and keys, must match the regular expression [A-Za-z\_][A-Za-z0-9\_@]*.`
+  //
+  // There is a secondary list of reserved words that the coremltools python uses, but it's not clear if those are
+  // required here, or if we will ever hit a model that uses one of them. Due to that, skip checking them for now as
+  // it adds cost and code complexity
+  // https://github.com/apple/coremltools/blob/8b37641f243b1a3e81452feea311c6e30dcc9287/coremltools/converters/mil/mil/passes/defs/preprocess.py#L151C1-L175C10
+  // static InlinedHashSet<std::string> reserved_names =
+  //    {"any", "bool", "program", "func", "tensor", "list", "dict", "tuple", "true", "false",
+  //     "string", "bf16", "fp16", "fp32", "fp64", "int8", "int16", "int32", "int64",
+  //     "uint8", "uint16", "uint32", "uint64"};
+
+  // handle empty name. shouldn't happen but code below assumes name is not empty
+  if (name.empty()) {
+    return name;
+  }
+
+  // We don't need '@' or '\' even though they're allowed. Optimize for a good name that does not need to be changed.
+
+  // has been sanitized and changed already
+  const auto entry = values_to_rename_.find(name);
+  if (entry != values_to_rename_.end()) {
+    return entry->second;
+  }
+
+  // Replace anything but a good char with '_'. If first char is 0-9 we prefix with '_';
+  bool changed = false;
+  std::string result = name;
+
+  if (std::isdigit(result[0])) {
+    changed = true;
+    result = '_' + name;
+  }
+
+  for (char& c : result) {
+    if (!std::isalnum(c) && c != '_') {
+      changed = true;
+      c = '_';
+    }
+  }
+
+  if (!changed) {
+    return name;  // return original as the return value is a reference that must remain valid
+  }
+
+  return (values_to_rename_[name] = GetUniqueName(result));
+}
+
+void ModelBuilder::SanitizeNames() {
+  // ML Model level inputs/outputs
+  auto* desc = coreml_model_->mutable_description();
+  for (auto& input : *desc->mutable_input()) {
+    input.set_name(GetSafeName(input.name()));
+  }
+
+  for (auto& output : *desc->mutable_output()) {
+    output.set_name(GetSafeName(output.name()));
+  }
+
+  // main function inputs/outputs.
+  for (auto& input : *mlprogram_main_fn_->mutable_inputs()) {
+    input.set_name(GetSafeName(input.name()));
+  }
+
+  // outputs from block with operations for current coreml version
+  for (auto& output : *mlprogram_main_block_->mutable_outputs()) {
+    output = GetSafeName(output);
+  }
+
+  // iterate operations changing input/output/node names
+  for (auto& op : *mlprogram_main_block_->mutable_operations()) {
+    for (auto& input : *op.mutable_inputs()) {
+      for (auto& arg : *input.second.mutable_arguments()) {
+        arg.set_name(GetSafeName(arg.name()));
+      }
+    }
+
+    for (auto& output : *op.mutable_outputs()) {
+      output.set_name(GetSafeName(output.name()));
+    }
+  }
+}
+
 std::unique_ptr<COREML_SPEC::MILSpec::Operation> ModelBuilder::CreateOperation(const Node& node,
                                                                                std::string_view op_type,
                                                                                std::string_view suffix) {
@@ -472,14 +592,9 @@ std::unique_ptr<COREML_SPEC::MILSpec::Operation> ModelBuilder::CreateOperation(c
   return op;
 }
 
-void ModelBuilder::AddConstant(std::string_view name, const ONNX_NAMESPACE::TensorProto& initializer) {
-  MILSpec::Value coreml_tensor = OnnxTensorToCoreMLTensor(initializer, *weights_file_writer_);
-  AddConstantOperation(name, std::move(coreml_tensor));
-}
-
-void ModelBuilder::AddConstantOperation(std::string_view name, MILSpec::Value&& coreml_tensor) {
+const std::string& ModelBuilder::AddConstantOperation(std::string_view name, MILSpec::Value&& coreml_tensor) {
   // Replicates coremltools/converters/mil/backend/mil/load.py translate_const logic
-  MILSpec::Operation& const_op = *mlprogram_main_->mutable_operations()->Add();
+  MILSpec::Operation& const_op = *mlprogram_main_block_->mutable_operations()->Add();
   const_op.set_type("const");
 
   MILSpec::NamedValueType& output = *const_op.mutable_outputs()->Add();
@@ -487,58 +602,63 @@ void ModelBuilder::AddConstantOperation(std::string_view name, MILSpec::Value&&
   *output.mutable_type() = coreml_tensor.type();
 
   auto& attr_map = *const_op.mutable_attributes();
-  attr_map["name"] = CreateScalarTensorValue(std::string(name));
+  // the operation name doesn't really matter as it isn't used elsewhere, so sanitize name now
+  attr_map["name"] = CreateScalarTensorValue(GetSafeName(output.name()));
   attr_map["val"] = std::move(coreml_tensor);
+
+  return output.name();
 }
 
 // Add operation to the Block for the main function in the ML Program
 void ModelBuilder::AddOperation(std::unique_ptr<COREML_SPEC::MILSpec::Operation> operation) {
-  mlprogram_main_->mutable_operations()->AddAllocated(operation.release());
+  mlprogram_main_block_->mutable_operations()->AddAllocated(operation.release());
 }
 
-std::string ModelBuilder::AddTensorValueAsConstantOperation(std::string_view op_type, std::string_view value_type,
-                                                            MILSpec::Value&& input_value) {
+const std::string& ModelBuilder::AddTensorValueAsConstantOperation(std::string_view op_type,
+                                                                   std::string_view value_type,
+                                                                   MILSpec::Value&& input_value) {
   auto unique_value_name = GetUniqueName(MakeString(op_type, "_", value_type));
-  AddConstantOperation(unique_value_name, std::move(input_value));
-  return unique_value_name;
+  return AddConstantOperation(unique_value_name, std::move(input_value));
 }
 
 template <typename T>
-std::string ModelBuilder::AddConstantImpl(std::string_view op_type, std::string_view value_type, gsl::span<const T> value,
-                                          std::optional<gsl::span<const int64_t>> shape) {
+std::string_view ModelBuilder::AddConstantImpl(std::string_view op_type, std::string_view value_type,
+                                               gsl::span<const T> value,
+                                               std::optional<gsl::span<const int64_t>> shape) {
   // add specialization below
   static_assert(false_for_T<T>, "Missing specialization for value type");
-  return "";  // unreachable
+
+  return "ModelBuilder::AddConstant error";  // unreachable
 }
 
 template <>
-std::string ModelBuilder::AddConstantImpl(std::string_view op_type, std::string_view value_type,
-                                          gsl::span<const float> value,
-                                          std::optional<gsl::span<const int64_t>> shape) {
+std::string_view ModelBuilder::AddConstantImpl(std::string_view op_type, std::string_view value_type,
+                                               gsl::span<const float> value,
+                                               std::optional<gsl::span<const int64_t>> shape) {
   auto input_value = CreateTensorValue<float>(value, shape);
   return AddTensorValueAsConstantOperation(op_type, value_type, std::move(input_value));
 }
 
 template <>
-std::string ModelBuilder::AddConstantImpl(std::string_view op_type, std::string_view value_type,
-                                          gsl::span<const int64_t> value,
-                                          std::optional<gsl::span<const int64_t>> shape) {
+std::string_view ModelBuilder::AddConstantImpl(std::string_view op_type, std::string_view value_type,
+                                               gsl::span<const int64_t> value,
+                                               std::optional<gsl::span<const int64_t>> shape) {
   auto input_value = CreateTensorValue<int64_t, int32_t>(value, shape);  // CoreML uses int32
   return AddTensorValueAsConstantOperation(op_type, value_type, std::move(input_value));
 }
 
 template <>
-std::string ModelBuilder::AddConstantImpl(std::string_view op_type, std::string_view value_type,
-                                          gsl::span<const bool> value,
-                                          std::optional<gsl::span<const int64_t>> shape) {
+std::string_view ModelBuilder::AddConstantImpl(std::string_view op_type, std::string_view value_type,
+                                               gsl::span<const bool> value,
+                                               std::optional<gsl::span<const int64_t>> shape) {
   auto input_value = CreateTensorValue<bool>(value, shape);
   return AddTensorValueAsConstantOperation(op_type, value_type, std::move(input_value));
 }
 
 template <>
-std::string ModelBuilder::AddConstantImpl(std::string_view op_type, std::string_view value_type,
-                                          gsl::span<const std::string> value,
-                                          std::optional<gsl::span<const int64_t>> shape) {
+std::string_view ModelBuilder::AddConstantImpl(std::string_view op_type, std::string_view value_type,
+                                               gsl::span<const std::string> value,
+                                               std::optional<gsl::span<const int64_t>> shape) {
   auto input_value = CreateTensorValue<std::string>(value, shape);
   return AddTensorValueAsConstantOperation(op_type, value_type, std::move(input_value));
 }
@@ -581,11 +701,13 @@ Status ModelBuilder::RegisterInitializers() {
       continue;
     }
 
-    if (create_ml_program_) {
 #if defined(COREML_ENABLE_MLPROGRAM)
-      AddConstant(name, tensor);
+    if (create_ml_program_) {
+      MILSpec::Value coreml_tensor = OnnxTensorToCoreMLTensor(tensor, *weights_file_writer_);
+      ORT_IGNORE_RETURN_VALUE(AddConstantOperation(name, std::move(coreml_tensor)));
+    } else
 #endif
-    } else {
+    {
       std::unique_ptr<NeuralNetworkLayer> layer = std::make_unique<NeuralNetworkLayer>();
       layer->set_name(GetUniqueName("initializer_" + name));
 
@@ -616,32 +738,33 @@ Status ModelBuilder::RegisterModelInputOutput(const NodeArg& node_arg, bool is_i
 
   if (is_input) {
     // input should not be an initializer
-    if (Contains(GetInitializerTensors(), name))
+    if (Contains(GetInitializerTensors(), name)) {
       return Status::OK();
+    }
 
     // This input will not be used
-    if (Contains(skipped_inputs_, name))
+    if (Contains(skipped_inputs_, name)) {
       return Status::OK();
+    }
   }
 
   auto* model_description = coreml_model_->mutable_description();
-  auto& input_output = is_input
-                           ? *model_description->mutable_input()->Add()
-                           : *model_description->mutable_output()->Add();
+  auto& input_output = is_input ? *model_description->mutable_input()->Add()
+                                : *model_description->mutable_output()->Add();
 
   input_output.set_name(name);
+
   auto* multi_array = input_output.mutable_type()->mutable_multiarraytype();
 
   std::vector<int64_t> shape;
-  ORT_RETURN_IF_NOT(GetShape(node_arg, shape, logger_),
-                    "Unable to get shape for ", input_output_type, ": ", name);
+  ORT_RETURN_IF_NOT(GetShape(node_arg, shape, logger_), "Unable to get shape for ", input_output_type, ": ", name);
 
   if (shape.empty()) {
-    // If we have an empty shape, this is a scalar input,
-    // Since all the input output of CoreML EP is MultiArray, we will make the scalar input output as a {1} MultiArray
+    // If we have an empty shape, this is a scalar
+    // Since all the input/output of CoreML EP is MultiArray, we will make the scalar input/output a {1} MultiArray
     shape.push_back(1);
 
-    // we need to change the shapes of these scalar outputs back to {} when CoreML EP returns these values to ORT
+    // we need to change the shapes of scalar outputs back to {} when CoreML EP returns values to ORT
     if (!is_input) {
       AddScalarOutput(name);
     }
@@ -713,13 +836,20 @@ Status ModelBuilder::RegisterModelInputOutput(const NodeArg& node_arg, bool is_i
 
 #if defined(COREML_ENABLE_MLPROGRAM)
   if (create_ml_program_) {
-    MILSpec::Function& main = (*coreml_model_->mutable_mlprogram()->mutable_functions())["main"];
     if (is_input) {
-      // the model inputs need to be wired up as args to the 'main' function
-      main.mutable_inputs()->Add(CreateNamedTensorValueType(node_arg));
+      // the model inputs need to be wired up as args to the 'main' function.
+      auto tensor_value_type = CreateNamedTensorValueType(node_arg);
+      tensor_value_type.set_name(name);
+      if (node_arg.Shape()->dim_size() == 0) {
+        // update shape from {} to {1} (same change we made at the model input level above).
+        tensor_value_type.mutable_type()->mutable_tensortype()->set_rank(1);
+        tensor_value_type.mutable_type()->mutable_tensortype()->add_dimensions()->mutable_constant()->set_size(1);
+      }
+
+      mlprogram_main_fn_->mutable_inputs()->Add(std::move(tensor_value_type));
     } else {
       // the model outputs need to be set as outputs of the Block for the 'main' function
-      *mlprogram_main_->mutable_outputs()->Add() = node_arg.Name();
+      *mlprogram_main_block_->mutable_outputs()->Add() = name;
     }
   }
 #endif  // defined(COREML_ENABLE_MLPROGRAM)
@@ -744,7 +874,7 @@ Status ModelBuilder::ProcessNodes() {
       // This shouldn't happen as this is called from CoreMLExecutionProvider::Compile and should only be processing
       // nodes that we said were supported and were returned from CoreMLExecutionProvider::GetCapability.
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                             "Node [", node.Name(), "], type [", node.OpType(), "] is not supported");
+                             "Node [", node.Name(), "], type [", node.OpType(), "] was not able to be processed");
     }
   }
 
@@ -767,6 +897,12 @@ Status ModelBuilder::CreateModel() {
   ORT_RETURN_IF_ERROR(ProcessNodes());
   ORT_RETURN_IF_ERROR(RegisterModelOutputs());
 
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (create_ml_program_) {
+    SanitizeNames();
+  }
+#endif
+
   return Status::OK();
 }
 
@@ -795,7 +931,7 @@ Status ModelBuilder::SaveModel() {
 #if defined(COREML_ENABLE_MLPROGRAM)
   // need to delete the ModelPackage instance for it to write out the manifest. clear out the other ML Program
   // related types as well.
-  mlprogram_main_ = nullptr;
+  mlprogram_main_block_ = nullptr;
   mlpackage_.reset();
   weights_file_writer_.reset();
 #endif
@@ -804,11 +940,51 @@ Status ModelBuilder::SaveModel() {
 }
 
 Status ModelBuilder::LoadModel(std::unique_ptr<Model>& model) {
-  model = std::make_unique<Model>(model_output_path_,
-                                  std::move(input_output_info_),
-                                  std::move(scalar_outputs_),
-                                  std::move(int64_outputs_),
-                                  logger_, coreml_flags_);
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (create_ml_program_) {
+    // we need to provide the sanitized names for model inputs/outputs so that info is captured.
+    // the input/output matching when we execute the model from the CoreML EP is based on order, so the change
+    // to the names doesn't matter for that.
+    auto get_sanitized_names = [this](std::vector<std::string>&& names) -> std::vector<std::string> {
+      std::vector<std::string> output(std::move(names));
+
+      for (std::string& name : output) {
+        name = GetSafeName(name);
+      }
+
+      return output;
+    };
+
+    // also need to update the keys in input_output_info_
+    auto get_sanitized_io_info = [this](std::unordered_map<std::string, OnnxTensorInfo>&& info) {
+      std::unordered_map<std::string, OnnxTensorInfo> output;
+      output.reserve(info.size());
+
+      for (auto entry = info.begin(), end = info.end(); entry != end; ++entry) {
+        output.emplace(GetSafeName(entry->first), std::move(entry->second));
+      }
+
+      return output;
+    };
+
+    model = std::make_unique<Model>(model_output_path_,
+                                    get_sanitized_names(std::move(onnx_input_names_)),
+                                    get_sanitized_names(std::move(onnx_output_names_)),
+                                    get_sanitized_io_info(std::move(input_output_info_)),
+                                    std::move(scalar_outputs_),
+                                    std::move(int64_outputs_),
+                                    logger_, coreml_flags_);
+  } else
+#endif
+  {
+    model = std::make_unique<Model>(model_output_path_,
+                                    std::move(onnx_input_names_),
+                                    std::move(onnx_output_names_),
+                                    std::move(input_output_info_),
+                                    std::move(scalar_outputs_),
+                                    std::move(int64_outputs_),
+                                    logger_, coreml_flags_);
+  }
 
   return model->LoadModel();  // load using CoreML API, including compilation
 }
@@ -816,8 +992,11 @@ Status ModelBuilder::LoadModel(std::unique_ptr<Model>& model) {
 // static
 Status ModelBuilder::Build(const GraphViewer& graph_viewer, const logging::Logger& logger,
                            int32_t coreml_version, uint32_t coreml_flags,
+                           std::vector<std::string>&& onnx_input_names,
+                           std::vector<std::string>&& onnx_output_names,
                            std::unique_ptr<Model>& model) {
-  ModelBuilder builder(graph_viewer, logger, coreml_version, coreml_flags);
+  ModelBuilder builder(graph_viewer, logger, coreml_version, coreml_flags,
+                       std::move(onnx_input_names), std::move(onnx_output_names));
 
   ORT_RETURN_IF_ERROR(builder.CreateModel());
   ORT_RETURN_IF_ERROR(builder.SaveModel());
@@ -847,20 +1026,31 @@ void ModelBuilder::AddInputToSkip(const std::string& input_name) {
   skipped_inputs_.insert(input_name);
 }
 
-std::string ModelBuilder::GetUniqueName(std::string_view base_name) {
+const std::string& ModelBuilder::GetUniqueName(const std::string& base_name) {
+  if (unique_names_.find(base_name) == unique_names_.end()) {
+    return *unique_names_.insert(base_name).first;
+  }
+
   std::string unique_name;
-  do {
-    std::ostringstream os;
-    os << base_name << "_token_" << name_token_++;
-    unique_name = os.str();
-  } while (Contains(unique_names_, unique_name));
+  std::string suffix;
+
+  // supports up to 1000 unique names without having to grow in the loop
+  unique_name.reserve(base_name.size() + 5);
+  unique_name = base_name;
+
+  while (Contains(unique_names_, unique_name)) {
+    // assign followed by += to avoid creating temporary strings.
+    unique_name = base_name;
+    unique_name += "__";
+    unique_name += std::to_string(name_token_++);
+  }
 
-  return unique_name;
+  return *unique_names_.insert(unique_name).first;
 }
 
-std::string ModelBuilder::GetUniqueName(const Node& node, std::string_view suffix) {
+const std::string& ModelBuilder::GetUniqueName(const Node& node, std::string_view suffix) {
   if (node.Name().empty()) {
-    return GetUniqueName(MakeString("Node_", node.Index(), "_", node.OpType(), suffix));
+    return GetUniqueName(MakeString(node.OpType(), "_", node.Index(), suffix));
   } else {
     return GetUniqueName(node.Name() + std::string(suffix));
   }
diff --git a/onnxruntime/core/providers/coreml/builders/model_builder.h b/onnxruntime/core/providers/coreml/builders/model_builder.h
index 961ba647257b5..8f85ab2c09e7c 100644
--- a/onnxruntime/core/providers/coreml/builders/model_builder.h
+++ b/onnxruntime/core/providers/coreml/builders/model_builder.h
@@ -25,17 +25,20 @@ namespace onnxruntime {
 namespace coreml {
 
 class IOpBuilder;
-class Model;
 
 class ModelBuilder {
  private:
   ModelBuilder(const GraphViewer& graph_viewer, const logging::Logger& logger,
-               int32_t coreml_version, uint32_t coreml_flags);
+               int32_t coreml_version, uint32_t coreml_flags,
+               std::vector<std::string>&& onnx_input_names,
+               std::vector<std::string>&& onnx_output_names);
 
  public:
   // Create the CoreML model, serialize to disk, load and compile using the CoreML API and return in `model`
   static Status Build(const GraphViewer& graph_viewer, const logging::Logger& logger,
                       int32_t coreml_version, uint32_t coreml_flags,
+                      std::vector<std::string>&& onnx_input_names,
+                      std::vector<std::string>&& onnx_output_names,
                       std::unique_ptr<Model>& model);
 
   ~ModelBuilder();
@@ -101,8 +104,8 @@ class ModelBuilder {
   /// </param>
   /// <returns>Unique name generated for value.</returns>
   template <typename T>
-  std::string AddConstant(std::string_view op_type, std::string_view value_type, gsl::span<const T> value,
-                          std::optional<gsl::span<const int64_t>> shape = std::nullopt) {
+  std::string_view AddConstant(std::string_view op_type, std::string_view value_type, gsl::span<const T> value,
+                               std::optional<gsl::span<const int64_t>> shape = std::nullopt) {
     static_assert(std::is_same_v<T, float> ||
                       std::is_same_v<T, int64_t> ||
                       std::is_same_v<T, std::string> ||
@@ -113,8 +116,8 @@ class ModelBuilder {
   }
 
   template <typename T>
-  std::string AddConstant(std::string_view op_type, std::string_view value_type, const std::vector<T>& value,
-                          std::optional<gsl::span<const int64_t>> shape = std::nullopt) {
+  std::string_view AddConstant(std::string_view op_type, std::string_view value_type, const std::vector<T>& value,
+                               std::optional<gsl::span<const int64_t>> shape = std::nullopt) {
     return AddConstant(op_type, value_type, AsSpan(value), shape);
   }
 
@@ -122,17 +125,10 @@ class ModelBuilder {
   /// Add a scalar value as a 'const' operation. See AddConstant for details.
   /// </summary>
   template <typename T>
-  std::string AddScalarConstant(std::string_view op_type, std::string_view value_type, const T& value) {
+  std::string_view AddScalarConstant(std::string_view op_type, std::string_view value_type, const T& value) {
     return AddConstant(op_type, value_type, AsSpan({value}), AsSpan<const int64_t>({}));
   }
 
-  /// <summary>
-  /// Add an existing a constant ONNX initializer to the ML Program as a 'const' operation
-  /// </summary>
-  /// <param name="name">Initializer name</param>
-  /// <param name="initializer">Initializer data</param>
-  void AddConstant(std::string_view name, const ONNX_NAMESPACE::TensorProto& initializer);
-
   // add the operation to the main function
   void AddOperation(std::unique_ptr<COREML_SPEC::MILSpec::Operation> operation);
 #endif
@@ -149,18 +145,26 @@ class ModelBuilder {
   // be added to CoreML model, since CoreML does not like input unused
   void AddInputToSkip(const std::string& input_name);
 
-  std::string GetUniqueName(std::string_view base_name);
-  std::string GetUniqueName(const Node& node, std::string_view suffix);
+  const std::string& GetUniqueName(const std::string& base_name);
+  const std::string& GetUniqueName(const Node& node, std::string_view suffix);
+
+  const logging::Logger& Logger() const { return logger_; }
 
  private:
 #if defined(COREML_ENABLE_MLPROGRAM)
   template <typename T>
-  std::string AddConstantImpl(std::string_view op_type, std::string_view value_type, gsl::span<const T> value,
-                              std::optional<gsl::span<const int64_t>> shape = std::nullopt);
-
-  void AddConstantOperation(std::string_view name, COREML_SPEC::MILSpec::Value&& initializer);
-  std::string AddTensorValueAsConstantOperation(std::string_view op_type, std::string_view value_type,
-                                                COREML_SPEC::MILSpec::Value&& input_value);
+  std::string_view AddConstantImpl(std::string_view op_type, std::string_view value_type, gsl::span<const T> value,
+                                   std::optional<gsl::span<const int64_t>> shape = std::nullopt);
+
+  // apply the CoreML naming rules and fix any invalid names.
+  const std::string& GetSafeName(const std::string& name);
+  // sanitize all the names in the ML Model
+  void SanitizeNames();
+
+  // add Value as a const operation. return value name in case sanitization changed it
+  const std::string& AddConstantOperation(std::string_view name, COREML_SPEC::MILSpec::Value&& initializer);
+  const std::string& AddTensorValueAsConstantOperation(std::string_view op_type, std::string_view value_type,
+                                                       COREML_SPEC::MILSpec::Value&& input_value);
 #endif
 
   // Convert the ONNX model in graph_viewer_ to a CoreML::Specification::Model and serialize to disk.
@@ -193,6 +197,9 @@ class ModelBuilder {
   const bool create_ml_program_;         // ML Program (CoreML5, iOS 15+, macOS 12+) or NeuralNetwork (old)
   const std::string model_output_path_;  // create_ml_program_ ? dir for mlpackage : filename for mlmodel
 
+  std::vector<std::string> onnx_input_names_;
+  std::vector<std::string> onnx_output_names_;
+
   std::unique_ptr<CoreML::Specification::Model> coreml_model_;
   std::unordered_set<std::string> scalar_outputs_;
   std::unordered_set<std::string> int64_outputs_;
@@ -208,9 +215,19 @@ class ModelBuilder {
   // mlprogram_main_ is the main block of the CoreML ML Program.
   // It is set in CreateModel to the CoreML Model.mlprogram.functions['main'].block_specializations['CoreML<ver>']
   // entry we create.
-  COREML_SPEC::MILSpec::Block* mlprogram_main_{nullptr};
+  COREML_SPEC::MILSpec::Function* mlprogram_main_fn_{nullptr};  // Function that contains a Block with the operations
+  COREML_SPEC::MILSpec::Block* mlprogram_main_block_{nullptr};  // Block that all the operations are added to
   std::unique_ptr<MPL::ModelPackage> mlpackage_;
   std::unique_ptr<MILBlob::Blob::StorageWriter> weights_file_writer_;
+
+  // Values must start with [a-zA-A_]
+  // Additionally they can't be in a list of reserved words.
+  // If we need to sanitize an initializer name we do so during PreprocessInitializers and apply the change during
+  // RegisterInitializers.
+  // We also check inputs in AddOperation and apply the change there.
+  // This means an op builder author doesn't need to be aware of the renaming.
+  // https://github.com/apple/coremltools/blob/8b37641f243b1a3e81452feea311c6e30dcc9287/coremltools/converters/mil/mil/passes/defs/preprocess.py#L146-L149
+  std::unordered_map<std::string, std::string> values_to_rename_;
 #endif
 };
 
diff --git a/onnxruntime/core/providers/coreml/coreml_execution_provider.cc b/onnxruntime/core/providers/coreml/coreml_execution_provider.cc
index 8e718da07703c..0ba715cc7c6d9 100644
--- a/onnxruntime/core/providers/coreml/coreml_execution_provider.cc
+++ b/onnxruntime/core/providers/coreml/coreml_execution_provider.cc
@@ -114,28 +114,27 @@ common::Status CoreMLExecutionProvider::Compile(const std::vector<FusedNodeAndGr
                                                 std::vector<NodeComputeInfo>& node_compute_funcs) {
   for (const auto& fused_node_and_graph : fused_nodes_and_graphs) {
     Node& fused_node = fused_node_and_graph.fused_node;
-    const onnxruntime::GraphViewer& graph_viewer(fused_node_and_graph.filtered_graph);
 
     std::unique_ptr<coreml::Model> coreml_model;
-    ORT_RETURN_IF_ERROR(coreml::ModelBuilder::Build(graph_viewer, *GetLogger(), coreml_version_, coreml_flags_,
-                                                    coreml_model));
-
     {
-      const auto& input_defs = fused_node.InputDefs();
-      std::vector<std::string> onnx_input_names(input_defs.size());
-      for (size_t i = 0, end = input_defs.size(); i < end; ++i) {
-        onnx_input_names[i] = input_defs[i]->Name();
-      }
-      coreml_model->SetOnnxInputs(std::move(onnx_input_names));
-    }
+      auto get_names = [](const ConstPointerContainer<std::vector<NodeArg*>>& args) -> std::vector<std::string> {
+        std::vector<std::string> names;
+        names.reserve(args.size());
 
-    {
-      const auto& output_defs = fused_node.OutputDefs();
-      std::vector<std::string> onnx_output_names(output_defs.size());
-      for (size_t i = 0, end = output_defs.size(); i < end; ++i) {
-        onnx_output_names[i] = output_defs[i]->Name();
-      }
-      coreml_model->SetOnnxOutputs(std::move(onnx_output_names));
+        for (const NodeArg* def : args) {
+          names.push_back(def->Name());
+        }
+
+        return names;
+      };
+
+      std::vector<std::string> onnx_input_names = get_names(fused_node.InputDefs());
+      std::vector<std::string> onnx_output_names = get_names(fused_node.OutputDefs());
+
+      const onnxruntime::GraphViewer& graph_viewer(fused_node_and_graph.filtered_graph);
+      ORT_RETURN_IF_ERROR(coreml::ModelBuilder::Build(graph_viewer, *GetLogger(), coreml_version_, coreml_flags_,
+                                                      std::move(onnx_input_names), std::move(onnx_output_names),
+                                                      coreml_model));
     }
 
     coreml_models_.emplace(fused_node.Name(), std::move(coreml_model));
@@ -153,13 +152,14 @@ common::Status CoreMLExecutionProvider::Compile(const std::vector<FusedNodeAndGr
 
     compute_info.compute_func = [](FunctionState state, const OrtApi* /* api */, OrtKernelContext* context) {
       Ort::KernelContext ctx(context);
-
       const size_t num_inputs = ctx.GetInputCount();
       const size_t num_outputs = ctx.GetOutputCount();
 
       coreml::Model* model = reinterpret_cast<coreml::Model*>(state);
-      const auto& model_inputs = model->GetOnnxInputs();
-      const auto& model_outputs = model->GetOnnxOutputs();
+
+      // input/output names used by the CoreML model in the order that matches the fused_node InputDefs/OutputDefs
+      const auto& model_inputs = model->GetOrderedInputs();
+      const auto& model_outputs = model->GetOrderedOutputs();
 
       ORT_RETURN_IF_NOT(model_inputs.size() <= num_inputs, "Inconsistent input sizes");
       ORT_RETURN_IF_NOT(model_outputs.size() == num_outputs, "Inconsistent output sizes");
@@ -182,28 +182,25 @@ common::Status CoreMLExecutionProvider::Compile(const std::vector<FusedNodeAndGr
 
         // Disallow inputs with dynamic shape which actually have zero elements.
         // CoreML doesn't consistently handle this well (e.g., there may be runtime errors).
-        {
-          const auto& inferred_shape = input_info->shape;
-          ORT_RETURN_IF(!coreml::IsStaticShape(inferred_shape) && coreml::DoesShapeSpecifyZeroElements(shape),
-                        "Input (", input_name, ") has a dynamic shape (", coreml::Shape2String(inferred_shape),
-                        ") but the runtime shape (", coreml::Shape2String(shape),
-                        ") has zero elements. This is not supported by the CoreML EP.");
-        }
+        const auto& inferred_shape = input_info->shape;
+        ORT_RETURN_IF(!coreml::IsStaticShape(inferred_shape) && coreml::DoesShapeSpecifyZeroElements(shape),
+                      "Input (", input_name, ") has a dynamic shape (", coreml::Shape2String(inferred_shape),
+                      ") but the runtime shape (", coreml::Shape2String(shape),
+                      ") has zero elements. This is not supported by the CoreML EP.");
 
         // If we have an empty shape, this is a scalar input,
         // Since all the input output of CoreML EP is MultiArray, we will make the scalar input as a {1} MultiArray
-        if (shape.empty())
+        if (shape.empty()) {
           shape.push_back(1);
+        }
 
         // CoreML MLMultiArray API expect input to be non-const
         // https://developer.apple.com/documentation/coreml/mlmultiarray/2881219-initwithdatapointer?language=objc
         void* inputBuffer = const_cast<void*>(input_tensor.GetTensorRawData());
-        inputs.emplace(
-            input_name,
-            coreml::OnnxTensorData{
-                coreml::OnnxTensorInfo{tensor_info.GetElementType(), shape},
-                inputBuffer,
-            });
+        inputs.emplace(input_name, coreml::OnnxTensorData{
+                                       coreml::OnnxTensorInfo{tensor_info.GetElementType(), shape},
+                                       inputBuffer,
+                                   });
       }
 
       // From this point we will need to take the exclusive lock on the model until the Predict is
@@ -215,14 +212,13 @@ common::Status CoreMLExecutionProvider::Compile(const std::vector<FusedNodeAndGr
         outputs.reserve(model_outputs.size());
 
         coreml::GetOutputTensorMutableRawDataFn get_output_tensor_mutable_raw_data_fn =
-            [&ctx, &model_outputs](
-                const std::string& name,
-                int32_t requested_onnx_tensor_element_type,
-                gsl::span<const int64_t> static_shape) -> void* {
+            [&ctx, &model_outputs](const std::string& name,
+                                   int32_t requested_onnx_tensor_element_type,
+                                   gsl::span<const int64_t> static_shape) -> void* {
           const auto model_output_it = std::find(model_outputs.begin(), model_outputs.end(), name);
           ORT_ENFORCE(model_output_it != model_outputs.end(), "Failed to find CoreML model output name: ", name);
-          const auto output_idx = gsl::narrow_cast<size_t>(std::distance(model_outputs.begin(), model_output_it));
 
+          const auto output_idx = gsl::narrow_cast<size_t>(std::distance(model_outputs.begin(), model_output_it));
           auto output_tensor = ctx.GetOutput(output_idx, static_shape.data(), static_shape.size());
 
           const auto type_and_shape_info = output_tensor.GetTensorTypeAndShapeInfo();
@@ -243,13 +239,15 @@ common::Status CoreMLExecutionProvider::Compile(const std::vector<FusedNodeAndGr
 
           // Since CoreML EP use {1} MLMultiArray as scalar, if the model output should have empty shape
           // We are going to replace the {1} shape of the output back to {}
-          if (model->IsScalarOutput(output_name))
+          if (model->IsScalarOutput(output_name)) {
             output_shape.clear();
+          }
 
           // Since CoreML EP only accepts int32 output type and onnx requires int64 output,
           // We are going to set the model output (from int32) ->int64
-          if (model->IsInt64Output(output_name))
+          if (model->IsInt64Output(output_name)) {
             output_type = ONNX_NAMESPACE::TensorProto_DataType_INT64;
+          }
 
           outputs.emplace(output_name, coreml::OnnxTensorInfo{output_type, output_shape});
         }
diff --git a/onnxruntime/core/providers/coreml/dump_mlprogram_model.py b/onnxruntime/core/providers/coreml/dump_mlprogram_model.py
new file mode 100644
index 0000000000000..a3ceee70684dc
--- /dev/null
+++ b/onnxruntime/core/providers/coreml/dump_mlprogram_model.py
@@ -0,0 +1,27 @@
+import sys
+
+import coremltools as ct
+
+if len(sys.argv) < 2:
+    print(f"Usage: {sys.argv[0]} <path to model.mlmodel in ML Package>")
+    print("If generated by onnxruntime this will be <ML Package root>/Data/com.microsoft.onnxruntime/model.mlmodel")
+    sys.exit(-1)
+
+model_path = sys.argv[1]
+m = ct.models.MLModel(model_path)
+
+spec = m.get_spec()
+print(spec)
+
+# Example code if you want to filter output or do more advanced things
+# main = spec.mlProgram.functions["main"]
+# block = main.block_specializations[main.opset]
+# print(f"{len(block.operations)} operators")
+# for op in block.operations:
+# if op.type == 'const':
+#     if op.attributes["name"].immediateValue.tensor.strings.values[0] == "conv_0_pad_type_0":
+#         print(f"Conv pad_type={op.attributes['val'].immediateValue.tensor.strings.values}")
+#
+# if op.type == 'conv':
+#     #print(op)
+#     pass
diff --git a/onnxruntime/core/providers/coreml/model/host_utils.h b/onnxruntime/core/providers/coreml/model/host_utils.h
index 4f9a014c4d885..a9991ccb945ce 100644
--- a/onnxruntime/core/providers/coreml/model/host_utils.h
+++ b/onnxruntime/core/providers/coreml/model/host_utils.h
@@ -67,6 +67,12 @@ int CoreMLVersion();
 // Get a temporary macOS/iOS temp file path
 std::string GetTemporaryFilePath();
 
+#if !defined(NDEBUG) && defined(__APPLE__)
+// Override location the model is written to so that a) it's easily found and b) it is not automatically deleted
+// when the EP exits. Use to debug the model that is generated.
+// See onnxruntime/core/providers/coreml/dump_mlprogram_model.py for a script to dump the ML Program.
+constexpr const char* kOverrideModelOutputDirectoryEnvVar = "ORT_COREML_EP_MODEL_DIR";
+#endif
 }  // namespace util
 }  // namespace coreml
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/coreml/model/host_utils.mm b/onnxruntime/core/providers/coreml/model/host_utils.mm
index 0ae0cf8f0d207..5487ea35388f5 100644
--- a/onnxruntime/core/providers/coreml/model/host_utils.mm
+++ b/onnxruntime/core/providers/coreml/model/host_utils.mm
@@ -1,6 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "core/platform/env.h"
 #include "core/providers/coreml/model/host_utils.h"
 
 #import <Foundation/Foundation.h>
@@ -31,6 +32,15 @@ int32_t CoreMLVersion() {
 std::string GetTemporaryFilePath() {
   // Get temporary directory for user.
   NSURL* temporary_directory_url = [NSURL fileURLWithPath:NSTemporaryDirectory() isDirectory:YES];
+
+#if !defined(NDEBUG)
+  std::string path_override = Env::Default().GetEnvironmentVar(kOverrideModelOutputDirectoryEnvVar);
+  if (!path_override.empty()) {
+    NSString* ns_path_override = [NSString stringWithUTF8String:path_override.c_str()];
+    temporary_directory_url = [NSURL fileURLWithPath:ns_path_override isDirectory:YES];
+  }
+#endif
+
   // Generate a Unique file name to use.
   NSString* temporary_filename = [[NSProcessInfo processInfo] globallyUniqueString];
 
diff --git a/onnxruntime/core/providers/coreml/model/model.h b/onnxruntime/core/providers/coreml/model/model.h
index b940c4b768aec..e3cd43d786fc3 100644
--- a/onnxruntime/core/providers/coreml/model/model.h
+++ b/onnxruntime/core/providers/coreml/model/model.h
@@ -35,6 +35,8 @@ using GetOutputTensorMutableRawDataFn = std::function<void*(const std::string& n
 class Model {
  public:
   Model(const std::string& path,
+        std::vector<std::string>&& model_input_names,
+        std::vector<std::string>&& model_output_names,
         std::unordered_map<std::string, OnnxTensorInfo>&& input_output_info,
         std::unordered_set<std::string>&& scalar_outputs,
         std::unordered_set<std::string>&& int64_outputs,
@@ -60,12 +62,11 @@ class Model {
   // Mutex for exclusive lock to this model object
   OrtMutex& GetMutex() { return mutex_; }
 
-  // Input and output names in the onnx model's order
-  const std::vector<std::string>& GetOnnxInputs() const { return onnx_inputs_; }
-  void SetOnnxInputs(std::vector<std::string>&& inputs) { onnx_inputs_ = std::move(inputs); }
-
-  const std::vector<std::string>& GetOnnxOutputs() const { return onnx_outputs_; }
-  void SetOnnxOutputs(std::vector<std::string>&& outputs) { onnx_outputs_ = std::move(outputs); }
+  // Input and output names in the ORT fused node's order.
+  // Names may have been adjusted from the originals due to CoreML naming rules.
+  // We do inputs/outputs based on order at the ONNX level so this doesn't matter.
+  const std::vector<std::string>& GetOrderedInputs() const { return model_input_names_; }
+  const std::vector<std::string>& GetOrderedOutputs() const { return model_output_names_; }
 
   const OnnxTensorInfo* TryGetInputOutputInfo(const std::string& name) const {
     const auto info_it = input_output_info_.find(name);
@@ -80,13 +81,13 @@ class Model {
 
  private:
   std::unique_ptr<Execution> execution_;
+  std::vector<std::string> model_input_names_;   // input names in the order of the ORT fused node's inputs
+  std::vector<std::string> model_output_names_;  // output names in the order of the ORT fused node's outputs
+
   std::unordered_map<std::string, OnnxTensorInfo> input_output_info_;
   std::unordered_set<std::string> scalar_outputs_;
   std::unordered_set<std::string> int64_outputs_;
 
-  std::vector<std::string> onnx_inputs_;
-  std::vector<std::string> onnx_outputs_;
-
   OrtMutex mutex_;
 };
 
diff --git a/onnxruntime/core/providers/coreml/model/model.mm b/onnxruntime/core/providers/coreml/model/model.mm
index d5cd70bff9479..1434043e064f4 100644
--- a/onnxruntime/core/providers/coreml/model/model.mm
+++ b/onnxruntime/core/providers/coreml/model/model.mm
@@ -19,6 +19,7 @@
 #include "core/common/narrow.h"
 #include "core/common/span_utils.h"
 #include "core/graph/onnx_protobuf.h"
+#include "core/platform/env.h"
 #include "core/providers/coreml/builders/helper.h"
 #include "core/providers/coreml/coreml_provider_factory.h"
 #include "core/providers/coreml/model/host_utils.h"
@@ -287,6 +288,14 @@ - (void)cleanup {
     compiled_model_path_ = nil;
   }
 
+#if !defined(NDEBUG)
+  std::string path_override = Env::Default().GetEnvironmentVar(util::kOverrideModelOutputDirectoryEnvVar);
+  if (!path_override.empty()) {
+    // don't cleanup
+    coreml_model_path_ = nil;
+  }
+#endif
+
   if (coreml_model_path_ != nil) {
     error = nil;
     [[NSFileManager defaultManager] removeItemAtPath:coreml_model_path_ error:&error];
@@ -487,12 +496,16 @@ Status Predict(const std::unordered_map<std::string, OnnxTensorData>& inputs,
 }
 
 Model::Model(const std::string& path,
+             std::vector<std::string>&& model_input_names,
+             std::vector<std::string>&& model_output_names,
              std::unordered_map<std::string, OnnxTensorInfo>&& input_output_info,
              std::unordered_set<std::string>&& scalar_outputs,
              std::unordered_set<std::string>&& int64_outputs,
              const logging::Logger& logger,
              uint32_t coreml_flags)
     : execution_(std::make_unique<Execution>(path, logger, coreml_flags)),
+      model_input_names_(std::move(model_input_names)),
+      model_output_names_(std::move(model_output_names)),
       input_output_info_(std::move(input_output_info)),
       scalar_outputs_(std::move(scalar_outputs)),
       int64_outputs_(std::move(int64_outputs)) {
diff --git a/onnxruntime/core/providers/coreml/model/model_stub.cc b/onnxruntime/core/providers/coreml/model/model_stub.cc
index 087c9f8c05d5f..c6f2e7401ea1e 100644
--- a/onnxruntime/core/providers/coreml/model/model_stub.cc
+++ b/onnxruntime/core/providers/coreml/model/model_stub.cc
@@ -9,12 +9,16 @@ namespace coreml {
 class Execution {};
 
 Model::Model(const std::string& /*path*/,
+             std::vector<std::string>&& model_input_names,
+             std::vector<std::string>&& model_output_names,
              std::unordered_map<std::string, OnnxTensorInfo>&& input_output_info,
              std::unordered_set<std::string>&& scalar_outputs,
              std::unordered_set<std::string>&& int64_outputs,
              const logging::Logger& /*logger*/,
              uint32_t /*coreml_flags*/)
     : execution_(std::make_unique<Execution>()),
+      model_input_names_(std::move(model_input_names)),
+      model_output_names_(std::move(model_output_names)),
       input_output_info_(std::move(input_output_info)),
       scalar_outputs_(std::move(scalar_outputs)),
       int64_outputs_(std::move(int64_outputs)) {
diff --git a/onnxruntime/core/providers/cpu/tensor/reshape_helper.h b/onnxruntime/core/providers/cpu/tensor/reshape_helper.h
index 5961686674424..d7ceda16e61ea 100644
--- a/onnxruntime/core/providers/cpu/tensor/reshape_helper.h
+++ b/onnxruntime/core/providers/cpu/tensor/reshape_helper.h
@@ -37,12 +37,14 @@ class ReshapeHelper {
     if (unknown_dim != -1) {
       // calculate unknown dimension
       ORT_ENFORCE(size != 0 && (input_shape_size % size) == 0,
-                  "The input tensor cannot be reshaped to the requested shape. Input shape:", input_shape, ", requested shape:", TensorShape(requested_shape));
+                  "The input tensor cannot be reshaped to the requested shape. Input shape:", input_shape,
+                  ", requested shape:", TensorShape(requested_shape));
       requested_shape[unknown_dim] = input_shape_size / size;
     } else {
       // check if the output shape is valid.
       ORT_ENFORCE(input_shape_size == size,
-                  "The input tensor cannot be reshaped to the requested shape. Input shape:", input_shape, ", requested shape:", TensorShape(requested_shape));
+                  "The input tensor cannot be reshaped to the requested shape. Input shape:", input_shape,
+                  ", requested shape:", TensorShape(requested_shape));
     }
   }
 };
diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc
index 7d4111e3b9c39..729ad34368453 100644
--- a/onnxruntime/test/perftest/command_args_parser.cc
+++ b/onnxruntime/test/perftest/command_args_parser.cc
@@ -64,17 +64,22 @@ namespace perftest {
       "\t    Refer to onnxruntime_session_options_config_keys.h for valid keys and values. \n"
       "\t    [Example] -C \"session.disable_cpu_ep_fallback|1 ep.context_enable|1\" \n"
       "\t-i: Specify EP specific runtime options as key value pairs. Different runtime options available are: \n"
+      "\t    [Usage]: -e <provider_name> -i '<key1>|<value1> <key2>|<value2>'\n"
+      "\n"
       "\t    [DML only] [performance_preference]: DML device performance preference, options: 'default', 'minimum_power', 'high_performance', \n"
       "\t    [DML only] [device_filter]: DML device filter, options: 'any', 'gpu', 'npu', \n"
       "\t    [DML only] [disable_metacommands]: Options: 'true', 'false', \n"
       "\t    [DML only] [enable_dynamic_graph_fusion]: Options: 'true', 'false', \n"
       "\t    [DML only] [enable_graph_serialization]: Options: 'true', 'false', \n"
+      "\n"
       "\t    [OpenVINO only] [device_type]: Overrides the accelerator hardware type and precision with these values at runtime.\n"
       "\t    [OpenVINO only] [device_id]: Selects a particular hardware device for inference.\n"
       "\t    [OpenVINO only] [enable_npu_fast_compile]: Optionally enabled to speeds up the model's compilation on NPU device targets.\n"
       "\t    [OpenVINO only] [num_of_threads]: Overrides the accelerator hardware type and precision with these values at runtime.\n"
       "\t    [OpenVINO only] [cache_dir]: Explicitly specify the path to dump and load the blobs(Model caching) or cl_cache (Kernel Caching) files feature. If blob files are already present, it will be directly loaded.\n"
       "\t    [OpenVINO only] [enable_opencl_throttling]: Enables OpenCL queue throttling for GPU device(Reduces the CPU Utilization while using GPU) \n"
+      "\t    [Example] [For OpenVINO EP] -e openvino -i \"device_type|CPU_FP32 enable_npu_fast_compile|true num_of_threads|5 enable_opencl_throttling|true cache_dir|\"<path>\"\"\n"
+      "\n"
       "\t    [QNN only] [backend_path]: QNN backend path. e.g '/folderpath/libQnnHtp.so', '/folderpath/libQnnCpu.so'.\n"
       "\t    [QNN only] [profiling_level]: QNN profiling level, options: 'basic', 'detailed', default 'off'.\n"
       "\t    [QNN only] [rpc_control_latency]: QNN rpc control latency. default to 10.\n"
@@ -89,9 +94,8 @@ namespace perftest {
       "\t    [QNN only] [htp_arch]: The minimum HTP architecture. The driver will use ops compatible with this architecture. \n"
       "\t    Options are '0', '68', '69', '73', '75'. Defaults to '0' (none). \n"
       "\t    [QNN only] [device_id]: The ID of the device to use when setting 'htp_arch'. Defaults to '0' (for single device). \n"
-      "\t [Usage]: -e <provider_name> -i '<key1>|<value1> <key2>|<value2>'\n\n"
-      "\t [Example] [For OpenVINO EP] -e openvino -i \"device_type|CPU_FP32 enable_npu_fast_compile|true num_of_threads|5 enable_opencl_throttling|true cache_dir|\"<path>\"\"\n"
-      "\t [Example] [For QNN EP] -e qnn -i \"backend_path|/folderpath/libQnnCpu.so\" \n\n"
+      "\t    [Example] [For QNN EP] -e qnn -i \"backend_path|/folderpath/libQnnCpu.so\" \n"
+      "\n"
       "\t    [TensorRT only] [trt_max_partition_iterations]: Maximum iterations for TensorRT parser to get capability.\n"
       "\t    [TensorRT only] [trt_min_subgraph_size]: Minimum size of TensorRT subgraphs.\n"
       "\t    [TensorRT only] [trt_max_workspace_size]: Set TensorRT maximum workspace size in byte.\n"
@@ -108,20 +112,23 @@ namespace perftest {
       "\t    [TensorRT only] [trt_force_sequential_engine_build]: Force TensorRT engines to be built sequentially.\n"
       "\t    [TensorRT only] [trt_context_memory_sharing_enable]: Enable TensorRT context memory sharing between subgraphs.\n"
       "\t    [TensorRT only] [trt_layer_norm_fp32_fallback]: Force Pow + Reduce ops in layer norm to run in FP32 to avoid overflow.\n"
-      "\t [Usage]: -e <provider_name> -i '<key1>|<value1> <key2>|<value2>'\n\n"
-      "\t [Example] [For TensorRT EP] -e tensorrt -i 'trt_fp16_enable|true trt_int8_enable|true trt_int8_calibration_table_name|calibration.flatbuffers trt_int8_use_native_calibration_table|false trt_force_sequential_engine_build|false'\n"
+      "\t    [Example] [For TensorRT EP] -e tensorrt -i 'trt_fp16_enable|true trt_int8_enable|true trt_int8_calibration_table_name|calibration.flatbuffers trt_int8_use_native_calibration_table|false trt_force_sequential_engine_build|false'\n"
+      "\n"
       "\t    [NNAPI only] [NNAPI_FLAG_USE_FP16]: Use fp16 relaxation in NNAPI EP..\n"
       "\t    [NNAPI only] [NNAPI_FLAG_USE_NCHW]: Use the NCHW layout in NNAPI EP.\n"
       "\t    [NNAPI only] [NNAPI_FLAG_CPU_DISABLED]: Prevent NNAPI from using CPU devices.\n"
       "\t    [NNAPI only] [NNAPI_FLAG_CPU_ONLY]: Using CPU only in NNAPI EP.\n"
-      "\t [Usage]: -e <provider_name> -i '<key1> <key2>'\n\n"
-      "\t [Example] [For NNAPI EP] -e nnapi -i \" NNAPI_FLAG_USE_FP16 NNAPI_FLAG_USE_NCHW NNAPI_FLAG_CPU_DISABLED \"\n"
+      "\t    [Example] [For NNAPI EP] -e nnapi -i \"NNAPI_FLAG_USE_FP16 NNAPI_FLAG_USE_NCHW NNAPI_FLAG_CPU_DISABLED\"\n"
+      "\n"
+      "\t    [CoreML only] [COREML_FLAG_CREATE_MLPROGRAM]: Create an ML Program model instead of Neural Network.\n"
+      "\t    [Example] [For CoreML EP] -e coreml -i \"COREML_FLAG_CREATE_MLPROGRAM\"\n"
+      "\n"
       "\t    [SNPE only] [runtime]: SNPE runtime, options: 'CPU', 'GPU', 'GPU_FLOAT16', 'DSP', 'AIP_FIXED_TF'. \n"
       "\t    [SNPE only] [priority]: execution priority, options: 'low', 'normal'. \n"
       "\t    [SNPE only] [buffer_type]: options: 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. default: ITENSOR'. \n"
       "\t    [SNPE only] [enable_init_cache]: enable SNPE init caching feature, set to 1 to enabled it. Disabled by default. \n"
-      "\t [Usage]: -e <provider_name> -i '<key1>|<value1> <key2>|<value2>' \n\n"
-      "\t [Example] [For SNPE EP] -e snpe -i \"runtime|CPU priority|low\" \n\n"
+      "\t    [Example] [For SNPE EP] -e snpe -i \"runtime|CPU priority|low\" \n\n"
+      "\n"
       "\t-T [Set intra op thread affinities]: Specify intra op thread affinity string\n"
       "\t [Example]: -T 1,2;3,4;5,6 or -T 1-2;3-4;5-6 \n"
       "\t\t Use semicolon to separate configuration between threads.\n"
diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc
index 1934314b8ce43..9679ca6159464 100644
--- a/onnxruntime/test/perftest/ort_test_session.cc
+++ b/onnxruntime/test/perftest/ort_test_session.cc
@@ -468,7 +468,10 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
         nnapi_flags |= NNAPI_FLAG_CPU_ONLY;
       } else if (key.empty()) {
       } else {
-        ORT_THROW("[ERROR] [NNAPI] wrong key type entered. Choose from the following runtime key options that are available for NNAPI. ['NNAPI_FLAG_USE_FP16', 'NNAPI_FLAG_USE_NCHW', 'NNAPI_FLAG_CPU_DISABLED', 'NNAPI_FLAG_CPU_ONLY'] \n");
+        ORT_THROW(
+            "[ERROR] [NNAPI] wrong key type entered. Choose from the following runtime key options "
+            "that are available for NNAPI. "
+            "['NNAPI_FLAG_USE_FP16', 'NNAPI_FLAG_USE_NCHW', 'NNAPI_FLAG_CPU_DISABLED', 'NNAPI_FLAG_CPU_ONLY'] \n");
       }
     }
     Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_Nnapi(session_options, nnapi_flags));
@@ -476,10 +479,31 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
     ORT_THROW("NNAPI is not supported in this build\n");
 #endif
   } else if (provider_name_ == onnxruntime::kCoreMLExecutionProvider) {
+#ifdef __APPLE__
 #ifdef USE_COREML
-    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CoreML(session_options, 0));
+    uint32_t coreml_flags = 0;
+    std::string ov_string = performance_test_config.run_config.ep_runtime_config_string;
+    std::istringstream ss(ov_string);
+
+    std::string key;
+    while (ss >> key) {
+      if (key == "COREML_FLAG_CREATE_MLPROGRAM") {
+        coreml_flags |= COREML_FLAG_CREATE_MLPROGRAM;
+        std::cout << "Enabling ML Program.\n";
+      } else if (key.empty()) {
+      } else {
+        ORT_THROW(
+            "[ERROR] [CoreML] wrong key type entered. Choose from the following runtime key options "
+            "that are available for CoreML. ['COREML_FLAG_CREATE_MLPROGRAM'] \n");
+      }
+    }
+    // COREML_FLAG_CREATE_MLPROGRAM
+    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CoreML(session_options, coreml_flags));
+#else
+    ORT_THROW("CoreML is not supported in this build\n");
+#endif
 #else
-    ORT_THROW("COREML is not supported in this build\n");
+    ORT_THROW("COREML is not supported on this platform.\n");
 #endif
   } else if (provider_name_ == onnxruntime::kDmlExecutionProvider) {
 #ifdef USE_DML
diff --git a/onnxruntime/test/providers/coreml/coreml_basic_test.cc b/onnxruntime/test/providers/coreml/coreml_basic_test.cc
index 7b6f1b9244be9..94817158017bd 100644
--- a/onnxruntime/test/providers/coreml/coreml_basic_test.cc
+++ b/onnxruntime/test/providers/coreml/coreml_basic_test.cc
@@ -192,5 +192,25 @@ TEST(CoreMLExecutionProviderTest, TestOrtFormatModel) {
 #endif
 }
 
+// Test that we fix invalid names in model inputs, initializers and outputs.
+// Names in CoreML cannot start with [0-9] or contain anything but "[a-z][A-Z][0-9]_"
+TEST(CoreMLExecutionProviderTest, TestNameSanitization) {
+  OpTester test("Clip", 11);
+
+  std::vector<int64_t> dims{3, 3};
+  test.AddInput<float>("0", dims,
+                       {-1.0f, 0.0f, 1.0f,
+                        -6.0f, 0.0f, 6.0f,
+                        -5.4f, 2.0f, 6.0f});
+  test.AddInput<float>("1.min", {}, {-5}, true);  // add as initializers
+  test.AddInput<float>("2/max", {}, {5}, true);
+  test.AddOutput<float>("3", dims,
+                        {-1.0f, 0.0f, 1.0f,
+                         -5.0f, 0.0f, 5.0f,
+                         -5.0f, 2.0f, 5.0f});
+
+  // TensorRT does not support Clip opset 11 yet.
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/math/clip_test.cc b/onnxruntime/test/providers/cpu/math/clip_test.cc
index efb46e86d04e4..b5d5f84df950a 100644
--- a/onnxruntime/test/providers/cpu/math/clip_test.cc
+++ b/onnxruntime/test/providers/cpu/math/clip_test.cc
@@ -182,7 +182,7 @@ TEST(MathOpTest, Clip) {
   run_test(true);
 }
 
-// Use clip between [0, 6] as Relu6 (for some EPs, such as NNAPI)
+// Use clip between [0, 6] as Relu6 to test optimized path in some  EPs, such as NNAPI and CoreML
 TEST(MathOpTest, Clip_Relu6) {
   // To test NNAPI EP, we need the min/max to be in initializers
   auto run_test = [](bool min_max_are_initializer) {
@@ -208,6 +208,31 @@ TEST(MathOpTest, Clip_Relu6) {
   run_test(true);
 }
 
+// Use clip between [0, inf] as Relu to test optimized path in some EPs, such as CoreML
+TEST(MathOpTest, Clip_Relu) {
+  // To test NNAPI EP, we need the min/max to be in initializers
+  auto run_test = [](bool min_max_are_initializer) {
+    OpTester test("Clip", 11);
+
+    std::vector<int64_t> dims{3, 3};
+    test.AddInput<float>("X", dims,
+                         {-1.0f, 0.0f, 1.0f,
+                          -6.0f, 3.5f, 6.0f,
+                          -5.4f, 2.0f, 8.0f});
+    test.AddInput<float>("min", {}, {0.0f}, min_max_are_initializer);
+    test.AddOutput<float>("Y", dims,
+                          {0.0f, 0.0f, 1.0f,
+                           0.0f, 3.5f, 6.0f,
+                           0.0f, 2.0f, 8.0f});
+
+    // TensorRT does not support Clip opset 11 yet.
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+  };
+
+  run_test(false);
+  run_test(true);
+}
+
 // Use clip between [-1, 1] as Relu1 (for some EPs, such as NNAPI)
 TEST(MathOpTest, Clip_Relu1) {
   // To test NNAPI EP, we need the min/max to be in initializers
diff --git a/onnxruntime/test/providers/cpu/math/gemm_test.cc b/onnxruntime/test/providers/cpu/math/gemm_test.cc
index bf089e083d67e..428925e154497 100644
--- a/onnxruntime/test/providers/cpu/math/gemm_test.cc
+++ b/onnxruntime/test/providers/cpu/math/gemm_test.cc
@@ -281,24 +281,31 @@ using GemmOpTypedTestsTypes = ::testing::Types<float, double, MLFloat16>;
 TYPED_TEST_SUITE(GemmOpTypedTests, GemmOpTypedTestsTypes);
 
 TYPED_TEST(GemmOpTypedTests, TestGemmScalarBroadcast) {
-  OpTester test("Gemm");
+  auto run_test = [](bool b_is_initializer, bool c_is_initializer) {
+    OpTester test("Gemm");
 
-  test.AddAttribute("transA", (int64_t)0);
-  test.AddAttribute("transB", (int64_t)0);
-  test.AddAttribute("alpha", 1.0f);
-  test.AddAttribute("beta", 1.0f);
+    test.AddAttribute("transA", (int64_t)0);
+    test.AddAttribute("transB", (int64_t)0);
+    test.AddAttribute("alpha", 1.0f);
+    test.AddAttribute("beta", 1.0f);
 
-  test.AddInput<TypeParam>("A", {2, 4},
-                           {static_cast<TypeParam>(1.0f), static_cast<TypeParam>(2.0f), static_cast<TypeParam>(3.0f), static_cast<TypeParam>(4.0f),
-                            static_cast<TypeParam>(-1.0f), static_cast<TypeParam>(-2.0f), static_cast<TypeParam>(-3.0f), static_cast<TypeParam>(-4.0f)});
-  test.AddInput<TypeParam>("B", {4, 3}, std::vector<TypeParam>(12, static_cast<TypeParam>(1.0f)));
-  test.AddInput<TypeParam>("C", {1}, std::vector<TypeParam>{static_cast<TypeParam>(1.0f)});
-  test.AddOutput<TypeParam>("Y", {2, 3},
-                            {static_cast<TypeParam>(11.0f), static_cast<TypeParam>(11.0f), static_cast<TypeParam>(11.0f),
-                             static_cast<TypeParam>(-9.0f), static_cast<TypeParam>(-9.0f), static_cast<TypeParam>(-9.0f)});
-  test.Config(run_with_tunable_op)
-      .RunWithConfig();
+    test.AddInput<TypeParam>("A", {2, 4},
+                             {static_cast<TypeParam>(1.0f), static_cast<TypeParam>(2.0f), static_cast<TypeParam>(3.0f), static_cast<TypeParam>(4.0f),
+                              static_cast<TypeParam>(-1.0f), static_cast<TypeParam>(-2.0f), static_cast<TypeParam>(-3.0f), static_cast<TypeParam>(-4.0f)});
+    test.AddInput<TypeParam>("B", {4, 3}, std::vector<TypeParam>(12, static_cast<TypeParam>(1.0f)), b_is_initializer);
+    test.AddInput<TypeParam>("C", {1}, std::vector<TypeParam>{static_cast<TypeParam>(1.0f)}, c_is_initializer);
+    test.AddOutput<TypeParam>("Y", {2, 3},
+                              {static_cast<TypeParam>(11.0f), static_cast<TypeParam>(11.0f), static_cast<TypeParam>(11.0f),
+                               static_cast<TypeParam>(-9.0f), static_cast<TypeParam>(-9.0f), static_cast<TypeParam>(-9.0f)});
+    test.Config(run_with_tunable_op)
+        .RunWithConfig();
+  };
+
+  run_test(false, false);
+  // CoreML EP requires weight and bias to be initializers
+  run_test(true, true);
 }
+
 TYPED_TEST(GemmOpTypedTests, TestGemm2DBroadcast_2) {
   OpTester test("Gemm");
 
diff --git a/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc b/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc
index ee18cf2cea6cb..cbb4531a50b7c 100644
--- a/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc
@@ -75,6 +75,43 @@ TEST(BatchNormTest, PositiveTestCase) {
   input_data_map.insert({"mean", mean});
   input_data_map.insert({"var", var});
 
+  InputShapesMap input_shapes_map;
+  vector<int64_t> input_shape{1, 1, 7, 7};
+  input_shapes_map.insert({"X", input_shape});
+  input_shapes_map.insert({"scale", {1}});
+  input_shapes_map.insert({"B", {1}});
+  input_shapes_map.insert({"mean", {1}});
+  input_shapes_map.insert({"var", {1}});
+
+  auto expected_output = {1.01359f, 0.703983f, 0.641631f, 1.08571f, 0.939167f, 0.762469f, 0.682729f, 0.762401f, 0.787021f,
+                          1.06744f, 0.604378f, 0.957476f, 0.667302f, 0.901764f, 1.07566f, 1.01117f, 0.928324f, 0.897667f,
+                          0.705842f, 0.660885f, 0.977291f, 0.878918f, 0.818345f, 1.06608f, 0.839057f, 1.04796f, 0.621471f,
+                          0.781831f, 0.760527f, 0.835665f, 1.05825f, 0.611442f, 0.781873f, 1.08437f, 0.907454f, 0.926173f,
+                          1.03375f, 0.707961f, 0.968646f, 0.621757f, 0.973095f, 0.700301f, 0.916723f, 0.807602f, 0.692598f,
+                          0.621972f, 0.707334f, 0.63723f, 0.63062f};
+  float epsilon = 1e-05f;
+  TestBatchNorm(input_data_map, input_shapes_map, epsilon, expected_output, input_shape);
+}
+
+TEST(BatchNormTest, PositiveTestCase_5D) {
+  // This input was taken from the SpatialBN_1.pb, SpatialBN_1_input.pb and SpatialBN_1_output.pb files.
+  vector<float> X{0.329876f, -0.287158f, -0.411425f, 0.473621f, 0.18156f, -0.170596f, -0.329516f, -0.170733f, -0.121664f, 0.4372f,
+                  -0.485668f, 0.218049f, -0.360263f, 0.107016f, 0.45358f, 0.325056f, 0.15995f, 0.098852f, -0.283453f, -0.373051f,
+                  0.257542f, 0.0614853f, -0.0592363f, 0.434488f, -0.0179583f, 0.398374f, -0.451602f, -0.132009f, -0.174468f,
+                  -0.0247169f, 0.418897f, -0.47159f, -0.131925f, 0.470943f, 0.118357f, 0.155664f, 0.370062f, -0.279229f, 0.240311f,
+                  -0.451034f, 0.249178f, -0.294496f, 0.13683f, -0.0806475f, -0.309849f, -0.450604f, -0.28048f, -0.420197f, -0.433369f};
+  vector<float> scale{0.589433f};
+  vector<float> B{-0.384622f};
+  vector<float> mean{-2.45673f};
+  vector<float> var{1.37998f};
+
+  InputDataMap input_data_map;
+  input_data_map.insert({"X", X});
+  input_data_map.insert({"scale", scale});
+  input_data_map.insert({"B", B});
+  input_data_map.insert({"mean", mean});
+  input_data_map.insert({"var", var});
+
   InputShapesMap input_shapes_map;
   vector<int64_t> input_shape{1, 1, 7, 7, 1};
   input_shapes_map.insert({"X", input_shape});
diff --git a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
index 1d31f3fdb4eb4..5addb5dd9ce46 100644
--- a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
@@ -572,8 +572,8 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_pytorch_half_pixe
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kDmlExecutionProvider});
 }
 
-TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_4DBilinear_asymmetric) {
-  // To test NNAPI EP, we need the sclaes/sizes to be in initializers
+TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_4DBilinear_asymmetric_scales) {
+  // To test CoreML/NNAPI EP, we need the scales/sizes to be in initializers
   auto run_test = [](bool scales_in_initializer) {
     OpTester test("Resize", 13);
     std::vector<float> roi{};

From acbfc29f272b5578145e7600bc42342e116ffbc2 Mon Sep 17 00:00:00 2001
From: pengwa <pengwa@microsoft.com>
Date: Fri, 1 Mar 2024 10:57:14 +0800
Subject: [PATCH 092/279] Follow up fix for Gelu impl (#19693)

### Follow up fix for Gelu impl

There are two minor comments in
https://github.com/microsoft/onnxruntime/pull/19560.

Fix them in this pull request.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 docs/ORTModule_Training_Guidelines.md          | 2 +-
 onnxruntime/contrib_ops/cuda/bert/fast_gelu.cc | 8 +++-----
 onnxruntime/contrib_ops/cuda/bert/fast_gelu.h  | 4 +++-
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/ORTModule_Training_Guidelines.md b/docs/ORTModule_Training_Guidelines.md
index 91057d3dfb120..f50b18b736936 100644
--- a/docs/ORTModule_Training_Guidelines.md
+++ b/docs/ORTModule_Training_Guidelines.md
@@ -293,7 +293,7 @@ A classical usage of disabling the deep copy: when the deep copy before module e
     export ORTMODULE_MEMORY_OPT_LEVEL=0
     ```
 
-### ORTMODULE_ENABLE_MEM_EFFICIENT_GRAD_MGMT
+#### ORTMODULE_ENABLE_MEM_EFFICIENT_GRAD_MGMT
 
 - **Feature Area**: *ORTMODULE/Optimizations*
 - **Description**: By default, the memory-efficient gradient management is turned off. The gradient after it is computed in ONNX Runtime, will trigger the corresponding parameter's backward function through `PythonOpGrad` operator. This would help release the gradient buffer managed in ONNX Runtime, which originally is released once all backward computation finishes.
diff --git a/onnxruntime/contrib_ops/cuda/bert/fast_gelu.cc b/onnxruntime/contrib_ops/cuda/bert/fast_gelu.cc
index e8974a29476b6..8b8e4e267f895 100644
--- a/onnxruntime/contrib_ops/cuda/bert/fast_gelu.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/fast_gelu.cc
@@ -8,8 +8,7 @@
 #include "contrib_ops/cpu/bert/bias_gelu_helper.h"
 #ifdef USE_ROCM
 #include "contrib_ops/rocm/bert/elementwise.h"
-#endif
-#ifdef USE_CUDA
+#else
 #include "contrib_ops/cuda/bert/transformer_common.h"
 #endif
 
@@ -36,7 +35,7 @@ using namespace ONNX_NAMESPACE;
 
 template <typename T>
 FastGelu<T>::FastGelu(const OpKernelInfo& op_kernel_info) : CudaKernel(op_kernel_info) {
-#ifdef USE_CUDA
+#ifndef USE_ROCM
   const TransformerOptions* options = TransformerOptions::GetInstance();
   use_half2_ = !options->DisableHalf2();
 #endif
@@ -63,8 +62,7 @@ Status FastGelu<T>::ComputeInternal(OpKernelContext* context) const {
       reinterpret_cast<const CudaT*>(input->Data<T>()), static_cast<int>(input_length),
       (nullptr != bias) ? reinterpret_cast<const CudaT*>(bias->Data<T>()) : nullptr, static_cast<int>(bias_length),
       reinterpret_cast<CudaT*>(output->MutableData<T>()));
-#endif
-#ifdef USE_CUDA
+#else
   return LaunchFastGeluKernel<CudaT>(GetDeviceProp(),
                                      Stream(context),
                                      static_cast<int>(input_length),
diff --git a/onnxruntime/contrib_ops/cuda/bert/fast_gelu.h b/onnxruntime/contrib_ops/cuda/bert/fast_gelu.h
index d563556593e6e..26f3bd5a03928 100644
--- a/onnxruntime/contrib_ops/cuda/bert/fast_gelu.h
+++ b/onnxruntime/contrib_ops/cuda/bert/fast_gelu.h
@@ -18,7 +18,9 @@ class FastGelu final : public CudaKernel {
   Status ComputeInternal(OpKernelContext* ctx) const override;
 
  private:
-  bool use_half2_;  // Only applicable to CUDA kernel (not ROCM).
+#ifndef USE_ROCM
+  bool use_half2_;
+#endif
 };
 
 }  // namespace cuda

From ed550b5fe5aa41e182db84d2b2f2fb768121fd7a Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Thu, 29 Feb 2024 20:36:29 -0800
Subject: [PATCH 093/279] Change webgpu CI pipeline to use a preinstalled
 chrome (#19729)

### Description
Change webgpu CI pipeline to use a preinstalled chrome. Hopefully it can
increase the stability. Now the chrome got from puppeteer often failed
to start.
---
 .../github/azure-pipelines/templates/win-web-ci.yml | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
index 043da233cc674..b882d6fb167fd 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
@@ -31,6 +31,7 @@ jobs:
   variables:
     webgpuCommandlineExtraFlags: '--chromium-flags=--ignore-gpu-blocklist --chromium-flags=--gpu-vendor-id=0x10de'
     runCodesignValidationInjection: false
+    CHROME_BIN: 'C:\Program Files\Google\Chrome\Application\chrome.exe'
   timeoutInMinutes: 60
   workspace:
     clean: all
@@ -95,18 +96,6 @@ jobs:
       targetFolder: $(Build.SourcesDirectory)\js\web\lib\wasm\binding
       flattenFolders: true
     displayName: 'Binplace js files'
-  - script: |
-      npm i -g puppeteer
-    workingDirectory: '$(Build.SourcesDirectory)'
-    displayName: 'Use puppeteer to prepare Chrome for tests'
-  - script: |
-      FOR /F "tokens=* USEBACKQ" %%F IN (`where /r %HOMEDRIVE%%HOMEPATH%\.cache\puppeteer chrome.exe`) DO (
-        SET var=%%F
-        ECHO found chrome.exe: %%F
-      )
-      ECHO ##vso[task.setvariable variable=CHROME_BIN;]%var%
-    workingDirectory: '$(Build.SourcesDirectory)'
-    displayName: 'Set CHROME_BIN'
   - script: |
      npm ci
     workingDirectory: '$(Build.SourcesDirectory)\js'

From 5672cdebdf5648815fcc3a001dc00e610a9f9b51 Mon Sep 17 00:00:00 2001
From: Edward Chen <18449977+edgchen1@users.noreply.github.com>
Date: Fri, 1 Mar 2024 11:01:58 -0800
Subject: [PATCH 094/279]  Update google benchmark to 1.8.3. (#19734)

Update google benchmark to 1.8.3.
Update deps_update_and_upload.py script to make it easier to use.
---
 cgmanifests/generated/cgmanifest.json         |   2 +-
 cmake/deps.txt                                |   2 +-
 cmake/deps_update_and_upload.py               | 135 ++++++++++++------
 .../templates/download-deps.yml               |   4 +-
 4 files changed, 98 insertions(+), 45 deletions(-)

diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json
index efd901787fdb7..cfad59be6b4c0 100644
--- a/cgmanifests/generated/cgmanifest.json
+++ b/cgmanifests/generated/cgmanifest.json
@@ -116,7 +116,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "361e8d1cfe0c6c36d30b39f1b61302ece5507320",
+          "commitHash": "344117638c8ff7e239044fd0fa7085839fc03021",
           "repositoryUrl": "https://github.com/google/benchmark.git"
         },
         "comments": "google_benchmark"
diff --git a/cmake/deps.txt b/cmake/deps.txt
index cb431f8c77397..9cba25b00157d 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -26,7 +26,7 @@ eigen;https://gitlab.com/libeigen/eigen/-/archive/e7248b26a1ed53fa030c5c459f7ea0
 flatbuffers;https://github.com/google/flatbuffers/archive/refs/tags/v1.12.0.zip;ba0a75fd12dbef8f6557a74e611b7a3d0c5fe7bf
 fp16;https://github.com/Maratyszcza/FP16/archive/0a92994d729ff76a58f692d3028ca1b64b145d91.zip;b985f6985a05a1c03ff1bb71190f66d8f98a1494
 fxdiv;https://github.com/Maratyszcza/FXdiv/archive/63058eff77e11aa15bf531df5dd34395ec3017c8.zip;a5658f4036402dbca7cebee32be57fb8149811e1
-google_benchmark;https://github.com/google/benchmark/archive/refs/tags/v1.7.0.zip;e97c368b176e8614e3f1bf13dd9abcf6a7ad9908
+google_benchmark;https://github.com/google/benchmark/archive/refs/tags/v1.8.3.zip;bf9870756ee3f8d2d3b346b24ee3600a41c74d3d
 google_nsync;https://github.com/google/nsync/archive/refs/tags/1.26.0.zip;5e7c00ef6bf5b787386fc040067903ec774e2752
 googletest;https://github.com/google/googletest/archive/530d5c8c84abd2a46f38583ee817743c9b3a42b4.zip;5e3a61db2aa975cfd0f97ba92c818744e7fa7034
 googlexnnpack;https://github.com/google/XNNPACK/archive/0da379fc4808f9601faef392352018c741c0f297.zip;663883491e380b628e0a5b162b5f2658032fae73
diff --git a/cmake/deps_update_and_upload.py b/cmake/deps_update_and_upload.py
index d357284d91225..63df3f6f03869 100644
--- a/cmake/deps_update_and_upload.py
+++ b/cmake/deps_update_and_upload.py
@@ -1,56 +1,109 @@
-# in case deps.txt is updated, run this file to update and upload the dependencies so that CI can use them.
-# Before running the script, increase the version number found at:
+# If deps.txt is updated, run this file to update and upload the dependencies so that CI can use them.
+#
+# Before running the script, find the latest version number at:
 # https://aiinfra.visualstudio.com/Lotus/_artifacts/feed/Lotus/UPack/onnxruntime_build_dependencies/versions
+# Increment it to obtain a new version number to use.
+#
 # Run without --do-upload once to verify downloading. Use --do-upload when you are ready to publish.
-# python cmake/deps_update_and_upload.py --root-path C:/temp/onnxruntime_deps --version 1.0.82 --do-upload
-# update version number in tools\ci_build\github\azure-pipelines\templates\download-deps.yml
+# E.g.:
+#   python cmake/deps_update_and_upload.py --root-path C:/temp/onnxruntime_deps --version 1.0.82
+#   # check contents of C:/temp/onnxruntime_deps
+#   python cmake/deps_update_and_upload.py --root-path C:/temp/onnxruntime_deps --version 1.0.82 --no-download --do-upload
+#
+# Next, update the version number in tools/ci_build/github/azure-pipelines/templates/download-deps.yml.
+
+import argparse
+import contextlib
+import pathlib
 import re
 import subprocess
-import os
-import argparse
 import tempfile
 
+script_dir = pathlib.Path(__file__).parent
+
 parser = argparse.ArgumentParser(description="Update dependencies and publish to Azure Artifacts")
 parser.add_argument(
-    "--root-path", type=str, default=tempfile.gettempdir(), help="Target root path for downloaded files"
+    "--root-path",
+    type=pathlib.Path,
+    help="Target root path for downloaded files. If not provided, a temporary directory is used.",
+)
+parser.add_argument(
+    "--version",
+    type=str,
+    help="Package version to publish",
+)
+parser.add_argument(
+    "--do-upload",
+    action="store_true",
+    dest="upload",
+    help="Upload the package to Azure Artifacts",
+)
+parser.add_argument(
+    "--no-download",
+    action="store_false",
+    dest="download",
+    help="Skip downloading the dependency files. "
+    "Use with '--do-upload' and '--root-path' to upload the package from existing dependency files.",
 )
-parser.add_argument("--version", type=str, default="1.0.82", help="Package version to publish")
-parser.add_argument("--do-upload", action="store_true", help="Upload the package to Azure Artifacts")
 args = parser.parse_args()
 
-with open("cmake/deps.txt") as file:
+if args.upload:
+    assert args.version is not None, "'--version' must be specified if uploading."
+
+if args.upload != args.download:
+    assert args.root_path is not None, "'--root-path' must be specified if only downloading or uploading."
+
+deps_path = script_dir / "deps.txt"
+with open(deps_path) as file:
     text = file.read()
 
 lines = [line for line in text.split("\n") if not line.startswith("#") and ";" in line]
 
-root_path = args.root_path
-
-for line in lines:
-    url = re.sub("^[^;]+?;https://([^;]+?);.*", r"https://\1", line)
-    filename = re.sub("^[^;]+?;https://([^;]+?);.*", r"\1", line)
-    full_path = os.path.join(root_path, filename)
-    subprocess.run(["curl", "-sSL", "--create-dirs", "-o", full_path, url])  # noqa: PLW1510
-
-package_name = "onnxruntime_build_dependencies"
-version = args.version
-
-# Check if the user is logged in to Azure
-result = subprocess.run("az account show", shell=True, capture_output=True, text=True)  # noqa: PLW1510
-if "No subscriptions found" in result.stderr:
-    # Prompt the user to log in to Azure
-    print("You are not logged in to Azure. Please log in to continue.")
-    subprocess.run("az login", shell=True)  # noqa: PLW1510
-
-# Publish the package to Azure Artifacts if --no-upload is not specified
-
-cmd = f'az artifacts universal publish --organization https://dev.azure.com/onnxruntime --feed onnxruntime --name {package_name} --version {version} --description "onnxruntime build time dependencies" --path {root_path}'
-if args.do_upload:
-    subprocess.run(cmd, shell=True)  # noqa: PLW1510
-else:
-    print("would have run: " + cmd)
-
-cmd = f'az artifacts universal publish --organization https://dev.azure.com/aiinfra --feed Lotus --name {package_name} --version {version} --description "onnxruntime build time dependencies" --path {root_path}'
-if args.do_upload:
-    subprocess.run(cmd, shell=True)  # noqa: PLW1510
-else:
-    print("would have run: " + cmd)
+with contextlib.ExitStack() as context_stack:
+    if args.root_path is not None:
+        root_path = args.root_path.resolve()
+        root_path.mkdir(parents=True, exist_ok=True)
+    else:
+        temp_dir_name = context_stack.enter_context(tempfile.TemporaryDirectory())
+        root_path = pathlib.Path(temp_dir_name)
+
+    if args.download:
+        print(f"Downloading dependencies to directory: {root_path}")
+
+        dep_pattern = re.compile(r"^[^;]+;https://([^;]+);.*$")
+
+        for line in lines:
+            match = dep_pattern.fullmatch(line)
+            if match is None:
+                continue
+
+            dep_path = match[1]
+            url = f"https://{dep_path}"
+            full_path = root_path / dep_path
+
+            subprocess.run(["curl", "-sSL", "--create-dirs", "-o", str(full_path), url], check=True)
+
+    package_name = "onnxruntime_build_dependencies"
+    version = args.version if args.version is not None else "VERSION_PLACEHOLDER"
+
+    if args.upload:
+        # Check if the user is logged in to Azure
+        result = subprocess.run("az account show", shell=True, capture_output=True, text=True, check=False)
+        if "No subscriptions found" in result.stderr:
+            # Prompt the user to log in to Azure
+            print("You are not logged in to Azure. Please log in to continue.")
+            subprocess.run("az login", shell=True, check=True)
+
+    # Publish the package to Azure Artifacts if --do-upload is specified
+
+    cmd = f'az artifacts universal publish --organization https://dev.azure.com/onnxruntime --feed onnxruntime --name {package_name} --version {version} --description "onnxruntime build time dependencies" --path {root_path}'
+    if args.upload:
+        subprocess.run(cmd, shell=True, check=True)
+    else:
+        print("would have run: " + cmd)
+
+    cmd = f'az artifacts universal publish --organization https://dev.azure.com/aiinfra --feed Lotus --name {package_name} --version {version} --description "onnxruntime build time dependencies" --path {root_path}'
+    if args.upload:
+        subprocess.run(cmd, shell=True, check=True)
+    else:
+        print("would have run: " + cmd)
diff --git a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
index 95e34cd863915..01be343795a56 100644
--- a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
@@ -11,7 +11,7 @@ steps:
       packageType: upack
       feed: '/7424c8e4-5c62-490e-95c4-79446f31017c'
       definition: '517c4f6f-5437-4392-a70d-4f15ec5be2f0'
-      version: 1.0.133
+      version: 1.0.134
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # The private ADO project
@@ -22,7 +22,7 @@ steps:
       packageType: upack
       feed: '/4c7631f5-24c0-4307-8822-1aa8f180c325'
       definition: 'fd9dd5ad-b73e-4678-890e-edcf680dbc1a'
-      version: 1.0.133
+      version: 1.0.134
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # You can add more ADO accounts at here.

From 22176a5fa8fe97efe05a63c1e7bb89b0e54cd201 Mon Sep 17 00:00:00 2001
From: Yufeng Li <liyufeng1987@gmail.com>
Date: Fri, 1 Mar 2024 13:44:29 -0800
Subject: [PATCH 095/279] disable gemm f16 on CPU (#19744)

### Description
<!-- Describe your changes. -->
Temporarily disable fp16 gemm on CPU because it usually needs a
following Cast which offsets the gain. Need more fp16 operators
implementation and performance tuning.

Also fix a fusion error of LayerNormalization.
### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .vscode/settings.json                         |  5 ++++-
 .../core/optimizer/layer_norm_fusion.cc       | 14 +++++++++++++
 .../providers/cpu/cpu_execution_provider.cc   | 21 -------------------
 .../test/providers/cpu/math/gemm_test.cc      |  2 +-
 4 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 3e2b1f31dd6cf..98d23090fd474 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -21,5 +21,8 @@
     "cpplint.filters": [
         "-build/include_subdir",
         "-runtime/references"
-    ]
+    ],
+    "files.associations": {
+        "span": "cpp"
+    }
 }
diff --git a/onnxruntime/core/optimizer/layer_norm_fusion.cc b/onnxruntime/core/optimizer/layer_norm_fusion.cc
index b6ad4fde6c1f7..ce696154adb6d 100644
--- a/onnxruntime/core/optimizer/layer_norm_fusion.cc
+++ b/onnxruntime/core/optimizer/layer_norm_fusion.cc
@@ -447,6 +447,13 @@ Status LayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level,
 
     NodeArg* x_input = has_leading_cast ? graph.GetNode(p_reduce_mean_input_node->Index())->MutableInputDefs()[0]
                                         : reduce_mean_node.MutableInputDefs()[0];
+
+    // CPU doesn't support fp16
+    if (reduce_mean_node.GetExecutionProviderType() == kCpuExecutionProvider &&
+        x_input->TypeAsProto()->tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) {
+      continue;
+    }
+
     InlinedVector<NodeArg*> layer_norm_input_defs{x_input, scale, bias};
     Node& layer_norm_node = graph.AddNode(graph.GenerateNodeName("LayerNormalization"),
                                           "LayerNormalization",
@@ -689,6 +696,13 @@ Status SimplifiedLayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int gr
 
     NodeArg* x_input = has_leading_cast ? graph.GetNode(p_pow_input_node->Index())->MutableInputDefs()[0]
                                         : pow_node.MutableInputDefs()[0];
+
+    // CPU doesn't support fp16
+    if (reduce_mean_node.GetExecutionProviderType() == kCpuExecutionProvider &&
+        x_input->TypeAsProto()->tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) {
+      continue;
+    }
+
     InlinedVector<NodeArg*> layer_norm_input_defs{x_input, scale};
     Node& layer_norm_node =
         graph.AddNode(graph.GenerateNodeName("SimplifiedLayerNormalization"), "SimplifiedLayerNormalization",
diff --git a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
index 37e7e42150413..7e0f919deb0a7 100644
--- a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
+++ b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
@@ -143,9 +143,6 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, Aco
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, Atan);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8, float, Gemm);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8, double, Gemm);
-#ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8, MLFloat16, Gemm);
-#endif
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, Hardmax);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, float, LogSoftmax);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, double, LogSoftmax);
@@ -335,9 +332,6 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOn
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, Flatten);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, float, Gemm);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, double, Gemm);
-#ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, MLFloat16, Gemm);
-#endif
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12, float, MatMul);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12, double, MatMul);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12, int32_t, MatMul);
@@ -497,9 +491,6 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, Sp
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, ScatterND);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, float, Gemm);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, double, Gemm);
-#ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, MLFloat16, Gemm);
-#endif
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, GatherElements);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, uint8_t, BitShift);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, uint32_t, BitShift);
@@ -606,9 +597,6 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, string, Expand);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Gemm);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Gemm);
-#ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, MLFloat16, Gemm);
-#endif
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, MatMul);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, MatMul);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t, MatMul);
@@ -2617,15 +2605,6 @@ Status RegisterFp16Kernels(KernelRegistry& kernel_registry) {
                                                                             MLFloat16, LeakyRelu)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, MLFloat16,
                                                                   LeakyRelu)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8,
-                                                                            MLFloat16, Gemm)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
-                                                                            MLFloat16, Gemm)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                            MLFloat16, Gemm)>,
-
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, MLFloat16,
-                                                                  Gemm)>,
   };
 
   for (auto& function_table_entry : function_table) {
diff --git a/onnxruntime/test/providers/cpu/math/gemm_test.cc b/onnxruntime/test/providers/cpu/math/gemm_test.cc
index 428925e154497..1a542fb67418e 100644
--- a/onnxruntime/test/providers/cpu/math/gemm_test.cc
+++ b/onnxruntime/test/providers/cpu/math/gemm_test.cc
@@ -277,7 +277,7 @@ class GemmOpTypedTests : public ::testing::Test {
 // On CPUs without fp16 instructions the tests will output a warning:
 // "registered execution providers CPUExecutionProvider were unable to run the model"
 // , then they will still pass.
-using GemmOpTypedTestsTypes = ::testing::Types<float, double, MLFloat16>;
+using GemmOpTypedTestsTypes = ::testing::Types<float, double>;
 TYPED_TEST_SUITE(GemmOpTypedTests, GemmOpTypedTestsTypes);
 
 TYPED_TEST(GemmOpTypedTests, TestGemmScalarBroadcast) {

From f06164ef8b8de42dd67ca2137f6996cdc87a3f72 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Fri, 1 Mar 2024 14:50:06 -0800
Subject: [PATCH 096/279] [js/web] transfer input buffer back to caller thread
 (#19677)

### Description

When using proxy worker, input buffers should be transferred back to the
caller thread after `run()` call is done.

Fixes #19488
---
 js/web/lib/wasm/proxy-worker/main.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/js/web/lib/wasm/proxy-worker/main.ts b/js/web/lib/wasm/proxy-worker/main.ts
index 6cbd38c76ccc8..3ce37a2d6b652 100644
--- a/js/web/lib/wasm/proxy-worker/main.ts
+++ b/js/web/lib/wasm/proxy-worker/main.ts
@@ -103,7 +103,7 @@ self.onmessage = (ev: MessageEvent<OrtWasmMessage>): void => {
                   } else {
                     postMessage(
                         {type, out: outputs} as OrtWasmMessage,
-                        extractTransferableBuffers(outputs as SerializableTensorMetadata[]));
+                        extractTransferableBuffers([...inputs, ...outputs] as SerializableTensorMetadata[]));
                   }
                 },
                 err => {

From a0521f899e9d495d57ae044bd4a1fe4d17155782 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Fri, 1 Mar 2024 16:23:20 -0800
Subject: [PATCH 097/279] Enable CPUINFO for all Windows build (#19655)

### Description
It was disabled in PR #9065. And the reason was:
" api-ms-win-core-kernel32-legacy-*.dll wasn't available in Windows 8
and was added in Windows 10, so cpuinfo breaks our Windows 8 support.
I'm disabling it again."

We no longer support Windows 8.  Therefore we can add CPUINFO back.

### Motivation and Context
To make the code simpler. If in any case the library doesn't work as
expected, we can submit a PR to their code base and fix it.
---
 .../external/onnxruntime_external_deps.cmake  |  9 +-
 cmake/onnxruntime_common.cmake                |  5 --
 onnxruntime/core/common/cpuid_info.cc         | 82 ++++++++-----------
 onnxruntime/core/common/cpuid_info.h          | 19 ++---
 4 files changed, 42 insertions(+), 73 deletions(-)

diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index 09d57164b4ee1..cb75b0b8751bb 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -256,14 +256,7 @@ if (onnxruntime_ENABLE_CPUINFO)
       set(CPUINFO_SUPPORTED TRUE)
     endif()
     if (WIN32)
-      # Exclude Windows ARM build and Windows Store
-      if (${onnxruntime_target_platform} MATCHES "^(ARM.*|arm.*)$" )
-        message(WARNING "Cpuinfo not included for compilation problems with Windows ARM.")
-        set(CPUINFO_SUPPORTED FALSE)
-      elseif (WIN32 AND NOT CMAKE_CXX_STANDARD_LIBRARIES MATCHES kernel32.lib)
-        message(WARNING "Cpuinfo not included non-Desktop builds")
-        set(CPUINFO_SUPPORTED FALSE)
-      endif()
+      set(CPUINFO_SUPPORTED TRUE)
     elseif (NOT ${onnxruntime_target_platform} MATCHES "^(i[3-6]86|AMD64|x86(_64)?|armv[5-8].*|aarch64|arm64)$")
       message(WARNING
         "Target processor architecture \"${onnxruntime_target_platform}\" is not supported in cpuinfo. "
diff --git a/cmake/onnxruntime_common.cmake b/cmake/onnxruntime_common.cmake
index 6b8c2560b1714..fb56e3f3445d4 100644
--- a/cmake/onnxruntime_common.cmake
+++ b/cmake/onnxruntime_common.cmake
@@ -201,10 +201,6 @@ endif()
 
 
 if (RISCV64 OR ARM64 OR ARM OR X86 OR X64 OR X86_64)
-  if((WIN32 AND NOT CMAKE_CXX_STANDARD_LIBRARIES MATCHES kernel32.lib) OR ((ARM64 OR ARM) AND MSVC))
-    # msvc compiler report syntax error with cpuinfo arm source files
-    # and cpuinfo does not have code for getting arm uarch info under windows
-  else()
     # Link cpuinfo if supported
     # Using it mainly in ARM with Android.
     # Its functionality in detecting x86 cpu features are lacking, so is support for Windows.
@@ -212,7 +208,6 @@ if (RISCV64 OR ARM64 OR ARM OR X86 OR X64 OR X86_64)
       onnxruntime_add_include_to_target(onnxruntime_common cpuinfo::cpuinfo)
       list(APPEND onnxruntime_EXTERNAL_LIBRARIES cpuinfo::cpuinfo ${ONNXRUNTIME_CLOG_TARGET_NAME})
     endif()
-  endif()
 endif()
 
 if (NOT onnxruntime_BUILD_SHARED_LIB)
diff --git a/onnxruntime/core/common/cpuid_info.cc b/onnxruntime/core/common/cpuid_info.cc
index 711fd595e90fd..be881f6bc4bc2 100644
--- a/onnxruntime/core/common/cpuid_info.cc
+++ b/onnxruntime/core/common/cpuid_info.cc
@@ -52,6 +52,13 @@
 
 #if defined(CPUINFO_SUPPORTED)
 #include <cpuinfo.h>
+#if defined(CPUIDINFO_ARCH_ARM)
+namespace onnxruntime {
+// The following function is declared in "core/common/cpuid_uarch.h" but we cannot include the whole header file because
+//  some of its symbols are conflict with <cpuinfo.h>
+void decodeMIDR(uint32_t midr, uint32_t uarch[1]);
+}  // namespace onnxruntime
+#endif
 #else
 #include "core/common/cpuid_uarch.h"
 #endif  // CPUINFO_SUPPORTED
@@ -142,11 +149,6 @@ void CPUIDInfo::ArmLinuxInit() {
   // Pytorch CPUINFO only works on ARM linux or android
   // Assuming no hyper-threading, no NUMA groups
 #ifdef CPUINFO_SUPPORTED
-  pytorch_cpuinfo_init_ = cpuinfo_initialize();
-  if (!pytorch_cpuinfo_init_) {
-    LOGS_DEFAULT(WARNING) << "Failed to init pytorch cpuinfo library, may cause CPU EP performance degradation due to undetected CPU features.";
-    return;
-  }
   is_hybrid_ = cpuinfo_get_uarchs_count() > 1;
   has_arm_neon_dot_ = cpuinfo_has_arm_neon_dot();
   has_fp16_ = cpuinfo_has_arm_neon_fp16_arith();
@@ -239,52 +241,24 @@ void CPUIDInfo::ArmWindowsInit() {
       lastUarch = uarch;
     }
   }
-
-  switch (lastUarch) {
-    case cpuinfo_uarch_cortex_a55:
-    case cpuinfo_uarch_cortex_a55r0:
-    case cpuinfo_uarch_cortex_a76:
-    case cpuinfo_uarch_neoverse_n1:
-    case cpuinfo_uarch_cortex_a77:
-    case cpuinfo_uarch_exynos_m4:
-    case cpuinfo_uarch_exynos_m5:
-      has_fp16_ = true;
-      break;
-    default:
-      break;
-  }
-  if (!has_fp16_) {
-    /*
-     * Detecting fp16 support. Different cores should have the same instruction set.
-     * So we just check the first ID_AA64PFR0_EL1
-     *  Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0100), Op2(0b000),
-     */
-    uint64_t ID_AA64PFR0_EL1;
-    unsigned long valsize = sizeof(uint64_t);
-    auto retCode = ::RegGetValueA(
-        HKEY_LOCAL_MACHINE,
-        "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0",
-        "CP 4020", RRF_RT_REG_QWORD, nullptr,
-        &ID_AA64PFR0_EL1, &valsize);
-    if (retCode == ERROR_SUCCESS) {
-      // AdvSIMD, bits [23:20]
-      auto advSimd = ID_AA64PFR0_EL1 >> 20;
-      if ((advSimd & 0xfULL) == 1) {
-        has_fp16_ = true;
-      }
-    }
-  }
 #endif /* Application Family or OneCore Family */
 
   has_arm_neon_dot_ = (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE) != 0);
 #else
   has_arm_neon_dot_ = false;
 #endif
-  has_fp16_ |= has_arm_neon_dot_;
-  /* TODO: implement them when hw+sw is available for testing these features */
-  has_arm_neon_i8mm_ = false;
-  has_arm_sve_i8mm_ = false;
-  has_arm_neon_bf16_ = false;
+
+  if (pytorch_cpuinfo_init_) {
+    has_fp16_ = cpuinfo_has_arm_neon_fp16_arith();
+    has_arm_neon_i8mm_ = cpuinfo_has_arm_i8mm();
+    has_arm_sve_i8mm_ = cpuinfo_has_arm_sve() && cpuinfo_has_arm_i8mm();
+    has_arm_neon_bf16_ = cpuinfo_has_arm_neon_bf16();
+  } else {
+    has_fp16_ = false;
+    has_arm_neon_i8mm_ = false;
+    has_arm_sve_i8mm_ = false;
+    has_arm_neon_bf16_ = false;
+  }
 }
 
 #endif /* (arm or arm64) and windows */
@@ -304,5 +278,21 @@ uint32_t CPUIDInfo::GetCurrentCoreIdx() const {
   return 0xFFFFFFFF;  // don't know how to get core index
 #endif
 }
-
+CPUIDInfo::CPUIDInfo() {
+#ifdef CPUIDINFO_ARCH_X86
+  X86Init();
+#elif defined(CPUIDINFO_ARCH_ARM)
+#if CPUINFO_SUPPORTED
+  pytorch_cpuinfo_init_ = cpuinfo_initialize();
+  if (!pytorch_cpuinfo_init_) {
+    LOGS_DEFAULT(WARNING) << "Failed to init pytorch cpuinfo library, may cause CPU EP performance degradation due to undetected CPU features.";
+  }
+#endif
+#ifdef __linux__
+  ArmLinuxInit();
+#elif defined(_WIN32)
+  ArmWindowsInit();
+#endif /* (arm or arm64) and windows */
+#endif
+}
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/common/cpuid_info.h b/onnxruntime/core/common/cpuid_info.h
index 2f8041e39f680..a3936b4bd11a6 100644
--- a/onnxruntime/core/common/cpuid_info.h
+++ b/onnxruntime/core/common/cpuid_info.h
@@ -93,17 +93,7 @@ class CPUIDInfo {
   }
 
  private:
-  CPUIDInfo() {
-#ifdef CPUIDINFO_ARCH_X86
-    X86Init();
-#elif defined(CPUIDINFO_ARCH_ARM)
-#ifdef __linux__
-    ArmLinuxInit();
-#elif defined(_WIN32)
-    ArmWindowsInit();
-#endif /* (arm or arm64) and windows */
-#endif
-  }
+  CPUIDInfo();
   bool has_amx_bf16_{false};
   bool has_avx_{false};
   bool has_avx2_{false};
@@ -131,11 +121,13 @@ class CPUIDInfo {
 #ifdef CPUIDINFO_ARCH_X86
 
   void X86Init();
-
 #elif defined(CPUIDINFO_ARCH_ARM)
+  // Now the following var is only used in ARM build, but later one we may expand the usage.
+  bool pytorch_cpuinfo_init_{false};
+#endif
+
 #ifdef __linux__
 
-  bool pytorch_cpuinfo_init_{false};
   void ArmLinuxInit();
 
 #elif defined(_WIN32)
@@ -143,7 +135,6 @@ class CPUIDInfo {
   void ArmWindowsInit();
 
 #endif /* (arm or arm64) and windows */
-#endif
 };
 
 }  // namespace onnxruntime

From de3158e78d09992e4b5085c15da44108d9c6fa83 Mon Sep 17 00:00:00 2001
From: zesongw <zesong.wang@intel.com>
Date: Sat, 2 Mar 2024 08:55:50 +0800
Subject: [PATCH 098/279] [WebNN EP] Add contraints for MatMul (#19713)

### Description
Add constraints to MatMul:
- The input must be at least 2D.
- CPU backend: The input rank must be the same.
- CPU backend: The input shape except for the last two axis must be the
same.


### Motivation and Context
Prevent regression for some models.
---
 .../webnn/builders/impl/gemm_op_builder.cc    | 73 +++++++++++--------
 1 file changed, 43 insertions(+), 30 deletions(-)

diff --git a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
index d5f84f853f7de..455e0e5f16a42 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
@@ -91,44 +91,33 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
 
 bool GemmOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
                                       const Node& node,
-                                      const WebnnDeviceType /* device_type */,
+                                      const WebnnDeviceType device_type,
                                       const logging::Logger& logger) const {
   (void)initializers;
   const auto& op_type = node.OpType();
   const auto& input_defs(node.InputDefs());
   const size_t a_idx = 0, b_idx = 1, c_idx = 2;  // A*B+C
 
-  if (op_type == "Gemm") {
-    std::vector<int64_t> a_shape;
-    {
-      if (!GetShape(*input_defs[a_idx], a_shape, logger))
-        return false;
-
-      if (a_shape.size() != 2) {
-        LOGS(logger, VERBOSE) << "A must be 2D";
-        return false;
-      }
-
-      if (Product(a_shape) == 0) {
-        LOGS(logger, VERBOSE) << "A must be non-empty";
-        return false;
-      }
-    }
-
-    std::vector<int64_t> b_shape;
-    {
-      if (!GetShape(*input_defs[b_idx], b_shape, logger))
-        return false;
+  std::vector<int64_t> a_shape;
+  if (!GetShape(*input_defs[a_idx], a_shape, logger))
+    return false;
+  if (Product(a_shape) == 0) {
+    LOGS(logger, VERBOSE) << "A must be non-empty";
+    return false;
+  }
 
-      if (b_shape.size() != 2) {
-        LOGS(logger, VERBOSE) << "B must be 2D";
-        return false;
-      }
+  std::vector<int64_t> b_shape;
+  if (!GetShape(*input_defs[b_idx], b_shape, logger))
+    return false;
+  if (Product(b_shape) == 0) {
+    LOGS(logger, VERBOSE) << "B must be non-empty";
+    return false;
+  }
 
-      if (Product(b_shape) == 0) {
-        LOGS(logger, VERBOSE) << "B must be non-empty";
-        return false;
-      }
+  if (op_type == "Gemm") {
+    if (a_shape.size() != 2 || b_shape.size() != 2) {
+      LOGS(logger, VERBOSE) << "A and B must be 2D for Gemm";
+      return false;
     }
 
     // C of Gemm.
@@ -162,6 +151,30 @@ bool GemmOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
     }
   }
 
+  if (op_type == "MatMul") {
+    if (a_shape.size() < 2 || b_shape.size() < 2) {
+      LOGS(logger, VERBOSE) << "Inputs of MatMul must be at least 2D";
+      return false;
+    }
+
+    // WebNN CPU backend has two more constraints.
+    // https://source.chromium.org/chromium/chromium/src/+/main:third_party/blink/renderer/modules/ml/webnn/ml_graph_xnnpack.cc;l=1177
+    // TODO: Remove this workaround when Chromium enables broadcast for MatMul on WebNN CPU backend.
+    if (device_type == WebnnDeviceType::CPU) {
+      if (a_shape.size() != b_shape.size()) {
+        LOGS(logger, VERBOSE) << "The rank of two inputs for WebNN CPU backend MatMul must be the same.";
+        return false;
+      }
+
+      for (size_t i = 0; i < a_shape.size() - 2; i++) {
+        if (a_shape[i] != b_shape[i]) {
+          LOGS(logger, VERBOSE) << "WebNN CPU backend can't support broadcasting for MatMul.";
+          return false;
+        }
+      }
+    }
+  }
+
   return true;
 }
 

From 2d79052ec38b831f3254b20e0f6a42b3f98eabc7 Mon Sep 17 00:00:00 2001
From: Adrian Lizarraga <adlizarraga@microsoft.com>
Date: Fri, 1 Mar 2024 18:39:51 -0800
Subject: [PATCH 099/279] [QNN Quant] Add preprocessing option to transpose
 graph inputs/outputs to channel-last (#19731)

### Description
Adds the optional parameters `inputs_to_make_channel_last` and
`outputs_to_make_channel_last` to the `qnn_preprocess_model()` function.

```python
"""
 inputs_to_make_channel_last: List of graph input names to transpose to be "channel-last". For example,
      if "input0" originally has the shape (N, C, D1, D2, ..., Dn), the resulting model will change input0's
      shape to (N, D1, D2, ..., Dn, C) and add a transpose node after it.
      Original:
          input0 (N, C, D1, D2, ..., Dn) --> <Nodes>
      Updated:
          input0 (N, D1, D2, ..., Dn, C) --> Transpose --> input0_chanfirst (N, C, D1, D2, ..., Dn) --> <Nodes>
      This can potentially improve inference latency for QDQ models running on QNN EP because the
      additional transpose node may allow other transpose nodes inserted during ORT layout transformation
      to cancel out.
 outputs_to_make_channel_last: List of graph output names to transpose to be "channel-last". For example,
      if "output0" originally has the shape (N, C, D1, D2, ..., Dn), the resulting model will change output0's
      shape to (N, D1, D2, ..., Dn, C) and add a transpose node before it.
      Original:
          <Nodes> --> output0 (N, C, D1, D2, ..., Dn)
      Updated:
          <Nodes> --> output0_chanfirst (N, C, D1, D2, ..., Dn) --> Transpose --> output0 (N, D1, D2, ..., Dn, C)
      This can potentially improve inference latency for QDQ models running on QNN EP because the
      additional transpose node may allow other transpose nodes inserted during ORT layout transformation
      to cancel out.
"""
```

**NOTE: If you use these options with the quantization scripts, you'll
have to make sure your data_reader feeds in transposed input data. It
won't happen automatically.**

### Motivation and Context
Native QNN operators use the channel-last data layout, but ONNX uses
channel-first. To bridge the gap, ORT's layout transformer inserts
transposes around layout-sensitive nodes and updates their domain to
indicate that they now operate on channel-last data. The transpose
optimizer is able to remove most of these inserted transposes, but not
all transposes can always be removed (i.e., some could remain at the
graph's inputs and outputs).

We've found that these extra transpose nodes can significantly degrade
inference latency on QNN EP. One workaround (provided by this PR) is to
add _additional_ transpose nodes at the graph inputs or outputs. These
additional nodes can often help the ORT transpose optimizer cancel out
any remaining transpose nodes, which significantly improves latency.

Additionally, it may make more sense for some kinds of inputs to just be
in channel-last form (e.g., images), avoiding the need to pre-transpose
of the input data before inference.

Example at the input:
```
Original:
    input0 (N, C, D1, D2, ..., Dn) --> <Nodes>
Updated:
    input0 (N, D1, D2, ..., Dn, C) --> Transpose --> input0_chanfirst (N, C, D1, D2, ..., Dn) --> <Nodes>
```

Example at the output:
```
Original:
   <Nodes> --> output0 (N, C, D1, D2, ..., Dn)
Updated:
   <Nodes> --> output0_chanfirst (N, C, D1, D2, ..., Dn) --> Transpose --> output0 (N, D1, D2, ..., Dn, C)
```
---
 .../execution_providers/qnn/preprocess.py     | 198 ++++++++++++++++++
 .../quantization/test_qnn_preprocess_model.py |  93 ++++++++
 2 files changed, 291 insertions(+)

diff --git a/onnxruntime/python/tools/quantization/execution_providers/qnn/preprocess.py b/onnxruntime/python/tools/quantization/execution_providers/qnn/preprocess.py
index b0dab81830c8b..e584a65574520 100644
--- a/onnxruntime/python/tools/quantization/execution_providers/qnn/preprocess.py
+++ b/onnxruntime/python/tools/quantization/execution_providers/qnn/preprocess.py
@@ -24,6 +24,8 @@ def qnn_preprocess_model(
     external_data_location: str | None = None,
     external_data_size_threshold: int = 1024,
     external_data_convert_attribute: bool = False,
+    inputs_to_make_channel_last: list[str] | None = None,
+    outputs_to_make_channel_last: list[str] | None = None,
 ) -> bool:
     """
     If necessary, this method creates a new "pre-processed" model in preparation for
@@ -52,6 +54,32 @@ def qnn_preprocess_model(
         external_data_convert_attribute: Effective only if save_as_external_data is true. Defaults to false.
             If true, convert all tensors to external data.
             If false, convert only non-attribute tensors to external data.
+        inputs_to_make_channel_last: List of graph input names to transpose to be "channel-last". For example,
+            if "input0" originally has the shape (N, C, D1, D2, ..., Dn), the resulting model will change input0's
+            shape to (N, D1, D2, ..., Dn, C) and add a transpose node after it.
+
+            Original:
+                input0 (N, C, D1, D2, ..., Dn) --> <Nodes>
+
+            Updated:
+                input0 (N, D1, D2, ..., Dn, C) --> Transpose --> input0_chanfirst (N, C, D1, D2, ..., Dn) --> <Nodes>
+
+            This can potentially improve inference latency for QDQ models running on QNN EP because the
+            additional transpose node may allow other transpose nodes inserted during ORT layout transformation
+            to cancel out.
+        outputs_to_make_channel_last: List of graph output names to transpose to be "channel-last". For example,
+            if "output0" originally has the shape (N, C, D1, D2, ..., Dn), the resulting model will change output0's
+            shape to (N, D1, D2, ..., Dn, C) and add a transpose node before it.
+
+            Original:
+                <Nodes> --> output0 (N, C, D1, D2, ..., Dn)
+
+            Updated:
+                <Nodes> --> output0_chanfirst (N, C, D1, D2, ..., Dn) --> Transpose --> output0 (N, D1, D2, ..., Dn, C)
+
+            This can potentially improve inference latency for QDQ models running on QNN EP because the
+            additional transpose node may allow other transpose nodes inserted during ORT layout transformation
+            to cancel out.
     """
     modified = False
     model = onnx.load_model(model_input)
@@ -83,6 +111,19 @@ def qnn_preprocess_model(
             if fusion_layernorm.apply():
                 modified = True
 
+    # Optionally, transpose inputs and/or outputs to make them "channel-last".
+    if inputs_to_make_channel_last or outputs_to_make_channel_last:
+        transpose_node_prefix = "Transpose_channel_"
+        transpose_node_suffix: int = onnx_model.get_largest_node_name_suffix(transpose_node_prefix) + 1
+        update_io_to_channel_last(
+            onnx_model.model,
+            inputs_to_make_channel_last,
+            outputs_to_make_channel_last,
+            transpose_node_name_prefix=transpose_node_prefix,
+            transpose_node_name_start_suffix=transpose_node_suffix,
+        )
+        modified = True
+
     # Make sure all nodes have a name.
     unnamed_node_prefix = "qnn_preproc_node_"
     available_suffix = onnx_model.get_largest_node_name_suffix(unnamed_node_prefix) + 1
@@ -107,3 +148,160 @@ def qnn_preprocess_model(
         )
 
     return modified
+
+
+class InputOutputNameMap:
+    def __init__(
+        self,
+        orig_tensor_names: set[str],
+        orig_graph_inputs: dict[str, onnx.ValueInfoProto],
+        orig_graph_outputs: dict[str, onnx.ValueInfoProto],
+    ):
+        self.orig_tensor_names = orig_tensor_names
+        self.orig_graph_inputs = orig_graph_inputs
+        self.orig_graph_outputs = orig_graph_outputs
+        self.updated_io_names = {}
+        self.new_value_infos = []
+
+    def get_new_name(self, orig_name: str):
+        if orig_name in self.updated_io_names:
+            return self.updated_io_names[orig_name]
+
+        # Make a new tensor name that is unique among all tensors in the graph.
+        prefix: str = f"{orig_name}_channel_first_"
+        suffix: int = -1
+        for tensor_name in self.orig_tensor_names:
+            if tensor_name.startswith(prefix) and tensor_name[len(prefix) :].isdigit():
+                index = int(tensor_name[len(prefix) :])
+                suffix = max(suffix, index)
+
+        suffix += 1  # This is the first available suffix.
+        new_name = f"{prefix}{suffix!s}"
+
+        # Add new value_info objects for these new tensors.
+        orig_value_info = self.orig_graph_inputs.get(orig_name) or self.orig_graph_outputs[orig_name]
+        value_info_proto = onnx.ValueInfoProto()
+        value_info_proto.CopyFrom(orig_value_info)
+        value_info_proto.name = new_name
+        self.new_value_infos.append(value_info_proto)
+
+        self.updated_io_names[orig_name] = new_name
+        return self.updated_io_names[orig_name]
+
+
+def update_io_to_channel_last(
+    model: onnx.ModelProto,
+    inputs_to_update: list[str] | None,
+    outputs_to_update: list[str] | None,
+    transpose_node_name_prefix: str = "Transpose_channel_",
+    transpose_node_name_start_suffix: int = 0,
+):
+    inputs_to_update = set(inputs_to_update or [])
+    outputs_to_update = set(outputs_to_update or [])
+
+    if not inputs_to_update and not outputs_to_update:
+        return
+
+    graph = model.graph
+    orig_graph_inputs = {ginput.name: ginput for ginput in graph.input}
+    orig_graph_outputs = {goutput.name: goutput for goutput in graph.output}
+
+    # Check that the user passed in actual input and output names.
+    for input_name in inputs_to_update:
+        if input_name not in orig_graph_inputs:
+            raise ValueError(f"{input_name} is not a graph input")
+
+    for output_name in outputs_to_update:
+        if output_name not in orig_graph_outputs:
+            raise ValueError(f"{output_name} is not a graph output")
+
+    orig_tensor_names = set()
+    orig_tensor_names.update(set(orig_graph_inputs))
+    orig_tensor_names.update(set(orig_graph_outputs))
+    orig_tensor_names.update(input_name for node in graph.node for input_name in node.input if input_name)
+
+    # Maps original input (or output) name to its updated name used within the graph.
+    io_map = InputOutputNameMap(orig_tensor_names, orig_graph_inputs, orig_graph_outputs)
+
+    # Update each node's inputs/outputs to use the transposed versions.
+    for node in graph.node:
+        for i in range(len(node.input)):
+            if node.input[i] and node.input[i] in inputs_to_update:
+                node.input[i] = io_map.get_new_name(node.input[i])
+            elif node.input[i] and node.input[i] in outputs_to_update:
+                node.input[i] = io_map.get_new_name(node.input[i])
+
+        for i in range(len(node.output)):
+            if node.output[i] in outputs_to_update:
+                node.output[i] = io_map.get_new_name(node.output[i])
+
+    # Update graph inputs to channel-last and a Transpose (to channel-first) after each.
+    for g_input_name in inputs_to_update:
+        g_input = orig_graph_inputs[g_input_name]
+
+        if not g_input.type.HasField("tensor_type") or not g_input.type.tensor_type.HasField("shape"):
+            raise ValueError(f"Expected input {g_input.name} to have a tensor_type with a shape")
+
+        input_shape = g_input.type.tensor_type.shape
+        input_rank = len(input_shape.dim)
+
+        if input_rank < 3:
+            raise ValueError(f"Expected input {g_input.name} to be of rank >= 3")
+
+        channel_dim = onnx.TensorShapeProto.Dimension()
+        channel_dim.CopyFrom(input_shape.dim[1])
+        for i in range(1, input_rank - 1):
+            input_shape.dim[i].CopyFrom(input_shape.dim[i + 1])
+        input_shape.dim[input_rank - 1].CopyFrom(channel_dim)
+
+        transpose_perm = list(range(input_rank))
+        for i in range(input_rank):
+            transpose_perm[i] = i if i < 1 else i - 1
+        transpose_perm[1] = input_rank - 1
+
+        transpose_node = onnx.helper.make_node(
+            "Transpose",
+            name=f"{transpose_node_name_prefix}{transpose_node_name_start_suffix!s}",
+            inputs=[g_input.name],
+            outputs=[io_map.get_new_name(g_input.name)],
+            perm=transpose_perm,
+        )
+        transpose_node_name_start_suffix += 1
+
+        graph.node.extend([transpose_node])
+
+    # Update graph outputs to channel-last and a Transpose (from channel-first) before each.
+    for g_output_name in outputs_to_update:
+        g_output = orig_graph_outputs[g_output_name]
+        if not g_output.type.HasField("tensor_type") or not g_output.type.tensor_type.HasField("shape"):
+            raise ValueError(f"Expected output {g_output.name} to have a tensor_type with a shape")
+
+        output_shape = g_output.type.tensor_type.shape
+        output_rank = len(output_shape.dim)
+
+        if output_rank < 3:
+            raise ValueError(f"Expected output {g_output.name} to be of rank >= 3")
+
+        channel_dim = onnx.TensorShapeProto.Dimension()
+        channel_dim.CopyFrom(output_shape.dim[1])
+        for i in range(1, output_rank - 1):
+            output_shape.dim[i].CopyFrom(output_shape.dim[i + 1])
+        output_shape.dim[output_rank - 1].CopyFrom(channel_dim)
+
+        transpose_perm = list(range(output_rank))
+        for i in range(output_rank):
+            transpose_perm[i] = i if i == 0 else i + 1
+        transpose_perm[output_rank - 1] = 1
+
+        transpose_node = onnx.helper.make_node(
+            "Transpose",
+            name=f"{transpose_node_name_prefix}{transpose_node_name_start_suffix!s}",
+            inputs=[io_map.get_new_name(g_output.name)],
+            outputs=[g_output.name],
+            perm=transpose_perm,
+        )
+        transpose_node_name_start_suffix += 1
+
+        graph.node.extend([transpose_node])
+
+    graph.value_info.extend(io_map.new_value_infos)
diff --git a/onnxruntime/test/python/quantization/test_qnn_preprocess_model.py b/onnxruntime/test/python/quantization/test_qnn_preprocess_model.py
index 9b67fd41caac3..6503b3223b828 100644
--- a/onnxruntime/test/python/quantization/test_qnn_preprocess_model.py
+++ b/onnxruntime/test/python/quantization/test_qnn_preprocess_model.py
@@ -12,6 +12,7 @@
 import numpy as np
 import onnx
 
+import onnxruntime
 from onnxruntime.quantization.execution_providers.qnn import qnn_preprocess_model
 from onnxruntime.quantization.quant_utils import model_has_external_data, ms_domain
 
@@ -165,6 +166,98 @@ def test_external_data(self):
         for node in fused_model.graph.node:
             self.assertIn(node.op_type, expected_op_types)
 
+    def build_multi_input_output_model(self, shape):
+        """
+        Returns the following model.
+                               +----------> [X]
+                               |
+        [A] ---> Add ---> Abs -+-> Mul ---> [Y]
+                  ^                 ^
+                  |                 |
+        [B] ------+-----------------+
+        """
+        input_a = onnx.helper.make_tensor_value_info("A", onnx.TensorProto.FLOAT, shape)
+        input_b = onnx.helper.make_tensor_value_info("B", onnx.TensorProto.FLOAT, shape)
+        output_x = onnx.helper.make_tensor_value_info("X", onnx.TensorProto.FLOAT, shape)
+        output_y = onnx.helper.make_tensor_value_info("Y", onnx.TensorProto.FLOAT, shape)
+
+        add_node = onnx.helper.make_node("Add", ["A", "B"], ["add_out"], name="add_node")
+        abs_node = onnx.helper.make_node("Abs", ["add_out"], ["X"], name="abs_node")
+        mul_node = onnx.helper.make_node("Mul", ["X", "B"], ["Y"], name="mul_node")
+
+        graph = onnx.helper.make_graph(
+            [add_node, abs_node, mul_node],
+            "multi_io_graph",
+            [input_a, input_b],
+            [output_x, output_y],
+        )
+        opset_imports = [
+            onnx.helper.make_opsetid("", 18),
+        ]
+        model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        return onnx.shape_inference.infer_shapes(model)
+
+    def test_make_io_channel_last(self):
+        """
+        Test making a model's inputs and outputs channel-last.
+        """
+        model = self.build_multi_input_output_model((1, 2, 3, 4))
+        onnx.save_model(model, "model.onnx")
+        modified = qnn_preprocess_model(
+            "model.onnx",
+            "model.qnn_pp.onnx",
+            inputs_to_make_channel_last=["A", "B"],
+            outputs_to_make_channel_last=["X", "Y"],
+        )
+
+        self.assertTrue(modified)
+
+        preproc_model = onnx.load_model("model.qnn_pp.onnx")
+        self.assertEqual(len(preproc_model.graph.node), 7)
+
+        num_transposes = sum(1 for node in preproc_model.graph.node if node.op_type == "Transpose")
+        self.assertEqual(num_transposes, 4)
+
+        # Check that the outputs of the new model are the same, but transposed.
+        input_a = np.arange(0.0, 24.0, 1.0, dtype=np.float32).reshape((1, 2, 3, 4))
+        input_a_t = input_a.transpose(0, 2, 3, 1)
+        input_b = np.arange(1.0, 25.0, 1.0, dtype=np.float32).reshape((1, 2, 3, 4))
+        input_b_t = input_b.transpose(0, 2, 3, 1)
+
+        orig_session = onnxruntime.InferenceSession(model.SerializeToString(), providers=["CPUExecutionProvider"])
+        orig_results = orig_session.run(None, {"A": input_a, "B": input_b})
+
+        new_session = onnxruntime.InferenceSession(
+            preproc_model.SerializeToString(), providers=["CPUExecutionProvider"]
+        )
+        new_results = new_session.run(None, {"A": input_a_t, "B": input_b_t})
+
+        self.assertEqual(len(orig_results), len(new_results))
+        for idx, orig_output in enumerate(orig_results):
+            transposed_output = new_results[idx]
+            np.testing.assert_allclose(
+                orig_output,
+                transposed_output.transpose(0, 3, 1, 2),
+                err_msg=f"Channel-last model output {idx} differs",
+            )
+
+    def test_make_io_channel_last_rank_error(self):
+        """
+        Test making a model's inputs and outputs channel-last with a rank < 3 (error).
+        """
+        model = self.build_multi_input_output_model((1, 2))
+        onnx.save_model(model, "model.onnx")
+
+        with self.assertRaises(ValueError) as context:
+            qnn_preprocess_model(
+                "model.onnx",
+                "model.qnn_pp.onnx",
+                inputs_to_make_channel_last=["A", "B"],
+                outputs_to_make_channel_last=["X", "Y"],
+            )
+
+        self.assertIn("to be of rank >= 3", str(context.exception))
+
 
 if __name__ == "__main__":
     unittest.main()

From 9460597b2103d8d07e88272b9f4e19700d71d632 Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Sat, 2 Mar 2024 11:33:47 +0800
Subject: [PATCH 100/279] Update copying API header files (#19736)

### Description
Make Linux logic consistent as Windows


### Motivation and Context
onnxruntime_lite_custom_op.h in Windows zip package but not in Linux zip
package

https://github.com/microsoft/onnxruntime/blob/acbfc29f272b5578145e7600bc42342e116ffbc2/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml#L67

Co-authored-by: Your Name <your@email.com>
---
 tools/ci_build/github/linux/copy_strip_binary.sh | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/tools/ci_build/github/linux/copy_strip_binary.sh b/tools/ci_build/github/linux/copy_strip_binary.sh
index 42973a8fcb5b8..65d6d97ebf0a8 100755
--- a/tools/ci_build/github/linux/copy_strip_binary.sh
+++ b/tools/ci_build/github/linux/copy_strip_binary.sh
@@ -44,17 +44,10 @@ elif [[ $LIB_NAME == *.so.* ]]
 then
     ln -s $LIB_NAME $BINARY_DIR/$ARTIFACT_NAME/lib/libonnxruntime.so
 fi
-cp $SOURCE_DIR/include/onnxruntime/core/session/onnxruntime_c_api.h  $BINARY_DIR/$ARTIFACT_NAME/include
-cp $SOURCE_DIR/include/onnxruntime/core/session/onnxruntime_cxx_api.h  $BINARY_DIR/$ARTIFACT_NAME/include
-cp $SOURCE_DIR/include/onnxruntime/core/session/onnxruntime_cxx_inline.h  $BINARY_DIR/$ARTIFACT_NAME/include
-cp $SOURCE_DIR/include/onnxruntime/core/session/onnxruntime_float16.h  $BINARY_DIR/$ARTIFACT_NAME/include
-cp $SOURCE_DIR/include/onnxruntime/core/providers/cpu/cpu_provider_factory.h  $BINARY_DIR/$ARTIFACT_NAME/include
-cp $SOURCE_DIR/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h  $BINARY_DIR/$ARTIFACT_NAME/include
-cp $SOURCE_DIR/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h  $BINARY_DIR/$ARTIFACT_NAME/include
+cp $SOURCE_DIR/include/onnxruntime/core/session/onnxruntime_*.h $BINARY_DIR/$ARTIFACT_NAME/include
 cp $SOURCE_DIR/include/onnxruntime/core/framework/provider_options.h  $BINARY_DIR/$ARTIFACT_NAME/include
-cp $SOURCE_DIR/orttraining/orttraining/training_api/include/onnxruntime_training_c_api.h  $BINARY_DIR/$ARTIFACT_NAME/include
-cp $SOURCE_DIR/orttraining/orttraining/training_api/include/onnxruntime_training_cxx_api.h  $BINARY_DIR/$ARTIFACT_NAME/include
-cp $SOURCE_DIR/orttraining/orttraining/training_api/include/onnxruntime_training_cxx_inline.h  $BINARY_DIR/$ARTIFACT_NAME/include
+cp $SOURCE_DIR/include/onnxruntime/core/providers/cpu/cpu_provider_factory.h  $BINARY_DIR/$ARTIFACT_NAME/include
+cp $SOURCE_DIR/orttraining/orttraining/training_api/include/onnxruntime_training_*.h  $BINARY_DIR/$ARTIFACT_NAME/include
 
 if [[ -f "$BINARY_DIR/$BUILD_CONFIG/libonnxruntime_providers_cuda.so" ]]; then
 # copy headers for context context used in custom ops

From 9acaf534a62050705d9b892a57ef0e8409fa62ec Mon Sep 17 00:00:00 2001
From: ironman <bitzhangxi@outlook.com>
Date: Mon, 4 Mar 2024 23:29:58 +0800
Subject: [PATCH 101/279] Benchmark - Updating llama-2 requirement files
 (#19716)

### Description
<!-- Describe your changes. -->


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../tools/transformers/models/llama/requirements-cuda.txt      | 1 +
 .../python/tools/transformers/models/llama/requirements.txt    | 3 ++-
 .../python/tools/transformers/models/whisper/requirements.txt  | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/models/llama/requirements-cuda.txt b/onnxruntime/python/tools/transformers/models/llama/requirements-cuda.txt
index acd9c23aa42d0..307afbc122901 100644
--- a/onnxruntime/python/tools/transformers/models/llama/requirements-cuda.txt
+++ b/onnxruntime/python/tools/transformers/models/llama/requirements-cuda.txt
@@ -2,3 +2,4 @@
 # Please manually install torch>=2.2.0 with CUDA enabled for the CUDA version installed in your system.
 # Instructions can be found here: https://pytorch.org/get-started/locally/
 onnxruntime-gpu>=1.16.2
+py3nvml
\ No newline at end of file
diff --git a/onnxruntime/python/tools/transformers/models/llama/requirements.txt b/onnxruntime/python/tools/transformers/models/llama/requirements.txt
index 8b57279295e35..e991c2f27a1a3 100644
--- a/onnxruntime/python/tools/transformers/models/llama/requirements.txt
+++ b/onnxruntime/python/tools/transformers/models/llama/requirements.txt
@@ -1,6 +1,7 @@
 optimum>=1.14.1
-transformers>=4.33.2
+transformers>=4.33.2,<= 4.37.2
 torch>=2.2.0
 onnx>=1.14.0
 datasets>=2.8.0
 protobuf==3.20.2
+psutil
\ No newline at end of file
diff --git a/onnxruntime/python/tools/transformers/models/whisper/requirements.txt b/onnxruntime/python/tools/transformers/models/whisper/requirements.txt
index 956922dc83d51..9bbe0d7380406 100644
--- a/onnxruntime/python/tools/transformers/models/whisper/requirements.txt
+++ b/onnxruntime/python/tools/transformers/models/whisper/requirements.txt
@@ -7,8 +7,8 @@ soundfile
 librosa
 optimum
 onnxruntime-extensions>=0.9.0
+onnx>=1.15.0
 protobuf==3.20.2
 numpy==1.23.3
-onnx>=1.15.0
 psutil
 py3nvml

From 2e13d5f0ab54c726ee2400d38983000de7f61b8e Mon Sep 17 00:00:00 2001
From: inisis <46103969+inisis@users.noreply.github.com>
Date: Tue, 5 Mar 2024 01:41:36 +0800
Subject: [PATCH 102/279] fix split shape inference error for opset >= 13
 (#19756)

### Description
get split operator split section by opset

### Motivation and Context
for opset higher than 13, split section is treated as an input.
---
 onnxruntime/python/tools/symbolic_shape_infer.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/python/tools/symbolic_shape_infer.py b/onnxruntime/python/tools/symbolic_shape_infer.py
index 4b56bc1e8d828..4b029f9b172b0 100755
--- a/onnxruntime/python/tools/symbolic_shape_infer.py
+++ b/onnxruntime/python/tools/symbolic_shape_infer.py
@@ -1940,8 +1940,17 @@ def _infer_SoftmaxCrossEntropyLoss(self, node):  # noqa: N802
     def _infer_Split_Common(self, node, make_value_info_func):  # noqa: N802
         input_sympy_shape = self._get_sympy_shape(node, 0)
         axis = handle_negative_axis(get_attribute(node, "axis", 0), len(input_sympy_shape))
-        split = get_attribute(node, "split")
-        if not split:
+        op_set = get_opset(self.out_mp_)
+
+        # Depending on op-version 'split' are provided as attribute or via 2nd input
+        if op_set < 13:
+            split = get_attribute(node, "split")
+            assert self._try_get_value(node, 1) is None
+        else:
+            split = self._try_get_value(node, 1)
+            assert get_attribute(node, "split") is None
+
+        if split is None:
             num_outputs = len(node.output)
             split = [input_sympy_shape[axis] / sympy.Integer(num_outputs)] * num_outputs
             self._update_computed_dims(split)

From 27b1dc91abb71b71fe6a26e1b4ebd30e13524baf Mon Sep 17 00:00:00 2001
From: raoanag <127366241+raoanag@users.noreply.github.com>
Date: Mon, 4 Mar 2024 11:55:35 -0800
Subject: [PATCH 103/279] [DML] MatrixMultiplyIntegerToFloat (#19608)

### Description
DML Implementation for
[com.microsoft.MatMulIntegerToFloat](https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#com.microsoft.MatMulIntegerToFloat)

```
.\onnxruntime_test_all.exe --gtest_filter="*MatMulIntegerToFloat.*"
Note: Google Test filter = *MatMulIntegerToFloat.*
[==========] Running 22 tests from 1 test suite.
[----------] Global test environment set-up.
[----------] 22 tests from MatMulIntegerToFloat
[ RUN      ] MatMulIntegerToFloat.HasZeroPoint_NoBias_test_S8S8
[       OK ] MatMulIntegerToFloat.HasZeroPoint_NoBias_test_S8S8 (620 ms)
[ RUN      ] MatMulIntegerToFloat.NoZeroPoint_HasBias_test_S8S8
[       OK ] MatMulIntegerToFloat.NoZeroPoint_HasBias_test_S8S8 (497 ms)
[ RUN      ] MatMulIntegerToFloat.NoZeroPoint_NoBias_test_S8S8
[       OK ] MatMulIntegerToFloat.NoZeroPoint_NoBias_test_S8S8 (488 ms)
[ RUN      ] MatMulIntegerToFloat.HasZeroPoint_HasBias_test_S8S8
[       OK ] MatMulIntegerToFloat.HasZeroPoint_HasBias_test_S8S8 (503 ms)
[ RUN      ] MatMulIntegerToFloat.HasZeroPoint_NoBias_test_U8U8
[       OK ] MatMulIntegerToFloat.HasZeroPoint_NoBias_test_U8U8 (495 ms)
[ RUN      ] MatMulIntegerToFloat.NoZeroPoint_HasBias_test_U8U8
[       OK ] MatMulIntegerToFloat.NoZeroPoint_HasBias_test_U8U8 (488 ms)
[ RUN      ] MatMulIntegerToFloat.NoZeroPoint_NoBias_test_U8U8
[       OK ] MatMulIntegerToFloat.NoZeroPoint_NoBias_test_U8U8 (492 ms)
[ RUN      ] MatMulIntegerToFloat.HasZeroPoint_HasBias_test_U8X8
[       OK ] MatMulIntegerToFloat.HasZeroPoint_HasBias_test_U8X8 (502 ms)
[ RUN      ] MatMulIntegerToFloat.HasZeroPoint_NoBias_test_S8U8
[       OK ] MatMulIntegerToFloat.HasZeroPoint_NoBias_test_S8U8 (452 ms)
[ RUN      ] MatMulIntegerToFloat.NoZeroPoint_HasBias_test_S8U8
[       OK ] MatMulIntegerToFloat.NoZeroPoint_HasBias_test_S8U8 (454 ms)
[ RUN      ] MatMulIntegerToFloat.NoZeroPoint_NoBias_test_S8U8
[       OK ] MatMulIntegerToFloat.NoZeroPoint_NoBias_test_S8U8 (446 ms)
[ RUN      ] MatMulIntegerToFloat.HasZeroPoint_HasBias_test_S8U8
[       OK ] MatMulIntegerToFloat.HasZeroPoint_HasBias_test_S8U8 (508 ms)
[ RUN      ] MatMulIntegerToFloat.HasZeroPoint_NoBias_test_U8S8
[       OK ] MatMulIntegerToFloat.HasZeroPoint_NoBias_test_U8S8 (456 ms)
[ RUN      ] MatMulIntegerToFloat.NoZeroPoint_HasBias_test_U8S8
[       OK ] MatMulIntegerToFloat.NoZeroPoint_HasBias_test_U8S8 (455 ms)
[ RUN      ] MatMulIntegerToFloat.NoZeroPoint_NoBias_test_U8S8
[       OK ] MatMulIntegerToFloat.NoZeroPoint_NoBias_test_U8S8 (447 ms)
[ RUN      ] MatMulIntegerToFloat.HasZeroPoint_HasBias_test_U8S8
[       OK ] MatMulIntegerToFloat.HasZeroPoint_HasBias_test_U8S8 (465 ms)
[ RUN      ] MatMulIntegerToFloat.MatMulIntegerToFloat_FP16_U8U8
[       OK ] MatMulIntegerToFloat.MatMulIntegerToFloat_FP16_U8U8 (111 ms)
[ RUN      ] MatMulIntegerToFloat.MatMulIntegerToFloat_FP16_U8S8
[       OK ] MatMulIntegerToFloat.MatMulIntegerToFloat_FP16_U8S8 (115 ms)
[ RUN      ] MatMulIntegerToFloat.MatMulIntegerToFloat_FP16_S8S8
[       OK ] MatMulIntegerToFloat.MatMulIntegerToFloat_FP16_S8S8 (114 ms)
[ RUN      ] MatMulIntegerToFloat.MatMulIntegerToFloat_FP16_S8U8
[       OK ] MatMulIntegerToFloat.MatMulIntegerToFloat_FP16_S8U8 (110 ms)
[ RUN      ] MatMulIntegerToFloat.MatMulIntegerToFloat_FP16
[       OK ] MatMulIntegerToFloat.MatMulIntegerToFloat_FP16 (112 ms)
[ RUN      ] MatMulIntegerToFloat.MatMulInteger_With_ZeroPoint
[       OK ] MatMulIntegerToFloat.MatMulInteger_With_ZeroPoint (337 ms)
[----------] 22 tests from MatMulIntegerToFloat (8679 ms total)

[----------] Global test environment tear-down
[==========] 22 tests from 1 test suite ran. (8680 ms total)
[  PASSED  ] 22 tests.
memleakdbg:
----- No memory leaks detected -----
```


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
* `CalculateMatMulIntegerToFloat` to replace CPU EP run reference
* Added more FP32 testcases to isolate all input datatype combinations
* Added fixed input to `MatMulIntegerToFloat_FP16*` test cases as for
FP16 test cases.
* onnxruntime/test/testdata/matmul_integer_to_float.py` is capable of
generating FP16 models, but we do not produce any for now
---
 docs/ContribOperators.md                      |   2 +-
 docs/OperatorKernels.md                       |   1 +
 .../graph/contrib_ops/quantization_defs.cc    |   2 +-
 .../core/optimizer/graph_transformer_utils.cc |   5 +-
 .../core/optimizer/matmul_integer_to_float.cc |  23 +-
 .../src/External/DirectMLHelpers/ApiTraits.h  |  12 +-
 .../External/DirectMLHelpers/DirectMLSchema.h |  37 +-
 .../DirectMLHelpers/GeneratedSchemaHelpers.h  |  36 +-
 .../DmlOperatorMatMulIntegerToFloat.cpp       | 111 +++++
 .../src/Operators/OperatorRegistration.cpp    |   9 +
 .../dml/OperatorAuthorHelper/OperatorHelper.h |   2 +-
 .../OperatorAuthorHelper/OperatorVersions.h   |   1 +
 .../matmul_integer_to_float_test.cc           | 414 +++++++++++++++---
 .../test/optimizer/graph_transform_test.cc    |  18 +
 .../test/testdata/matmul_integer_to_float.py  |  60 ++-
 .../matmul_integer_to_float_int8.onnx         |   4 +-
 .../matmul_integer_to_float_int8_bias.onnx    |   4 +-
 .../matmul_integer_to_float_int8_int8.onnx    |   4 +-
 ...atmul_integer_to_float_int8_int8_bias.onnx |   4 +-
 .../matmul_integer_to_float_uint8.onnx        |   4 +-
 .../matmul_integer_to_float_uint8_bias.onnx   |   4 +-
 .../fusion/matmul_integer_to_float.onnx       | Bin 1520 -> 1520 bytes
 .../matmul_integer_to_float16_int8.onnx       |  51 +++
 23 files changed, 664 insertions(+), 144 deletions(-)
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorMatMulIntegerToFloat.cpp
 create mode 100644 onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16_int8.onnx

diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
index f523e97293427..e295dfa203ae5 100644
--- a/docs/ContribOperators.md
+++ b/docs/ContribOperators.md
@@ -2795,7 +2795,7 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dd>Constrain input A data type to 8-bit integer tensor.</dd>
 <dt><tt>T2</tt> : tensor(int8), tensor(uint8)</dt>
 <dd>Constrain input B data type to 8-bit integer tensor.</dd>
-<dt><tt>T3</tt> : tensor(float)</dt>
+<dt><tt>T3</tt> : tensor(float), tensor(float16)</dt>
 <dd>Constrain input a_scale, b_scale and output Y data type as float tensor.</dd>
 </dl>
 
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index 1eaf0fb6dad76..0e60b4622f2fb 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -1268,6 +1268,7 @@ Do not modify directly.*
 |FusedMatMulActivation|*in* A:**T**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
 |Gelu|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
 |GroupNorm|*in* X:**T**<br> *in* gamma:**M**<br> *in* beta:**M**<br> *out* Y:**T**|1+|**M** = tensor(float), tensor(float16)<br/> **T** = tensor(float), tensor(float16)|
+|MatMulIntegerToFloat|*in* A:**T1**<br> *in* B:**T2**<br> *in* a_scale:**T3**<br> *in* b_scale:**T3**<br> *in* a_zero_point:**T1**<br> *in* b_zero_point:**T2**<br> *in* bias:**T3**<br> *out* Y:**T3**|1+|**T1** = tensor(int8), tensor(uint8)<br/> **T2** = tensor(int8), tensor(uint8)<br/> **T3** = tensor(float), tensor(float16)|
 |MultiHeadAttention|*in* query:**T**<br> *in* key:**T**<br> *in* value:**T**<br> *in* bias:**T**<br> *in* key_padding_mask:**M**<br> *in* relative_position_bias:**T**<br> *in* past_key:**T**<br> *in* past_value:**T**<br> *out* output:**T**<br> *out* present_key:**T**<br> *out* present_value:**T**|1+|**M** = tensor(int32)<br/> **T** = tensor(float), tensor(float16)|
 |NhwcConv|*in* X:**T**<br> *in* W:**T**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
 |QLinearAdd|*in* A:**T**<br> *in* A_scale:**tensor(float)**<br> *in* A_zero_point:**T**<br> *in* B:**T**<br> *in* B_scale:**tensor(float)**<br> *in* B_zero_point:**T**<br> *in* C_scale:**tensor(float)**<br> *in* C_zero_point:**T**<br> *out* C:**T**|1+|**T** = tensor(int8), tensor(uint8)|
diff --git a/onnxruntime/core/graph/contrib_ops/quantization_defs.cc b/onnxruntime/core/graph/contrib_ops/quantization_defs.cc
index 4313fae767fe5..22a79ef652515 100644
--- a/onnxruntime/core/graph/contrib_ops/quantization_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/quantization_defs.cc
@@ -434,7 +434,7 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
         .Output(0, "Y", "Matrix multiply results from A * B", "T3")
         .TypeConstraint("T1", {"tensor(int8)", "tensor(uint8)"}, "Constrain input A data type to 8-bit integer tensor.")
         .TypeConstraint("T2", {"tensor(int8)", "tensor(uint8)"}, "Constrain input B data type to 8-bit integer tensor.")
-        .TypeConstraint("T3", {"tensor(float)"},
+        .TypeConstraint("T3", {"tensor(float)", "tensor(float16)"},
                         "Constrain input a_scale, b_scale and output Y data type as float tensor.")
         .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
           propagateElemTypeFromInputToOutput(ctx, 2, 0);
diff --git a/onnxruntime/core/optimizer/graph_transformer_utils.cc b/onnxruntime/core/optimizer/graph_transformer_utils.cc
index 8376b87aee6b2..f319e7254568d 100644
--- a/onnxruntime/core/optimizer/graph_transformer_utils.cc
+++ b/onnxruntime/core/optimizer/graph_transformer_utils.cc
@@ -278,7 +278,8 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
                                                                                onnxruntime::kAclExecutionProvider,
                                                                                onnxruntime::kArmNNExecutionProvider,
                                                                                onnxruntime::kJsExecutionProvider};
-
+      const InlinedHashSet<std::string_view> cpu_dml_eps = {onnxruntime::kCpuExecutionProvider,
+                                                            onnxruntime::kDmlExecutionProvider};
 #ifdef MLAS_TARGET_AMD64_IX86
       const bool avx2_precision_mode =
           session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsAvx2PrecisionMode, "0") == "1" && MlasPlatformU8S8Overflow();
@@ -296,7 +297,7 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
       }
 
       transformers.emplace_back(std::make_unique<GemmActivationFusion>(cpu_ep));
-      transformers.emplace_back(std::make_unique<MatMulIntegerToFloatFusion>(cpu_ep));
+      transformers.emplace_back(std::make_unique<MatMulIntegerToFloatFusion>(cpu_dml_eps));
       transformers.emplace_back(std::make_unique<DynamicQuantizeMatMulFusion>(cpu_ep));
 
       transformers.emplace_back(std::make_unique<ConvActivationFusion>(cpu_cuda_rocm_acl_armnn_js_eps));
diff --git a/onnxruntime/core/optimizer/matmul_integer_to_float.cc b/onnxruntime/core/optimizer/matmul_integer_to_float.cc
index 56e51cb787931..4fee1a6ce224e 100644
--- a/onnxruntime/core/optimizer/matmul_integer_to_float.cc
+++ b/onnxruntime/core/optimizer/matmul_integer_to_float.cc
@@ -31,6 +31,24 @@ static bool CheckBiasShape(const TensorShapeProto* bias_shape) {
   return bias_last_dim > 1;
 }
 
+bool HasElementDataType(const NodeArg& node_arg, int32_t data_type) {
+  if (!node_arg.Exists()) {
+    return false;
+  }
+
+  const auto* type_proto = node_arg.TypeAsProto();
+  if (!type_proto) {
+    return false;
+  }
+
+  int32_t actual_data_type;
+  if (!utils::TryGetElementDataType(*type_proto, actual_data_type)) {
+    return false;
+  }
+
+  return data_type == actual_data_type;
+}
+
 /**
 MatMulIntegerToFloatFusion will fuse subgraph like below into MatMulIntegerToFloat:
 
@@ -63,9 +81,10 @@ Status MatMulIntegerToFloatFusion::ApplyImpl(Graph& graph, bool& modified, int g
     auto& mul_node = *node_ptr;
 
     ORT_RETURN_IF_ERROR(Recurse(mul_node, modified, graph_level, logger));
-
+    const bool is_dml_ep = node_ptr->GetExecutionProviderType() == kDmlExecutionProvider;
     if (!graph_utils::IsSupportedOptypeVersionAndDomain(mul_node, "Mul", {7, 13, 14}) ||
-        !graph_utils::IsSupportedProvider(mul_node, GetCompatibleExecutionProviders())) {
+        !graph_utils::IsSupportedProvider(mul_node, GetCompatibleExecutionProviders()) ||
+        (!is_dml_ep && HasElementDataType(*mul_node.InputDefs()[0], ONNX_NAMESPACE::TensorProto_DataType_FLOAT16))) {
       continue;
     }
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h
index e1e7eacfbd85d..7c25755a7d09e 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h
@@ -879,6 +879,12 @@ struct OperatorDescTraits<DML_QUANTIZED_LINEAR_MATRIX_MULTIPLY_OPERATOR_DESC>
     static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_QUANTIZED_LINEAR_MATRIX_MULTIPLY;
 };
 
+template <>
+struct OperatorDescTraits<DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC>
+{
+    static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT;
+};
+
 template <>
 struct OperatorDescTraits<DML_CONVOLUTION_INTEGER_OPERATOR_DESC>
 {
@@ -1041,12 +1047,6 @@ struct OperatorDescTraits<DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_DESC>
     static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING;
 };
 
-template <>
-struct OperatorDescTraits<DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC>
-{
-    static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT;
-};
-
 template <>
 struct OperatorDescTraits<DML_ACTIVATION_ELU_OPERATOR_DESC>
 {
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h
index 5fe6603c2a0bf..da57c2aa235fd 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h
@@ -1885,6 +1885,25 @@ constexpr DML_OPERATOR_SCHEMA DML_QUANTIZED_LINEAR_MATRIX_MULTIPLY_OPERATOR_SCHE
     DML_QUANTIZED_LINEAR_MATRIX_MULTIPLY_OPERATOR_SCHEMA_FIELDS,
 };
 
+constexpr DML_SCHEMA_FIELD DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA_FIELDS[8] {
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "ATensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "AScaleTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "AZeroPointTensor", true },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BScaleTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BZeroPointTensor", true },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BiasTensor", true },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false },
+};
+
+constexpr DML_OPERATOR_SCHEMA DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA {
+    "DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT",
+    static_cast<DML_OPERATOR_TYPE>(DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT),
+    DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE,
+    8,
+    DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA_FIELDS,
+};
+
 constexpr DML_SCHEMA_FIELD DML_CONVOLUTION_INTEGER_OPERATOR_SCHEMA_FIELDS[11] {
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputZeroPointTensor", true },
@@ -2395,24 +2414,6 @@ constexpr DML_OPERATOR_SCHEMA DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHE
     DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA_FIELDS,
 };
 
-constexpr DML_SCHEMA_FIELD DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA_FIELDS[8] {
-    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "ATensor", false },
-    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "AScaleTensor", false },
-    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "AZeroPointTensor", true },
-    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BTensor", false },
-    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BScaleTensor", false },
-    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BZeroPointTensor", true },
-    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BiasTensor", true },
-    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false },
-};
-
-constexpr DML_OPERATOR_SCHEMA DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA {
-    "DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT",
-    DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT,
-    DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE,
-    8,
-    DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA_FIELDS,
-};
 constexpr DML_SCHEMA_FIELD DML_ACTIVATION_ELU_OPERATOR_SCHEMA_FIELDS[3] {
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false },
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaHelpers.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaHelpers.h
index 4be41ad3924a2..86c66d8cca26c 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaHelpers.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaHelpers.h
@@ -1139,6 +1139,19 @@ inline std::vector<OperatorField> GetFields(const DML_QUANTIZED_LINEAR_MATRIX_MU
         OperatorField(&DML_QUANTIZED_LINEAR_MATRIX_MULTIPLY_OPERATOR_SCHEMA.Fields[8], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.OutputTensor))),
     };
 }
+inline std::vector<OperatorField> GetFields(const DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC& desc)
+{
+    return {
+        OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[0], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.ATensor))),
+        OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[1], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.AScaleTensor))),
+        OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[2], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.AZeroPointTensor))),
+        OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[3], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.BTensor))),
+        OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[4], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.BScaleTensor))),
+        OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[5], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.BZeroPointTensor))),
+        OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[6], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.BiasTensor))),
+        OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[7], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.OutputTensor))),
+    };
+}
 inline std::vector<OperatorField> GetFields(const DML_CONVOLUTION_INTEGER_OPERATOR_DESC& desc)
 {
     return {
@@ -1487,19 +1500,6 @@ inline std::vector<OperatorField> GetFields(const DML_QUANTIZED_LINEAR_AVERAGE_P
         OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[12], ToOperatorFieldType(static_cast<UINT>(desc.IncludePadding))),
     };
 }
-inline std::vector<OperatorField> GetFields(const DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC& desc)
-{
-    return {
-        OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[0], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.ATensor))),
-        OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[1], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.AScaleTensor))),
-        OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[2], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.AZeroPointTensor))),
-        OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[3], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.BTensor))),
-        OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[4], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.BScaleTensor))),
-        OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[5], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.BZeroPointTensor))),
-        OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[6], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.BiasTensor))),
-        OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[7], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.OutputTensor))),
-    };
-}
 inline std::vector<OperatorField> GetFields(const DML_ACTIVATION_ELU_OPERATOR_DESC& desc)
 {
     return {
@@ -1829,6 +1829,7 @@ inline const DML_OPERATOR_SCHEMA& GetSchema(DML_OPERATOR_TYPE operatorType)
     case DML_OPERATOR_RESAMPLE1: return DML_RESAMPLE1_OPERATOR_SCHEMA;
     case DML_OPERATOR_MATRIX_MULTIPLY_INTEGER: return DML_MATRIX_MULTIPLY_INTEGER_OPERATOR_SCHEMA;
     case DML_OPERATOR_QUANTIZED_LINEAR_MATRIX_MULTIPLY: return DML_QUANTIZED_LINEAR_MATRIX_MULTIPLY_OPERATOR_SCHEMA;
+    case DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT: return DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA;
     case DML_OPERATOR_CONVOLUTION_INTEGER: return DML_CONVOLUTION_INTEGER_OPERATOR_SCHEMA;
     case DML_OPERATOR_QUANTIZED_LINEAR_CONVOLUTION: return DML_QUANTIZED_LINEAR_CONVOLUTION_OPERATOR_SCHEMA;
     case DML_OPERATOR_ELEMENT_WISE_BIT_AND: return DML_ELEMENT_WISE_BIT_AND_OPERATOR_SCHEMA;
@@ -1856,7 +1857,6 @@ inline const DML_OPERATOR_SCHEMA& GetSchema(DML_OPERATOR_TYPE operatorType)
     case DML_OPERATOR_DIAGONAL_MATRIX1: return DML_DIAGONAL_MATRIX1_OPERATOR_SCHEMA;
     case DML_OPERATOR_MULTIHEAD_ATTENTION: return DML_MULTIHEAD_ATTENTION_OPERATOR_SCHEMA;
     case DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING: return DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA;
-    case DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT: return DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA;
     case DML_OPERATOR_ACTIVATION_ELU: return DML_ACTIVATION_ELU_OPERATOR_SCHEMA;
     case DML_OPERATOR_ACTIVATION_CELU: return DML_ACTIVATION_CELU_OPERATOR_SCHEMA;
     case DML_OPERATOR_ACTIVATION_HARDMAX: return DML_ACTIVATION_HARDMAX_OPERATOR_SCHEMA;
@@ -2360,6 +2360,10 @@ inline AbstractOperatorDesc ConvertOperatorDesc(const DML_OPERATOR_DESC& opDesc)
         return AbstractOperatorDesc(
             &DML_QUANTIZED_LINEAR_MATRIX_MULTIPLY_OPERATOR_SCHEMA,
             GetFields(*static_cast<const DML_QUANTIZED_LINEAR_MATRIX_MULTIPLY_OPERATOR_DESC*>(opDesc.Desc)));
+    case DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT:
+        return AbstractOperatorDesc(
+            &DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA,
+            GetFields(*static_cast<const DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC*>(opDesc.Desc)));
     case DML_OPERATOR_CONVOLUTION_INTEGER:
         return AbstractOperatorDesc(
             &DML_CONVOLUTION_INTEGER_OPERATOR_SCHEMA,
@@ -2468,10 +2472,6 @@ inline AbstractOperatorDesc ConvertOperatorDesc(const DML_OPERATOR_DESC& opDesc)
         return AbstractOperatorDesc(
             &DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA,
             GetFields(*static_cast<const DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_DESC*>(opDesc.Desc)));
-    case DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT:
-        return AbstractOperatorDesc(
-            &DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA,
-            GetFields(*static_cast<const DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC*>(opDesc.Desc)));
     case DML_OPERATOR_ACTIVATION_ELU:
         return AbstractOperatorDesc(
             &DML_ACTIVATION_ELU_OPERATOR_SCHEMA,
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorMatMulIntegerToFloat.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorMatMulIntegerToFloat.cpp
new file mode 100644
index 0000000000000..b5a3dd0960b86
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorMatMulIntegerToFloat.cpp
@@ -0,0 +1,111 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "precomp.h"
+
+namespace Dml
+{
+
+class DmlOperatorMatMulIntegerToFloat : public DmlOperator
+{
+    enum OrtInputTensors : uint32_t
+    {
+        ortA,
+        ortB,
+        ortAScale,
+        ortBScale,
+        ortAZeroPoint,
+        ortBZeroPoint,
+        ortBias,
+        ortInputCount
+    };
+
+    enum DmlInputIndex : uint32_t
+    {
+        dmlA,
+        dmlAScale,
+        dmlAZeroPoint,
+        dmlB,
+        dmlBScale,
+        dmlBZeroPoint,
+        dmlBias,
+        dmlInputCount,
+    };
+
+public:
+    DmlOperatorMatMulIntegerToFloat(const MLOperatorKernelCreationContext& kernelInfo)
+        :   DmlOperator(kernelInfo)
+    {
+        std::vector<std::optional<uint32_t>> inputIndices = { OrtInputTensors::ortA, OrtInputTensors::ortAScale, OrtInputTensors::ortAZeroPoint, OrtInputTensors::ortB, OrtInputTensors::ortBScale, OrtInputTensors::ortBZeroPoint, OrtInputTensors::ortBias };
+        DmlOperator::Initialize(kernelInfo, inputIndices);
+
+        std::vector<DimensionType> inputShape0 = kernelInfo.GetTensorShapeDescription().GetInputTensorShape(OrtInputTensors::ortA);
+        std::vector<DimensionType> inputShape1 = kernelInfo.GetTensorShapeDescription().GetInputTensorShape(OrtInputTensors::ortB);
+        std::vector<DimensionType> outputShape = kernelInfo.GetTensorShapeDescription().GetOutputTensorShape(0);
+
+        OperatorHelper::MatMulShapeMapping(inputShape0, inputShape1, outputShape);
+
+        // Initialize the input descriptions with broadcasting
+        m_inputTensorDescs[DmlInputIndex::dmlA] = CreateTensorDescFromInput(kernelInfo, OrtInputTensors::ortA, TensorAxis::DoNotCoerce, TensorAxis::W, TensorAxis::RightAligned, inputShape0);
+        m_inputTensorDescs[DmlInputIndex::dmlB] = CreateTensorDescFromInput(kernelInfo, OrtInputTensors::ortB, TensorAxis::DoNotCoerce, TensorAxis::W, TensorAxis::RightAligned, inputShape1);
+
+        // Broadcast Bias tensor to the shape of the output tensor.
+        if(kernelInfo.IsInputValid(OrtInputTensors::ortBias)) {
+            m_inputTensorDescs[DmlInputIndex::dmlBias] = CreateTensorDescFromInput(kernelInfo, OrtInputTensors::ortBias, TensorAxis::DoNotCoerce,
+                TensorAxis::W, TensorAxis::RightAligned, outputShape);
+        }
+
+        uint32_t dmlDimSize = m_inputTensorDescs[DmlInputIndex::dmlA].GetDimensionCount();
+        // Resize the A Scale to be the same dimension as the input tensor.
+        // The 1D tensor needs to be moved to the H channel.
+        m_inputTensorDescs[DmlInputIndex::dmlAScale] = CreateTensorDescFromInput(
+            kernelInfo,
+            OrtInputTensors::ortAScale,
+            TensorAxis::DoNotCoerce,
+            TensorAxis::H,
+            TensorAxis::LeftAligned,
+            std::nullopt,
+            dmlDimSize
+            );
+
+        // Resize the A ZeroPoint to be the same dimension as the input tensor.
+        // The 1D tensor needs to be moved to the H channel.
+        if (kernelInfo.IsInputValid(OrtInputTensors::ortAZeroPoint))
+        {
+            m_inputTensorDescs[DmlInputIndex::dmlAZeroPoint] = CreateTensorDescFromInput(
+                kernelInfo,
+                OrtInputTensors::ortAZeroPoint,
+                TensorAxis::DoNotCoerce,
+                TensorAxis::H,
+                TensorAxis::LeftAligned,
+                std::nullopt,
+                dmlDimSize
+                );
+        }
+
+        // B Zeropoint and BScale are already aligned in the W dimension so no need to align them
+
+        // Initialize the output description while overriding the shape
+        m_outputTensorDescs[0] = CreateTensorDescFromOutput(kernelInfo, 0, TensorAxis::DoNotCoerce, TensorAxis::W, TensorAxis::RightAligned, outputShape);
+
+        std::vector<DML_TENSOR_DESC> inputDescs = GetDmlInputDescs();
+        std::vector<DML_TENSOR_DESC> outputDescs = GetDmlOutputDescs();
+
+        DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC matMulDesc = {};
+        matMulDesc.ATensor = &inputDescs[DmlInputIndex::dmlA];
+        matMulDesc.AScaleTensor = &inputDescs[DmlInputIndex::dmlAScale];
+        matMulDesc.AZeroPointTensor = inputDescs[DmlInputIndex::dmlAZeroPoint].Desc != nullptr ? &inputDescs[DmlInputIndex::dmlAZeroPoint] : nullptr;
+        matMulDesc.BTensor = &inputDescs[DmlInputIndex::dmlB];
+        matMulDesc.BScaleTensor = &inputDescs[DmlInputIndex::dmlBScale];
+        matMulDesc.BZeroPointTensor = inputDescs[DmlInputIndex::dmlBZeroPoint].Desc != nullptr ? &inputDescs[DmlInputIndex::dmlBZeroPoint] : nullptr;
+        matMulDesc.BiasTensor = inputDescs[DmlInputIndex::dmlBias].Desc != nullptr ? &inputDescs[DmlInputIndex::dmlBias] : nullptr;
+        matMulDesc.OutputTensor = &outputDescs[0];
+
+        DML_OPERATOR_DESC opDesc = { (DML_OPERATOR_TYPE) DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT, &matMulDesc };
+        SetDmlOperatorDesc(opDesc, kernelInfo);
+    }
+};
+
+DML_OP_DEFINE_CREATION_FUNCTION(MatMulIntegerToFloat, DmlOperatorMatMulIntegerToFloat);
+
+}  // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp
index 9c136ed8c9484..f08151b61197a 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp
@@ -503,6 +503,7 @@ DML_OP_EXTERN_CREATION_FUNCTION(QLinearMatMul);
 DML_OP_EXTERN_CREATION_FUNCTION(QLinearConcat);
 DML_OP_EXTERN_CREATION_FUNCTION(DynamicQuantizeLinear);
 DML_OP_EXTERN_CREATION_FUNCTION(MatMulInteger);
+DML_OP_EXTERN_CREATION_FUNCTION(MatMulIntegerToFloat);
 DML_OP_EXTERN_CREATION_FUNCTION(ConvInteger);
 DML_OP_EXTERN_CREATION_FUNCTION(Trilu);
 
@@ -622,6 +623,13 @@ constexpr static std::array<SupportedTensorDataTypes, 3> supportedTypeListQLinea
     SupportedTensorDataTypes::Int8|SupportedTensorDataTypes::UInt8,
     SupportedTensorDataTypes::Int8|SupportedTensorDataTypes::UInt8
 };
+
+constexpr static std::array<SupportedTensorDataTypes, 3> supportedTypeListMatMulIntegerToFloat = {
+    SupportedTensorDataTypes::Ints8Bit,
+    SupportedTensorDataTypes::Ints8Bit,
+    SupportedTensorDataTypes::Float16to32
+};
+
 constexpr static std::array<SupportedTensorDataTypes, 4> supportedTypeListQLinearConv = {
     SupportedTensorDataTypes::Int8|SupportedTensorDataTypes::UInt8,
     SupportedTensorDataTypes::Int8|SupportedTensorDataTypes::UInt8,
@@ -1083,6 +1091,7 @@ constexpr static OperatorRegistrationInformation operatorRegistrationInformation
     {REG_INFO(     10,  QLinearConv,                        typeNameListFour,               supportedTypeListQLinearConv,           DmlGraphSupport::Supported)},
     {REG_INFO(     10,  QLinearMatMul,                      typeNameListThree,              supportedTypeListQLinearMatMul,         DmlGraphSupport::Supported)},
     {REG_INFO(     10,  MatMulInteger,                      typeNameListThree,              supportedTypeListInteger,               DmlGraphSupport::Supported)},
+    {REG_INFO_MS(   1,  MatMulIntegerToFloat,               typeNameListThree,              supportedTypeListMatMulIntegerToFloat,  DmlGraphSupport::Supported)},
     {REG_INFO(     10,  ConvInteger,                        typeNameListThree,              supportedTypeListInteger,               DmlGraphSupport::Supported)},
     {REG_INFO(     11,  DynamicQuantizeLinear,              typeNameListTwo,                supportedTypeListDynamicQuantizeLinear, DmlGraphSupport::Supported)},
     {REG_INFO(      7,  LayerNormalization,                 typeNameListLayerNormContrib,   supportedTypeListLayerNormalizationContrib, DmlGraphSupport::Supported, requiredConstantCpuInputs(), std::nullopt, QueryLayerNormalization)},
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h
index 1b2521a86613f..06bacc1b28c99 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h
@@ -870,7 +870,6 @@ class QLinearMatMulHelper : public MatMulHelperBase
     QLinearMatMulHelper(const Info_t& info, const Shape_t& shape) : MatMulHelperBase(info, shape, 0, 3) {}
 };
 
-
 class TopKHelper
 {
     void Initialize(
@@ -1776,6 +1775,7 @@ using ShapeInferenceHelper_Identity16 = GetOutputShapeAsInputShapeHelper;
 using ShapeInferenceHelper_Identity19 = GetOutputShapeAsInputShapeHelper;
 using ShapeInferenceHelper_MatMul = MatMulHelper;
 using ShapeInferenceHelper_MatMulInteger = MatMulHelper;
+using ShapeInferenceHelper_MatMulIntegerToFloat = MatMulHelper;
 using ShapeInferenceHelper_QLinearMatMul = QLinearMatMulHelper;
 using ShapeInferenceHelper_QLinearAdd = GetBroadcastedOutputShapeHelper;
 using ShapeInferenceHelper_DynamicQuantizeLinear = GetOutputShapeAsInputShapeHelper;
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h
index e725ba085113d..d081aa2e29148 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h
@@ -449,6 +449,7 @@ namespace OperatorHelper
         static const int sc_sinceVer_FusedMatMulActivation = 1;
         static const int sc_sinceVer_QLinearSigmoid = 1;
         static const int sc_sinceVer_Attention = 1;
+        static const int sc_sinceVer_MatMulIntegerToFloat = 1;
         static const int sc_sinceVer_MultiHeadAttention = 1;
         static const int sc_sinceVer_SkipLayerNormalization = 1;
         static const int sc_sinceVer_EmbedLayerNormalization = 1;
diff --git a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
index 26ce5272d25ee..6f3ca7e239671 100644
--- a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
@@ -23,135 +23,407 @@ using namespace std;
 namespace onnxruntime {
 namespace test {
 
-template <typename IType, typename WType>
-void TestMatMulIntegerToFloat(const std::vector<int64_t>& A_dims,
-                              std::vector<int64_t> B_dims,
-                              const std::string& reference_model,
-                              bool is_matrix_b_constant,
+template <typename IType, typename WType, typename OType>
+static void CalculateMatMulIntegerToFloat(const int64_t M, const int64_t N, const int64_t K,
+                                          const std::vector<IType>& A_data, const std::vector<OType>& A_scale,
+                                          const std::vector<IType>& A_zero_point, const std::vector<WType>& B_data,
+                                          std::vector<OType>& B_scale, std::vector<WType>& B_zero_point,
+                                          const std::vector<OType>& Bias, std::vector<float>& Y_data,
+                                          bool per_column, bool has_zp, bool has_bias) {
+  if (!per_column) {
+    B_zero_point.resize(N, B_zero_point[0]);
+    B_scale.resize(N, B_scale[0]);
+  }
+
+  for (int64_t m = 0; m < M; m++) {
+    for (int64_t n = 0; n < N; n++) {
+      float sum = 0.0f;
+      for (int64_t k = 0; k < K; k++) {
+        float A_dequantized = has_zp ? (static_cast<int>(A_data[m * K + k]) - static_cast<int>(A_zero_point[0])) * A_scale[0] : A_data[m * K + k] * A_scale[0];
+        float B_dequantized = has_zp ? (static_cast<int>(B_data[k * N + n]) - static_cast<int>(B_zero_point[n])) * B_scale[n] : B_data[k * N + n] * B_scale[n];
+
+        sum += A_dequantized * B_dequantized;
+      }
+      if (has_bias) {
+        sum += Bias[n];
+      }
+      Y_data[m * N + n] = static_cast<OType>(sum);
+    }
+  }
+}
+
+template <typename IType, typename WType, typename OType>
+void TestMatMulIntegerToFloat(bool is_matrix_b_constant,
                               bool per_column = false,
                               bool has_zp = true,
                               bool has_bias = false) {
   // create rand inputs
   RandomValueGenerator random{};
-
+  int64_t M = 4;
+  int64_t N = 128;
+  int64_t K = 128;
+  std::vector<int64_t> A_dims{M, K};
+  std::vector<int64_t> B_dims{K, N};
+  std::vector<int64_t> Y_dims{M, K};
   std::vector<IType> A_data;
-  std::vector<int> tmp_A_data = random.Uniform<int32_t>(A_dims,
-                                                        std::numeric_limits<WType>::lowest(),
-                                                        std::numeric_limits<WType>::max());
-  std::transform(tmp_A_data.begin(), tmp_A_data.end(), std::back_inserter(A_data), [](int32_t v) -> WType {
+  std::vector<IType> tmp_A_data = random.Uniform<IType>(A_dims,
+                                                        std::numeric_limits<IType>::lowest(),
+                                                        std::numeric_limits<IType>::max());
+  std::transform(tmp_A_data.begin(), tmp_A_data.end(), std::back_inserter(A_data), [](int32_t v) -> IType {
     return static_cast<IType>(v);
   });
 
   std::vector<WType> B_data;
-  std::vector<int> tmp_B_data = random.Uniform<int32_t>(B_dims,
-                                                        std::numeric_limits<WType>::lowest(),
-                                                        std::numeric_limits<WType>::max());
+
+  std::vector<WType> tmp_B_data;
+  tmp_B_data = random.Uniform<WType>(B_dims,
+                                     std::is_signed<WType>::value ? std::numeric_limits<int8_t>::lowest() / 2 : std::numeric_limits<uint8_t>::lowest(),
+                                     std::numeric_limits<WType>::max() / 2);
   std::transform(tmp_B_data.begin(), tmp_B_data.end(), std::back_inserter(B_data), [](int32_t v) -> WType {
     return static_cast<WType>(v);
   });
 
-  std::vector<float> A_scale = random.Uniform<float>(AsSpan<int64_t>({1}), -0.1f, 0.1f);
+  std::vector<OType> A_scale = random.Uniform<OType>(AsSpan<int64_t>({1}), -0.1f, 0.1f);
   std::vector<IType> A_zero_point{(std::numeric_limits<IType>::lowest() + std::numeric_limits<IType>::max() + IType(2)) / 2};
 
   int64_t b_scale_zp_size = per_column ? B_dims.back() : 1;
-  std::vector<float> B_scale = random.Uniform<float>(AsSpan({b_scale_zp_size}), -0.1f, 0.1f);
+  std::vector<OType> B_scale = random.Uniform<OType>(AsSpan({b_scale_zp_size}), -0.1f, 0.1f);
 
   std::vector<WType> B_zero_point(b_scale_zp_size);
   std::for_each(B_zero_point.begin(),
                 B_zero_point.end(),
                 [&random](WType& zp) {
-                  zp = static_cast<WType>(random.Uniform<int32_t>(std::array<int64_t, 1>{1},
-                                                                  std::numeric_limits<WType>::lowest(),
-                                                                  std::numeric_limits<WType>::max())[0]);
+                  zp = static_cast<WType>(random.Uniform<WType>(std::array<int64_t, 1>{1},
+                                                                std::numeric_limits<WType>::lowest(),
+                                                                std::numeric_limits<WType>::max())[0]);
                 });
 
-  std::vector<float> Bias = random.Uniform<float>(AsSpan({B_dims.back()}), -0.1f, 0.1f);
+  std::vector<OType> Bias = random.Uniform<OType>(AsSpan({B_dims.back()}), -0.1f, 0.1f);
 
   OpTester test("MatMulIntegerToFloat", 1, onnxruntime::kMSDomain);
   test.AddInput<IType>("A", A_dims, A_data);
   test.AddInput<WType>("B", B_dims, B_data, is_matrix_b_constant);
-  test.AddInput<float>("a_scale", {1}, A_scale);
-  test.AddInput<float>("b_scale", {b_scale_zp_size}, B_scale);
+  test.AddInput<OType>("a_scale", {1}, A_scale);
+  test.AddInput<OType>("b_scale", {b_scale_zp_size}, B_scale);
 
   if (has_zp) {
     test.AddInput<IType>("a_zero_point", {1}, A_zero_point);
     test.AddInput<WType>("b_zero_point", {b_scale_zp_size}, B_zero_point);
   } else {
-    test.AddOptionalInputEdge<WType>();
+    test.AddOptionalInputEdge<IType>();
     test.AddOptionalInputEdge<WType>();
   }
 
   if (has_bias) {
-    test.AddInput<float>("bias", {B_dims.back()}, Bias);
+    test.AddInput<OType>("bias", {B_dims.back()}, Bias);
   } else {
-    test.AddOptionalInputEdge<float>();
+    test.AddOptionalInputEdge<OType>();
   }
 
-  test.AddReferenceOutputs(reference_model);
-  test.SetOutputRelErr("Y", 1e-4f);
-  test.Run();
-}
+  std::vector<float> Y_data(M * N);
+  CalculateMatMulIntegerToFloat<IType, WType, OType>(M, N, K, A_data, A_scale, A_zero_point,
+                                                     B_data, B_scale, B_zero_point, Bias, Y_data,
+                                                     per_column, has_zp, has_bias);
 
-template <typename IType, typename WType, bool HasZeroPoint, bool HasBias>
-void RunMatMulIntegerToFloatTest(const string& model_path) {
-  std::vector<int64_t> A_dims{4, 128};
-  std::vector<int64_t> B_dims{128, 128};
-  std::vector<int64_t> Y_dims{4, 128};
+  if (std::is_same_v<OType, float>) {
+    test.AddOutput<float>("Y", {M, N}, Y_data);
+    test.SetOutputRelErr("Y", 0.02f);
+  } else {
+    test.AddOutput<MLFloat16>("Y", {M, N}, ToFloat16(Y_data));
+    test.SetOutputAbsErr("Y", 0.5f);
+  }
 
-  TestMatMulIntegerToFloat<IType, WType>(A_dims,
-                                         B_dims,
-                                         model_path,
-                                         false,        /*is_matrix_b_constant*/
-                                         false,        /*per_column*/
-                                         HasZeroPoint, /*has_zp*/
-                                         HasBias       /*has_bias*/
+  // Only DML EP supports these data type combinations for now
+  if (std::is_same_v<OType, MLFloat16> ||
+      (std::is_same_v<OType, float> &&
+       std::is_same_v<IType, int8_t> &&
+       std::is_same_v<WType, uint8_t>)) {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultDmlExecutionProvider());
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  } else {
+    test.Run();
+  }
+}
+
+template <typename IType, typename WType, typename OType, bool HasZeroPoint, bool HasBias>
+void RunMatMulIntegerToFloatTest() {
+  TestMatMulIntegerToFloat<IType, WType, OType>(
+      false,        /*is_matrix_b_constant*/
+      false,        /*per_column*/
+      HasZeroPoint, /*has_zp*/
+      HasBias       /*has_bias*/
   );
 
-  TestMatMulIntegerToFloat<IType, WType>(A_dims,
-                                         B_dims,
-                                         model_path,
-                                         true,         /*is_matrix_b_constant*/
-                                         false,        /*per_column*/
-                                         HasZeroPoint, /*has_zp*/
-                                         HasBias       /*has_bias*/
+  TestMatMulIntegerToFloat<IType, WType, OType>(
+      true,         /*is_matrix_b_constant*/
+      false,        /*per_column*/
+      HasZeroPoint, /*has_zp*/
+      HasBias       /*has_bias*/
   );
 
-  TestMatMulIntegerToFloat<IType, WType>(A_dims,
-                                         B_dims,
-                                         model_path,
-                                         false,        /*is_matrix_b_constant*/
-                                         true,         /*per_column*/
-                                         HasZeroPoint, /*has_zp*/
-                                         HasBias       /*has_bias*/
+  TestMatMulIntegerToFloat<IType, WType, OType>(
+      false,        /*is_matrix_b_constant*/
+      true,         /*per_column*/
+      HasZeroPoint, /*has_zp*/
+      HasBias       /*has_bias*/
   );
 
-  TestMatMulIntegerToFloat<IType, WType>(A_dims,
-                                         B_dims,
-                                         model_path,
-                                         true,         /*is_matrix_b_constant*/
-                                         true,         /*per_column*/
-                                         HasZeroPoint, /*has_zp*/
-                                         HasBias       /*has_bias*/
+  TestMatMulIntegerToFloat<IType, WType, OType>(
+      true,         /*is_matrix_b_constant*/
+      true,         /*per_column*/
+      HasZeroPoint, /*has_zp*/
+      HasBias       /*has_bias*/
   );
 }
 
-TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8X8) {
-  RunMatMulIntegerToFloatTest<uint8_t, int8_t, true, false>("testdata/matmul_integer_to_float_int8.onnx");
-  RunMatMulIntegerToFloatTest<uint8_t, uint8_t, true, false>("testdata/matmul_integer_to_float_uint8.onnx");
+TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8S8) {
+  RunMatMulIntegerToFloatTest<int8_t, int8_t, float, true, false>();
 }
 
-TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_U8X8) {
-  RunMatMulIntegerToFloatTest<uint8_t, int8_t, false, true>("testdata/matmul_integer_to_float_int8_bias.onnx");
-  RunMatMulIntegerToFloatTest<uint8_t, uint8_t, false, true>("testdata/matmul_integer_to_float_uint8_bias.onnx");
+TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_S8S8) {
+  RunMatMulIntegerToFloatTest<int8_t, int8_t, float, false, true>();
 }
 
-TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8S8) {
-  RunMatMulIntegerToFloatTest<int8_t, int8_t, true, false>("testdata/matmul_integer_to_float_int8_int8.onnx");
+TEST(MatMulIntegerToFloat, NoZeroPoint_NoBias_test_S8S8) {
+  RunMatMulIntegerToFloatTest<int8_t, int8_t, float, false, false>();
 }
 
-TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_S8S8) {
-  RunMatMulIntegerToFloatTest<int8_t, int8_t, false, true>("testdata/matmul_integer_to_float_int8_int8_bias.onnx");
+TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_S8S8) {
+  RunMatMulIntegerToFloatTest<int8_t, int8_t, float, true, true>();
+}
+
+TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8U8) {
+  RunMatMulIntegerToFloatTest<uint8_t, uint8_t, float, true, false>();
+}
+
+TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_U8U8) {
+  RunMatMulIntegerToFloatTest<uint8_t, uint8_t, float, false, true>();
+}
+
+TEST(MatMulIntegerToFloat, NoZeroPoint_NoBias_test_U8U8) {
+  RunMatMulIntegerToFloatTest<uint8_t, uint8_t, float, false, false>();
+}
+
+TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_U8X8) {
+  RunMatMulIntegerToFloatTest<uint8_t, uint8_t, float, true, true>();
+}
+
+TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8S8) {
+  RunMatMulIntegerToFloatTest<uint8_t, int8_t, float, true, false>();
+}
+
+TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_U8S8) {
+  RunMatMulIntegerToFloatTest<uint8_t, int8_t, float, false, true>();
+}
+
+TEST(MatMulIntegerToFloat, NoZeroPoint_NoBias_test_U8S8) {
+  RunMatMulIntegerToFloatTest<uint8_t, int8_t, float, false, false>();
+}
+
+TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_U8S8) {
+  RunMatMulIntegerToFloatTest<uint8_t, int8_t, float, true, true>();
+}
+
+// DML EP supports Float16 output type and Signed A Matrix and Unsigned B Matric for Float32 output
+#if defined(USE_DML)
+
+TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8U8) {
+  RunMatMulIntegerToFloatTest<int8_t, uint8_t, float, true, false>();
+}
+
+TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_S8U8) {
+  RunMatMulIntegerToFloatTest<int8_t, uint8_t, float, false, true>();
+}
+
+TEST(MatMulIntegerToFloat, NoZeroPoint_NoBias_test_S8U8) {
+  RunMatMulIntegerToFloatTest<int8_t, uint8_t, float, false, false>();
+}
+
+TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_S8U8) {
+  RunMatMulIntegerToFloatTest<int8_t, int8_t, float, true, true>();
+}
+
+TEST(MatMulIntegerToFloat, MatMulIntegerToFloat_FP16_U8U8) {
+  OpTester test("MatMulIntegerToFloat", 1, kMSDomain);
+  int64_t M = 5;
+  int64_t N = 5;
+  int64_t K = 2;
+
+  std::vector<uint8_t> A_data = {1, 5, 2, 1, 9,
+                                 1, 1, 3, 7, 2};
+  std::vector<uint8_t> B_data = {3, 7, 2, 1, 1,
+                                 2, 1, 9, 1, 1};
+  std::vector<MLFloat16> A_scale = ToFloat16({3.0f});
+  std::vector<MLFloat16> B_scale = ToFloat16({2.0f});
+  test.AddInput<uint8_t>("A", {M, K}, A_data);
+  test.AddInput<uint8_t>("B", {K, N}, B_data);
+  std::vector<uint8_t> A_zero_point = {1};
+  std::vector<uint8_t> B_zero_point = {1};
+
+  test.AddInput<MLFloat16>("a_scale", {1}, A_scale);
+  test.AddInput<MLFloat16>("b_scale", {1}, B_scale);
+  test.AddInput<uint8_t>("a_zero_point", {1}, A_zero_point);
+  test.AddInput<uint8_t>("b_zero_point", {1}, B_zero_point);
+
+  std::vector<float> Y_data(M * N);
+  CalculateMatMulIntegerToFloat<uint8_t, uint8_t, MLFloat16>(M, N, K, A_data, A_scale, A_zero_point,
+                                                             B_data, B_scale, B_zero_point, {}, Y_data,
+                                                             false, true, false);
+
+  test.AddOutput<MLFloat16>("Y", {M, N}, ToFloat16(Y_data));
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultDmlExecutionProvider());
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+}
+
+TEST(MatMulIntegerToFloat, MatMulIntegerToFloat_FP16_U8S8) {
+  OpTester test("MatMulIntegerToFloat", 1, kMSDomain);
+  int64_t M = 5;
+  int64_t N = 5;
+  int64_t K = 2;
+
+  std::vector<uint8_t> A_data = {3, 7, 2, 1, 1,
+                                 2, 1, 9, 1, 1};
+  std::vector<int8_t> B_data = {2, -1, -9, 1, 1,
+                                -1, 0, -3, 1, -4};
+  std::vector<MLFloat16> A_scale = ToFloat16({-4.0f});
+  std::vector<MLFloat16> B_scale = ToFloat16({2.0f});
+  test.AddInput<uint8_t>("A", {M, K}, A_data);
+  test.AddInput<int8_t>("B", {K, N}, B_data);
+  std::vector<uint8_t> A_zero_point = {1};
+  std::vector<int8_t> B_zero_point = {3};
+  std::vector<MLFloat16> Bias = ToFloat16({11.0f, -17.0f, 1.0f, -3.0f, 12.0f});
+
+  test.AddInput<MLFloat16>("a_scale", {1}, A_scale);
+  test.AddInput<MLFloat16>("b_scale", {1}, B_scale);
+  test.AddInput<uint8_t>("a_zero_point", {1}, A_zero_point);
+  test.AddInput<int8_t>("b_zero_point", {1}, B_zero_point);
+
+  std::vector<float> Y_data(M * N);
+  CalculateMatMulIntegerToFloat<uint8_t, int8_t, MLFloat16>(M, N, K, A_data, A_scale, A_zero_point,
+                                                            B_data, B_scale, B_zero_point, {}, Y_data,
+                                                            false, true, false);
+
+  test.AddOutput<MLFloat16>("Y", {M, N}, ToFloat16(Y_data));
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultDmlExecutionProvider());
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+}
+
+TEST(MatMulIntegerToFloat, MatMulIntegerToFloat_FP16_S8S8) {
+  OpTester test("MatMulIntegerToFloat", 1, kMSDomain);
+  int64_t M = 5;
+  int64_t N = 5;
+  int64_t K = 2;
+
+  std::vector<int8_t> A_data = {3, 7, -2, 1, 1,
+                                2, -1, -9, 1, 1};
+  std::vector<int8_t> B_data = {2, -1, -9, 1, 1,
+                                -1, 0, -3, 1, -4};
+  std::vector<MLFloat16> A_scale = ToFloat16({-4.0f});
+  std::vector<MLFloat16> B_scale = ToFloat16({2.0f});
+  test.AddInput<int8_t>("A", {M, K}, A_data);
+  test.AddInput<int8_t>("B", {K, N}, B_data);
+  std::vector<int8_t> A_zero_point = {-1};
+  std::vector<int8_t> B_zero_point = {3};
+  std::vector<MLFloat16> Bias = ToFloat16({11.0f, -17.0f, 1.0f, -3.0f, 12.0f});
+
+  test.AddInput<MLFloat16>("a_scale", {1}, A_scale);
+  test.AddInput<MLFloat16>("b_scale", {1}, B_scale);
+  test.AddInput<int8_t>("a_zero_point", {1}, A_zero_point);
+  test.AddInput<int8_t>("b_zero_point", {1}, B_zero_point);
+  test.AddInput<MLFloat16>("bias", {N}, Bias);
+
+  std::vector<float> Y_data(M * N);
+  CalculateMatMulIntegerToFloat<int8_t, int8_t, MLFloat16>(M, N, K, A_data, A_scale, A_zero_point,
+                                                           B_data, B_scale, B_zero_point, Bias, Y_data,
+                                                           false, true, true);
+
+  test.AddOutput<MLFloat16>("Y", {M, N}, ToFloat16(Y_data));
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultDmlExecutionProvider());
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+}
+
+TEST(MatMulIntegerToFloat, MatMulIntegerToFloat_FP16_S8U8) {
+  OpTester test("MatMulIntegerToFloat", 1, kMSDomain);
+  int64_t M = 5;
+  int64_t N = 5;
+  int64_t K = 2;
+
+  std::vector<int8_t> A_data = {3, 7, -2, 1, 1,
+                                2, -1, -9, 1, 1};
+  std::vector<uint8_t> B_data = {3, 7, 2, 1, 1,
+                                 2, 1, 9, 1, 1};
+  std::vector<MLFloat16> A_scale = ToFloat16({-4.0f});
+  std::vector<MLFloat16> B_scale = ToFloat16({2.0f});
+  test.AddInput<int8_t>("A", {M, K}, A_data);
+  test.AddInput<uint8_t>("B", {K, N}, B_data);
+  std::vector<int8_t> A_zero_point = {-1};
+  std::vector<uint8_t> B_zero_point = {1};
+  std::vector<MLFloat16> Bias = ToFloat16({11.0f, -17.0f, 1.0f, -3.0f, 12.0f});
+
+  test.AddInput<MLFloat16>("a_scale", {1}, A_scale);
+  test.AddInput<MLFloat16>("b_scale", {1}, B_scale);
+  test.AddInput<int8_t>("a_zero_point", {1}, A_zero_point);
+  test.AddInput<uint8_t>("b_zero_point", {1}, B_zero_point);
+  test.AddInput<MLFloat16>("bias", {N}, Bias);
+
+  std::vector<float> Y_data(M * N);
+  CalculateMatMulIntegerToFloat<int8_t, uint8_t, MLFloat16>(M, N, K, A_data, A_scale, A_zero_point,
+                                                            B_data, B_scale, B_zero_point, Bias, Y_data,
+                                                            false, true, true);
+
+  test.AddOutput<MLFloat16>("Y", {M, N}, ToFloat16(Y_data));
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultDmlExecutionProvider());
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+}
+
+TEST(MatMulIntegerToFloat, MatMulIntegerToFloat_FP16) {
+  OpTester test("MatMulIntegerToFloat", 1, kMSDomain);
+  int64_t M = 2;
+  int64_t N = 2;
+  int64_t K = 3;
+
+  std::vector<int8_t> A_data = {11, -2, 5,
+                                -1, 3, 10};
+  std::vector<int8_t> B_data = {-13, -2,
+                                9, 55,
+                                -1, 23};
+  std::vector<MLFloat16> A_scale = ToFloat16({0.910f});
+  std::vector<MLFloat16> B_scale = ToFloat16({1.10f, 1.123f});
+
+  std::vector<int8_t> A_zero_point = {113};
+  std::vector<int8_t> B_zero_point = {98, 71};
+
+  std::vector<MLFloat16> Bias = ToFloat16({0.10f, 1.123f});
+
+  test.AddInput<int8_t>("A", {M, K}, A_data);
+  test.AddInput<int8_t>("B", {K, N}, B_data);
+
+  test.AddInput<MLFloat16>("a_scale", {}, {A_scale});
+  test.AddInput<MLFloat16>("b_scale", {N}, B_scale);
+  test.AddInput<int8_t>("a_zero_point", {}, {A_zero_point});
+  test.AddInput<int8_t>("b_zero_point", {N}, B_zero_point);
+  test.AddInput<MLFloat16>("bias", {N}, Bias);
+
+  std::vector<float> Y_data(M * N);
+  CalculateMatMulIntegerToFloat<int8_t, int8_t, MLFloat16>(M, N, K, A_data, A_scale, A_zero_point,
+                                                           B_data, B_scale, B_zero_point, Bias, Y_data,
+                                                           true, true, true);
+
+  test.AddOutput<MLFloat16>("Y", {M, N}, ToFloat16(Y_data));
+  test.SetOutputRelErr("Y", 2e-2f);
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultDmlExecutionProvider());
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
 }
+#endif
 
 TEST(MatMulIntegerToFloat, MatMulInteger_With_ZeroPoint) {
   auto test_case = [&](const std::vector<int64_t>& input_shape,
diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc
index 16f38bac62713..1535e2b60a3bd 100755
--- a/onnxruntime/test/optimizer/graph_transform_test.cc
+++ b/onnxruntime/test/optimizer/graph_transform_test.cc
@@ -5679,6 +5679,24 @@ TEST_F(GraphTransformationTests, MatMulIntegerToFloatTest) {
   EXPECT_EQ(op_to_count["Add"], 1);
 }
 
+#ifdef USE_DML
+TEST_F(GraphTransformationTests, MatMulIntegerToFloat16Test) {
+  constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/matmul_integer_to_float16_int8.onnx";
+  std::shared_ptr<Model> p_model;
+  ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_));
+  Graph& graph = p_model->MainGraph();
+
+  for (auto& node : graph.Nodes()) {
+    node.SetExecutionProviderType(kDmlExecutionProvider);
+  }
+  onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::make_unique<MatMulIntegerToFloatFusion>(), TransformerLevel::Level2));
+  ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, *logger_));
+  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
+  EXPECT_EQ(op_to_count["com.microsoft.MatMulIntegerToFloat"], 1);
+}
+#endif  // USE_DML
+
 #endif
 
 #ifndef DISABLE_CONTRIB_OPS
diff --git a/onnxruntime/test/testdata/matmul_integer_to_float.py b/onnxruntime/test/testdata/matmul_integer_to_float.py
index b898390044cf4..e6c51009018f9 100644
--- a/onnxruntime/test/testdata/matmul_integer_to_float.py
+++ b/onnxruntime/test/testdata/matmul_integer_to_float.py
@@ -4,7 +4,7 @@
 from onnx import TensorProto, helper
 
 
-def GenerateModel(model_name, sign_i, sign_w, has_zp=True, bias=False):  # noqa: N802
+def GenerateModel(model_name, sign_i, sign_w, output_type_fp16, has_zp=True, bias=False):  # noqa: N802
     nodes = [  # subgraph
         helper.make_node(
             "MatMulInteger",
@@ -13,7 +13,13 @@ def GenerateModel(model_name, sign_i, sign_w, has_zp=True, bias=False):  # noqa:
             "MatMulInteger",
         ),
         helper.make_node("Mul", ["a_scale", "b_scale"], ["multiplier"], "mul_right"),
-        helper.make_node("Cast", ["matmul_output_int32"], ["matmul_output_float"], "cast", to=1),
+        helper.make_node(
+            "Cast",
+            ["matmul_output_int32"],
+            ["matmul_output_float"],
+            "cast",
+            to=TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT,
+        ),
         helper.make_node(
             "Mul",
             ["matmul_output_float", "multiplier"],
@@ -25,8 +31,8 @@ def GenerateModel(model_name, sign_i, sign_w, has_zp=True, bias=False):  # noqa:
     inputs = [  # inputs
         helper.make_tensor_value_info("A", TensorProto.INT8 if sign_i else TensorProto.UINT8, ["M", "K"]),
         helper.make_tensor_value_info("B", TensorProto.INT8 if sign_w else TensorProto.UINT8, ["K", "N"]),
-        helper.make_tensor_value_info("a_scale", TensorProto.FLOAT, [1]),
-        helper.make_tensor_value_info("b_scale", TensorProto.FLOAT, ["C"]),
+        helper.make_tensor_value_info("a_scale", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [1]),
+        helper.make_tensor_value_info("b_scale", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, ["C"]),
     ]
 
     if has_zp:
@@ -48,14 +54,22 @@ def GenerateModel(model_name, sign_i, sign_w, has_zp=True, bias=False):  # noqa:
     if bias:
         nodes.extend([helper.make_node("Add", ["mul_bottom_output", "bias"], ["Y"], "add")])
 
-        inputs.extend([helper.make_tensor_value_info("bias", TensorProto.FLOAT, ["N"])])
+        inputs.extend(
+            [
+                helper.make_tensor_value_info(
+                    "bias", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, ["N"]
+                )
+            ]
+        )
 
     graph = helper.make_graph(
         nodes,
         "DynamicQuantizeMatMul_fusion",  # name
         inputs,
         [  # outputs
-            helper.make_tensor_value_info("Y", TensorProto.FLOAT, ["M", "N"]),
+            helper.make_tensor_value_info(
+                "Y", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, ["M", "N"]
+            ),
         ],
     )
 
@@ -64,10 +78,32 @@ def GenerateModel(model_name, sign_i, sign_w, has_zp=True, bias=False):  # noqa:
 
 
 if __name__ == "__main__":
-    GenerateModel("matmul_integer_to_float_int8.onnx", False, True)
-    GenerateModel("matmul_integer_to_float_uint8.onnx", False, False)
-    GenerateModel("matmul_integer_to_float_int8_bias.onnx", False, True, False, True)
-    GenerateModel("matmul_integer_to_float_uint8_bias.onnx", False, False, False, True)
+    GenerateModel("matmul_integer_to_float16_int8.onnx", sign_i=False, sign_w=True, output_type_fp16=True)
+    GenerateModel("matmul_integer_to_float_int8.onnx", sign_i=False, sign_w=True, output_type_fp16=False)
+    GenerateModel("matmul_integer_to_float_uint8.onnx", sign_i=False, sign_w=False, output_type_fp16=False)
+    GenerateModel(
+        "matmul_integer_to_float_int8_bias.onnx",
+        sign_i=False,
+        sign_w=True,
+        output_type_fp16=False,
+        has_zp=False,
+        bias=True,
+    )
+    GenerateModel(
+        "matmul_integer_to_float_uint8_bias.onnx",
+        sign_i=False,
+        sign_w=False,
+        output_type_fp16=False,
+        has_zp=False,
+        bias=True,
+    )
 
-    GenerateModel("matmul_integer_to_float_int8_int8.onnx", True, True)
-    GenerateModel("matmul_integer_to_float_int8_int8_bias.onnx", True, True, False, True)
+    GenerateModel("matmul_integer_to_float_int8_int8.onnx", sign_i=True, sign_w=True, output_type_fp16=False)
+    GenerateModel(
+        "matmul_integer_to_float_int8_int8_bias.onnx",
+        sign_i=True,
+        sign_w=True,
+        output_type_fp16=False,
+        has_zp=False,
+        bias=True,
+    )
diff --git a/onnxruntime/test/testdata/matmul_integer_to_float_int8.onnx b/onnxruntime/test/testdata/matmul_integer_to_float_int8.onnx
index 9f4465a914963..906dec542a4fa 100644
--- a/onnxruntime/test/testdata/matmul_integer_to_float_int8.onnx
+++ b/onnxruntime/test/testdata/matmul_integer_to_float_int8.onnx
@@ -1,4 +1,4 @@
-:�
+	:�
 U
 A
 B
@@ -44,4 +44,4 @@ mul_bottom"MulDynamicQuantizeMatMul_fusionZ
 
 
 M
-NB
\ No newline at end of file
+NB
\ No newline at end of file
diff --git a/onnxruntime/test/testdata/matmul_integer_to_float_int8_bias.onnx b/onnxruntime/test/testdata/matmul_integer_to_float_int8_bias.onnx
index 01b7e15aa4a1f..16cdf03c7ae59 100644
--- a/onnxruntime/test/testdata/matmul_integer_to_float_int8_bias.onnx
+++ b/onnxruntime/test/testdata/matmul_integer_to_float_int8_bias.onnx
@@ -1,4 +1,4 @@
-:�
+	:�
 9
 A
 Bmatmul_output_int32MatMulInteger"MatMulInteger
@@ -41,4 +41,4 @@ mul_bottom"Mul
 
 
 M
-NB
\ No newline at end of file
+NB
\ No newline at end of file
diff --git a/onnxruntime/test/testdata/matmul_integer_to_float_int8_int8.onnx b/onnxruntime/test/testdata/matmul_integer_to_float_int8_int8.onnx
index 9d38828e25d6a..55102757a0b57 100644
--- a/onnxruntime/test/testdata/matmul_integer_to_float_int8_int8.onnx
+++ b/onnxruntime/test/testdata/matmul_integer_to_float_int8_int8.onnx
@@ -1,4 +1,4 @@
-:�
+	:�
 U
 A
 B
@@ -44,4 +44,4 @@ mul_bottom"MulDynamicQuantizeMatMul_fusionZ
 
 
 M
-NB
\ No newline at end of file
+NB
\ No newline at end of file
diff --git a/onnxruntime/test/testdata/matmul_integer_to_float_int8_int8_bias.onnx b/onnxruntime/test/testdata/matmul_integer_to_float_int8_int8_bias.onnx
index 4d9a55af50a87..d9d7222a1acaa 100644
--- a/onnxruntime/test/testdata/matmul_integer_to_float_int8_int8_bias.onnx
+++ b/onnxruntime/test/testdata/matmul_integer_to_float_int8_int8_bias.onnx
@@ -1,4 +1,4 @@
-:�
+	:�
 9
 A
 Bmatmul_output_int32MatMulInteger"MatMulInteger
@@ -41,4 +41,4 @@ mul_bottom"Mul
 
 
 M
-NB
\ No newline at end of file
+NB
\ No newline at end of file
diff --git a/onnxruntime/test/testdata/matmul_integer_to_float_uint8.onnx b/onnxruntime/test/testdata/matmul_integer_to_float_uint8.onnx
index a4c6d20d59be8..5373ce145688e 100644
--- a/onnxruntime/test/testdata/matmul_integer_to_float_uint8.onnx
+++ b/onnxruntime/test/testdata/matmul_integer_to_float_uint8.onnx
@@ -1,4 +1,4 @@
-:�
+	:�
 U
 A
 B
@@ -44,4 +44,4 @@ mul_bottom"MulDynamicQuantizeMatMul_fusionZ
 
 
 M
-NB
\ No newline at end of file
+NB
\ No newline at end of file
diff --git a/onnxruntime/test/testdata/matmul_integer_to_float_uint8_bias.onnx b/onnxruntime/test/testdata/matmul_integer_to_float_uint8_bias.onnx
index a5be0c63f4dcb..e407414b23b24 100644
--- a/onnxruntime/test/testdata/matmul_integer_to_float_uint8_bias.onnx
+++ b/onnxruntime/test/testdata/matmul_integer_to_float_uint8_bias.onnx
@@ -1,4 +1,4 @@
-:�
+	:�
 9
 A
 Bmatmul_output_int32MatMulInteger"MatMulInteger
@@ -41,4 +41,4 @@ mul_bottom"Mul
 
 
 M
-NB
\ No newline at end of file
+NB
\ No newline at end of file
diff --git a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.onnx b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.onnx
index 7ea69c580ee435be09f12b949f14fdb2efe3d403..aa8e67bcbc59e53d3418000c23ef35c75dfd76c6 100644
GIT binary patch
delta 13
Ucmeys{ehc_gL5O(TUJJ403a9x!vFvP

delta 13
Ucmeys{ehc_gMA~@TUJIM03ZVcx&QzG

diff --git a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16_int8.onnx b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16_int8.onnx
new file mode 100644
index 0000000000000..22293b0d10756
--- /dev/null
+++ b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16_int8.onnx
@@ -0,0 +1,51 @@
+	:�
+U
+A
+B
+a_zero_point
+b_zero_pointmatmul_output_int32MatMulInteger"MatMulInteger
+.
+a_scale
+b_scale
+multiplier	mul_right"Mul
+A
+matmul_output_int32matmul_output_floatcast"Cast*	
+to
+�
+5
+matmul_output_float
+
+multiplierY
+mul_bottom"MulDynamicQuantizeMatMul_fusionZ
+A
+
+
+M
+KZ
+B
+
+
+K
+NZ
+a_scale
+
+
+
+Z
+b_scale
+	
+
+CZ
+a_zero_point
+
+
+Z
+b_zero_point
+	
+Cb
+Y
+
+
+
+M
+NB
\ No newline at end of file

From 0cdf36faeb4eafcf543bd84dd6f543a55df738c1 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <yuslepukhin@users.noreply.github.com>
Date: Mon, 4 Mar 2024 13:46:51 -0800
Subject: [PATCH 104/279] Expose SessionOtions.DisablePerSessionThreads
 (#19730)

### Description

### Motivation and Context
ML.NET needs to run mltiple sessions on a single threadpool.
---
 .../src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs | 5 +++++
 .../Microsoft.ML.OnnxRuntime/SessionOptions.shared.cs    | 9 +++++++++
 .../InferenceTest.cs                                     | 5 ++++-
 3 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
index 4128524b30483..8a8426a0b3054 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
@@ -362,6 +362,7 @@ static NativeMethods()
             OrtDisableMemPattern = (DOrtDisableMemPattern)Marshal.GetDelegateForFunctionPointer(api_.DisableMemPattern, typeof(DOrtDisableMemPattern));
             OrtEnableCpuMemArena = (DOrtEnableCpuMemArena)Marshal.GetDelegateForFunctionPointer(api_.EnableCpuMemArena, typeof(DOrtEnableCpuMemArena));
             OrtDisableCpuMemArena = (DOrtDisableCpuMemArena)Marshal.GetDelegateForFunctionPointer(api_.DisableCpuMemArena, typeof(DOrtDisableCpuMemArena));
+            OrtDisablePerSessionThreads = (DOrtDisablePerSessionThreads)Marshal.GetDelegateForFunctionPointer(api_.DisablePerSessionThreads, typeof(DOrtDisablePerSessionThreads));
             OrtSetSessionLogId = (DOrtSetSessionLogId)Marshal.GetDelegateForFunctionPointer(api_.SetSessionLogId, typeof(DOrtSetSessionLogId));
             OrtSetSessionLogVerbosityLevel = (DOrtSetSessionLogVerbosityLevel)Marshal.GetDelegateForFunctionPointer(api_.SetSessionLogVerbosityLevel, typeof(DOrtSetSessionLogVerbosityLevel));
             OrtSetSessionLogSeverityLevel = (DOrtSetSessionLogSeverityLevel)Marshal.GetDelegateForFunctionPointer(api_.SetSessionLogSeverityLevel, typeof(DOrtSetSessionLogSeverityLevel));
@@ -992,6 +993,10 @@ IntPtr[] outputValues /* An array of output value pointers. Array must be alloca
         public delegate IntPtr /*(OrtStatus*)*/ DOrtDisableCpuMemArena(IntPtr /* OrtSessionOptions* */ options);
         public static DOrtDisableCpuMemArena OrtDisableCpuMemArena;
 
+        [UnmanagedFunctionPointer(CallingConvention.Winapi)]
+        public delegate IntPtr /*(OrtStatus*)*/ DOrtDisablePerSessionThreads(IntPtr /* OrtSessionOptions* */ options);
+        public static DOrtDisablePerSessionThreads OrtDisablePerSessionThreads;
+
         [UnmanagedFunctionPointer(CallingConvention.Winapi)]
         public delegate IntPtr /*(OrtStatus*)*/ DOrtSetSessionLogId(IntPtr /* OrtSessionOptions* */ options, byte[] /* const char* */ logId);
         public static DOrtSetSessionLogId OrtSetSessionLogId;
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/SessionOptions.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/SessionOptions.shared.cs
index 7a68246c9b67a..30d005b3c4236 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/SessionOptions.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/SessionOptions.shared.cs
@@ -696,6 +696,15 @@ public bool EnableCpuMemArena
         }
         private bool _enableCpuMemArena = true;
 
+        /// <summary>
+        /// Disables the per session threads. Default is true.
+        /// This makes all sessions in the process use a global TP.
+        /// </summary>
+        public void DisablePerSessionThreads()
+        {
+            NativeApiStatus.VerifySuccess(NativeMethods.OrtDisablePerSessionThreads(handle));
+        }
+
         /// <summary>
         /// Log Id to be used for the session. Default is empty string.
         /// </summary>
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/InferenceTest.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/InferenceTest.cs
index fd8feda359f90..d6a6b9627f418 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/InferenceTest.cs
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/InferenceTest.cs
@@ -55,6 +55,9 @@ public void TestSessionOptions()
                 Assert.Equal(0, opt.InterOpNumThreads);
                 Assert.Equal(GraphOptimizationLevel.ORT_ENABLE_ALL, opt.GraphOptimizationLevel);
 
+                // No get, so no verify
+                opt.DisablePerSessionThreads();
+
                 // try setting options
                 opt.ExecutionMode = ExecutionMode.ORT_PARALLEL;
                 Assert.Equal(ExecutionMode.ORT_PARALLEL, opt.ExecutionMode);
@@ -98,7 +101,7 @@ public void TestSessionOptions()
                 Assert.Contains("[ErrorCode:InvalidArgument] Config key is empty", ex.Message);
 
                 // SessionOptions.RegisterOrtExtensions can be manually tested by referencing the
-                // Microsoft.ML.OnnxRuntime.Extensions nuget package. After that is done, this should not throw.                
+                // Microsoft.ML.OnnxRuntime.Extensions nuget package. After that is done, this should not throw.
                 ex = Assert.Throws<OnnxRuntimeException>(() => { opt.RegisterOrtExtensions(); });
                 Assert.Contains("Microsoft.ML.OnnxRuntime.Extensions NuGet package must be referenced", ex.Message);
 

From 2a5c9b86ebbdba8fb76f79de26524a2fdd2e5c2a Mon Sep 17 00:00:00 2001
From: zhijiang <43435212+zhijxu-MS@users.noreply.github.com>
Date: Tue, 5 Mar 2024 10:11:19 +0800
Subject: [PATCH 105/279] Zhijxu/fix conv1d replacement (#19758)

remove the constraint - "group number should be less than 3";
add more condition to make sure the conv1d replacement only happens on
conv1d instead of conv2d/conv3d;
add more tests;
---
 .../core/optimizer/conv1d_replacement.cc      | 63 +++++++++++-------
 .../test/optimizer/graph_transform_test.cc    | 64 ++++++++++++++++---
 2 files changed, 96 insertions(+), 31 deletions(-)

diff --git a/orttraining/orttraining/core/optimizer/conv1d_replacement.cc b/orttraining/orttraining/core/optimizer/conv1d_replacement.cc
index 0412000e04e1b..ff220fcb067b8 100644
--- a/orttraining/orttraining/core/optimizer/conv1d_replacement.cc
+++ b/orttraining/orttraining/core/optimizer/conv1d_replacement.cc
@@ -42,30 +42,45 @@
 */
 namespace onnxruntime {
 bool NodeCanBeReplacedByMatmul(const Node& node) {
-  // If node type is Conv, and attr "dilations" is 1, "kernel_shape" is 1, "stride" is 1, group is 1 or 2,
-  // then it can be replaced by MatMul
-  // Kernel_shape is 1 means it is conv1d
+  /*
+  If node type is Conv, and satisfy the following conditions then it can be replaced by MatMul:
+  - not bias as input which means only has 2 inputs: input and weight
+  - "dilations" should be [1]
+    size 1 means conv1d
+  - "strides" should be [1]
+  - "pads" should be [0,0]
+  - "autopad" should be "NOTSET"
+  - "kernel_shape" should be [1]
+  */
   if (!graph_utils::IsSupportedOptypeVersionAndDomain(node, "Conv", {1, 11})) {
     return false;
   }
-  const auto* dilations = graph_utils::GetNodeAttribute(node, "dilations");
-  const auto* kernel_shape = graph_utils::GetNodeAttribute(node, "kernel_shape");
-  const auto* stride = graph_utils::GetNodeAttribute(node, "strides");
-  const auto* group = graph_utils::GetNodeAttribute(node, "group");
-  if (dilations == nullptr || kernel_shape == nullptr || stride == nullptr || group == nullptr) {
+
+  // TODO: bias input can also be supported if needed
+  if (node.InputDefs().size() != 2) {
     return false;
   }
-  if ((dilations->ints_size() && dilations->ints(0) != 1) ||
-      (kernel_shape->ints_size() && kernel_shape->ints(0) != 1) ||
-      (stride->ints_size() && stride->ints(0) != 1) ||
-      group->i() >= 3) {
+
+  const auto* dilations = graph_utils::GetNodeAttribute(node, "dilations");
+  const auto* strides = graph_utils::GetNodeAttribute(node, "strides");
+  const auto* pads = graph_utils::GetNodeAttribute(node, "pads");
+  const auto* autopad = graph_utils::GetNodeAttribute(node, "auto_pad");
+  const auto* kernel_shape = graph_utils::GetNodeAttribute(node, "kernel_shape");
+  if (dilations == nullptr || strides == nullptr || pads == nullptr || autopad == nullptr || kernel_shape == nullptr) {
     return false;
   }
 
-  return true;
+  if ((dilations->ints_size() == 1 && dilations->ints(0) == 1) &&
+      (strides->ints_size() == 1 && strides->ints(0) == 1) &&
+      (autopad->s() == "NOTSET") &&
+      (pads->ints_size() == 2 && pads->ints(0) == 0 && pads->ints(1) == 0) &&
+      (kernel_shape->ints_size() == 1 && kernel_shape->ints(0) == 1)) {
+    return true;
+  }
+  return false;
 }
 
-void Conv1dToMatmul(Graph& graph, Node& conv) {
+void Conv1dToMatmul(Graph& graph, Node& conv, const std::string transformer_name) {
   // Shape of conv1d input: [batch_size, in_channels, in_length]
   // Shape of conv1d weight:[output_channels, input_channels/group, kernel_shape], kernel_shape is 1
   // We need to split the input into "group", and squeeze&split the weight, and then do MatMul
@@ -83,7 +98,7 @@ void Conv1dToMatmul(Graph& graph, Node& conv) {
     conv1d_input_splitted_outputs.push_back(&graph.GetOrCreateNodeArg(
         graph.GenerateNodeArgName("input_split_output"), nullptr));
   }
-  auto& input_split = graph.AddNode(graph.GenerateNodeName("Split"), "Split", node_description, {conv1d_input},
+  auto& input_split = graph.AddNode(graph.GenerateNodeName(transformer_name + "Split"), "Split", node_description, {conv1d_input},
                                     {conv1d_input_splitted_outputs});
   input_split.SetExecutionProviderType(execution_provider_type);
   input_split.AddAttribute("axis", int64_t(1));
@@ -93,23 +108,25 @@ void Conv1dToMatmul(Graph& graph, Node& conv) {
   }
   // 2. Squeeze conv weight
   auto conv1d_weight = conv.MutableInputDefs()[1];
+  // auto con1d_bias = xx;
   auto weight_squeeze_output = &graph.GetOrCreateNodeArg(graph.GenerateNodeArgName("weight_squeeze_output"), nullptr);
-  auto& weight_squeeze = graph.AddNode(graph.GenerateNodeName("WeightSqueeze"), "Squeeze",
+  auto& weight_squeeze = graph.AddNode(graph.GenerateNodeName(transformer_name + "WeightSqueeze"), "Squeeze",
                                        node_description, {conv1d_weight}, {weight_squeeze_output});
+  int64_t weight_squeeze_axis = 2;
   if (onnx_opset_version > 12) {
     // After onnx version 12, squeeze node has axes as input instead of attribute
     ONNX_NAMESPACE::TensorProto initializer_proto;
-    initializer_proto.set_name(graph.GenerateNodeName("ConstAsInitializer"));
+    initializer_proto.set_name(graph.GenerateNodeName(transformer_name + "ConstAsInitializer"));
     initializer_proto.add_dims(static_cast<int64_t>(1));
     initializer_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
-    InlinedVector<int64_t> initializer_proto_value{2};
+    InlinedVector<int64_t> initializer_proto_value{weight_squeeze_axis};
     initializer_proto.set_raw_data(initializer_proto_value.data(), initializer_proto_value.size() * sizeof(int64_t));
     auto& axes_input = graph_utils::AddInitializer(graph, initializer_proto);
     // Squeeze node doesn't have opschema here, so we need to set input args count manually
     weight_squeeze.MutableInputArgsCount().resize(2);
     graph_utils::AddNodeInput(weight_squeeze, 1, axes_input);
   } else {
-    weight_squeeze.AddAttribute("axes", std::vector<int64_t>{2});
+    weight_squeeze.AddAttribute("axes", std::vector<int64_t>{weight_squeeze_axis});
   }
   weight_squeeze.SetExecutionProviderType(execution_provider_type);
   // 3. Split conv weight
@@ -118,7 +135,7 @@ void Conv1dToMatmul(Graph& graph, Node& conv) {
     conv1d_weight_splitted_outputs.push_back(&graph.GetOrCreateNodeArg(
         graph.GenerateNodeArgName("weight_split_output"), nullptr));
   }
-  auto& weight_split = graph.AddNode(graph.GenerateNodeName("Split"), "Split", node_description,
+  auto& weight_split = graph.AddNode(graph.GenerateNodeName(transformer_name + "Split"), "Split", node_description,
                                      {weight_squeeze_output}, {conv1d_weight_splitted_outputs});
   weight_split.AddAttribute("axis", int64_t(0));
   weight_split.SetExecutionProviderType(execution_provider_type);
@@ -130,13 +147,13 @@ void Conv1dToMatmul(Graph& graph, Node& conv) {
   for (int i = 0; i < group_num; i++) {
     auto matmul_output = &graph.GetOrCreateNodeArg(graph.GenerateNodeArgName("matmul_output"), nullptr);
     matmul_outputs.push_back(matmul_output);
-    auto& matmul = graph.AddNode(graph.GenerateNodeName("Matmul"), "MatMul", node_description,
+    auto& matmul = graph.AddNode(graph.GenerateNodeName(transformer_name + "Matmul"), "MatMul", node_description,
                                  {conv1d_weight_splitted_outputs[i], conv1d_input_splitted_outputs[i]},
                                  {matmul_output});
     matmul.SetExecutionProviderType(execution_provider_type);
   }
   // 5. Concat matmul outputs
-  auto& concat_node = graph.AddNode(graph.GenerateNodeName("Concat"), "Concat", node_description,
+  auto& concat_node = graph.AddNode(graph.GenerateNodeName(transformer_name + "Concat"), "Concat", node_description,
                                     matmul_outputs, {});
   concat_node.SetExecutionProviderType(execution_provider_type);
   concat_node.AddAttribute("axis", int64_t(1));
@@ -155,7 +172,7 @@ Status Conv1dReplacement::ApplyImpl(Graph& graph, bool& modified, int graph_leve
     ORT_RETURN_IF_ERROR(Recurse(node, modified, graph_level, logger));
     if (NodeCanBeReplacedByMatmul(node)) {
       LOGS(logger, VERBOSE) << "lora conv1d replacement, node name: " + node.Name();
-      Conv1dToMatmul(graph, node);
+      Conv1dToMatmul(graph, node, Name());
       modified = true;
     }
   }
diff --git a/orttraining/orttraining/test/optimizer/graph_transform_test.cc b/orttraining/orttraining/test/optimizer/graph_transform_test.cc
index bab7c09839273..109937ff96d1d 100644
--- a/orttraining/orttraining/test/optimizer/graph_transform_test.cc
+++ b/orttraining/orttraining/test/optimizer/graph_transform_test.cc
@@ -1200,7 +1200,7 @@ TEST_P(QDQFusionTestsParameterized, CheckModelComposition) {
   ASSERT_EQ(op_to_count_post_fusion["com.microsoft.FakeQuant"], 1);
 }
 
-TEST_F(GraphTransformationTests, Conv1dReplacement) {
+TEST_F(GraphTransformationTests, Conv1dReplacement_TakeEffect) {
   auto pre_graph_checker = [&](Graph& graph) {
     auto op_count_map = CountOpsInGraph(graph);
     TEST_RETURN_IF_NOT(op_count_map["Conv"] == 1);
@@ -1208,7 +1208,7 @@ TEST_F(GraphTransformationTests, Conv1dReplacement) {
   };
 
   for (auto opset : {11, 12, 13, 14, 15, 16, 17, 18}) {
-    for (auto group : {1, 2}) {
+    for (auto group : {1, 2, 4}) {
       auto build_test_case = [&](ModelTestBuilder& builder) {
         auto [batch_size, in_channel, in_length] = std::make_tuple(8, 16, 128);
         auto out_channel = 64;
@@ -1222,6 +1222,8 @@ TEST_F(GraphTransformationTests, Conv1dReplacement) {
         conv_node.AddAttribute("kernel_shape", std::vector<int64_t>{1});
         conv_node.AddAttribute("strides", std::vector<int64_t>{1});
         conv_node.AddAttribute("group", static_cast<int64_t>(group));
+        conv_node.AddAttribute("pads", std::vector<int64_t>{0, 0});
+        conv_node.AddAttribute("auto_pad", "NOTSET");
       };
 
       auto post_graph_checker = [&](Graph& graph) {
@@ -1243,28 +1245,64 @@ TEST_F(GraphTransformationTests, Conv1dReplacement) {
   }
 }
 
-TEST_F(GraphTransformationTests, Conv1dReplacement_NoTakeEffect) {
+// node has bias input so conv not replaced
+TEST_F(GraphTransformationTests, Conv1dReplacement_NoTakeEffect1) {
   auto pre_graph_checker = [&](Graph& graph) {
     auto op_count_map = CountOpsInGraph(graph);
     TEST_RETURN_IF_NOT(op_count_map["Conv"] == 1);
     return Status::OK();
   };
 
-  // "group" is 3 so conv not replaced
   for (auto opset : {11, 12, 13, 14, 15, 16, 17, 18}) {
     auto build_test_case = [&](ModelTestBuilder& builder) {
       auto [batch_size, in_channel, in_length] = std::make_tuple(8, 16, 128);
       auto out_channel = 64;
       auto* data_arg = builder.MakeInput<float>({{batch_size, in_channel, in_length}});
 
-      auto* weight_arg = builder.MakeInitializer<float>({out_channel, in_channel / 3, 1}, {-1.0f, 1.0f});
+      auto* weight_arg = builder.MakeInitializer<float>({out_channel, in_channel, 1}, {-1.0f, 1.0f});
+      auto* bias_arg = builder.MakeInitializer<float>({out_channel}, {-1.0f, 1.0f});
+      auto* conv_output = builder.MakeOutput();
+
+      auto& conv_node = builder.AddNode("Conv", {data_arg, weight_arg, bias_arg}, {conv_output});
+      conv_node.AddAttribute("dilations", std::vector<int64_t>{1});
+      conv_node.AddAttribute("kernel_shape", std::vector<int64_t>{1});
+      conv_node.AddAttribute("strides", std::vector<int64_t>{1});
+      conv_node.AddAttribute("group", static_cast<int64_t>(1));
+      conv_node.AddAttribute("pads", std::vector<int64_t>{0, 0});
+      conv_node.AddAttribute("auto_pad", "NOTSET");
+    };
+
+    std::unique_ptr<GraphTransformer> transformer = std::make_unique<Conv1dReplacement>();
+    ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, opset, *logger_, std::move(transformer),
+                                          TransformerLevel::Level1, 1,
+                                          pre_graph_checker, pre_graph_checker));
+  }
+}
+
+// "auto_pad " is not NOTSET so conv not replaced
+TEST_F(GraphTransformationTests, Conv1dReplacement_NoTakeEffect2) {
+  auto pre_graph_checker = [&](Graph& graph) {
+    auto op_count_map = CountOpsInGraph(graph);
+    TEST_RETURN_IF_NOT(op_count_map["Conv"] == 1);
+    return Status::OK();
+  };
+
+  for (auto opset : {11, 12, 13, 14, 15, 16, 17, 18}) {
+    auto build_test_case = [&](ModelTestBuilder& builder) {
+      auto [batch_size, in_channel, in_length] = std::make_tuple(8, 16, 128);
+      auto out_channel = 64;
+      auto* data_arg = builder.MakeInput<float>({{batch_size, in_channel, in_length}});
+
+      auto* weight_arg = builder.MakeInitializer<float>({out_channel, in_channel, 1}, {-1.0f, 1.0f});
       auto* conv_output = builder.MakeOutput();
 
       auto& conv_node = builder.AddNode("Conv", {data_arg, weight_arg}, {conv_output});
       conv_node.AddAttribute("dilations", std::vector<int64_t>{1});
       conv_node.AddAttribute("kernel_shape", std::vector<int64_t>{1});
       conv_node.AddAttribute("strides", std::vector<int64_t>{1});
-      conv_node.AddAttribute("group", static_cast<int64_t>(3));
+      conv_node.AddAttribute("group", static_cast<int64_t>(1));
+      conv_node.AddAttribute("pads", std::vector<int64_t>{0, 0});
+      conv_node.AddAttribute("auto_pad", "VALID");
     };
 
     std::unique_ptr<GraphTransformer> transformer = std::make_unique<Conv1dReplacement>();
@@ -1272,8 +1310,16 @@ TEST_F(GraphTransformationTests, Conv1dReplacement_NoTakeEffect) {
                                           TransformerLevel::Level1, 1,
                                           pre_graph_checker, pre_graph_checker));
   }
+}
+
+// pads is not all zero, so conv not replaced
+TEST_F(GraphTransformationTests, Conv1dReplacement_NoTakeEffect3) {
+  auto pre_graph_checker = [&](Graph& graph) {
+    auto op_count_map = CountOpsInGraph(graph);
+    TEST_RETURN_IF_NOT(op_count_map["Conv"] == 1);
+    return Status::OK();
+  };
 
-  // "kernel_shape" is not 1 so conv not replaced
   for (auto opset : {11, 12, 13, 14, 15, 16, 17, 18}) {
     auto build_test_case = [&](ModelTestBuilder& builder) {
       auto [batch_size, in_channel, in_length] = std::make_tuple(8, 16, 128);
@@ -1285,9 +1331,11 @@ TEST_F(GraphTransformationTests, Conv1dReplacement_NoTakeEffect) {
 
       auto& conv_node = builder.AddNode("Conv", {data_arg, weight_arg}, {conv_output});
       conv_node.AddAttribute("dilations", std::vector<int64_t>{1});
-      conv_node.AddAttribute("kernel_shape", std::vector<int64_t>{2});
+      conv_node.AddAttribute("kernel_shape", std::vector<int64_t>{1});
       conv_node.AddAttribute("strides", std::vector<int64_t>{1});
       conv_node.AddAttribute("group", static_cast<int64_t>(1));
+      conv_node.AddAttribute("pads", std::vector<int64_t>{1, 0});
+      conv_node.AddAttribute("auto_pad", "NOTSET");
     };
 
     std::unique_ptr<GraphTransformer> transformer = std::make_unique<Conv1dReplacement>();

From 7e613ee821405b1192d0b71b9434a4f94643f1e4 Mon Sep 17 00:00:00 2001
From: wejoncy <wejoncy@163.com>
Date: Tue, 5 Mar 2024 11:45:45 +0800
Subject: [PATCH 106/279] [quant] supports act_order inputs in Matmulnbits and
 new quantization algorithm "hqq" (#19106)

### Description
<!-- Describe your changes. -->
1. Support quantized GPTQ weight in huggingface like
[TheBloke/Llama-2-7B-Chat-GPTQ](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GPTQ)
2. Support Act_order for GPTQ
3. Support [HQQ](https://mobiusml.github.io/hqq_blog/) algorithm to
quantize matmul weight and add quant script


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 docs/ContribOperators.md                      |  43 +-
 docs/OperatorKernels.md                       |   4 +-
 .../cpu/quantization/matmul_nbits.cc          | 105 ++++-
 .../cpu/quantization/matmul_nbits_impl.cc     | 108 +++++
 .../cpu/quantization/matmul_nbits_impl.h      |  23 ++
 .../cuda/quantization/dequantize_blockwise.cu | 159 ++++++--
 .../quantization/dequantize_blockwise.cuh     |   6 +-
 .../cuda/quantization/matmul_nbits.cc         | 170 ++++----
 .../cuda/quantization/matmul_nbits.h          |  41 ++
 .../core/graph/contrib_ops/contrib_defs.cc    |  38 +-
 .../quantization/matmul_4bits_quantizer.py    | 379 ++++++++++++++++--
 .../test/contrib_ops/matmul_4bits_test.cc     |  78 +++-
 .../test/python/quantization/op_test_utils.py |   3 +-
 .../quantization/test_op_matmul_4bits.py      |  19 +-
 14 files changed, 942 insertions(+), 234 deletions(-)
 create mode 100644 onnxruntime/contrib_ops/cpu/quantization/matmul_nbits_impl.cc
 create mode 100644 onnxruntime/contrib_ops/cpu/quantization/matmul_nbits_impl.h
 create mode 100644 onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.h

diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
index e295dfa203ae5..5f0100fad95a2 100644
--- a/docs/ContribOperators.md
+++ b/docs/ContribOperators.md
@@ -2808,22 +2808,23 @@ This version of the operator has been available since version 1 of the 'com.micr
        And block_size is not an arbitrary number and must be a power of 2 and not smaller than 16, like 16, 32, 64, 128,..
     3. Input B's scale and zero point are specified by input scales and zero_points.
   
-  Input B is stored as uint8_t with shape: [N][n_blocks_per_col][blob_size] in which:
-  - n_blocks_per_col = (K + block_size - 1) / block_size
-  - blob_size = block_size / 8 * bits
+    Input is stored as uint8_t with shape: [N][n_blocks_per_col][blob_size] in which:
+    - n_blocks_per_col = (K + block_size - 1) / block_size
+    - blob_size = CeilDiv(block_size * bits, bitsof(uint8_t)<8>)
+    For all bits from 2-8, a row of data is stored squeezely and represented by uint8_t.
+      - for 2,4,8 bits, 4x2bit,2x4bit,1x8bit are stored in one uint8_t.
+          4bit example:
+          |.|.|.|.| .|.|.|.| =uint8_t (2x4bit)
+      - for 3,5,6,7 bits, 32x3bit,32x5bit,16x6bit,32x7bit are stored in 12xuint8_t,20xuint8_t,12xuint8_t,28xuint8_t separately. no bits are wasted.
+          3bit example:
+          |.|.|. |.|.|. |.|.|. = 9bit, which across 2 uint8_t, the highest bit for the second uint8_t is used.
+    The last uint_8 may have some bits unused.
   
-    For a block blob. It is stored in format:
-    struct Blob {
-      uint8 one_bits[(bits & 0x1) * 1 * block_size / 8];  // highest 1 bit for 3, 5, 7 bits quantization
-      uint8 two_bits[(bits & 0x2) * 2 * block_size / 8];  // high 2 bits for 2, 6, 7 bits quantization
-      uint8 four_bits[(bits & 0x4) * 4 * block_size / 8]; // low 4 bits for 4, 5, 6 bits quantization
-    }
   
   Input scales is stored in same type as original type of B(float32, float16) with shape like: [N * n_blocks_per_col]
-  Input zero_points is stored as uint8_t. If bits <= 4, two zero points are stored as one unit8_t. If bits > 4, one zero point is stored with one unit8_t. Thus, its shape is:
-    - [(N * n_blocks_per_col + 1) / 2] if bits <=4
-    - [N * n_blocks_per_col] if bits > 4
-  
+  Input zero_points is stored as uint8_t or same as type(A). It has the same packing method as input B.
+    - [CeilDiv((N * n_blocks_per_col + 1) *bits, 8)]
+    If zero_points has same type as A, it's not packed and has the same shape as Scales.
 
 #### Version
 
@@ -2844,17 +2845,19 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dd>number of groupsize used for weight quantization,(default 128). It needs to be a power of 2 and not smaller than 16.</dd>
 </dl>
 
-#### Inputs (3 - 4)
+#### Inputs (3 - 5)
 
 <dl>
 <dt><tt>A</tt> : T1</dt>
 <dd>The input tensor, not quantized</dd>
 <dt><tt>B</tt> : T2</dt>
-<dd>1-dimensional data blob</dd>
+<dd>1 or 2 dimensional data blob</dd>
 <dt><tt>scales</tt> : T1</dt>
 <dd>quantization scale</dd>
-<dt><tt>zero_points</tt> (optional) : T2</dt>
+<dt><tt>zero_points</tt> (optional) : T3</dt>
 <dd>quantization zero points</dd>
+<dt><tt>g_idx</tt> (optional) : T4</dt>
+<dd>group_idx</dd>
 </dl>
 
 #### Outputs
@@ -2869,8 +2872,12 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dl>
 <dt><tt>T1</tt> : tensor(float), tensor(float16)</dt>
 <dd>Constrain input and output types to float/half_float tensors.</dd>
-<dt><tt>T2</tt> : tensor(uint8)</dt>
-<dd>Constrain quantized weight types to uint8.</dd>
+<dt><tt>T2</tt> : tensor(uint8), tensor(int32)</dt>
+<dd>Constrain quantized weight types to uint8/int32.</dd>
+<dt><tt>T3</tt> : tensor(uint8), tensor(int32), tensor(float16), tensor(float)</dt>
+<dd>Constrain quantized zero point types to uint8/int32/float16/float.</dd>
+<dt><tt>T4</tt> : tensor(int32)</dt>
+<dd>the index tensor.</dd>
 </dl>
 
 
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index 0e60b4622f2fb..71b0def659741 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -470,7 +470,7 @@ Do not modify directly.*
 |MatMulFpQ4|*in* A:**T1**<br> *in* B:**T2**<br> *in* B_shape:**T3**<br> *out* Y:**T1**|1+|**T1** = tensor(float)<br/> **T2** = tensor(uint8)<br/> **T3** = tensor(int64)|
 |MatMulInteger16|*in* A:**T1**<br> *in* B:**T2**<br> *out* Y:**T3**|1+|**T1** = tensor(int16)<br/> **T2** = tensor(int16)<br/> **T3** = tensor(int32)|
 |MatMulIntegerToFloat|*in* A:**T1**<br> *in* B:**T2**<br> *in* a_scale:**T3**<br> *in* b_scale:**T3**<br> *in* a_zero_point:**T1**<br> *in* b_zero_point:**T2**<br> *in* bias:**T3**<br> *out* Y:**T3**|1+|**T1** = tensor(int8), tensor(uint8)<br/> **T2** = tensor(int8), tensor(uint8)<br/> **T3** = tensor(float)|
-|MatMulNBits|*in* A:**T1**<br> *in* B:**T2**<br> *in* scales:**T1**<br> *in* zero_points:**T2**<br> *out* Y:**T1**|1+|**T1** = tensor(float)<br/> **T2** = tensor(uint8)|
+|MatMulNBits|*in* A:**T1**<br> *in* B:**T2**<br> *in* scales:**T1**<br> *in* zero_points:**T3**<br> *in* g_idx:**T4**<br> *out* Y:**T1**|1+|**T1** = tensor(float)<br/> **T2** = tensor(uint8)<br/> **T3** = tensor(float), tensor(uint8)<br/> **T4** = tensor(int32)|
 |MaxpoolWithMask|*in* X:**T**<br> *in* M:**tensor(int32)**<br> *out* Y:**T**|1+|**T** = tensor(float)|
 |MultiHeadAttention|*in* query:**T**<br> *in* key:**T**<br> *in* value:**T**<br> *in* bias:**T**<br> *in* key_padding_mask:**M**<br> *in* relative_position_bias:**T**<br> *in* past_key:**T**<br> *in* past_value:**T**<br> *out* output:**T**<br> *out* present_key:**T**<br> *out* present_value:**T**|1+|**T** = tensor(float)|
 |MurmurHash3|*in* X:**T1**<br> *out* Y:**T2**|1+|**T1** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(string), tensor(uint32), tensor(uint64)<br/> **T2** = tensor(int32), tensor(uint32)|
@@ -855,7 +855,7 @@ Do not modify directly.*
 |Irfft|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
 |LongformerAttention|*in* input:**T**<br> *in* weight:**T**<br> *in* bias:**T**<br> *in* mask:**T**<br> *in* global_weight:**T**<br> *in* global_bias:**T**<br> *in* global:**G**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
 |MatMulBnb4|*in* A:**T1**<br> *in* B:**T2**<br> *in* absmax:**T1**<br> *out* Y:**T1**|1+|**T1** = tensor(bfloat16), tensor(float), tensor(float16)<br/> **T2** = tensor(uint8)|
-|MatMulNBits|*in* A:**T1**<br> *in* B:**T2**<br> *in* scales:**T1**<br> *in* zero_points:**T2**<br> *out* Y:**T1**|1+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(uint8)|
+|MatMulNBits|*in* A:**T1**<br> *in* B:**T2**<br> *in* scales:**T1**<br> *in* zero_points:**T3**<br> *in* g_idx:**T4**<br> *out* Y:**T1**|1+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(uint8)|
 |MoE|*in* input:**T**<br> *in* router_probs:**T**<br> *in* fc1_experts_weights:**T**<br> *in* fc2_experts_weights:**T**<br> *in* fc1_experts_bias:**T**<br> *in* fc2_experts_bias:**T**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
 |MultiHeadAttention|*in* query:**T**<br> *in* key:**T**<br> *in* value:**T**<br> *in* bias:**T**<br> *in* key_padding_mask:**M**<br> *in* relative_position_bias:**T**<br> *in* past_key:**T**<br> *in* past_value:**T**<br> *out* output:**T**<br> *out* present_key:**T**<br> *out* present_value:**T**|1+|**T** = tensor(float), tensor(float16)|
 |NGramRepeatBlock|*in* input_ids:**Tid**<br> *in* scores:**T**<br> *out* scores_out:**T**|1+|**T** = tensor(float)<br/> **Tid** = tensor(int64)|
diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
index 166f5c8f52f54..602dd98d8c0d6 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
@@ -1,6 +1,12 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "contrib_ops/cpu/quantization/matmul_nbits_impl.h"
+
+#include <cstdint>
+#include <type_traits>
+
+#include "core/common/common.h"
 #include "core/common/narrow.h"
 #include "core/common/safeint.h"
 #include "core/framework/op_kernel.h"
@@ -50,6 +56,17 @@ int64_t GetAccuracyLevel(size_t nbits, size_t block_size, int64_t accuracy_level
 }
 }  // namespace
 
+bool GetType(const NodeArg& node_arg, int32_t& type) {
+  type = ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED;
+  const auto* type_proto = node_arg.TypeAsProto();
+  if (!type_proto || !type_proto->has_tensor_type() || !type_proto->tensor_type().has_elem_type()) {
+    return false;
+  }
+
+  type = type_proto->tensor_type().elem_type();
+  return true;
+}
+
 class MatMulNBits final : public OpKernel {
  public:
   MatMulNBits(const OpKernelInfo& info)
@@ -59,6 +76,17 @@ class MatMulNBits final : public OpKernel {
         block_size_{narrow<size_t>(info.GetAttr<int64_t>("block_size"))},
         nbits_{narrow<size_t>(info.GetAttr<int64_t>("bits"))},
         accuracy_level_{GetAccuracyLevel(nbits_, block_size_, info.GetAttr<int64_t>("accuracy_level"))} {
+    const auto& node = info.node();
+    auto input_defs = node.InputDefs();
+    // g_idx
+    if (input_defs.size() > 4) {
+      act_order_ = true;
+    }
+    int32_t type;
+    if (input_defs.size() > 3 && GetType(*input_defs[3], type)) {
+      zero_point_is_not_quant_ = type != ONNX_NAMESPACE::TensorProto_DataType_UINT8;
+    }
+
     ORT_ENFORCE(nbits_ == 4,
                 "Only 4b quantization is supported for MatMulNBits op, additional bits support is planned.");
 #ifdef ORT_NEURAL_SPEED
@@ -88,6 +116,8 @@ class MatMulNBits final : public OpKernel {
   const size_t N_;
   const size_t block_size_;
   const size_t nbits_;
+  bool act_order_{false};
+  bool zero_point_is_not_quant_{false};
   const int64_t accuracy_level_;
   const bool column_wise_quant_{true};
   IAllocatorUniquePtr<void> packed_b_;
@@ -105,7 +135,9 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ Allocat
                             /*out*/ bool& is_packed,
                             /*out*/ PrePackedWeights* prepacked_weights) {
   is_packed = false;
-
+  if (act_order_ || zero_point_is_not_quant_) {
+    return Status::OK();
+  }
 #if defined(ORT_NEURAL_SPEED)
 
   if (!all_constant_) {
@@ -212,7 +244,6 @@ Status MatMulNBits::UseSharedPrePackedBuffers(std::vector<BufferUniquePtr>& prep
 
 Status MatMulNBits::Compute(OpKernelContext* ctx) const {
   concurrency::ThreadPool* thread_pool = ctx->GetOperatorThreadPool();
-
   const Tensor* a = ctx->Input<Tensor>(0);
   const auto* a_data = a->Data<float>();
 
@@ -257,11 +288,14 @@ Status MatMulNBits::Compute(OpKernelContext* ctx) const {
 #endif  // defined(ORT_NEURAL_SPEED)
 
   const Tensor* scales = ctx->Input<Tensor>(2);
-  const Tensor* zero_points = ctx->Input<Tensor>(3);
+  const Tensor* zero_points = ctx->InputCount() > 3 ? ctx->Input<Tensor>(3) : nullptr;
+  const Tensor* reorder_idx = ctx->InputCount() > 4 ? ctx->Input<Tensor>(4) : nullptr;
+
   const auto* scales_data = scales->Data<float>();
-  const auto* zero_points_data = zero_points == nullptr ? nullptr : zero_points->Data<uint8_t>();
+  const auto* zero_points_data = zero_points == nullptr ? nullptr : zero_points->DataRaw();
 
   TensorShape b_shape({static_cast<int64_t>(N_), static_cast<int64_t>(K_)});
+  const auto* reorder_idx_data = reorder_idx == nullptr ? nullptr : reorder_idx->Data<int32_t>();
 
   MatMulComputeHelper helper;
   ORT_RETURN_IF_ERROR(helper.Compute(a->Shape(), b_shape, false, true));
@@ -281,8 +315,9 @@ Status MatMulNBits::Compute(OpKernelContext* ctx) const {
   const size_t K = static_cast<size_t>(helper.K());
   const size_t lda = helper.Lda(false);
 
-  const bool has_single_b_matrix = std::all_of(helper.RightOffsets().begin(), helper.RightOffsets().end(),
-                                               [](size_t offset) { return offset == 0; });
+  const bool has_single_b_matrix =
+      (!act_order_) && (!zero_point_is_not_quant_) &&
+      std::all_of(helper.RightOffsets().begin(), helper.RightOffsets().end(), [](size_t offset) { return offset == 0; });
 
   if (has_single_b_matrix) {
     const auto compute_type = static_cast<MLAS_SQNBIT_GEMM_COMPUTE_TYPE>(accuracy_level_);
@@ -328,22 +363,50 @@ Status MatMulNBits::Compute(OpKernelContext* ctx) const {
   const uint8_t* b_data = b->Data<uint8_t>();
 
   const size_t ldb = helper.Ldb(true);
-
   AllocatorPtr allocator;
   ORT_RETURN_IF_ERROR(ctx->GetTempSpaceAllocator(&allocator));
   auto tmp_b_data_ptr = IAllocator::MakeUniquePtr<float>(allocator, SafeInt<size_t>(K_) * N_);
-  // dequantize b, only 4b quantization is supported for now
-  MlasDequantizeBlockwise<float, 4>(
-      tmp_b_data_ptr.get(),               // dequantized output
-      b_data,                             // quantized input
-      scales_data,                        // quantization scales
-      zero_points_data,                   // quantization zero points
-      static_cast<int32_t>(block_size_),  // quantization block size
-      column_wise_quant_,                 // columnwise quantization or row-wise
-      static_cast<int32_t>(K_),           // number of rows in quantized input
-      static_cast<int32_t>(N_),           // number of columns in quantized input
-      thread_pool);
-
+  if ((reorder_idx_data == nullptr) && (!zero_points || !zero_points->IsDataType<float>())) {
+    // dequantize b, only 4b quantization is supported for now
+    MlasDequantizeBlockwise<float, 4>(
+        tmp_b_data_ptr.get(),                           // dequantized output
+        b_data,                                         // quantized input
+        scales_data,                                    // quantization scales
+        static_cast<const uint8_t*>(zero_points_data),  // quantization zero points
+        static_cast<int32_t>(block_size_),              // quantization block size
+        column_wise_quant_,                             // columnwise quantization or row-wise
+        static_cast<int32_t>(K_),                       // number of rows in quantized input
+        static_cast<int32_t>(N_),                       // number of columns in quantized input
+        thread_pool);
+  } else {
+    ORT_ENFORCE(column_wise_quant_, "Row-wise quantization is not supported for now");
+    // !!!!!!!!!!!!!! naive implementation, need to be optimized !!!!!!!!!!!!!!
+    if ((zero_points && zero_points->IsDataType<float>())) {
+      DequantizeBlockwise<float, float>(
+          tmp_b_data_ptr.get(),                         // dequantized output
+          b_data,                                       // quantized input
+          scales_data,                                  // quantization scales
+          static_cast<const float*>(zero_points_data),  // quantization zero points
+          reorder_idx_data,
+          static_cast<int32_t>(block_size_),  // quantization block size
+          column_wise_quant_,                 // columnwise quantization or row-wise
+          static_cast<int32_t>(K_),           // number of rows in quantized input
+          static_cast<int32_t>(N_),           // number of columns in quantized input
+          thread_pool);
+    } else {
+      DequantizeBlockwise<float, uint8_t>(
+          tmp_b_data_ptr.get(),                           // dequantized output
+          b_data,                                         // quantized input
+          scales_data,                                    // quantization scales
+          static_cast<const uint8_t*>(zero_points_data),  // quantization zero points
+          reorder_idx_data,
+          static_cast<int32_t>(block_size_),  // quantization block size
+          column_wise_quant_,                 // columnwise quantization or row-wise
+          static_cast<int32_t>(K_),           // number of rows in quantized input
+          static_cast<int32_t>(N_),           // number of columns in quantized input
+          thread_pool);
+    }
+  }
 #if 0  // for debug
   auto tm_b_data_ptr_trans = IAllocator::MakeUniquePtr<float>(allocator, SafeInt<size_t>(K_) * N_);
   MlasTranspose(tmp_b_data_ptr.get(), tm_b_data_ptr_trans.get(), N_, K_);
@@ -374,7 +437,9 @@ ONNX_OPERATOR_KERNEL_EX(
     kCpuExecutionProvider,
     KernelDefBuilder()
         .TypeConstraint("T1", DataTypeImpl::GetTensorType<float>())
-        .TypeConstraint("T2", DataTypeImpl::GetTensorType<uint8_t>()),
+        .TypeConstraint("T2", DataTypeImpl::GetTensorType<uint8_t>())
+        .TypeConstraint("T3", {DataTypeImpl::GetTensorType<uint8_t>(), DataTypeImpl::GetTensorType<float>()})
+        .TypeConstraint("T4", DataTypeImpl::GetTensorType<int32_t>()),
     MatMulNBits);
 
 }  // namespace contrib
diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits_impl.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits_impl.cc
new file mode 100644
index 0000000000000..f92e59e990ba5
--- /dev/null
+++ b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits_impl.cc
@@ -0,0 +1,108 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#include "contrib_ops/cpu/quantization/matmul_nbits_impl.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <type_traits>
+
+#include "core/common/common.h"
+#include "core/framework/float16.h"
+#include "core/providers/common.h"
+#include "core/platform/threadpool.h"
+
+namespace onnxruntime {
+namespace contrib {
+
+template <class T, class zeroT>
+void Dequantize4BitsKernelReOrder(
+    T* output, const uint8_t* quant_data, const T* scale_data,
+    const zeroT* zero_points, const int32_t* reorder_idx, int block_size,
+    int groups_per_threadblock, int total_groups, int out_rows, int out_cols,
+    int blockIdx_x, int threadIdx_x) {
+  const int group_id = blockIdx_x * groups_per_threadblock + ((threadIdx_x * 8) / block_size);
+  if (group_id >= total_groups) {
+    return;
+  }
+  const int scales_shape_x = (out_cols + block_size - 1) / block_size;
+  const int zero_point_shape_x = (scales_shape_x + 1) / 2;
+
+  int n_idx = group_id / scales_shape_x;
+  int kb_idx = group_id % scales_shape_x;
+  int element_offset = group_id * block_size + ((threadIdx_x * 8) & (block_size - 1));
+
+  const int out_x = element_offset % (scales_shape_x * block_size);
+  const int out_y = element_offset / (scales_shape_x * block_size);
+  if (out_y >= out_rows || out_x >= out_cols) {
+    return;
+  }
+  T* output_i = output + out_y * out_cols + out_x;
+  uint32_t quant_value = *(reinterpret_cast<const uint32_t*>(quant_data + element_offset / 2));
+  const int remain_x = std::min(8, out_cols - out_x);
+  for (int i = 0; i < remain_x; i++) {
+    int32_t rid = reorder_idx ? reorder_idx[kb_idx * block_size + i] : kb_idx;
+    T scale = *(scale_data + n_idx * scales_shape_x + rid);
+    float zp_f = 8;
+    if (zero_points) {
+      if constexpr (std::is_same_v<zeroT, T>) {
+        zp_f = *(zero_points + n_idx * scales_shape_x + rid);
+      } else {
+        uint8_t zp = 8;
+        zp = zero_points[n_idx * zero_point_shape_x + rid / 2];
+        zp = (rid & 0x01) ? (zp >> 4) : (zp & 0x0f);
+      }
+    }
+
+    if constexpr (std::is_same_v<T, MLFloat16>) {
+      T zp_adjust = -scale * MLFloat16(zp_f);
+      output_i[i] = static_cast<float>((quant_value >> (4 * i)) & 0xF) * scale + zp_adjust;
+    } else {
+      T zp_adjust = -scale * zp_f;
+      output_i[i] = T((quant_value >> (4 * i)) & 0xF) * scale + zp_adjust;
+    }
+  }
+}
+
+template <typename inputT, typename zeroT>
+void DequantizeBlockwise(
+    inputT* output,              // dequantized output
+    const uint8_t* quant_data,   // quantized input
+    const inputT* scales_data,   // quantization scales
+    const zeroT* zero_points,    // quantization zero points
+    const int32_t* reorder_idx,  // reorder_idx for groupwise quantization
+    int32_t block_size,          // quantization block size
+    bool,                        // columnwise quantization or row-wise
+    int32_t K,                   // number of rows in quantized input
+    int32_t N,                   // number of columns in quantized input
+    onnxruntime::concurrency::ThreadPool* pool) {
+  auto ceildiv = [](int a, int b) { return (a + b - 1) / b; };
+  constexpr int element_per_thread = 8;
+  int groups_per_threadblock = 256 * element_per_thread / block_size;
+  int groups_per_K = ceildiv(K, block_size);
+  int total_groups = N * groups_per_K;  // total elemenets in quant_data
+  int blocks_per_grid = static_cast<int>(ceildiv(total_groups, groups_per_threadblock));
+  concurrency::ThreadPool::TrySimpleParallelFor(
+      pool, static_cast<std::ptrdiff_t>(blocks_per_grid),
+      [&](std::ptrdiff_t block_id) {
+        for (int j = 0; j < 256; j++) {
+          Dequantize4BitsKernelReOrder(output, quant_data, scales_data, zero_points,
+                                       reorder_idx, block_size, groups_per_threadblock,
+                                       total_groups, N, K, static_cast<int>(block_id), j);
+        }
+      });
+}
+
+template void DequantizeBlockwise<float, uint8_t>(
+    float* output, const uint8_t* quant_data, const float* scales_data,
+    const uint8_t* zero_points, const int32_t* reorder_idx, int32_t block_size,
+    bool columnwise, int32_t K, int32_t N, onnxruntime::concurrency::ThreadPool* thread_pool);
+
+template void DequantizeBlockwise<float, float>(
+    float* output, const uint8_t* quant_data, const float* scales_data,
+    const float* zero_points, const int32_t* reorder_idx, int32_t block_size,
+    bool columnwise, int32_t K, int32_t N, onnxruntime::concurrency::ThreadPool* thread_pool);
+
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits_impl.h b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits_impl.h
new file mode 100644
index 0000000000000..5061ac5c800a6
--- /dev/null
+++ b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits_impl.h
@@ -0,0 +1,23 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#include "core/providers/common.h"
+#include "core/platform/threadpool.h"
+
+namespace onnxruntime {
+namespace contrib {
+
+template <typename inputT, typename zeroT>
+void DequantizeBlockwise(
+    inputT* output,              // dequantized output
+    const uint8_t* quant_data,   // quantized input
+    const inputT* scales_data,   // quantization scales
+    const zeroT* zero_points,    // quantization zero points
+    const int32_t* reorder_idx,  // quantization zero points
+    int32_t block_size,          // quantization block size
+    bool,                        // columnwise quantization or row-wise
+    int32_t K,                   // number of rows in quantized input
+    int32_t N,                   // number of columns in quantized input
+    onnxruntime::concurrency::ThreadPool* thread_pool);
+
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise.cu b/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise.cu
index 6b66f1d84e221..cd6593352008b 100644
--- a/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise.cu
+++ b/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise.cu
@@ -2,10 +2,12 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include <cstdint>
 #include <cub/cub.cuh>
 #include <cublas_v2.h>
 #include <cuda_fp16.h>
 #include <cmath>
+#include <type_traits>
 #include <math_constants.h>
 #include "core/providers/cuda/cu_inc/common.cuh"
 #include "core/providers/cuda/cuda_common.h"
@@ -56,41 +58,94 @@ __device__ __forceinline__ void DequantizeEightElements(uint32_t values_quant, f
 }
 
 template <class T>
-__global__ void Dequantize4BitsKernel(
+__global__ void Dequantize4BitsKernelReOrder(
     T* output,
     const uint8_t* quant_data,
     const T* scale_data,
     const uint8_t* zero_points,
+    const int32_t* reorder_idx,
     int block_size,
-    int blocks_per_K,
-    int blocks_per_threadblock,
-    int total_blks,
-    int shift) {
-  int block_id = blockIdx.x * blocks_per_threadblock + ((threadIdx.x * 8) >> shift);
-  if (block_id >= total_blks) {
+    int groups_per_K,
+    int groups_per_threadblock,
+    int total_groups) {
+  int group_id = blockIdx.x * groups_per_threadblock + ((threadIdx.x * 8) / block_size);
+  if (group_id >= total_groups) {
     return;
   }
-  int n_idx = block_id / blocks_per_K;
-  int kb_idx = block_id % blocks_per_K;
-  int element_offset = block_id * block_size + ((threadIdx.x * 8) & ((1 << shift) - 1));
+  // T __shared__ zero_points_after_reorder[];//K
+  // T __shared__ scales_after_reorder[];     // K
+  // const int num_r_per_thread = k / 256;
+
+  const int zero_point_shape_x = (groups_per_K + 1) / 2;
+  const int scales_shape_x = groups_per_K;
+  int n_idx = group_id / scales_shape_x;
+  int kb_idx = group_id % scales_shape_x;
+  int element_offset = group_id * block_size + ((threadIdx.x * 8) & (block_size - 1));
+  T* output_i = output + element_offset;
+  uint32_t quant_value = *(reinterpret_cast<const uint32_t*>(quant_data + element_offset / 2));
+  for (int i = 0; i < 8; i++) {
+    int32_t rid = reorder_idx[kb_idx * block_size + i];
+    T scale = *(scale_data + n_idx * scales_shape_x + rid);
+    uint8_t zp = 8;
+    if (zero_points) {
+      zp = zero_points[n_idx * zero_point_shape_x + rid / 2];
+      zp = (rid & 0x01) ? (zp >> 4) : (zp & 0x0f);
+    }
+
+    if constexpr (std::is_same_v<T, half>) {
+      T zp_adjust = -scale * __short2half_rn(zp);
+      output_i[i] = __uint2half_rn((quant_value >> (4 * i)) & 0xF) * scale + zp_adjust;
+    } else {
+      T zp_adjust = -scale * T(zp);
+      output_i[i] = T((quant_value >> (4 * i)) & 0xF) * scale + zp_adjust;
+    }
+  }
+}
+
+template <class T, typename ZeroT = uint8_t>
+__global__ void Dequantize4BitsKernel(
+    T* output,
+    const uint8_t* quant_data,
+    const T* scale_data,
+    const ZeroT* zero_points,
+    int block_size,
+    int groups_per_K,
+    int groups_per_threadblock,
+    int total_groups) {
+  int block_id = blockIdx.x * groups_per_threadblock + ((threadIdx.x * 8) / block_size);
+  if (block_id >= total_groups) {
+    return;
+  }
+  int element_offset = block_id * block_size + ((threadIdx.x * 8) & (block_size - 1));
   uint32_t quant_value = *(reinterpret_cast<const uint32_t*>(quant_data + element_offset / 2));
   T scale = *(scale_data + block_id);
-  uint8_t zp = 8;
-  if (zero_points) {
-    zp = zero_points[n_idx * ((blocks_per_K + 1)/2) + kb_idx / 2];
-    zp = (kb_idx & 0x01) ? (zp >> 4) : (zp & 0x0f);
+  T zero_point_value;
+  if constexpr (std::is_same_v<ZeroT, uint8_t>) {
+    const int scales_shape_x = groups_per_K;
+    const int zero_point_shape_x = (groups_per_K + 1) / 2;
+    int kb_idx = block_id % scales_shape_x;
+    int n_idx = block_id / scales_shape_x;
+    uint8_t zp = 8;
+    if (zero_points) {
+      zp = zero_points[n_idx * zero_point_shape_x + kb_idx / 2];
+      zp = (kb_idx & 0x01) ? (zp >> 4) : (zp & 0x0f);
+    }
+    zero_point_value = static_cast<T>(zp);
+  } else {
+    zero_point_value = zero_points? *(zero_points + block_id):static_cast<T>(8);
   }
 
   output = output + element_offset;
-  DequantizeEightElements(quant_value, scale, static_cast<T>(zp), output);
+  DequantizeEightElements(quant_value, scale, zero_point_value, output);
 }
 
-template <class T>
+template <class T, typename ZeroT>
 Status Dequantize4Bits(
     T* output,
     const uint8_t* quant_data,
     const T* scales_data,
-    const uint8_t* zero_points,  // shape: [N, (block_per_K + 1)/2]
+    const ZeroT* zero_points,  // shape: [N, (block_per_K + 1)/2]
+    const int32_t* reorder_idx,
     int k,
     int n,
     int block_size,
@@ -98,47 +153,79 @@ Status Dequantize4Bits(
   // k is padded and equal to block_per_K * block_size
   ORT_ENFORCE(k % block_size == 0, "k must be a multiplier of block_size");
   constexpr int element_per_thread = 8;
-  int blocks_per_threadblock = GridDim::maxThreadsPerBlock * element_per_thread / block_size;
-  int blocks_per_K = k / block_size;
-  int total_blks = n * blocks_per_K;
-  int blocks_per_grid = static_cast<int>(CeilDiv(n * blocks_per_K, blocks_per_threadblock));
-  int shift = static_cast<int>(log2f(float(block_size)));
-
-  Dequantize4BitsKernel<<<blocks_per_grid, GridDim::maxThreadsPerBlock, 0, stream>>>(
-      output,
-      quant_data,
-      scales_data,
-      zero_points,
-      block_size,
-      blocks_per_K,
-      blocks_per_threadblock,
-      total_blks,
-      shift);
+  int groups_per_threadblock = GridDim::maxThreadsPerBlock * element_per_thread / block_size;
+  int groups_per_K = k / block_size;
+  int total_groups = n * groups_per_K;  // total elemenets in quant_data
+  int groups_per_grid = static_cast<int>(CeilDiv(total_groups, groups_per_threadblock));
+  if (!reorder_idx) {
+    Dequantize4BitsKernel<T, ZeroT><<<groups_per_grid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+        output,
+        quant_data,
+        scales_data,
+        zero_points,
+        block_size,
+        groups_per_K,
+        groups_per_threadblock,
+        total_groups);
+  } else {
+    // static_assert(std::is_same_v<ZeroT, uint8_t>, "ZeroT must be uint8_t");
+    Dequantize4BitsKernelReOrder<<<groups_per_grid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+        output,
+        quant_data,
+        scales_data,
+        (const uint8_t*)zero_points,
+        reorder_idx,
+        block_size,
+        groups_per_K,
+        groups_per_threadblock,
+        total_groups);
+  }
 
   return Status::OK();
 }
 
-template Status Dequantize4Bits<float>(
+template Status Dequantize4Bits<float, uint8_t>(
     float* output,
     const uint8_t* quant_data,
     const float* scales_data,
     const uint8_t* zero_points,
+    const int32_t* reorder_idx,
     int k,
     int n,
     int block_size,
     cudaStream_t stream);
 
-template Status Dequantize4Bits<half>(
+template Status Dequantize4Bits<half, uint8_t>(
     half* output,
     const uint8_t* quant_data,
     const half* scales_data,
     const uint8_t* zero_points,
+    const int32_t* reorder_idx,
+    int k,
+    int n,
+    int block_size,
+    cudaStream_t stream);
+template Status Dequantize4Bits<float, float>(
+    float* output,
+    const uint8_t* quant_data,
+    const float* scales_data,
+    const float* zero_points,
+    const int32_t* reorder_idx,
     int k,
     int n,
     int block_size,
     cudaStream_t stream);
 
-
+template Status Dequantize4Bits<half, half>(
+    half* output,
+    const uint8_t* quant_data,
+    const half* scales_data,
+    const half* zero_points,
+    const int32_t* reorder_idx,
+    int k,
+    int n,
+    int block_size,
+    cudaStream_t stream);
 ///////////////////////////////////////////////////////////////////////////////
 // A more general block-wise dequantization implementation that supports
 // different block sizes and block orientations (row-wise/column-wise).
diff --git a/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise.cuh b/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise.cuh
index f9c09c55fd893..580b5087f3fa3 100644
--- a/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise.cuh
+++ b/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise.cuh
@@ -7,18 +7,18 @@
 namespace onnxruntime {
 namespace contrib {
 namespace cuda {
-template <class T>
+template <class T, typename ZeroT>
 Status Dequantize4Bits(
     T* output,
     const uint8_t* quant_data,
     const T* scales_data,
-    const uint8_t* zero_points,
+    const ZeroT* zero_points,
+    const int32_t* reorder_idx,
     int k,
     int n,
     int block_size,
     cudaStream_t stream);
 
-
 /**
  * @brief Dequantize a block-wise quantized matrix, and store the result in a
  *        column major matrix for use in subsequent GEMM. This implementation supports
diff --git a/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cc
index 015df70c8ec3c..1cec6f6a12f1c 100644
--- a/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cc
+++ b/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cc
@@ -1,15 +1,12 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-//
-// This module define MatMulFp32Q4 operator, it is basically
-// matmul float32 with right hand side being a 2-D matrix
-// pre-packed and block-compacted into int4
-//
-
-#include "core/common/safeint.h"
-#include "core/providers/cuda/cuda_kernel.h"
-#include "core/providers/cuda/shared_inc/fpgeneric.h"
+#include "contrib_ops/cuda/quantization/matmul_nbits.h"
+
+#include <cstdint>
+
+#include "core/common/status.h"
+#include "core/framework/float16.h"
 #include "core/providers/cpu/math/matmul_helper.h"
 #include "matmul_nbits.cuh"
 #include "dequantize_blockwise.cuh"
@@ -19,40 +16,19 @@ namespace contrib {
 namespace cuda {
 using namespace onnxruntime::cuda;
 
-template <typename T>
-class MatMulNBits final : public CudaKernel {
- public:
-  MatMulNBits(const OpKernelInfo& info) : CudaKernel(info) {
-    ORT_ENFORCE(Status::OK() == info.GetAttr<int64_t>("K", &K_));
-    ORT_ENFORCE(Status::OK() == info.GetAttr<int64_t>("N", &N_));
-    ORT_ENFORCE(Status::OK() == info.GetAttr<int64_t>("block_size", &block_size_));
-    ORT_ENFORCE(Status::OK() == info.GetAttr<int64_t>("bits", &nbits_));
-    ORT_ENFORCE(nbits_ == 4,
-                "Only 4b quantization is supported for MatMulNBits op,"
-                " additional bits support is planned.");
-  }
-
-  Status ComputeInternal(OpKernelContext* context) const override;
-
- private:
-  int64_t K_;
-  int64_t N_;
-  int64_t block_size_;
-  int64_t nbits_;
-  bool column_wise_quant_blk_{true};
-};
-
 template <typename T>
 Status MatMulNBits<T>::ComputeInternal(OpKernelContext* ctx) const {
   const Tensor* a = ctx->Input<Tensor>(0);
   const Tensor* b = ctx->Input<Tensor>(1);
   const Tensor* scales = ctx->Input<Tensor>(2);
   const Tensor* zero_points = ctx->Input<Tensor>(3);
+  const Tensor* reorder_idx = ctx->Input<Tensor>(4);
 
   const auto* a_data = a->Data<T>();
   const uint8_t* blob_data = b->Data<uint8_t>();
   const auto* scales_data = scales->Data<T>();
-  const auto* zero_points_data = zero_points == nullptr ? nullptr : zero_points->Data<uint8_t>();
+  const auto* zero_points_data = zero_points == nullptr ? nullptr : zero_points->DataRaw();
+  const auto* reorder_idx_data = reorder_idx == nullptr ? nullptr : reorder_idx->Data<int32_t>();
 
   typedef typename ToCudaType<T>::MappedType CudaT;
 
@@ -67,77 +43,99 @@ Status MatMulNBits<T>::ComputeInternal(OpKernelContext* ctx) const {
   // Bail out early if the output is going to be empty
   if (Y->Shape().Size() == 0) return Status::OK();
 
-  bool is_4bit_done = TryMatMul4Bits(
-      reinterpret_cast<CudaT*>(Y->MutableData<T>()),
-      reinterpret_cast<const CudaT*>(a_data),
-      blob_data,
-      reinterpret_cast<const CudaT*>(scales_data),
-      zero_points_data,
-      SafeInt<int>(helper.M()),
-      SafeInt<int>(helper.N()),
-      SafeInt<int>(helper.K()),
-      SafeInt<int>(block_size_),
-      SafeInt<int>(GetDeviceProp().sharedMemPerBlock),
-      static_cast<cudaStream_t>(ctx->GetComputeStream()->GetHandle()));
-  if (!is_4bit_done) {
-    int64_t K_padded = (K_ + block_size_ - 1) / block_size_ * block_size_;
-    IAllocatorUniquePtr<T> b_data_ptr = GetScratchBuffer<T>(N_ * K_padded, ctx->GetComputeStream());
-    auto* b_data = b_data_ptr.get();
-    if (column_wise_quant_blk_) {
-      // column-wise block
+  bool is_4bit_done = (reorder_idx_data == nullptr) &&
+                      (!zero_points || !zero_points->IsDataType<T>()) &&
+                      TryMatMul4Bits(
+                          reinterpret_cast<CudaT*>(Y->MutableData<T>()),
+                          reinterpret_cast<const CudaT*>(a_data),
+                          blob_data,
+                          reinterpret_cast<const CudaT*>(scales_data),
+                          static_cast<const uint8_t*>(zero_points_data),
+                          SafeInt<int>(helper.M()),
+                          SafeInt<int>(helper.N()),
+                          SafeInt<int>(helper.K()),
+                          SafeInt<int>(block_size_),
+                          SafeInt<int>(GetDeviceProp().sharedMemPerBlock),
+                          static_cast<cudaStream_t>(ctx->GetComputeStream()->GetHandle()));
+
+  if (is_4bit_done) {
+    return Status::OK();
+  }
+
+  int64_t K_padded = (K_ + block_size_ - 1) / block_size_ * block_size_;
+  IAllocatorUniquePtr<T> b_data_ptr = GetScratchBuffer<T>(N_ * K_padded, ctx->GetComputeStream());
+  auto* b_data = b_data_ptr.get();
+  if (column_wise_quant_blk_) {
+    if (reorder_idx) {
+      ORT_ENFORCE(K_padded == reorder_idx->Shape()[0], "K_padded != g_idx->Shape()[0]");
+    }
+    // column-wise block
+    if ((zero_points && zero_points->IsDataType<T>())) {
       ORT_RETURN_IF_ERROR(Dequantize4Bits(
           reinterpret_cast<CudaT*>(b_data),
           blob_data,
           reinterpret_cast<const CudaT*>(scales_data),
-          zero_points_data,
+          (const CudaT*)zero_points_data,
+          reorder_idx_data,
           SafeInt<int>(K_padded),
           SafeInt<int>(N_),
           SafeInt<int>(block_size_),
           static_cast<cudaStream_t>(ctx->GetComputeStream()->GetHandle())));
     } else {
-      // row-wise block
-      K_padded = K_;
-
-      ORT_RETURN_IF_ERROR(DequantizeBlockwise4b(
+      ORT_RETURN_IF_ERROR(Dequantize4Bits(
           reinterpret_cast<CudaT*>(b_data),
           blob_data,
           reinterpret_cast<const CudaT*>(scales_data),
-          zero_points_data,
-          SafeInt<int>(block_size_),
-          column_wise_quant_blk_,
-          SafeInt<int>(K_),
+          (const uint8_t*)zero_points_data,
+          reorder_idx_data,
+          SafeInt<int>(K_padded),
           SafeInt<int>(N_),
+          SafeInt<int>(block_size_),
           static_cast<cudaStream_t>(ctx->GetComputeStream()->GetHandle())));
     }
+  } else {
+    // row-wise block
+    K_padded = K_;
+
+    ORT_RETURN_IF_ERROR(DequantizeBlockwise4b(
+        reinterpret_cast<CudaT*>(b_data),
+        blob_data,
+        reinterpret_cast<const CudaT*>(scales_data),
+        (const uint8_t*)zero_points_data,
+        SafeInt<int>(block_size_),
+        column_wise_quant_blk_,
+        SafeInt<int>(K_),
+        SafeInt<int>(N_),
+        static_cast<cudaStream_t>(ctx->GetComputeStream()->GetHandle())));
+  }
 #if 0
-  cudaStreamSynchronize(static_cast<cudaStream_t>(ctx->GetComputeStream()->GetHandle()));
-  T* b_data_cpu = new T[K_ * N_];
-  cudaMemcpy(b_data_cpu, b_data, K_ * N_ * sizeof(T), cudaMemcpyDeviceToHost);
-  delete[] b_data_cpu;
+cudaStreamSynchronize(static_cast<cudaStream_t>(ctx->GetComputeStream()->GetHandle()));
+T* b_data_cpu = new T[K_ * N_];
+cudaMemcpy(b_data_cpu, b_data, K_ * N_ * sizeof(T), cudaMemcpyDeviceToHost);
+delete[] b_data_cpu;
 #endif
 
-    const CudaT alpha = ToCudaType<T>::FromFloat(1.f);
-    const CudaT zero = ToCudaType<T>::FromFloat(0.f);
-
-    if (helper.OutputOffsets().size() == 1) {
-      CUBLAS_RETURN_IF_ERROR(cublasGemmHelper(
-          GetCublasHandle(ctx),
-          CUBLAS_OP_T,
-          CUBLAS_OP_N,
-          SafeInt<int>(helper.N()),
-          SafeInt<int>(helper.M()),
-          SafeInt<int>(helper.K()),
-          &alpha,
-          reinterpret_cast<const CudaT*>(b_data),
-          SafeInt<int>(K_padded),
-          reinterpret_cast<const CudaT*>(a_data),
-          helper.Lda(transa),
-          &zero,
-          reinterpret_cast<CudaT*>(Y->MutableData<T>()),
-          helper.Ldc(),
-          GetDeviceProp(),
-          UseTF32()));
-    }
+  const CudaT alpha = ToCudaType<T>::FromFloat(1.f);
+  const CudaT zero = ToCudaType<T>::FromFloat(0.f);
+
+  if (helper.OutputOffsets().size() == 1) {
+    CUBLAS_RETURN_IF_ERROR(cublasGemmHelper(
+        GetCublasHandle(ctx),
+        CUBLAS_OP_T,
+        CUBLAS_OP_N,
+        SafeInt<int>(helper.N()),
+        SafeInt<int>(helper.M()),
+        SafeInt<int>(helper.K()),
+        &alpha,
+        reinterpret_cast<const CudaT*>(b_data),
+        SafeInt<int>(K_padded),
+        reinterpret_cast<const CudaT*>(a_data),
+        helper.Lda(transa),
+        &zero,
+        reinterpret_cast<CudaT*>(Y->MutableData<T>()),
+        helper.Ldc(),
+        GetDeviceProp(),
+        UseTF32()));
   }
 
   return Status::OK();
diff --git a/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.h b/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.h
new file mode 100644
index 0000000000000..f5c2c6c4e4fdf
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.h
@@ -0,0 +1,41 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+//
+// This module define MatMulNBits operator, it is basically
+// matmul float with right hand side being a 2-D matrix
+// pre-packed and block-compacted into int4
+//
+#pragma once
+#include "core/common/safeint.h"
+#include "core/providers/cuda/cuda_kernel.h"
+#include "core/providers/cuda/shared_inc/fpgeneric.h"
+
+namespace onnxruntime {
+namespace contrib {
+namespace cuda {
+using namespace onnxruntime::cuda;
+
+template <typename T>
+class MatMulNBits final : public CudaKernel {
+ public:
+  MatMulNBits(const OpKernelInfo& info) : CudaKernel(info) {
+    ORT_ENFORCE(Status::OK() == info.GetAttr<int64_t>("K", &K_));
+    ORT_ENFORCE(Status::OK() == info.GetAttr<int64_t>("N", &N_));
+    ORT_ENFORCE(Status::OK() == info.GetAttr<int64_t>("block_size", &block_size_));
+    ORT_ENFORCE(Status::OK() == info.GetAttr<int64_t>("bits", &nbits_));
+  }
+
+  Status ComputeInternal(OpKernelContext* context) const override;
+
+ private:
+  int64_t K_;
+  int64_t N_;
+  int64_t block_size_;
+  int64_t nbits_;
+  bool column_wise_quant_blk_{true};
+};
+
+}  // namespace cuda
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
index e33ce20737f80..f06a3785f362d 100644
--- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
@@ -3343,22 +3343,23 @@ MatMulNBits is a MatMul with weight quantized with N bits(e.g., 2, 3, 4, 5, 6, 7
      And block_size is not an arbitrary number and must be a power of 2 and not smaller than 16, like 16, 32, 64, 128,..
   3. Input B's scale and zero point are specified by input scales and zero_points.
 
-Input B is stored as uint8_t with shape: [N][n_blocks_per_col][blob_size] in which:
-- n_blocks_per_col = (K + block_size - 1) / block_size
-- blob_size = block_size / 8 * bits
-
-  For a block blob. It is stored in format:
-  struct Blob {
-    uint8 one_bits[(bits & 0x1) * 1 * block_size / 8];  // highest 1 bit for 3, 5, 7 bits quantization
-    uint8 two_bits[(bits & 0x2) * 2 * block_size / 8];  // high 2 bits for 2, 6, 7 bits quantization
-    uint8 four_bits[(bits & 0x4) * 4 * block_size / 8]; // low 4 bits for 4, 5, 6 bits quantization
-  }
+  Input is stored as uint8_t with shape: [N][n_blocks_per_col][blob_size] in which:
+  - n_blocks_per_col = (K + block_size - 1) / block_size
+  - blob_size = CeilDiv(block_size * bits, bitsof(uint8_t)<8>)
+  For all bits from 2-8, a row of data is stored squeezely and represented by uint8_t.
+    - for 2,4,8 bits, 4x2bit,2x4bit,1x8bit are stored in one uint8_t.
+        4bit example:
+        |.|.|.|.| .|.|.|.| =uint8_t (2x4bit)
+    - for 3,5,6,7 bits, 32x3bit,32x5bit,16x6bit,32x7bit are stored in 12xuint8_t,20xuint8_t,12xuint8_t,28xuint8_t separately. no bits are wasted.
+        3bit example:
+        |.|.|. |.|.|. |.|.|. = 9bit, which across 2 uint8_t, the highest bit for the second uint8_t is used.
+  The last uint_8 may have some bits unused.
 
-Input scales is stored in same type as original type of B(float32, float16) with shape like: [N * n_blocks_per_col]
-Input zero_points is stored as uint8_t. If bits <= 4, two zero points are stored as one unit8_t. If bits > 4, one zero point is stored with one unit8_t. Thus, its shape is:
-  - [(N * n_blocks_per_col + 1) / 2] if bits <=4
-  - [N * n_blocks_per_col] if bits > 4
 
+Input scales is stored in same type as original type of B(float32, float16) with shape like: [N * n_blocks_per_col]
+Input zero_points is stored as uint8_t or same as type(A). It has the same packing method as input B.
+  - [CeilDiv((N * n_blocks_per_col + 1) *bits, 8)]
+  If zero_points has same type as A, it's not packed and has the same shape as Scales.
 )DOC";
 
   ONNX_CONTRIB_OPERATOR_SCHEMA(MatMulNBits)
@@ -3377,12 +3378,15 @@ Input zero_points is stored as uint8_t. If bits <= 4, two zero points are stored
             "type T1.",
             AttributeProto::INT, static_cast<int64_t>(0))
       .Input(0, "A", "The input tensor, not quantized", "T1")
-      .Input(1, "B", "1-dimensional data blob", "T2")
+      .Input(1, "B", "1 or 2 dimensional data blob", "T2")
       .Input(2, "scales", "quantization scale", "T1")
-      .Input(3, "zero_points", "quantization zero points", "T2", OpSchema::Optional)
+      .Input(3, "zero_points", "quantization zero points", "T3", OpSchema::Optional)
+      .Input(4, "g_idx", "group_idx", "T4", OpSchema::Optional)
       .Output(0, "Y", "tensor. The output tensor has the same rank as the input. ", "T1")
       .TypeConstraint("T1", {"tensor(float)", "tensor(float16)"}, "Constrain input and output types to float/half_float tensors.")
-      .TypeConstraint("T2", {"tensor(uint8)"}, "Constrain quantized weight types to uint8.")
+      .TypeConstraint("T2", {"tensor(uint8)", "tensor(int32)"}, "Constrain quantized weight types to uint8/int32.")
+      .TypeConstraint("T3", {"tensor(uint8)", "tensor(int32)", "tensor(float16)", "tensor(float)"}, "Constrain quantized zero point types to uint8/int32/float16/float.")
+      .TypeConstraint("T4", {"tensor(int32)"}, "the index tensor.")
       .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
         // Type inference
         propagateElemTypeFromInputToOutput(ctx, 0, 0);
diff --git a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
index eb7bbec997d59..a1916e806c5c0 100644
--- a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
+++ b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
@@ -65,7 +65,7 @@ def __init__(
         self,
         calibration_data_reader: CalibrationDataReader,
         percdamp=0.01,
-        blocksize=128,
+        block_size=128,
         actorder=False,
         mse=False,
         perchannel=True,
@@ -79,7 +79,7 @@ def __init__(
                 a calibration data reader. It enumerates calibration data and generates inputs for the original model.
             percdamp:
                 percent of the average Hessian diagonal to use for dampening.
-            blocksize (int, optional):
+            block_size (int, optional):
                 channel number in one block to execute a GPTQ quantization iteration.
             actorder (bool, optional):
                 whether rearrange Hessian matrix considering the diag's value.
@@ -93,42 +93,285 @@ def __init__(
         )
         self.calibration_data_reader = calibration_data_reader
         self.percdamp = percdamp
-        self.blocksize = blocksize
+        self.block_size = block_size
         self.actorder = actorder
         self.mse = mse
         self.perchannel = perchannel
 
 
-class MatMul4BitsQuantizer:
-    """Perform 4b quantization of constant MatMul weights"""
+class HQQWeightOnlyQuantConfig(WeightOnlyQuantConfig):
+    def __init__(
+        self,
+        block_size=128,
+        bits=4,
+        axis=1,
+    ):
+        """
+        This is a class for HQQ algorithm Weight Only Quant Configuration.
+        HQQ algorithm quant weight without needing calibrate data.
+
+        Args:
+            block_size (int, optional):
+                channel number in one block to execute a GPTQ quantization iteration.
+            bits (int, optional):
+                how many bits to represent weight.
+            axis (int, optional):
+                0 or 1. which axis to quantize. https://arxiv.org/pdf/2309.15531.pdf
+        """
+        super().__init__(
+            algorithm="HQQ",
+        )
+        self.block_size = block_size
+        self.bits = bits
+        self.axis = axis
 
+
+class DefaultWeightOnlyQuantConfig(WeightOnlyQuantConfig):
     def __init__(
         self,
-        model: ModelProto | str,
-        block_size: int,
-        is_symmetric: bool,
+        block_size: int = 128,
+        is_symmetric: bool = False,
         accuracy_level: int | None = None,
-        nodes_to_exclude=None,
-        algo_config: WeightOnlyQuantConfig = None,
     ):
-        if nodes_to_exclude is None:
-            nodes_to_exclude = []
-        self.model = ONNXModel(onnx.load(model)) if isinstance(model, str) else ONNXModel(model)
-        self.model_path = model if isinstance(model, str) else None
+        super().__init__(algorithm="DEFAULT")
         self.block_size = block_size
         self.is_symmetric = is_symmetric
+        self.bits = 4
         self.accuracy_level = accuracy_level
-        self.nodes_to_exclude = set(nodes_to_exclude)
-        self.algo_config = algo_config
+
+
+def is_divisible(val1, val2):
+    return int(val2 * np.ceil(val1 / val2)) == val1
+
+
+class HQQWeightOnlyQuantizer:
+    def __init__(
+        self,
+        config: HQQWeightOnlyQuantConfig,
+    ):
+        self.config = config
+
+    # Proximal solver || weight - dequantize(quantize(weight))||_p^p
+    @staticmethod
+    def optimize_weights(
+        tensor,
+        scale,
+        zero,
+        min_max: list[int],
+        axis: int = 0,
+        opt_params: dict = None,  # noqa: RUF013
+        verbose=False,
+    ):
+        import torch
+
+        opt_params = {"lp_norm": 0.7, "beta": 1e1, "kappa": 1.01, "iters": 20} if opt_params is None else opt_params
+        lp_norm, beta, kappa, iters = (
+            opt_params["lp_norm"],
+            opt_params["beta"],
+            opt_params["kappa"],
+            opt_params["iters"],
+        )
+
+        dtype = torch.float16 if tensor.is_cuda else torch.float32
+        w_f = tensor.to(dtype)
+        scale = scale.to(dtype)
+        zero = zero.to(dtype)
+
+        if lp_norm == 1:
+
+            def shrink_op(x, beta):
+                return torch.sign(x) * torch.nn.functional.relu(torch.abs(x) - 1.0 / beta)
+
+        else:
+
+            def shrink_op(x, beta, p=lp_norm):
+                return torch.sign(x) * torch.nn.functional.relu(
+                    torch.abs(x) - (1.0 / beta) * torch.pow(torch.abs(x) + 1e-8, p - 1)
+                )
+
+        best_error = 1e4
+        for i in range(iters):
+            w_q = torch.round(w_f * scale + zero).clamp(min_max[0], min_max[1])
+            w_r = (w_q - zero) / scale
+            w_e = shrink_op(w_f - w_r, beta)
+            zero = torch.mean(w_q - (w_f - w_e) * scale, axis=axis, keepdim=True)
+            beta *= kappa
+
+            current_error = float(torch.abs(w_f - w_r).mean())
+            if verbose:
+                print(i, np.round(current_error, 6))
+            if current_error < best_error:
+                best_error = current_error
+            else:
+                break
+
+        del w_f, w_q, w_r, w_e
+
+        return scale, zero
 
     @staticmethod
-    def __get_initializer(name, graph_path: list[GraphProto]) -> tuple[TensorProto, GraphProto]:
-        for gid in range(len(graph_path) - 1, -1, -1):
-            graph = graph_path[gid]
-            for tensor in graph.initializer:
-                if tensor.name == name:
-                    return tensor, graph
-        return None, None
+    def pack_on_row_fast_248bit(pack_tensor, ori_int_tensor, bits):
+        if pack_tensor.shape[0] == ori_int_tensor.shape[0]:
+            ori_int_tensor = ori_int_tensor.T
+            pack_tensor = pack_tensor.T
+        if bits in [2, 4, 8]:
+            compress_ratio = pack_tensor.element_size() * 8 // bits
+            for j in range(0, compress_ratio):
+                pack_tensor[0:] |= ori_int_tensor[j::compress_ratio] << (bits * (j))
+        else:
+            raise NotImplementedError("Only 2,4,8 bits are supported.")
+
+    # from Official implementation of Half-Quadratic Quantization (HQQ)
+    def quantize_internal(
+        self, tensor, bits=4, channel_wise=True, group_size=64, optimize=True, round_zero=True, axis=1
+    ):
+        import torch
+
+        weight = tensor.float()
+        ori_shape = weight.shape
+
+        pad_len = (group_size - ori_shape[axis] % group_size) % group_size
+        if axis == 1:
+            weight = torch.nn.functional.pad(weight, (0, pad_len), "constant", 0)
+        else:
+            weight = torch.nn.functional.pad(weight, (0, 0, 0, pad_len), "constant", 0)
+        shape = weight.shape
+
+        # Reshape for grouping
+        if (group_size is not None) and channel_wise:
+            weight = weight.reshape([-1, group_size]) if (axis == 1) else weight.reshape([group_size, -1])
+
+        # Get min/max values
+        if channel_wise is False:
+            _min, _max = weight.min(), weight.max()
+            optimize = False
+        else:
+            _min = weight.min(axis=axis, keepdim=True)[0]
+            _max = weight.max(axis=axis, keepdim=True)[0]
+
+        max_v = 2**bits - 1
+        min_v = 0
+        min_max = [min_v, max_v]
+
+        # Note: here we work with the inverse of the scale to avoid division and quantize instead via weight*scale + zero, the scale is inverted later on.
+        # clamp to avoid half-precision problems
+        scale = (max_v / (_max - _min)).clamp(max=2e4)
+        #!!!!!!!!!!!!!!!
+        min_max_axis = _max - _min
+        if (min_max_axis == 0).sum().item() > 0:
+            min_max_axis[min_max_axis == 0] = max_v
+            scale = (max_v / min_max_axis).clamp(max=2e4)
+        zero = -_min * scale
+
+        if round_zero:
+            zero = torch.round(zero)
+
+        # Fine-tune weights
+        if optimize:
+            scale, zero = self.optimize_weights(tensor=weight, scale=scale, zero=zero, min_max=min_max, axis=axis)
+
+        # Quantize
+        # Necessary for fake quantization backprop
+        w_q = torch.round(weight * scale + zero).clamp(min_max[0], min_max[1])
+        w_q = w_q.reshape(shape).int()
+
+        scale = 1.0 / scale
+        if axis == 1:
+            scale = scale.reshape(shape[0], -1)
+            zero = zero.reshape(shape[0], -1)
+        else:
+            scale = scale.reshape(-1, shape[-1])
+            zero = zero.reshape(-1, shape[-1])
+        # cleanup
+        del weight, _min, _max
+
+        return w_q, scale.to(tensor.dtype), zero.to(tensor.dtype)
+
+    def quantize(self, node: NodeProto, graph_stack: list[GraphProto]):
+        """If the node is MatMul with fp32 const weight, quantize the weight with int4, and return the new node"""
+        if node.op_type != "MatMul":
+            return node  # only care about MatMul for now
+        import torch
+
+        logger.info(f"start to quantize {node.name} ...")
+        inputB = node.input[1]  # noqa: N806
+        b_pb, bs_graph = get_initializer(inputB, graph_stack)
+        if b_pb is None:
+            logger.info("MatMul doesn't have const weight. Skip to quantize")
+            return node  # only care about constant weight
+
+        b_array = onnx.numpy_helper.to_array(b_pb)
+        if len(b_array.shape) != 2:
+            logger.info("MatMul weight is not 2D. Skip to quantize")
+            return node  # can only process 2-D matrix
+        b_array_torch = torch.from_numpy(b_array)
+        if torch.cuda.is_available():
+            b_array_torch = b_array_torch.cuda()
+        quant_weight_torch, scales_torch, zero_points_torch = self.quantize_internal(
+            b_array_torch.T, bits=self.config.bits, group_size=self.config.block_size
+        )
+        quant_weight_torch = quant_weight_torch.contiguous()
+        scales_torch = scales_torch.contiguous()
+        zero_points_torch = zero_points_torch.contiguous()
+
+        packed_torch = torch.zeros(
+            (quant_weight_torch.shape[0], quant_weight_torch.shape[1] // 2),
+            dtype=torch.uint8,
+            device=quant_weight_torch.device,
+        )
+        self.pack_on_row_fast_248bit(packed_torch, quant_weight_torch, self.config.bits)
+        scales = scales_torch.cpu().numpy()
+        zero_points = zero_points_torch.cpu().numpy()
+        b_quant = onnx.numpy_helper.from_array(packed_torch.cpu().numpy())
+        b_quant.name = b_pb.name + "_Q4"
+        for input in bs_graph.input:
+            if input.name == inputB:
+                bs_graph.input.remove(input)
+                break
+
+        scales_tensor = onnx.numpy_helper.from_array(scales)
+        scales_tensor.name = b_pb.name + "_scales"
+        bs_graph.initializer.extend([b_quant, scales_tensor])
+
+        input_names = [node.input[0], b_quant.name, scales_tensor.name]
+        zp_tensor = onnx.numpy_helper.from_array(zero_points)
+        zp_tensor.name = b_pb.name + "_zero_points"
+        bs_graph.initializer.extend([zp_tensor])
+        input_names.append(zp_tensor.name)
+
+        kwargs = {}
+        rows, cols = b_array.shape
+        kwargs["K"] = rows
+        kwargs["N"] = cols
+        kwargs["bits"] = self.config.bits
+        kwargs["block_size"] = self.config.block_size
+
+        matmul_q4_node = onnx.helper.make_node(
+            "MatMulNBits",
+            inputs=input_names,
+            outputs=[node.output[0]],
+            name=node.name + "_Q4" if node.name else "",
+            domain="com.microsoft",
+            **kwargs,
+        )
+
+        logger.info(f"complete quantization of {node.name} ...")
+
+        return matmul_q4_node
+
+
+def get_initializer(name, graph_path: list[GraphProto]) -> tuple[TensorProto, GraphProto]:
+    for gid in range(len(graph_path) - 1, -1, -1):
+        graph = graph_path[gid]
+        for tensor in graph.initializer:
+            if tensor.name == name:
+                return tensor, graph
+    return None, None
+
+
+class DefaultWeightOnlyQuantizer:
+    def __init__(self, config: DefaultWeightOnlyQuantConfig):
+        self.config = config
 
     def int4_block_quant(self, fp32weight: npt.ArrayLike) -> np.ndarray:
         """4b quantize fp32 weight to a blob"""
@@ -137,7 +380,7 @@ def int4_block_quant(self, fp32weight: npt.ArrayLike) -> np.ndarray:
             raise ValueError("Current int4 block quantization only supports 2D tensors!")
         rows, cols = fp32weight.shape
 
-        block_size = self.block_size
+        block_size = self.config.block_size
         blob_size = block_size // 2
         k_blocks = (rows + block_size - 1) // block_size
         padded_rows = k_blocks * block_size
@@ -149,23 +392,19 @@ def int4_block_quant(self, fp32weight: npt.ArrayLike) -> np.ndarray:
         packed = np.zeros((cols, k_blocks, blob_size), dtype="uint8")
         scales = np.zeros((cols * k_blocks), dtype=fp32weight.dtype)
         zero_point = np.zeros(cols * ((k_blocks + 1) // 2), dtype="uint8")
-        quantize_matmul_4bits(packed, fp32weight, scales, zero_point, block_size, cols, rows, self.is_symmetric)
+        quantize_matmul_4bits(packed, fp32weight, scales, zero_point, block_size, cols, rows, self.config.is_symmetric)
 
         return (packed, scales, zero_point)
 
-    def _q4_matmul_node_weight(self, node: NodeProto, graph_stack: list[GraphProto]) -> NodeProto:
+    def quantize(self, node: NodeProto, graph_stack: list[GraphProto]) -> NodeProto:
         """If the node is MatMul with fp32 const weight, quantize the weight with int4, and return the new node"""
 
         if node.op_type != "MatMul":
             return node  # only care about MatMul for now
 
         logger.info(f"start to quantize {node.name} ...")
-        if node.name in self.nodes_to_exclude:
-            logger.info(f"exclude to quantize {node.name} as specified by nodes_to_exclude...")
-            return node
-
         inputB = node.input[1]  # noqa: N806
-        B, Bs_graph = MatMul4BitsQuantizer.__get_initializer(inputB, graph_stack)  # noqa: N806
+        B, Bs_graph = get_initializer(inputB, graph_stack)  # noqa: N806
         if B is None:
             logger.info("MatMul doesn't have const weight. Skip to quantize")
             return node  # only care about constant weight
@@ -188,7 +427,7 @@ def _q4_matmul_node_weight(self, node: NodeProto, graph_stack: list[GraphProto])
         Bs_graph.initializer.extend([B_quant, scales_tensor])
 
         input_names = [node.input[0], B_quant.name, scales_tensor.name]
-        if not self.is_symmetric:
+        if not self.config.is_symmetric:
             zp_tensor = onnx.numpy_helper.from_array(zero_points)
             zp_tensor.name = B.name + "_zero_points"
             Bs_graph.initializer.extend([zp_tensor])
@@ -199,8 +438,8 @@ def _q4_matmul_node_weight(self, node: NodeProto, graph_stack: list[GraphProto])
         kwargs["K"] = rows
         kwargs["N"] = cols
         kwargs["bits"] = 4
-        kwargs["block_size"] = self.block_size
-        if self.accuracy_level is not None:
+        kwargs["block_size"] = self.config.block_size
+        if self.config.accuracy_level is not None:
             kwargs["accuracy_level"] = self.accuracy_level
 
         matmul_q4_node = onnx.helper.make_node(
@@ -216,6 +455,38 @@ def _q4_matmul_node_weight(self, node: NodeProto, graph_stack: list[GraphProto])
 
         return matmul_q4_node
 
+
+class MatMul4BitsQuantizer:
+    """Perform 4b quantization of constant MatMul weights"""
+
+    def __init__(
+        self,
+        model: ModelProto | str,
+        block_size: int = 128,
+        is_symmetric: bool = False,
+        accuracy_level: int | None = None,
+        nodes_to_exclude=None,
+        algo_config: WeightOnlyQuantConfig = None,
+    ):
+        if nodes_to_exclude is None:
+            nodes_to_exclude = []
+        self.model = ONNXModel(onnx.load(model)) if isinstance(model, str) else ONNXModel(model)
+        self.model_path = model if isinstance(model, str) else None
+        self.block_size = block_size
+        self.is_symmetric = is_symmetric
+        self.accuracy_level = accuracy_level
+        self.nodes_to_exclude = set(nodes_to_exclude)
+        self.node_quantizer = None
+        if algo_config is None:
+            algo_config = DefaultWeightOnlyQuantConfig(
+                block_size=block_size, is_symmetric=is_symmetric, accuracy_level=accuracy_level
+            )
+        self.algo_config = algo_config
+        if algo_config.algorithm == "HQQ":
+            self.node_quantizer = HQQWeightOnlyQuantizer(self.algo_config)
+        elif algo_config.algorithm == "DEFAULT":
+            self.node_quantizer = DefaultWeightOnlyQuantizer(self.algo_config)
+
     def _process_subgraph(self, graph_stack: list[GraphProto]):
         new_nodes = []
         graph = graph_stack[-1]
@@ -246,8 +517,15 @@ def _process_subgraph(self, graph_stack: list[GraphProto]):
                 node = onnx.helper.make_node(  # noqa: PLW2901
                     node.op_type, node.input, node.output, name=node.name, **kwargs
                 )
-
-            new_nodes.append(self._q4_matmul_node_weight(node, graph_stack))
+            out_node = None
+            if node.name in self.nodes_to_exclude:
+                logger.info(f"exclude to quantize {node.name} as specified by nodes_to_exclude...")
+                out_node = node
+            elif self.algo_config is not None and self.algo_config.algorithm == "HQQ":
+                out_node = self.node_quantizer.quantize(node, graph_stack)
+            else:
+                out_node = self.node_quantizer.quantize(node, graph_stack)
+            new_nodes.append(out_node)
 
         graph.ClearField("node")
         graph.node.extend(new_nodes)
@@ -300,7 +578,7 @@ def inc_dataloader():
             from neural_compressor.adaptor.ox_utils.weight_only import gptq_quantize
 
             kwargs["percdamp"] = self.algo_config.percdamp
-            kwargs["blocksize"] = self.algo_config.blocksize
+            kwargs["blocksize"] = self.algo_config.block_size
             kwargs["actorder"] = self.algo_config.actorder
             kwargs["mse"] = self.algo_config.mse
             kwargs["perchannel"] = self.algo_config.perchannel
@@ -316,7 +594,7 @@ def inc_dataloader():
         logger.info(f"complete quantization of model with {algorithm} algorithm.")
 
     def process(self):
-        if self.algo_config is None:
+        if self.algo_config.algorithm in ["HQQ", "DEFAULT"]:
             # use a stack to keep track of sub-graphs
             graph_stack = [self.model.graph()]
             opset_import = self.model.opset_import()
@@ -327,7 +605,6 @@ def process(self):
                     has_ms_domain = True
             if not has_ms_domain:
                 opset_import.extend([onnx.helper.make_opsetid("com.microsoft", 1)])
-
             self._process_subgraph(graph_stack)
             self.model.clean_initializers()
         else:
@@ -366,6 +643,14 @@ def parse_args():
     parser.add_argument("--input_model", required=True, help="Path to the input model file")
     parser.add_argument("--output_model", required=True, help="Path to the output model file")
     parser.add_argument("--block_size", required=False, default=32, type=int, help="Block size for quantization")
+    parser.add_argument(
+        "--quant_method",
+        default="default",
+        type=str,
+        choices=["default", "hqq"],
+        help="the algorithm used to quantize weight",
+    )
+    parser.add_argument("--bits", default=4, type=int, help="the target bits to represent weight")
     parser.add_argument(
         "--symmetric",
         required=False,
@@ -411,12 +696,24 @@ def parse_args():
         raise Exception(f"file {output_model_path} already exists")
 
     model = onnx.load(input_model_path)
+    if args.quant_method == "hqq":
+        quant_config = HQQWeightOnlyQuantConfig(block_size=args.block_size, bits=args.bits)
+    elif args.quant_method == "default":
+        quant_config = DefaultWeightOnlyQuantConfig(
+            block_size=args.block_size, is_symmetric=args.symmetric, accuracy_level=args.accuracy_level
+        )
+    elif args.quant_method == "rtn":
+        quant_config = RTNWeightOnlyQuantConfig()
+    elif args.quant_method == "gptq":
+        quant_config = GPTQWeightOnlyQuantConfig(block_size=args.block_size)
+    else:
+        raise ValueError(f"Unsupported quantization method: {args.quant_method}")
+
     quant = MatMul4BitsQuantizer(
         model=model,
-        block_size=args.block_size,
-        is_symmetric=args.symmetric,
         accuracy_level=args.accuracy_level,
         nodes_to_exclude=args.nodes_to_exclude,
+        algo_config=quant_config,
     )
     quant.process()
     quant.model.save_model_to_file(output_model_path, True)
diff --git a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
index 2ad20eafc2ef1..d294fd4e2b0e0 100644
--- a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 #ifndef ORT_MINIMAL_BUILD
+#include <gsl/narrow>
 
 #include "core/common/span_utils.h"
 #include "core/framework/tensor.h"
@@ -66,7 +67,9 @@ void QuantizeDequantize(std::vector<float>& raw_vals,
 }
 
 void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, int64_t accuracy_level,
-             bool has_zeropoint, bool use_float16, float fp16_abs_error = 0.02f) {
+             bool has_zeropoint, bool use_float16, bool has_g_idx = false,
+             bool zp_is_4bit = true, float fp16_abs_error = 0.02f) {
+  zp_is_4bit = zp_is_4bit | has_g_idx;
   RandomValueGenerator random{1234};
   std::vector<float> input0_vals(random.Gaussian<float>(std::vector<int64_t>({M, K}), 0.0f, 0.25f));
   std::vector<float> input1_f_vals(random.Gaussian<float>(std::vector<int64_t>({K, N}), 0.0f, 0.25f));
@@ -113,12 +116,40 @@ void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, int64_t accura
   test.AddAttribute<int64_t>("block_size", block_size);
   test.AddAttribute<int64_t>("bits", QBits);
   test.AddAttribute<int64_t>("accuracy_level", accuracy_level);
+  auto ceildiv = [](int64_t a, int64_t b) { return (a + b - 1) / b; };
+
   if (use_float16) {
     test.AddInput<MLFloat16>("A", {M, K}, ToFloat16(input0_vals), false);
     test.AddInput<uint8_t>("B", {q_cols, q_rows}, input1_vals, true);
     test.AddInput<MLFloat16>("scales", {static_cast<int64_t>(q_scale_size)}, ToFloat16(scales), true);
     if (has_zeropoint) {
-      test.AddInput<uint8_t>("zero_points", {static_cast<int64_t>(q_zp_size_in_bytes)}, zp, true);
+      if (zp_is_4bit) {
+        test.AddInput<uint8_t>("zero_points", {static_cast<int64_t>(q_zp_size_in_bytes)}, zp, true);
+      } else {
+        std::vector<float> zp_f;
+        zp_f.reserve(q_zp_size_in_bytes * 2);
+        for (size_t i = 0; i < zp.size(); i++) {
+          zp_f.push_back(static_cast<float>(zp[i] & 0xf));
+          zp_f.push_back(static_cast<float>((zp[i] >> 4) & 0xf));
+        }
+        size_t ind = zp_f.size() - 1;
+        while (zp_f.size() != q_scale_size) {
+          zp_f.erase(zp_f.begin() + ind);
+          ind -= q_scale_size / N + 1;
+        }
+
+        test.AddInput<MLFloat16>("zero_points", {static_cast<int64_t>(q_scale_size)}, ToFloat16(zp_f), true);
+      }
+    } else {
+      test.AddInput<uint8_t>("", {0}, {});
+    }
+    if (has_g_idx) {
+      int K_pad = gsl::narrow<int32_t>(ceildiv(K, block_size) * block_size);
+      std::vector<int32_t> g_idx(K_pad);
+      for (int64_t i = 0; i < K_pad; i++) {
+        g_idx[i] = gsl::narrow<int32_t>(i / block_size);
+      }
+      test.AddInput<int32_t>("g_idx", {static_cast<int64_t>(K_pad)}, g_idx, true);
     }
 
     test.AddOutput<MLFloat16>("Y", {M, N}, ToFloat16(expected_vals));
@@ -132,9 +163,34 @@ void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, int64_t accura
     test.AddInput<uint8_t>("B", {q_cols, q_rows}, input1_vals, true);
     test.AddInput<float>("scales", {static_cast<int64_t>(q_scale_size)}, scales, true);
     if (has_zeropoint) {
-      test.AddInput<uint8_t>("zero_points", {static_cast<int64_t>(q_zp_size_in_bytes)}, zp, true);
-    }
+      if (zp_is_4bit) {
+        test.AddInput<uint8_t>("zero_points", {static_cast<int64_t>(q_zp_size_in_bytes)}, zp, true);
+      } else {
+        std::vector<float> zp_f;
+        zp_f.reserve(q_zp_size_in_bytes * 2);
+        for (size_t i = 0; i < zp.size(); i++) {
+          zp_f.push_back(static_cast<float>(zp[i] & 0xf));
+          zp_f.push_back(static_cast<float>((zp[i] >> 4) & 0xf));
+        }
+        size_t ind = zp_f.size() - 1;
+        while (zp_f.size() != q_scale_size) {
+          zp_f.erase(zp_f.begin() + ind);
+          ind -= q_scale_size / N + 1;
+        }
 
+        test.AddInput<float>("zero_points", {static_cast<int64_t>(q_scale_size)}, zp_f, true);
+      }
+    } else {
+      test.AddInput<uint8_t>("", {0}, {});
+    }
+    if (has_g_idx) {
+      int K_pad = gsl::narrow<int32_t>(ceildiv(K, block_size) * block_size);
+      std::vector<int32_t> g_idx(K_pad);
+      for (int64_t i = 0; i < K_pad; i++) {
+        g_idx[i] = gsl::narrow<int32_t>(i / block_size);
+      }
+      test.AddInput<int32_t>("g_idx", {static_cast<int64_t>(K_pad)}, g_idx, true);
+    }
     test.AddOutput<float>("Y", {M, N}, expected_vals);
     if (accuracy_level == 4) {
       test.SetOutputAbsErr("Y", 0.1f);
@@ -158,6 +214,8 @@ TEST(MatMulNBits, Float32) {
           for (auto accuracy_level : {0}) {
             RunTest(M, N, K, block_size, accuracy_level, false, false);
             RunTest(M, N, K, block_size, accuracy_level, true, false);
+            RunTest(M, N, K, block_size, accuracy_level, false, false, true);
+            RunTest(M, N, K, block_size, accuracy_level, true, false, false, false);
           }
 #endif
         }
@@ -172,8 +230,10 @@ TEST(MatMulNBits, Float16) {
     for (auto N : {1, 2, 32, 288}) {
       for (auto K : {16, 32, 64, 128, 256, 1024, 93, 1234}) {
         for (auto block_size : {16, 32, 64, 128}) {
-          RunTest(M, N, K, block_size, 0, false, true);
-          RunTest(M, N, K, block_size, 0, true, true);
+          for (auto has_gidx : {true, false}) {
+            RunTest(M, N, K, block_size, 0, false, true, has_gidx);
+            RunTest(M, N, K, block_size, 0, true, true, has_gidx, false);
+          }
         }
       }
     }
@@ -183,9 +243,9 @@ TEST(MatMulNBits, Float16) {
 TEST(MatMulNBits, Float16Large) {
   for (auto block_size : {16, 32, 64, 128}) {
     for (auto symmetric : {false, true}) {
-      RunTest(1, 4096, 4096, block_size, 0, symmetric, true, 0.05f);
-      RunTest(1, 4096, 11008, block_size, 0, symmetric, true, 0.05f);
-      RunTest(1, 11008, 4096, block_size, 0, symmetric, true, 0.05f);
+      RunTest(1, 4096, 4096, block_size, 0, symmetric, true, false, true, 0.05f);
+      RunTest(1, 4096, 11008, block_size, 0, symmetric, true, false, true, 0.05f);
+      RunTest(1, 11008, 4096, block_size, 0, symmetric, true, false, true, 0.05f);
     }
   }
 }
diff --git a/onnxruntime/test/python/quantization/op_test_utils.py b/onnxruntime/test/python/quantization/op_test_utils.py
index c1bbb49f10c7e..b30282f2ab41f 100644
--- a/onnxruntime/test/python/quantization/op_test_utils.py
+++ b/onnxruntime/test/python/quantization/op_test_utils.py
@@ -358,6 +358,7 @@ def check_model_correctness(
         model_onnx = onnx.load(f)
     ops_set = set(node.op_type for node in model_onnx.graph.node)
     check_reference_evaluator = not (ops_set & {"EmbedLayerNormalization", "Conv", "Attention", "Transpose"})
+    check_target_evaluator = False
 
     with open(model_path_to_check, "rb") as f:
         model_check = onnx.load(f)
@@ -413,7 +414,7 @@ def check_model_correctness(
             check_sign_f8_quantization(model_path_origin, model_path_to_check)
 
     # Verifies the expected outputs.
-    if check_reference_evaluator and onnx_recent_enough:
+    if check_target_evaluator and onnx_recent_enough:
         if op_matmul:
             reference_new_ops = [QLinearMatMul]
         else:
diff --git a/onnxruntime/test/python/quantization/test_op_matmul_4bits.py b/onnxruntime/test/python/quantization/test_op_matmul_4bits.py
index 73dae08af8ece..88e5052db4e2e 100644
--- a/onnxruntime/test/python/quantization/test_op_matmul_4bits.py
+++ b/onnxruntime/test/python/quantization/test_op_matmul_4bits.py
@@ -125,7 +125,10 @@ def quant_test(
         from onnxruntime.quantization import matmul_4bits_quantizer
 
         model = quant_utils.load_model_with_shape_infer(Path(model_fp32_path))
-        quant = matmul_4bits_quantizer.MatMul4BitsQuantizer(model, block_size, is_symmetric)
+        quant_config = matmul_4bits_quantizer.DefaultWeightOnlyQuantConfig(
+            block_size=block_size, is_symmetric=is_symmetric
+        )
+        quant = matmul_4bits_quantizer.MatMul4BitsQuantizer(model, algo_config=quant_config)
         quant.process()
         quant.model.save_model_to_file(model_int4_path, False)
 
@@ -165,6 +168,9 @@ def quant_test_with_algo(
         elif algorithm == "GPTQ":
             # test GPTQ algorithm
             algo_config = matmul_4bits_quantizer.GPTQWeightOnlyQuantConfig(calibration_data_reader=data_reader)
+        elif algorithm == "HQQ":
+            # test HQQ algorithm
+            algo_config = matmul_4bits_quantizer.HQQWeightOnlyQuantConfig(block_size=block_size)
 
         model = quant_utils.load_model_with_shape_infer(Path(model_fp32_path))
         quant = matmul_4bits_quantizer.MatMul4BitsQuantizer(model, block_size, is_symmetric, algo_config=algo_config)
@@ -227,6 +233,17 @@ def test_quantize_matmul_int4_using_gptq_algo(self):
         data_reader = self.input_feeds(1, {"input": [100, 52]})
         self.quant_test_with_algo("GPTQ", model_fp32_path, data_reader, 32, False)
 
+    @unittest.skipIf(
+        find_spec("onnxruntime.training"), "Skip because training package doesn't has quantize_matmul_4bits"
+    )
+    def test_quantize_matmul_int4_using_hqq_algo(self):
+        if not find_spec("torch"):
+            self.skipTest("skip test_hqq_quant since torch is not installed")
+        model_fp32_path = str(Path(self._tmp_model_dir.name).joinpath("matmul_fp32_offset.onnx").absolute())
+        self.construct_model_matmul(model_fp32_path, symmetric=False)
+        data_reader = self.input_feeds(1, {"input": [100, 52]})
+        self.quant_test_with_algo("HQQ", model_fp32_path, data_reader, 32, False)
+
 
 if __name__ == "__main__":
     unittest.main()

From cd56ea4a74ee41c040899d702667d2c86bee4ef0 Mon Sep 17 00:00:00 2001
From: guyang3532 <62738430+guyang3532@users.noreply.github.com>
Date: Tue, 5 Mar 2024 13:15:30 +0800
Subject: [PATCH 107/279] enable embedding sparse optimization by default
 (#19714)

---
 docs/ORTModule_Training_Guidelines.md              |  2 +-
 .../training/ortmodule/_graph_execution_manager.py | 14 +++++++++-----
 .../python/training/ortmodule/options.py           |  2 +-
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/docs/ORTModule_Training_Guidelines.md b/docs/ORTModule_Training_Guidelines.md
index f50b18b736936..84631bd1f6555 100644
--- a/docs/ORTModule_Training_Guidelines.md
+++ b/docs/ORTModule_Training_Guidelines.md
@@ -246,7 +246,7 @@ to standard outputs.
 #### ORTMODULE_ENABLE_EMBEDDING_SPARSE_OPTIMIZER
 
 - **Feature Area**: *ORTMODULE/Optimizations*
-- **Description**: By default, this is disabled. This env var can be used for enabling or disabling the embedding input
+- **Description**: By default, this is enabled. This env var can be used for enabling or disabling the embedding input
 data sparsity based performance optimizations.
 
 	```bash
diff --git a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
index fda6e345da235..e189ffff9cc7f 100755
--- a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
+++ b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
@@ -681,11 +681,15 @@ def _enable_conditional_optimizations(
                     )
 
                 if self._runtime_options.enable_embedding_sparse_optimizer and len(embed_sparsity_results) > 0:
-                    graph_transformer_config.sparse_embedding_input_names = list(embed_sparsity_results.keys())
-                    self._logger.info("Embedding sparsity-based optimization is ON for %s", embed_sparsity_results)
-                    self._runtime_options.embed_sparsity_ratio = ",".join(
-                        [f"{k}:{v:.0f}%" for k, v in embed_sparsity_results.items()]
-                    )
+                    if detected_device.type == "cuda":
+                        # Embedding sparsity optimization is only supported on CUDA devices.
+                        graph_transformer_config.sparse_embedding_input_names = list(embed_sparsity_results.keys())
+                        self._logger.info("Embedding sparsity-based optimization is ON for %s", embed_sparsity_results)
+                        self._runtime_options.embed_sparsity_ratio = ",".join(
+                            [f"{k}:{v:.0f}%" for k, v in embed_sparsity_results.items()]
+                        )
+                    else:
+                        self._logger.info("Embedding sparsity-based optimization is not supported on non-CUDA devices.")
 
             # If users don't want to print input density, disable the input density observer to avoid overhead
             # when looping through inputs during training.
diff --git a/orttraining/orttraining/python/training/ortmodule/options.py b/orttraining/orttraining/python/training/ortmodule/options.py
index 539859a0d58a6..93d24a34df6bd 100644
--- a/orttraining/orttraining/python/training/ortmodule/options.py
+++ b/orttraining/orttraining/python/training/ortmodule/options.py
@@ -271,7 +271,7 @@ def __init__(self, logger: Logger):
         self.enable_sparse_optimizer = True
         self.label_sparsity_ratio = ""
         self.embed_sparsity_ratio = ""
-        self.enable_embedding_sparse_optimizer = False  # TODO(pengwa): remove once validation on more models are done.
+        self.enable_embedding_sparse_optimizer = True
 
         # Configuration for memory optimization.
         self.memory_optimization_level = (

From bdf678df93cb257e311de3fa82fe6409be2854ff Mon Sep 17 00:00:00 2001
From: Markus Tavenrath <mtavenrath@users.noreply.github.com>
Date: Tue, 5 Mar 2024 17:09:42 +0100
Subject: [PATCH 108/279] Fix CUDA BatchNorm bugs and add support for NHWC
 (#19742)

### Description
- Fix incorrect running_mean / running_var in training mode due to
incorrect momentum and missing input mean/var. runnig_var could be
correct, but has a too high epsilon.
- Fix incorrect checks when using NHWC
- Pass NHWC flag to NormalizeDims to get correct new dimensions from
x_shape
- Register missing double operations to get parity between NHWC/NCHW
---
 .../core/providers/cpu/nn/batch_norm_helper.h | 41 +++++++++++++------
 .../providers/cuda/cuda_execution_provider.cc | 18 +++++---
 .../core/providers/cuda/cuda_nhwc_kernels.cc  | 16 ++++++++
 .../core/providers/cuda/nn/batch_norm.cc      | 11 ++++-
 .../providers/cpu/nn/batch_norm_op_test.cc    |  1 +
 5 files changed, 66 insertions(+), 21 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/nn/batch_norm_helper.h b/onnxruntime/core/providers/cpu/nn/batch_norm_helper.h
index a5d46aff83b50..ccecbabfa3db3 100644
--- a/onnxruntime/core/providers/cpu/nn/batch_norm_helper.h
+++ b/onnxruntime/core/providers/cpu/nn/batch_norm_helper.h
@@ -25,6 +25,8 @@ class BatchNormHelper {
                                        const Tensor* var,
                                        bool is_spatial = true,
                                        bool is_nhwc = false) {
+    // NHWC dependent shape: X
+    // All other shapes are assumed to be in NCHW layout?
     const auto& x_dims = X->Shape().GetDims();
 
     // If x_dims size < 2, num_channels defaults to 1.
@@ -48,16 +50,22 @@ class BatchNormHelper {
     // validate 'scales' shape
     const auto& scale_dims = scale->Shape().GetDims();
     if (static_cast<int>(scale_dims.size()) != kNumInputScaleDimensions) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid input scale: NumDimensions() != ", kNumInputScaleDimensions);
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "Invalid input scale: NumDimensions() != ", kNumInputScaleDimensions);
     }
     if (scale_dims[0] != num_channels) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid input scale: 0th dimension != ", num_channels);
     }
+    // N & C do not belong to features
+    // skip the first element for NHWC and the first two elements for NCHW.
+    int feature_offset = is_nhwc ? 1 : 2;
+
     // in non-spatial cases - the other dims of 'scale' must be validated
     if (!is_spatial) {
       for (int feature = 0; feature < num_feature_dims; ++feature) {
-        if (scale_dims[1 + feature] != x_dims[2 + feature]) {
-          return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid input scale: ", (1 + feature), " dimension != ", x_dims[2 + feature]);
+        if (scale_dims[1 + feature] != x_dims[feature_offset + feature]) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid input scale: ", (1 + feature),
+                                 " dimension != ", x_dims[feature_offset + feature]);
         }
       }
     }
@@ -65,7 +73,8 @@ class BatchNormHelper {
     // validate 'B' shape
     const auto& B_dims = B->Shape().GetDims();
     if (static_cast<int>(B_dims.size()) != kNumInputBiasDimensions) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid input B: NumDimensions() != ", kNumInputBiasDimensions);
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "Invalid input B: NumDimensions() != ", kNumInputBiasDimensions);
     }
     if (B_dims[0] != num_channels) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid input B: 0th dimension != ", num_channels);
@@ -73,8 +82,9 @@ class BatchNormHelper {
     // in non-spatial cases - the other dims of 'B' must be validated
     if (!is_spatial) {
       for (int feature = 0; feature < num_feature_dims; ++feature) {
-        if (B_dims[1 + feature] != x_dims[2 + feature]) {
-          return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid input B: ", (1 + feature), " dimension != ", x_dims[2 + feature]);
+        if (B_dims[1 + feature] != x_dims[feature_offset + feature]) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid input B: ", (1 + feature),
+                                 " dimension != ", x_dims[feature_offset + feature]);
         }
       }
     }
@@ -82,16 +92,19 @@ class BatchNormHelper {
     // validate 'mean' shape
     const auto& mean_dims = mean->Shape().GetDims();
     if (static_cast<int>(mean_dims.size()) != kNumInputMeanDimensions) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid input mean: NumDimensions() != ", kNumInputMeanDimensions);
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "Invalid input mean: NumDimensions() != ", kNumInputMeanDimensions);
     }
     if (mean_dims[0] != num_channels) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid input mean: 0th dimension != ", num_channels);
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "Invalid input mean: 0th dimension != ", num_channels);
     }
     // in non-spatial cases - the other dims of 'mean' must be validated
     if (!is_spatial) {
       for (int feature = 0; feature < num_feature_dims; ++feature) {
-        if (mean_dims[1 + feature] != x_dims[2 + feature]) {
-          return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid input mean: ", (1 + feature), " dimension != ", x_dims[2 + feature]);
+        if (mean_dims[1 + feature] != x_dims[feature_offset + feature]) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid input mean: ", (1 + feature),
+                                 " dimension != ", x_dims[feature_offset + feature]);
         }
       }
     }
@@ -99,7 +112,8 @@ class BatchNormHelper {
     // validate 'var' shape
     const auto& var_dims = var->Shape().GetDims();
     if (static_cast<int>(var_dims.size()) != kNumInputVarianceDimensions) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid input var: NumDimensions() != ", kNumInputVarianceDimensions);
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "Invalid input var: NumDimensions() != ", kNumInputVarianceDimensions);
     }
     if (var_dims[0] != num_channels) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid input var: 0th dimension != ", num_channels);
@@ -107,8 +121,9 @@ class BatchNormHelper {
     // in non-spatial cases - the other dims of 'var' must be validated
     if (!is_spatial) {
       for (int feature = 0; feature < num_feature_dims; ++feature) {
-        if (var_dims[1 + feature] != x_dims[2 + feature]) {
-          return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid input var: ", (1 + feature), " dimension != ", x_dims[2 + feature]);
+        if (var_dims[1 + feature] != x_dims[feature_offset + feature]) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid input var: ", (1 + feature),
+                                 " dimension != ", x_dims[feature_offset + feature]);
         }
       }
     }
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index 1ce089fd93044..8ba282031a5d4 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -1202,9 +1202,12 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, float, LSTM);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, double, LSTM);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, MLFloat16, LSTM);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, 14, float, BatchNormalization);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, 14, double, BatchNormalization);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, 14, MLFloat16, BatchNormalization);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+    kCudaExecutionProvider, kOnnxDomain, 14, 14, float, BatchNormalization);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+    kCudaExecutionProvider, kOnnxDomain, 14, 14, double, BatchNormalization);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+    kCudaExecutionProvider, kOnnxDomain, 14, 14, MLFloat16, BatchNormalization);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, float, ReduceMin);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, double, ReduceMin);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, ReduceMin);
@@ -2107,9 +2110,12 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, double, LSTM)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, MLFloat16, LSTM)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, 18, Reshape)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, 14, float, BatchNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, 14, double, BatchNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, 14, MLFloat16, BatchNormalization)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+        kCudaExecutionProvider, kOnnxDomain, 14, 14, float, BatchNormalization)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+        kCudaExecutionProvider, kOnnxDomain, 14, 14, double, BatchNormalization)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+        kCudaExecutionProvider, kOnnxDomain, 14, 14, MLFloat16, BatchNormalization)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, BFloat16, Add)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, BFloat16, Sub)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, BFloat16, Mul)>,
diff --git a/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc b/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc
index f416caecd115f..64edc319e15ac 100644
--- a/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc
+++ b/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc
@@ -18,10 +18,14 @@ namespace onnxruntime::cuda {
 
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 7, 8, float,
                                                       BatchNormalization);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 7, 8, double,
+                                                      BatchNormalization);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 7, 8, MLFloat16,
                                                       BatchNormalization);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 9, 13, float,
                                                       BatchNormalization);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 9, 13, double,
+                                                      BatchNormalization);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 9, 13, MLFloat16,
                                                       BatchNormalization);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 10, float,
@@ -72,10 +76,14 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalN
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 12, MLFloat16, MaxPool);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 14, 14, float,
                                                       BatchNormalization);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 14, 14, double,
+                                                      BatchNormalization);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 14, 14, MLFloat16,
                                                       BatchNormalization);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 15, float,
                                             BatchNormalization);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 15, double,
+                                            BatchNormalization);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 15, MLFloat16,
                                             BatchNormalization);
 
@@ -86,18 +94,26 @@ Status RegisterCudaNhwcKernels(KernelRegistry& kernel_registry) {
           kCudaExecutionProvider, kMSInternalNHWCDomain, 7, 8, MLFloat16, BatchNormalization)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
           kCudaExecutionProvider, kMSInternalNHWCDomain, 7, 8, float, BatchNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+          kCudaExecutionProvider, kMSInternalNHWCDomain, 7, 8, double, BatchNormalization)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
           kCudaExecutionProvider, kMSInternalNHWCDomain, 9, 13, MLFloat16, BatchNormalization)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
           kCudaExecutionProvider, kMSInternalNHWCDomain, 9, 13, float, BatchNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+          kCudaExecutionProvider, kMSInternalNHWCDomain, 9, 13, double, BatchNormalization)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
           kCudaExecutionProvider, kMSInternalNHWCDomain, 14, 14, MLFloat16, BatchNormalization)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
           kCudaExecutionProvider, kMSInternalNHWCDomain, 14, 14, float, BatchNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+          kCudaExecutionProvider, kMSInternalNHWCDomain, 14, 14, double, BatchNormalization)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 15,
                                                                   MLFloat16, BatchNormalization)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 15,
                                                                   float, BatchNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 15,
+                                                                  double, BatchNormalization)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
           kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 10, MLFloat16, Conv)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider,
diff --git a/onnxruntime/core/providers/cuda/nn/batch_norm.cc b/onnxruntime/core/providers/cuda/nn/batch_norm.cc
index c468971e1e426..02da1a2c99dfd 100644
--- a/onnxruntime/core/providers/cuda/nn/batch_norm.cc
+++ b/onnxruntime/core/providers/cuda/nn/batch_norm.cc
@@ -87,7 +87,7 @@ Status BatchNorm<T, NHWC>::ComputeInternal(OpKernelContext* p_op_kernel_context)
 
   CudnnTensor data_desc;
   vector<int64_t> new_dims;
-  BatchNormHelper::NormalizeDims(x_shape, new_dims);
+  BatchNormHelper::NormalizeDims(x_shape, new_dims, NHWC);
   ORT_RETURN_IF_ERROR(data_desc.Set(new_dims, CudnnTensor::GetDataType<CudaT>(), NHWC));
 
   // For half data type, the alpha, beta, scale, B, mean, var need to be float type
@@ -137,6 +137,12 @@ Status BatchNorm<T, NHWC>::ComputeInternal(OpKernelContext* p_op_kernel_context)
     auto saved_mean_data = reinterpret_cast<CudaT*>(saved_mean->MutableData<T>());
     auto saved_inv_var_data = reinterpret_cast<CudaT*>(saved_var->MutableData<T>());
 
+    auto stream = static_cast<cudaStream_t>(p_op_kernel_context->GetComputeStream()->GetHandle());
+    CUDA_RETURN_IF_ERROR(
+        cudaMemcpyAsync(running_mean_data, mean_data, mean->SizeInBytes(), cudaMemcpyDeviceToDevice, stream));
+    CUDA_RETURN_IF_ERROR(
+        cudaMemcpyAsync(running_var_data, var_data, var->SizeInBytes(), cudaMemcpyDeviceToDevice, stream));
+
     CUDNN_RETURN_IF_ERROR(BatchNormalizationForwardTrainingHelper(
         GetCudnnHandle(p_op_kernel_context),
         cudnn_batch_norm_mode_,
@@ -149,7 +155,7 @@ Status BatchNorm<T, NHWC>::ComputeInternal(OpKernelContext* p_op_kernel_context)
         bn_tensor_desc,
         scale_data,
         b_data,
-        momentum_,
+        1.0 - momentum_,
         running_mean_data,
         running_var_data,
         epsilon_,
@@ -186,6 +192,7 @@ SPECIALIZED_COMPUTE(MLFloat16, kOnnxDomain, false)
 
 #ifdef ENABLE_CUDA_NHWC_OPS
 SPECIALIZED_COMPUTE(float, kMSInternalNHWCDomain, true)
+SPECIALIZED_COMPUTE(double, kMSInternalNHWCDomain, true)
 SPECIALIZED_COMPUTE(MLFloat16, kMSInternalNHWCDomain, true)
 #endif
 }  // namespace cuda
diff --git a/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc b/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc
index cbb4531a50b7c..54e5c71bd753a 100644
--- a/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc
@@ -916,6 +916,7 @@ TEST(BatchNormTest, ForwardTrainingTestWithSavedOutputsOpset9) {
   // exclude CUDA Execution Provider due to flakiness
   // exclude TRT and OpenVINO for same reasons as seen in TestBatchNorm()
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           // TODO(mtavenrath) flakiness of running_mean for CUDA has been fixed, the delta of running_var is still ~0.1
            {kCudaExecutionProvider, kRocmExecutionProvider,
             kTensorrtExecutionProvider, kOpenVINOExecutionProvider, kDnnlExecutionProvider});
 }

From 06e684c9f2f8495de5259967cc12bab24da3d522 Mon Sep 17 00:00:00 2001
From: Chen Fu <1316708+chenfucn@users.noreply.github.com>
Date: Tue, 5 Mar 2024 09:37:45 -0800
Subject: [PATCH 109/279] Adding cuda kernel (optimized for sm80) for
 block-wise 4b quantized float 16 GEMM. (#18619)

### Description
Adding CUDA kernel for block-wise 4b quantized float 16 GEMM, this is
specially optimized for Nvidia Ampere GPUs.


### Motivation and Context
Trying to improve quantized LLM inference performance on Nvidia Ampere
GPUs

### Note:
This is implemented by extending CUTLASS, so it has a hard dependency on
CUTLASS. However, in current build system, loading of CUTLASS dependency
is guarded with:

(onnxruntime_USE_FLASH_ATTENTION OR
onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION)

If both of these options are turned off, then compilation will fail.

Why CUTLASS dependency is guarded at all? It's a header file only
library that does not introduce any binary if not instantiated. What's
the downside of removing all the guards and just include CUTLASS
unconditionally?
---
 .lintrunner.toml                              |    1 +
 cmake/CMakeLists.txt                          |    5 +-
 cmake/onnxruntime_providers_cuda.cmake        |    2 +-
 cmake/onnxruntime_unittests.cmake             |    1 +
 onnxruntime/core/mickey/README.md             |    4 +
 .../core/mickey/blk_q4/f16_gemm_sm80.h        |  208 +++
 .../{prepack_sm80.h => f16_prepack_sm80.h}    |    2 +-
 .../cutlass_ext/q4gemm/device/quantb_gemm.h   |  481 ++++++
 .../q4gemm/kernel/default_quantb_gemm.h       |  255 ++++
 .../cutlass_ext/q4gemm/kernel/quantb_gemm.h   |  462 ++++++
 .../q4gemm/threadblock/default_quantb_mma.h   |  248 ++++
 .../threadblock/default_quantb_mma_core.h     |  340 +++++
 .../optional_predicated_tile_access_iter.h    |  314 ++++
 .../optional_regular_tile_access_iter.h       |  224 +++
 .../threadblock/quantb_mma_multistage.h       | 1290 +++++++++++++++++
 .../warp/default_quantb_mma_tensor_op.h       |  112 ++
 .../quantb_meta_mma_tensor_op_tile_iterator.h |  883 +++++++++++
 .../q4gemm/warp/quantb_mma_tensor_op.h        |  361 +++++
 onnxruntime/core/util/matrix_layout.h         |    1 -
 .../test/cuda_host/blkq4_fp16_quant_sm80.h    |  203 +++
 .../cuda/test_cases/blkq4_fp16_gemm_sm80.h    |  188 +++
 .../test_cases/blkq4_fp16_gemm_sm80_test.cc   |  330 +++++
 .../test_cases/blkq4_fp16_gemm_sm80_testcu.cu |  344 +++++
 .../blkq4_fp16_sm80_prepack_test.cc           |  507 -------
 .../cuda_execution_provider_test.cc           |    4 +-
 25 files changed, 6257 insertions(+), 513 deletions(-)
 create mode 100644 onnxruntime/core/mickey/blk_q4/f16_gemm_sm80.h
 rename onnxruntime/core/mickey/blk_q4/{prepack_sm80.h => f16_prepack_sm80.h} (99%)
 create mode 100644 onnxruntime/core/mickey/cutlass_ext/q4gemm/device/quantb_gemm.h
 create mode 100644 onnxruntime/core/mickey/cutlass_ext/q4gemm/kernel/default_quantb_gemm.h
 create mode 100644 onnxruntime/core/mickey/cutlass_ext/q4gemm/kernel/quantb_gemm.h
 create mode 100644 onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/default_quantb_mma.h
 create mode 100644 onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/default_quantb_mma_core.h
 create mode 100644 onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/optional_predicated_tile_access_iter.h
 create mode 100644 onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/optional_regular_tile_access_iter.h
 create mode 100644 onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h
 create mode 100644 onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/default_quantb_mma_tensor_op.h
 create mode 100644 onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/quantb_meta_mma_tensor_op_tile_iterator.h
 create mode 100644 onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/quantb_mma_tensor_op.h
 create mode 100644 onnxruntime/test/cuda_host/blkq4_fp16_quant_sm80.h
 create mode 100644 onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h
 create mode 100644 onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc
 create mode 100644 onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu
 delete mode 100644 onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_sm80_prepack_test.cc

diff --git a/.lintrunner.toml b/.lintrunner.toml
index 4e5d077b08ff4..be95e03479cf9 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -132,6 +132,7 @@ exclude_patterns = [
     'onnxruntime/core/flatbuffers/schema/*.fbs.h', # Generated code
     'onnxruntime/core/graph/contrib_ops/quantization_defs.cc',
     'onnxruntime/core/mlas/**', # Contains assembly code
+    'onnxruntime/core/mickey/cutlass_ext/**', # CUTLASS lib recommends NO automatic code formatting
     'winml/lib/Api.Image/shaders/**',  # Contains data chunks
 ]
 command = [
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 8453da19ce3a6..0d55d4cab9826 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -727,6 +727,9 @@ if (onnxruntime_USE_CUDA)
     set(onnxruntime_USE_FLASH_ATTENTION OFF)
     set(onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION OFF)
   endif()
+  if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.4)
+    message( FATAL_ERROR "Failed build due to CUDA compiler version < 11.4")
+  endif()
 else()
   set(onnxruntime_USE_FLASH_ATTENTION OFF)
   set(onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION OFF)
@@ -747,8 +750,8 @@ if (onnxruntime_USE_CUDA)
       list(APPEND ORT_PROVIDER_FLAGS -DUSE_MEMORY_EFFICIENT_ATTENTION=1)
       list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_MEMORY_EFFICIENT_ATTENTION=1)
     endif()
-
 endif()
+
 if (onnxruntime_USE_VITISAI)
     list(APPEND ORT_PROVIDER_FLAGS -DUSE_VITISAI=1)
     list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_VITISAI=1)
diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
index 0f6d48bdb6ec8..7f295a59a0931 100644
--- a/cmake/onnxruntime_providers_cuda.cmake
+++ b/cmake/onnxruntime_providers_cuda.cmake
@@ -201,7 +201,7 @@
     endif()
 
     include(cutlass)
-    target_include_directories(${target} PRIVATE ${cutlass_SOURCE_DIR}/include ${cutlass_SOURCE_DIR}/examples)
+    target_include_directories(${target} PRIVATE ${cutlass_SOURCE_DIR}/include ${cutlass_SOURCE_DIR}/examples ${cutlass_SOURCE_DIR}/tools/util/include)
 
     target_include_directories(${target} PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR}  ${eigen_INCLUDE_DIRS} ${TVM_INCLUDES}
      PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index 88f662075e177..b004054c616a5 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -774,6 +774,7 @@ if (onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS)
   onnxruntime_add_shared_library_module(onnxruntime_providers_cuda_ut ${onnxruntime_test_providers_cuda_ut_src} $<TARGET_OBJECTS:onnxruntime_providers_cuda_obj>)
   config_cuda_provider_shared_module(onnxruntime_providers_cuda_ut)
   onnxruntime_add_include_to_target(onnxruntime_providers_cuda_ut GTest::gtest GTest::gmock)
+  target_include_directories(onnxruntime_providers_cuda_ut PRIVATE ${ONNXRUNTIME_ROOT}/core/mickey)
   target_link_libraries(onnxruntime_providers_cuda_ut PRIVATE GTest::gtest GTest::gmock ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common)
   list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_cuda_ut)
 endif()
diff --git a/onnxruntime/core/mickey/README.md b/onnxruntime/core/mickey/README.md
index 7e8d30cd1805b..735ec4b80daf3 100644
--- a/onnxruntime/core/mickey/README.md
+++ b/onnxruntime/core/mickey/README.md
@@ -4,3 +4,7 @@ Playful name for a template library of high performance cuda code that
 are often shared by various AI operators. The intention is to make this
 header files only, with no binary impact unless it is instantiated
 where it is needed.
+
+Currently cuda code are scattered in multiple locations in the repo.
+Hopefully this can be the starting point of consolidating all cuda
+code.
diff --git a/onnxruntime/core/mickey/blk_q4/f16_gemm_sm80.h b/onnxruntime/core/mickey/blk_q4/f16_gemm_sm80.h
new file mode 100644
index 0000000000000..52bff7e40dbe3
--- /dev/null
+++ b/onnxruntime/core/mickey/blk_q4/f16_gemm_sm80.h
@@ -0,0 +1,208 @@
+/**
+ * Copyright (c) Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License.
+ *
+ * Module Name:
+ *   blk_q4/f16_gemm_sm80.h
+ *
+ * Abstract:
+ *   Entry point for Q4F16 GEMM kernel for SM80 devices.
+ */
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass_ext/q4gemm/device/quantb_gemm.h"
+
+namespace onnxruntime {
+namespace cuda {
+
+//
+// This is the implementation of the quantized GEMM kernel for 16b float x blocked quantized 4b data type
+//
+template <
+    typename ElementDequant_,  // <- data type of dequantized elements for gemm, fp16 or bf16
+    typename QuantBlocking_,   // <- weights block per scale, cutlass::MatrixShape<x,y>
+    bool SmallM,               // <- true if M <= 16
+    bool kHasQuantOffset>
+struct BlkQ4F16GemmImpl {
+  //
+  // Type definitions
+  //
+
+  using ElementDequant = ElementDequant_;
+  using QuantBlocking = QuantBlocking_;
+
+  static_assert(sizeof(ElementDequant) == 2, "q4f16gemm kerenl only support 16b operands!");
+
+  // Data types that are fixed for this kernel
+  using ElementAccumulator = float;
+  using ElementComputeEpilogue = ElementAccumulator;
+  using ElementInputA = ElementDequant;
+  using ElementOutput = ElementDequant;
+
+  using ElementW = uint8_t;  // <- Weight is int4, uint8 for two of them
+
+  // We pack 4 weights into one 16b element, so as to leverage cutlass tile iterators
+  // for async shared memory loading and minimize bank conflict
+  using ElementWPack = ElementDequant;
+
+  using ElementQScale = ElementDequant;  // <- data type of quantization scale
+  using ElementQOffset = uint8_t;
+
+  using LayoutInputA = cutlass::layout::RowMajor;
+  using LayoutInputWPack = cutlass::layout::ColumnMajor;
+  using LayoutOutput = cutlass::layout::RowMajor;
+
+  // Layout of quantization scale and offset, oriented to be loaded using less instructions
+  // in a warp tile
+  using LayoutInputQScale =
+      typename std::conditional<QuantBlocking::kRow == 1,
+                                cutlass::layout::ColumnMajor,
+                                cutlass::layout::RowMajor>::type;  // <- layout of quantization scale
+
+  using ShapeMMAThreadBlock =
+      typename std::conditional<SmallM,
+                                cutlass::gemm::GemmShape<16, 64, 64>,
+                                cutlass::gemm::GemmShape<128, 256, 64>>::type;
+
+  static constexpr int MinN = QuantBlocking::kColumn > 32 ? QuantBlocking::kColumn : 32;
+  using ShapeMMAWarp =
+      typename std::conditional<SmallM,
+                                cutlass::gemm::GemmShape<16, MinN, 64>,
+                                cutlass::gemm::GemmShape<64, 64, 64>>::type;
+
+  using ShapeMMAOp = cutlass::gemm::GemmShape<16, 8, 16>;
+
+  // This code section describes how threadblocks are scheduled on GPU
+  using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;  // <- ??
+
+  // This code section describes the epilogue part of the kernel
+  using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,                                     // <- data type of output matrix
+      128 / cutlass::sizeof_bits<ElementOutput>::value,  // <- the number of elements per vectorized
+                                                         // memory access. For a byte, it's 16
+                                                         // elements. This becomes the vector width of
+                                                         // math instructions in the epilogue too
+      ElementAccumulator,                                // <- data type of accumulator
+      ElementComputeEpilogue>;                           // <- data type for alpha/beta in linear combination function
+
+  // Number of pipelines you want to use
+  static constexpr int NumStages = 3;
+
+  using Gemm = cutlass::gemm::device::QuantBGemm<
+      ElementInputA,
+      LayoutInputA,
+      ElementWPack,
+      LayoutInputWPack,
+      ElementQScale,
+      typename std::conditional<kHasQuantOffset, ElementQOffset, std::monostate>::type,
+      LayoutInputQScale,
+      QuantBlocking,
+      ElementOutput,
+      LayoutOutput,
+      ElementAccumulator,
+      cutlass::arch::OpClassTensorOp,
+      cutlass::arch::Sm80,
+      ShapeMMAThreadBlock,
+      ShapeMMAWarp,
+      ShapeMMAOp,
+      EpilogueOp,
+      SwizzleThreadBlock,
+      NumStages>;
+
+  using Arguments = typename Gemm::Arguments;
+
+  // Invoke gemm kernel (the version with quantization offset)
+  static cutlass::Status run(
+      cudaStream_t stream,
+      const cutlass::gemm::GemmCoord& problem_size_,
+      cutlass::TensorRef<ElementInputA const, LayoutInputA> ref_A_,
+      cutlass::TensorRef<ElementWPack const, LayoutInputWPack> ref_B_,
+      cutlass::TensorRef<ElementQScale const, LayoutInputQScale> ref_Qscale_,
+      cutlass::TensorRef<ElementQOffset const, LayoutInputQScale> ref_Qoffset_,
+      cutlass::TensorRef<ElementOutput const, LayoutOutput> ref_C_,
+      cutlass::TensorRef<ElementOutput, LayoutOutput> ref_D_,
+      typename EpilogueOp::Params epilogue_ = typename EpilogueOp::Params()) {
+    if constexpr (!kHasQuantOffset) {
+      return cutlass::Status::kErrorNotSupported;
+    } else {
+      if constexpr (ShapeMMAThreadBlock::kM == 16) {
+        if (problem_size_.m() > 16) {
+          // For M > 16, the caller should have picked the
+          // kernel with bigger M
+          return cutlass::Status::kErrorNotSupported;
+        }
+      }
+
+      // Construct Gemm arguments
+      Arguments args{
+          problem_size_,
+          ref_A_,
+          ref_B_,
+          ref_Qscale_,
+          ref_Qoffset_,
+          ref_C_,
+          ref_D_,
+          epilogue_};
+
+      Gemm gemm_op;
+
+      // Check if this GEMM can be run or not
+      cutlass::Status status = gemm_op.can_implement(args);
+      if (status != cutlass::Status::kSuccess) {
+        return status;
+      }
+
+      // Launch the CUTLASS GEMM kernel.
+      return gemm_op(args, nullptr, stream);
+    }
+  }
+
+  // Invoke gemm kernel (the version without quantization offset)
+  static cutlass::Status run(
+      cudaStream_t stream,
+      const cutlass::gemm::GemmCoord& problem_size_,
+      cutlass::TensorRef<ElementInputA const, LayoutInputA> ref_A_,
+      cutlass::TensorRef<ElementWPack const, LayoutInputWPack> ref_B_,
+      cutlass::TensorRef<ElementQScale const, LayoutInputQScale> ref_Qscale_,
+      cutlass::TensorRef<ElementOutput const, LayoutOutput> ref_C_,
+      cutlass::TensorRef<ElementOutput, LayoutOutput> ref_D_,
+      typename EpilogueOp::Params epilogue_ = typename EpilogueOp::Params()) {
+    if constexpr (kHasQuantOffset) {
+      return cutlass::Status::kErrorNotSupported;
+    } else {
+      if constexpr (ShapeMMAThreadBlock::kM == 16) {
+        if (problem_size_.m() > 16) {
+          // For M > 16, the caller should have picked the
+          // kernel with bigger M
+          return cutlass::Status::kErrorNotSupported;
+        }
+      }
+
+      // Construct Gemm arguments
+      Arguments args{
+          problem_size_,
+          ref_A_,
+          ref_B_,
+          ref_Qscale_,
+          ref_C_,
+          ref_D_,
+          epilogue_};
+
+      Gemm gemm_op;
+
+      // Check if this GEMM can be run or not
+      cutlass::Status status = gemm_op.can_implement(args);
+      if (status != cutlass::Status::kSuccess) {
+        return status;
+      }
+
+      // Launch the CUTLASS GEMM kernel.
+      return gemm_op(args, nullptr, stream);
+    }
+  }
+};
+
+}  // namespace cuda
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/mickey/blk_q4/prepack_sm80.h b/onnxruntime/core/mickey/blk_q4/f16_prepack_sm80.h
similarity index 99%
rename from onnxruntime/core/mickey/blk_q4/prepack_sm80.h
rename to onnxruntime/core/mickey/blk_q4/f16_prepack_sm80.h
index e291ab39e8aa3..a08cfb97eed4a 100644
--- a/onnxruntime/core/mickey/blk_q4/prepack_sm80.h
+++ b/onnxruntime/core/mickey/blk_q4/f16_prepack_sm80.h
@@ -3,7 +3,7 @@
  * Licensed under the MIT License.
  *
  * Module Name:
- *    prepack_sm80.h
+ *    blk_q4/f16_prepack_sm80.h
  *
  * Abstract:
  *    Prepack weights and quantization parameters (scales and offsets) for
diff --git a/onnxruntime/core/mickey/cutlass_ext/q4gemm/device/quantb_gemm.h b/onnxruntime/core/mickey/cutlass_ext/q4gemm/device/quantb_gemm.h
new file mode 100644
index 0000000000000..38795291b0328
--- /dev/null
+++ b/onnxruntime/core/mickey/cutlass_ext/q4gemm/device/quantb_gemm.h
@@ -0,0 +1,481 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/**
+ * Modifications Copyright (c) Microsoft.
+ * Licensed under the MIT license.
+ *
+ * @file quantb_gemm.h
+ * @brief Modified from cutlass/gemm/device/gemm.h, boilerplate code passing input pointers to the kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/gemm.h"
+
+#include "cutlass_ext/q4gemm/kernel/default_quantb_gemm.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+#include "cutlass/layout/permute.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*! A specialized GEMM operator for quantized B GEMM.
+
+  It is modified from cutlass::gemm::device::Gemm. Both this class and the original Gemm class
+  are pretty much boilerplate code that construct the Gemm kernel class, and pass parameters
+  and controls to it. The only difference is that this class has a few more template parameters
+  to support quantization.
+
+  This implementation pretty much follows the design of cutlass. But this class seems to be
+  just a wrapper of the Gemm kernel class. Consider combining them in future iterations.
+
+*/
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for quant scales
+    typename ElementQScale_,
+    /// Element type for quant offsets
+    typename ElementQOffset_,
+    /// Layout type for quant scales and offsets
+    typename LayoutQMeta_,
+    /// Blocking dimensions for quantization
+    typename QuantBlocking_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassSimt,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_ = arch::Sm80,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ =
+        typename threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// If true, kernel supports split-K with serial reduction
+    bool SplitKSerial = false,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator,
+    /// Gather operand A by using an index array
+    bool GatherA = false,
+    /// Gather operand B by using an index array
+    bool GatherB = false,
+    /// Scatter result D by using an index array
+    bool ScatterD = false,
+    /// Permute result D
+    typename PermuteDLayout = layout::NoPermute>
+class QuantBGemm {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static bool const kSplitKSerial = SplitKSerial;
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+
+  // Quantization Parameters
+  static_assert(std::is_same<LayoutB, layout::ColumnMajor>::value,
+                "LayoutB, i.e. packed weights must appear ColumnMajor.");
+  static_assert(InstructionShape::kK == 16,
+                "InstructionShape::kK must be a multiple of 16 (2 tiles), required by 4b weight packing layout.");
+  using ElementQScale = ElementQScale_;
+  using ElementQOffset = ElementQOffset_;
+  using LayoutQMeta = LayoutQMeta_;
+  using QuantBlocking = QuantBlocking_;
+  static constexpr bool kHasQOffset = !(std::is_same<ElementQOffset, std::monostate>::value);
+
+  // TODO(chenfucn): consider moving to uint4_t or smaller for QOffset
+  static_assert(!kHasQOffset || std::is_same<ElementQOffset_, uint8_t>::value, "QOffset must be uint8_t");
+
+  /// Define the kernel
+  using GemmKernel = typename kernel::DefaultQuantBGemm<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementQScale,
+    ElementQOffset,
+    LayoutQMeta,
+    QuantBlocking,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    kStages,
+    kSplitKSerial,
+    Operator,
+    GatherA,
+    GatherB,
+    ScatterD,
+    PermuteDLayout
+  >::GemmKernel;
+
+  /// Argument structure
+  struct Arguments {
+    //
+    // Data members
+    //
+
+    GemmCoord problem_size;
+    TensorRef<ElementA const, LayoutA> ref_A;
+    TensorRef<ElementB const, LayoutB> ref_B;
+    TensorRef<ElementC const, LayoutC> ref_C;
+    TensorRef<ElementC, LayoutC> ref_D;
+    TensorRef<ElementQScale const, LayoutQMeta> ref_Qscale;
+    TensorRef<ElementQOffset const, LayoutQMeta> ref_Qoffset;
+
+    typename EpilogueOutputOp::Params epilogue;
+
+    // split-K parallelism (etc.) are not yet supported, keeping this for future extension
+    int split_k_slices{1};
+    // For gather+scatter operations
+    int const *gather_A_indices{nullptr};
+    int const *gather_B_indices{nullptr};
+    int const *scatter_D_indices{nullptr};
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments(): problem_size(0, 0, 0) {}
+
+    /// Constructs an Arguments structure
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord problem_size_,
+      TensorRef<ElementA const, LayoutA> ref_A_,
+      TensorRef<ElementB const, LayoutB> ref_B_,
+      TensorRef<ElementQScale const, LayoutQMeta> ref_Qscale_,
+      TensorRef<ElementC const, LayoutC> ref_C_,
+      TensorRef<ElementC, LayoutC> ref_D_,
+      typename EpilogueOutputOp::Params epilogue_ =
+        typename EpilogueOutputOp::Params()):
+      problem_size(problem_size_),
+      ref_A(ref_A_),
+      ref_B(ref_B_),
+      ref_Qscale(ref_Qscale_),
+      ref_C(ref_C_),
+      ref_D(ref_D_),
+      epilogue(epilogue_) {
+        assert(!kHasQOffset);
+    }
+
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord problem_size_,
+      TensorRef<ElementA const, LayoutA> ref_A_,
+      TensorRef<ElementB const, LayoutB> ref_B_,
+      TensorRef<ElementQScale const, LayoutQMeta> ref_Qscale_,
+      TensorRef<ElementQOffset const, LayoutQMeta> ref_Qoffset_,
+      TensorRef<ElementC const, LayoutC> ref_C_,
+      TensorRef<ElementC, LayoutC> ref_D_,
+      typename EpilogueOutputOp::Params epilogue_ =
+        typename EpilogueOutputOp::Params()):
+      problem_size(problem_size_),
+      ref_A(ref_A_),
+      ref_B(ref_B_),
+      ref_Qscale(ref_Qscale_),
+      ref_Qoffset(ref_Qoffset_),
+      ref_C(ref_C_),
+      ref_D(ref_D_),
+      epilogue(epilogue_) {
+        assert(kHasQOffset);
+    }
+  };
+
+ private:
+  /// Kernel parameters object
+  typename GemmKernel::Params params_;
+
+ public:
+  /// Constructs the GEMM.
+  QuantBGemm() { }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    if (!kSplitKSerial && args.split_k_slices > 1) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    Status status = GemmKernel::can_implement(
+      args.problem_size,
+      args.ref_A.non_const_ref(),
+      args.ref_B.non_const_ref(),
+      args.ref_Qscale.non_const_ref(),
+      args.ref_Qoffset.non_const_ref(),
+      args.ref_C.non_const_ref(),
+      args.ref_D
+    );
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+
+    size_t bytes = 0;
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size,
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.split_k_slices);
+
+    if (kSplitKSerial && args.split_k_slices > 1) {
+
+      bytes += sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
+    }
+
+    return bytes;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size,
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.split_k_slices);
+
+    if (kSplitKSerial) {
+      if (args.split_k_slices > 1) {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+
+        size_t bytes = get_workspace_size(args);
+
+        cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);
+
+        if (result != cudaSuccess) {
+          return Status::kErrorInternal;
+        }
+      }
+    } else {
+
+      if (args.split_k_slices > 1) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    // Initialize the Params structure
+    params_ = typename GemmKernel::Params{
+      args.problem_size,
+      grid_shape,
+      args.ref_A.non_const_ref(),
+      args.ref_B.non_const_ref(),
+      args.ref_Qscale.non_const_ref(),
+      args.ref_Qoffset.non_const_ref(),
+      args.ref_C.non_const_ref(),
+      args.ref_D,
+      args.epilogue,
+      static_cast<int *>(workspace),
+      args.gather_A_indices,
+      args.gather_B_indices,
+      args.scatter_D_indices
+    };
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    if (kSplitKSerial && args.split_k_slices > 1) {
+      if (!workspace) {
+        return Status::kErrorWorkspaceNull;
+      }
+    }
+
+    params_.ref_A.reset(args.ref_A.non_const_ref().data());
+    params_.ref_B.reset(args.ref_B.non_const_ref().data());
+    params_.ref_Qscale.reset(args.ref_Qscale.non_const_ref().data());
+    params_.ref_Qoffset.reset(args.ref_Qoffset.non_const_ref().data());
+    params_.ref_C.reset(args.ref_C.non_const_ref().data());
+    params_.ref_D.reset(args.ref_D.data());
+    params_.output_op = args.epilogue;
+    params_.semaphore = static_cast<int *>(workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(GemmKernel::kThreadCount, 1, 1);
+
+    cudaError_t result;
+
+    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+
+    if (smem_size >= (48 << 10)) {
+      result = cudaFuncSetAttribute(Kernel<GemmKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        std::cerr << "Failed to obtain maximum shared memory size " << smem_size << " for kernel: "
+                  << cudaGetErrorString(result) << "\n";
+        return Status::kErrorInternal;
+      }
+    }
+
+    cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args,
+    void *workspace = nullptr,
+    cudaStream_t stream = nullptr) {
+
+    Status status = initialize(args, workspace, stream);
+
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/core/mickey/cutlass_ext/q4gemm/kernel/default_quantb_gemm.h b/onnxruntime/core/mickey/cutlass_ext/q4gemm/kernel/default_quantb_gemm.h
new file mode 100644
index 0000000000000..2f4460bb59e9f
--- /dev/null
+++ b/onnxruntime/core/mickey/cutlass_ext/q4gemm/kernel/default_quantb_gemm.h
@@ -0,0 +1,255 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/**
+ * Modifications Copyright (c) Microsoft.
+ * Licensed under the MIT license.
+ *
+ * @file default_quantb_gemm.h
+ * @brief Modified from cutlass/gemm/kernel/default_gemm.h. templates for combining
+ *        threadblock-scoped matrix multiply-add with  the appropriate
+ *        threadblock-scoped epilogue.
+ */
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass_ext/q4gemm/kernel/quantb_gemm.h"
+#include "cutlass/gemm/kernel/gemm_pipelined.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass_ext/q4gemm/threadblock/default_quantb_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#include "cutlass/layout/permute.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for quant scales
+    typename ElementQScale_,
+    /// Element type for quant offsets
+    typename ElementQOffset_,
+    /// Layout type for quant scales and offsets
+    typename LayoutQMeta_,
+    /// Blocking dimensions for quantization
+    typename QuantBlocking_,
+    /// Access granularity of quant scales in units of elements
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Gather operand A by using an index array
+    bool GatherA = false,
+    /// Gather operand B by using an index array
+    bool GatherB = false,
+    /// Scatter result D by using an index array
+    bool ScatterD = false,
+    /// Permute result D
+    typename PermuteDLayout = layout::NoPermute,
+    /// Permute operand A
+    typename PermuteALayout = layout::NoPermute,
+    /// Permute operand B
+    typename PermuteBLayout = layout::NoPermute,
+    ///
+    typename Enable = void
+>
+struct DefaultQuantBGemm;
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for quant scales
+    typename ElementQScale,
+    /// Element type for quant offsets
+    typename ElementQOffset,
+    /// Layout type for quant scales
+    typename LayoutQMeta,
+    /// Blocking dimensions for quantization
+    typename QuantBlocking,
+    /// Access granularity of quant scales in units of elements
+    typename ElementC,
+    /// Layout type for C and D matrix operand
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB,
+    /// Scatter result D by using an index array
+    bool ScatterD,
+    /// Permute result D
+    typename PermuteDLayout,
+    /// Permute operand A
+    typename PermuteALayout,
+    /// Permute operand B
+    typename PermuteBLayout
+>
+struct DefaultQuantBGemm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+                         ElementQScale, ElementQOffset, LayoutQMeta, QuantBlocking,
+                         ElementC, LayoutC, ElementAccumulator,
+                         arch::OpClassTensorOp, arch::Sm80, ThreadblockShape, WarpShape,
+                         InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages,
+                         SplitKSerial, Operator, GatherA, GatherB, ScatterD,
+                         PermuteDLayout, PermuteALayout, PermuteBLayout> {
+
+  static_assert((platform::is_same<LayoutC, layout::RowMajor>::value
+             || platform::is_same<LayoutC, layout::AffineRankN<2>>::value),
+             "Epilogue in the kernel level must be row major");
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultQuantBMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementQScale, ElementQOffset, LayoutQMeta, QuantBlocking,
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator, false, GatherA, GatherB,
+      PermuteALayout, PermuteBLayout>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using RegularEpilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, ScatterD, PermuteDLayout>::Epilogue;
+
+  using Affine2Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOpAffineRankN<
+          2, ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount>::Epilogue;
+
+  using Epilogue = typename platform::conditional<platform::is_same<LayoutC, layout::RowMajor>::value,
+                                                  RegularEpilogue,
+                                                  Affine2Epilogue>::type;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::QuantBGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/onnxruntime/core/mickey/cutlass_ext/q4gemm/kernel/quantb_gemm.h b/onnxruntime/core/mickey/cutlass_ext/q4gemm/kernel/quantb_gemm.h
new file mode 100644
index 0000000000000..6e5ad8f406147
--- /dev/null
+++ b/onnxruntime/core/mickey/cutlass_ext/q4gemm/kernel/quantb_gemm.h
@@ -0,0 +1,462 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/**
+ * Modifications Copyright (c) Microsoft.
+ * Licensed under the MIT license.
+ *
+ * @file quantb_gemm.h
+ * @brief Modified from cutlass/gemm/kernel/gemm.h.
+ *        Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+ */
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/arch/arch.h"
+
+#include "cutlass/util/debug.h"
+#include "cutlass/util/device_dump.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
+  bool SplitKSerial               ///! If true, code supporting split-K via serial reduction is enabled.
+>
+struct QuantBGemm {
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using OutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static bool const kSplitKSerial = SplitKSerial;
+
+  static constexpr bool kHasQOffset = Mma::kHasQOffset;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Parameters structure
+  struct Params {
+    cutlass::gemm::GemmCoord problem_size;
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    int swizzle_log_tile;
+    typename Mma::IteratorA::Params params_A;
+    typename Mma::IteratorA::TensorRef ref_A;
+    typename Mma::IteratorB::Params params_B;
+    typename Mma::IteratorB::TensorRef ref_B;
+    typename Mma::IteratorQScale::Params params_QScale;
+    typename Mma::IteratorQScale::TensorRef ref_QScale;
+    typename Mma::IteratorQOffset::Params params_QOffset;
+    typename Mma::IteratorQOffset::TensorRef ref_QOffset;
+    typename Epilogue::OutputTileIterator::Params params_C;
+    typename Epilogue::OutputTileIterator::TensorRef ref_C;
+    typename Epilogue::OutputTileIterator::Params params_D;
+    typename Epilogue::OutputTileIterator::TensorRef ref_D;
+    typename OutputOp::Params output_op;
+    int *semaphore;
+    int gemm_k_size;  // how many k vectors are processed by this threadblock
+    // For gather+scatter operations
+    int const *gather_A_indices;
+    int const *gather_B_indices;
+    int const *scatter_D_indices;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): swizzle_log_tile(0), semaphore(0), gemm_k_size(0) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      cutlass::gemm::GemmCoord const & problem_size,
+      cutlass::gemm::GemmCoord const & grid_tiled_shape,
+      typename Mma::IteratorA::TensorRef ref_A,
+      typename Mma::IteratorB::TensorRef ref_B,
+      typename Mma::IteratorQScale::TensorRef ref_QScale,
+      typename Mma::IteratorQOffset::TensorRef ref_QOffset,
+      typename Epilogue::OutputTileIterator::TensorRef ref_C,
+      typename Epilogue::OutputTileIterator::TensorRef ref_D,
+      typename OutputOp::Params output_op = typename OutputOp::Params(),
+      int *workspace = nullptr,
+      int const *gather_A_indices = nullptr,
+      int const *gather_B_indices = nullptr,
+      int const *scatter_D_indices = nullptr
+    ):
+      problem_size(problem_size),
+      grid_tiled_shape(grid_tiled_shape),
+      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
+      params_A(ref_A.layout()),
+      ref_A(ref_A),
+      params_B(ref_B.layout()),
+      ref_B(ref_B),
+      params_QScale(ref_QScale.layout()),
+      ref_QScale(ref_QScale),
+      params_QOffset(ref_QOffset.layout()),
+      ref_QOffset(ref_QOffset),
+      params_C(ref_C.layout()),
+      ref_C(ref_C),
+      params_D(ref_D.layout()),
+      ref_D(ref_D),
+      output_op(output_op),
+      gather_A_indices(gather_A_indices),
+      gather_B_indices(gather_B_indices),
+      scatter_D_indices(scatter_D_indices) {
+      int total_gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+      int gemm_k_iterations = (total_gemm_k_iterations + grid_tiled_shape.k() - 1) / grid_tiled_shape.k();
+
+      gemm_k_size = gemm_k_iterations * Mma::Shape::kK;
+
+      semaphore = workspace;
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  QuantBGemm() { }
+
+  /// Determines whether kernel satisfies alignment
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size,
+    typename Mma::IteratorA::TensorRef ref_A,
+    typename Mma::IteratorB::TensorRef ref_B,
+    typename Mma::IteratorQScale::TensorRef ref_QScale,
+    typename Mma::IteratorQOffset::TensorRef ref_QOffset,
+    typename Epilogue::OutputTileIterator::TensorRef ref_C,
+    typename Epilogue::OutputTileIterator::TensorRef ref_D) {
+
+    // TODO check problem_size K, N must be multiple of QuantBlocking
+
+    static int const kAlignmentA = (platform::is_same<typename Mma::IteratorA::Layout,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<typename Mma::IteratorA::Layout,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB =  (platform::is_same<typename Mma::IteratorB::Layout,
+                                                       layout::RowMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<typename Mma::IteratorB::Layout,
+                                                        layout::RowMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = (platform::is_same<typename Epilogue::OutputTileIterator::Layout,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<typename Epilogue::OutputTileIterator::Layout,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    if (!TensorRef_aligned(ref_A, kAlignmentA)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_B, kAlignmentB)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (problem_size.k() % Mma::Shape::kK != 0) {
+      // Currently we don't support this case due to the way
+      // predicate iterator works, it loads the partial tile
+      // in the first iteration and then the full tile in the
+      // remaining iterations. This will cause the blockwise
+      // quantization parameters to go out of step with the
+      // weights. We can fix this by adding a predicate iterator
+      // that loads the full tile in the first iterations and
+      // then the partial tile in the last iteration.
+      return Status::kErrorInvalidProblem;
+    }
+
+    int qscale_k = problem_size.k() / Mma::QuantBlocking::kRow;
+    int qscale_n = problem_size.n() / Mma::QuantBlocking::kColumn;
+    if ((qscale_k == 0) || (qscale_k * Mma::QuantBlocking::kRow != problem_size.k())) {
+      // partial block not supported
+      return Status::kErrorInvalidProblem;
+    }
+    if ((qscale_n == 0) || (qscale_n * Mma::QuantBlocking::kColumn != problem_size.n())) {
+      // partial block not supported
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (!TensorRef_aligned(ref_QScale, Mma::IteratorQScale::AccessType::kElements)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if constexpr(kHasQOffset) {
+      if (!TensorRef_aligned(ref_QOffset, Mma::IteratorQOffset::AccessType::kElements)) {
+        return Status::kErrorMisalignedOperand;
+      }
+    }
+
+    if (!TensorRef_aligned(ref_C, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_D, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.k() * params.gemm_k_size,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      (threadblock_tile_offset.k() * params.gemm_k_size) / 2,
+      (threadblock_tile_offset.n() * Mma::Shape::kN) / 2
+    };
+
+    // Problem size is a function of threadblock index in the K dimension
+    int problem_size_k = min(
+      params.problem_size.k(),
+      (threadblock_tile_offset.k() + 1) * params.gemm_k_size);
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - tb_offset_A.column() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      params.ref_A.data(),
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_A,
+      params.gather_A_indices);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      params.ref_B.data(),
+      {problem_size_k/2, params.problem_size.n()/2},
+      thread_idx,
+      tb_offset_B,
+      params.gather_B_indices);
+
+    const int qscale_k = problem_size_k / Mma::QuantBlocking::kRow;
+    const int qscale_n = params.problem_size.n() / Mma::QuantBlocking::kColumn;
+
+    // should have been verified by can_implement()
+    assert((qscale_k > 0) && (qscale_k * Mma::QuantBlocking::kRow == problem_size_k));
+    assert((qscale_n > 0) && (qscale_n * Mma::QuantBlocking::kColumn == params.problem_size.n()));
+
+    cutlass::MatrixCoord tb_offset_QScale{
+      threadblock_tile_offset.k() * (params.gemm_k_size/Mma::QuantBlocking::kRow),
+      threadblock_tile_offset.n() * (Mma::Shape::kN/Mma::QuantBlocking::kColumn)
+    };
+
+    typename Mma::IteratorQScale iterator_QScale(
+      params.params_QScale,
+      params.ref_QScale.data(),
+      {qscale_k, qscale_n},
+      thread_idx,
+      tb_offset_QScale,
+      nullptr);
+
+    typename Mma::IteratorQOffset iterator_QOffset(
+      params.params_QOffset,
+      params.ref_QOffset.data(),
+      {qscale_k, qscale_n},
+      thread_idx,
+      tb_offset_QScale);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    const int warp_idx = canonical_warp_idx();
+    const int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    if (!kSplitKSerial || gemm_k_iterations > 0) {
+      // Compute threadblock-scoped matrix multiply-add
+      mma(gemm_k_iterations, accumulators, iterator_A, iterator_B, iterator_QScale, iterator_QOffset, accumulators);
+    }
+
+    //
+    // Epilogue
+    //
+
+    OutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    // If performing a reduction via split-K, fetch the initial synchronization
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+
+      // Fetch the synchronization lock initially but do not block.
+      semaphore.fetch();
+
+      // Indicate which position in a serial reduction the output operator is currently updating
+      output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+    }
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_C,
+      params.ref_C.data(),
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset,
+      params.scatter_D_indices
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      params.ref_D.data(),
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset,
+      params.scatter_D_indices
+    );
+
+    Epilogue epilogue(
+      shared_storage.epilogue,
+      thread_idx,
+      warp_idx,
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(output_op, iterator_D, accumulators, iterator_C);
+
+    //
+    // Release the semaphore
+    //
+
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
diff --git a/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/default_quantb_mma.h b/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/default_quantb_mma.h
new file mode 100644
index 0000000000000..0af604f090e1f
--- /dev/null
+++ b/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/default_quantb_mma.h
@@ -0,0 +1,248 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/**
+ * Modifications Copyright (c) Microsoft.
+ * Licensed under the MIT license.
+ *
+ * @file default_quantb_mma.h
+ * @brief Modified from cutlass/gemm/threadblock/default_mma.h.
+ *        Defining global memory data layout and iterators, combinging with mma core and
+ *        pipelined GEMM kernel.
+ */
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/permute.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h"
+#include "cutlass_ext/q4gemm/threadblock/optional_predicated_tile_access_iter.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass_ext/q4gemm/threadblock/default_quantb_mma_core.h"
+#include "cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for quant scales
+    typename ElementQScale_,
+    /// Element type for quant offsets
+    typename ElementQOffset_,
+    /// Layout for quant scales and offsets
+    typename LayoutQMeta_,
+    /// Blocking size for quantization
+    typename QuantBlocking_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation perfomed by GEMM
+    typename Operator,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false,
+    /// Gather operand A by using an index array
+    bool GatherA = false,
+    /// Gather operand B by using an index array
+    bool GatherB = false,
+    /// Permute operand A
+    typename PermuteALayout = layout::NoPermute,
+    /// Permute operand B
+    typename PermuteBLayout = layout::NoPermute
+    >
+struct DefaultQuantBMma;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output (OperatorClass TensorOp)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for quant scales
+    typename ElementQScale,
+    /// Element type for quant offsets
+    typename ElementQOffset,
+    /// Layout for quant scales and offsets
+    typename LayoutQMeta,
+    /// Blocking size for quantization
+    typename QuantBlocking,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Layout type for C and D matrix operand
+    typename LayoutC,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Operation perfomed by GEMM
+    typename Operator,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB,
+    /// Permute operand A
+    typename PermuteALayout,
+    /// Permute operand B
+    typename PermuteBLayout
+    >
+struct DefaultQuantBMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementQScale, ElementQOffset,
+                  LayoutQMeta, QuantBlocking,
+                  ElementAccumulator, LayoutC,
+                  arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, Stages, Operator, false,
+                  GatherA, GatherB, PermuteALayout, PermuteBLayout> {
+
+  static_assert(platform::is_same<LayoutC, layout::RowMajor>::value
+             || platform::is_same<LayoutC, layout::AffineRankN<2>>::value,
+             "simt epilogue must be row major");
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultQuantBMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementQScale, ElementQOffset, LayoutQMeta, QuantBlocking,
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
+      Stages, Operator, false, CacheOpA, CacheOpB>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA, GatherA, PermuteALayout>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kK/2, ThreadblockShape::kN/2>,
+          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB, GatherB, PermuteBLayout>;
+
+  // Define iterators over tiles from the quant scales
+  using ThreadMapQScale = typename MmaCore::IteratorThreadMapQScale;
+  using AccessTypeQScale =
+      cutlass::Array<ElementQScale, ThreadMapQScale::kElementsPerAccess>;
+  using IteratorQScale =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          typename MmaCore::ThreadblockQShape,
+          ElementQScale, LayoutQMeta, 0, ThreadMapQScale, AccessTypeQScale>;
+
+  using ThreadMapQOffset = typename MmaCore::IteratorThreadMapQOffset;
+  using AccessTypeQOffset =
+      cutlass::Array<ElementQOffset, ThreadMapQOffset::kElementsPerAccess>;
+  using IteratorQOffset =
+      cutlass::transform::threadblock::OptionalPredicatedTileAccessIterator<
+            typename MmaCore::ThreadblockQShape, ElementQOffset, LayoutQMeta,
+            0, ThreadMapQOffset, AccessTypeQOffset, MmaCore::kThreads>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::QuantBMmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, IteratorQScale, typename MmaCore::SmemIteratorQScale,
+      cutlass::arch::CacheOperation::Global, IteratorQOffset,
+      typename MmaCore::SmemIteratorQOffset, cutlass::arch::CacheOperation::Global,
+      ElementAccumulator, LayoutC,
+      typename MmaCore::MmaPolicy, Stages>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/default_quantb_mma_core.h b/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/default_quantb_mma_core.h
new file mode 100644
index 0000000000000..ad322f6505200
--- /dev/null
+++ b/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/default_quantb_mma_core.h
@@ -0,0 +1,340 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/**
+ * Modifications Copyright (c) Microsoft.
+ * Licensed under the MIT license.
+ *
+ * @file default_quantb_mma_core.h
+ * @brief Modified from cutlass/gemm/threadblock/default_mma_core.h.
+ *        Defining data layout in shared memory, and its iterators.
+ */
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
+
+#include "cutlass/gemm/warp/mma_simt_policy.h"
+#include "cutlass/gemm/warp/mma_simt.h"
+#include "cutlass_ext/q4gemm/warp/default_quantb_mma_tensor_op.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+
+#include "cutlass/gemm/threadblock/default_multistage_mma_complex_core.h"
+#include "cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h"
+
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h"
+#include "cutlass_ext/q4gemm/threadblock/optional_regular_tile_access_iter.h"
+
+#include "cutlass/util/debug.h"
+#include "cutlass/util/device_dump.h"
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Template defininng default matrix multiply operators inferred from threadblock tile size,
+/// global memory data layout, and target math instruction.
+template <
+    /// Shape of threadblock-scoped matrix multiply operator
+    typename Shape,
+    /// Shape of warp-level matrix multiply operator
+    typename WarpShape,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape,
+    /// Element data type of A operand
+    typename ElementA,
+    /// Layout of operand A
+    typename LayoutA,
+    /// Element data type of B operand
+    typename ElementB,
+    /// Layout of operand B
+    typename LayoutB,
+    /// Element data type of quant scale
+    typename ElementQScale,
+    /// Element data type of quant offset
+    typename ElementQOffset,
+    /// Layout of quant scale
+    typename LayoutQMeta,
+    /// Blocking dimensions for quantization
+    typename QuantBlocking,
+    /// Data type of accumulator
+    typename ElementC,
+    /// Layout of accumulator
+    typename LayoutC,
+    /// Indicates type of math operator (arch::OpClassSimt or arch::OpClassTensorOp)
+    typename OperatorClass,
+    /// Number of stages
+    int Stages = 2,
+    /// Operation performed by MMA
+    typename Operator = typename platform::conditional<
+        (platform::is_same<OperatorClass,
+                           cutlass::arch::OpClassTensorOp>::value) &&
+            (platform::is_same<ElementA, int8_t>::value ||
+             platform::is_same<ElementA, int4b_t>::value ||
+             platform::is_same<ElementA, uint8_t>::value ||
+             platform::is_same<ElementA, uint4b_t>::value),
+        cutlass::arch::OpMultiplyAddSaturate,
+        cutlass::arch::OpMultiplyAdd>::type,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA =
+        cutlass::arch::CacheOperation::Global,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB =
+        cutlass::arch::CacheOperation::Global,
+    /// per-element transformation for elements of A
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// per-element transformation for elements of B
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    bool IsComplex = false // (is_complex<ElementA>::value || is_complex<ElementB>::value)
+>
+struct DefaultQuantBMmaCore;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: column-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Element data type of quant scale
+    typename ElementQScale_,
+    /// Element data type of quant offset
+    typename ElementQOffset_,
+    /// Layout of quant scale
+    typename LayoutQMeta_,
+    /// Blocking dimensions for quantization
+    typename QuantBlocking_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultQuantBMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::RowMajor, ElementB_, layout::ColumnMajor,
+                      ElementQScale_, ElementQOffset_, LayoutQMeta_, QuantBlocking_,
+                      ElementC_, LayoutC_, arch::OpClassTensorOp, Stages,
+                      Operator_, false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::ColumnMajor;
+
+  using ElementQScale = ElementQScale_;
+  using ElementQOffset = ElementQOffset_;
+  using LayoutQMeta = LayoutQMeta_;
+  using QuantBlocking = QuantBlocking_;
+
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN,
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement
+  static int const kWarpThreadArrangementContiguousA =
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
+  static int const kWarpThreadArrangementContiguousB =
+      (Shape::kK / 2) / (kAccessSizeInBits / sizeof_bits<ElementB>::value);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementA>::value, Shape::kK>;
+
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementB>::value, Shape::kK/2>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
+                               kWarpThreadArrangementStridedA>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK/2, Shape::kN/2>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
+                               kWarpThreadArrangementStridedB>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK/2, Shape::kN/2>, ElementB, SmemLayoutB, 1,
+      IteratorThreadMapB>;
+
+  using SmemLayoutQScale = LayoutQMeta;
+  using SmemLayoutQOffset = LayoutQMeta;
+
+  /// Threadblock-level quantization meta data shape
+  using ThreadblockQShape = MatrixShape<Shape::kK / QuantBlocking::kRow, Shape::kN / QuantBlocking::kColumn>;
+  static_assert(Shape::kK % QuantBlocking::kRow == 0, "K must be multiple of QuantBlocking::kRow");
+  static_assert(Shape::kN % QuantBlocking::kColumn == 0, "N must be multiple of QuantBlocking::kColumn");
+  static_assert(ThreadblockQShape::kCount > 0, "QuantBlocking too big to fit in a thread block!");
+  static_assert(QuantBlocking::kRow == 1 || QuantBlocking::kColumn == 1,
+        "Only support single column or row quantize blocking!");
+  static_assert(QuantBlocking::kColumn != 1 || std::is_same<LayoutQMeta, layout::RowMajor>::value,
+        "Quant scale matrix's major dimension must have more elements, to facilitate fast loading!");
+
+  /// Threadblock-level quantization meta data shape in pitch-linear layout
+  using TBQPitchLinearShape = typename std::conditional<
+      std::is_same<LayoutQMeta, layout::RowMajor>::value,
+      layout::PitchLinearShape<ThreadblockQShape::kColumn, ThreadblockQShape::kRow>,
+      layout::PitchLinearShape<ThreadblockQShape::kRow, ThreadblockQShape::kColumn>>::type;
+
+  /// By default we would like to use 128b load. However, we can't load more than
+  /// a column at a time in a column major layout.
+  static int const kElementsPerAccessQScale =
+      (kAccessSizeInBits / sizeof_bits<ElementQScale>::value) > TBQPitchLinearShape::kContiguous
+          ? TBQPitchLinearShape::kContiguous
+          : (kAccessSizeInBits / sizeof_bits<ElementQScale>::value);
+
+  /// quant scale is tiny.  Not all threads are needed.
+  static int const kAccessCntQScale = ThreadblockQShape::kCount / kElementsPerAccessQScale;
+  static int const kThreadsQScale = (kAccessCntQScale > kThreads) ? kThreads : kAccessCntQScale;
+
+  using IteratorThreadMapQScale = transform::PitchLinearStripminedThreadMap<
+      TBQPitchLinearShape, kThreadsQScale, kElementsPerAccessQScale>;
+
+  using SmemIteratorQScale = transform::threadblock::RegularTileAccessIterator<
+        ThreadblockQShape, ElementQScale, SmemLayoutQScale, 1, IteratorThreadMapQScale>;
+
+  static int const kElementsPerAccessQOffset =
+      (kAccessSizeInBits / sizeof_bits<ElementQOffset>::value) > TBQPitchLinearShape::kContiguous
+          ? TBQPitchLinearShape::kContiguous
+          : (kAccessSizeInBits / sizeof_bits<ElementQOffset>::value);
+  static int const kAccessCntQOffset = ThreadblockQShape::kCount / kElementsPerAccessQOffset;
+  static int const kThreadsQOffset = (kAccessCntQOffset > kThreads) ? kThreads : kAccessCntQOffset;
+
+  using IteratorThreadMapQOffset = transform::PitchLinearStripminedThreadMap<
+      TBQPitchLinearShape, kThreadsQOffset, kElementsPerAccessQOffset>;
+
+  using SmemIteratorQOffset = transform::threadblock::OptionalRegularTileAccessIterator<
+        ThreadblockQShape, ElementQOffset, SmemLayoutQOffset, 1, IteratorThreadMapQOffset, kThreads>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultQuantBMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementQScale, SmemLayoutQScale, ElementQOffset, SmemLayoutQScale, QuantBlocking,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/optional_predicated_tile_access_iter.h b/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/optional_predicated_tile_access_iter.h
new file mode 100644
index 0000000000000..6f27a692a3a2e
--- /dev/null
+++ b/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/optional_predicated_tile_access_iter.h
@@ -0,0 +1,314 @@
+/**
+ * Copyright (c) Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT license.
+ *
+ * @file optional_predicated_tile_access_iter.h
+ * @brief Templates for loading and storing optional tiles of matrix data.
+ *   This iterator is just a wrapper of PredicatedTileAccessIterator, with
+ *   the option to turn it off at compile time and minimize its runtime
+ *   footprint. Also, it utilize the higher numbered threads in the
+ *   threadblock when  the iterator can not utilize all the threads.
+ */
+
+#pragma once
+
+#include <variant>
+
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Optional 2-D matrix data loader, when element is std::monostate, the
+/// iterator becomes no-op with minimal runtime footprint. Also, it utilize the
+/// higher numbered threads in the threadblock when the iterator can not utilize
+/// all the threads.
+///
+template <
+    /// Tile shape of the iterator
+    typename Shape_,
+    /// Element data type of the iterator, no-op when it is std::monostate
+    typename Element_,
+    /// Layout of the source matrix
+    typename Layout_,
+    int AdvanceRank_,
+    typename ThreadMap_,
+    typename AccessType_,
+    /// Number of threads in the threadblock, when provided, the iterator
+    /// will utilize the higher numbered threads
+    int kThreadBlockSize_ = -1>
+class OptionalPredicatedTileAccessIterator{
+ public:
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  static constexpr int kAdvanceRank = AdvanceRank_;
+  static constexpr int kThreadblockSize = kThreadBlockSize_;
+
+  static_assert(!std::is_same<Element, std::monostate>::value,
+      "Disabled Iterator failed to match the specialized version below.");
+  static_assert(kThreadblockSize == -1 || kThreadblockSize >= ThreadMap::kThreads,
+      "kThreadblockSize must be no smaller than ThreadMap::kThreads");
+
+  using Base = PredicatedTileAccessIterator<Shape, Element, Layout, kAdvanceRank, ThreadMap, AccessType>;
+
+  using LongIndex = typename Base::LongIndex;
+  using Mask = typename Base::Mask;
+  using TensorCoord = typename Base::TensorCoord;
+  using TensorRef = typename Base::TensorRef;
+  using Params = typename Base::Params;
+  using Pointer = typename Base::Pointer;
+
+  static constexpr int kAccessesPerVector = Base::kAccessesPerVector;
+
+  CUTLASS_HOST_DEVICE
+  static int flip_thread_id(int thread_id){
+    if constexpr (kThreadblockSize > 0) {
+      return kThreadblockSize - 1 - thread_id;
+    }
+    return thread_id;
+  }
+
+ public:
+   Base base_;
+
+  /// Default constructor
+  OptionalPredicatedTileAccessIterator(): base_() {};
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  OptionalPredicatedTileAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : base_(params, pointer, extent, flip_thread_id(thread_id), threadblock_offset) {}
+
+  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  OptionalPredicatedTileAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id)
+      : OptionalPredicatedTileAccessIterator(params, pointer, extent, thread_id, make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    base_.set_iteration_index(index);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    base_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_DEVICE
+  void add_tile_offset(
+      TensorCoord const &tile_offset) {
+    base_.add_tile_offset(tile_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return base_.get();
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  OptionalPredicatedTileAccessIterator &operator++() {
+    ++base_;
+    return *this;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  OptionalPredicatedTileAccessIterator operator++(int) {
+    OptionalPredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    base_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    base_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+    base_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+    base_.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return base_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for the disabled version
+/// Reduce runtime overhead
+///
+template <
+    /// Tile shape of the iterator
+    typename Shape_,
+    typename Layout_,
+    int AdvanceRank_,
+    typename ThreadMap_,
+    typename AccessType_,
+    int kThreadBlockSize_>
+class OptionalPredicatedTileAccessIterator<Shape_, std::monostate, Layout_, AdvanceRank_, ThreadMap_, AccessType_, kThreadBlockSize_>{
+ public:
+
+  using Shape = Shape_;
+  using Element = std::monostate;
+  using Layout = Layout_;
+  static int const kAdvanceRank = AdvanceRank_;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  static constexpr int kThreadblockSize = kThreadBlockSize_;
+
+  using Base = PredicatedTileAccessIterator<Shape, Element, Layout, kAdvanceRank, ThreadMap, AccessType>;
+
+  using LongIndex = typename Base::LongIndex;
+  using Mask = typename Base::Mask;
+  using TensorCoord = typename Base::TensorCoord;
+  using TensorRef = typename Base::TensorRef;
+  using Params = typename Base::Params;
+  using Pointer = typename Base::Pointer;
+
+  static constexpr int kAccessesPerVector = Base::kAccessesPerVector;
+
+ public:
+  std::monostate base_;
+
+  /// Default constructor
+  OptionalPredicatedTileAccessIterator(): base_() {};
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  OptionalPredicatedTileAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : base_() {}
+
+  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  OptionalPredicatedTileAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id)
+      : base_() {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {}
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_DEVICE
+  void add_tile_offset(
+      TensorCoord const &tile_offset) {}
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return nullptr;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  OptionalPredicatedTileAccessIterator &operator++() {
+    return *this;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  OptionalPredicatedTileAccessIterator operator++(int) {
+    return *this;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {}
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {}
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {}
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {}
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() const { return false; }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
diff --git a/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/optional_regular_tile_access_iter.h b/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/optional_regular_tile_access_iter.h
new file mode 100644
index 0000000000000..4b0ae5317f8bb
--- /dev/null
+++ b/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/optional_regular_tile_access_iter.h
@@ -0,0 +1,224 @@
+/**
+ * Copyright (c) Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT license.
+ *
+ * @file optional_regular_tile_access_iter.h
+ * @brief Templates implementing the address computation of storing of tiles
+ *   from pitch-linear rank=2 tensors.
+ *
+ *   This iterator is just a wrapper of RegularTileAccessIterator, with the
+ *   option to turn it off at compile time and minimize its runtime footprint.
+ *   Also, it utilize the higher numbered threads in the threadblock when the
+ *   iterator can not utilize all the threads.
+ *
+ *   Must be used in conjunction with OptionalPredicatedTileAccessIterator,
+ *   with the same template parameters.
+ */
+
+#pragma once
+
+#include <variant>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Optional 2-D tile iterator, when element is std::monostate, the iterator
+/// becomes no-op with minimal runtime footprint. Also, it utilize the higher
+/// numbered threads in the threadblock when the iterator can not utilize all
+/// the threads.
+///
+template <
+    /// Tile shape of the iterator
+    typename Shape_,
+    typename Element_,
+    typename Layout_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    /// Number of threads in the threadblock, when not -1, the iterator
+    /// will utilize the higher numbered threads
+    int ThreadblockSize_ = -1,
+    int Alignment =
+        sizeof_bits<Element_>::value * ThreadMap_::kElementsPerAccess / 8>
+class OptionalRegularTileAccessIterator{
+ public:
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using ThreadMap = ThreadMap_;
+  static constexpr int kAlignment = Alignment;
+  static constexpr int kThreadblockSize = ThreadblockSize_;
+
+  static_assert(!std::is_same<Element, std::monostate>::value,
+      "Disabled Iterator failed to match the specialized template");
+  static_assert(kThreadblockSize == -1 || kThreadblockSize >= ThreadMap::kThreads,
+      "kThreadblockSize must be no smaller than ThreadMap::kThreads");
+
+  using Base = RegularTileAccessIterator<Shape, Element, Layout, AdvanceRank, ThreadMap, Alignment>;
+
+  using LongIndex = typename Base::LongIndex;
+  using TensorRef = typename Base::TensorRef;
+  using TensorCoord = typename Base::TensorCoord;
+  using AccessType = typename Base::AccessType;
+
+  CUTLASS_HOST_DEVICE
+  static int flip_thread_id(int thread_id){
+    if constexpr (kThreadblockSize > 0) {
+      return kThreadblockSize - 1 - thread_id;
+    }
+    return thread_id;
+  }
+
+ private:
+
+  Base base_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  OptionalRegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : base_(ref, flip_thread_id(thread_id)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    base_.set_iteration_index(index);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    base_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_DEVICE
+  AccessType *get() const {
+    return base_.get();
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  OptionalRegularTileAccessIterator &operator++() {
+    ++base_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  OptionalRegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    this->operator++();
+
+    return prev;
+  }
+
+  /// Adds a tile offset in the unit of tile.
+  /// In GEMM/Conv implementation, this is used to move in the k dimension in the shared memory.
+  /// Below layouts are the shared memory layouts.  Current SM50 SIMT kernels only use col major A and row major B.
+  ///   For row major A operand, k dimension is contiguous dimension;
+  ///   For col major A operand, k dimension is strided dimension;
+  ///   For row major B operand, k dimension is strided dimension;
+  ///   For col major B operand, k dimension is contiguous dimension.
+  /// Below two classes map col/row major to the pitch linear coordinates used
+  /// in this base class.
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    base_.add_tile_offset(coord);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization when Element is std::monostate, the iterator becomes no-op
+///
+template <
+    typename Shape_,
+    typename Layout_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int ThreadblockSize_,
+    int Alignment>
+class OptionalRegularTileAccessIterator<Shape_, std::monostate, Layout_,
+    AdvanceRank, ThreadMap_, ThreadblockSize_, Alignment>{
+ public:
+
+  using Shape = Shape_;
+  using Element = std::monostate;
+  using Layout = Layout_;
+  using ThreadMap = ThreadMap_;
+  static constexpr int kAlignment = Alignment;
+  static constexpr int kThreadblockSize = ThreadblockSize_;
+
+  using Base = RegularTileAccessIterator<Shape, Element, Layout, AdvanceRank, ThreadMap, Alignment>;
+
+  using LongIndex = typename Base::LongIndex;
+  using TensorRef = typename Base::TensorRef;
+  using TensorCoord = typename Base::TensorCoord;
+  using AccessType = typename Base::AccessType;
+
+ private:
+
+  std::monostate base_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  OptionalRegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : base_() {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {}
+
+  /// Returns a pointer
+  CUTLASS_DEVICE
+  AccessType *get() const {
+    return nullptr;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  OptionalRegularTileAccessIterator &operator++() {
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  OptionalRegularTileAccessIterator operator++(int) {
+    return *this;
+  }
+
+  /// Adds a tile offset in the unit of tile.
+  /// In GEMM/Conv implementation, this is used to move in the k dimension in the shared memory.
+  /// Below layouts are the shared memory layouts.  Current SM50 SIMT kernels only use col major A and row major B.
+  ///   For row major A operand, k dimension is contiguous dimension;
+  ///   For col major A operand, k dimension is strided dimension;
+  ///   For row major B operand, k dimension is strided dimension;
+  ///   For col major B operand, k dimension is contiguous dimension.
+  /// Below two classes map col/row major to the pitch linear coordinates used
+  /// in this base class.
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {}
+};
+
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
diff --git a/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h b/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h
new file mode 100644
index 0000000000000..8b6bac8c5099a
--- /dev/null
+++ b/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h
@@ -0,0 +1,1290 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/**
+ * Modifications Copyright (c) Microsoft.
+ * Licensed under the MIT license.
+ *
+ * @file quantb_mma_multistage.h
+ * @brief Modified from cutlass/gemm/threadblock/mma_multistage.h.
+ * Added the quantized data memory pipeline, dequantization, and feeding
+ * to tensor cores. Mainloop pipeline is heavily modified.
+ */
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/threadblock/mma_base.h"
+
+#include "cutlass/util/debug.h"
+#include "cutlass/util/device_dump.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace{
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Utilities for printing layout for the prepacked weights and quantization parameters
+///
+template<
+    /// Data type of the prepacked weights
+    typename ElementWeight,
+    /// Data type of the quant scales
+    typename ElementQScale,
+    /// Data type of the quant offsets
+    typename ElementQOffset>
+struct QuantBLayoutDebug{
+  static constexpr bool debug_smem = true;
+  static constexpr bool debug_fragment = true;
+  ElementWeight* smem_b_ptr_;
+  ElementQScale* smem_qscale_ptr_;
+  ElementQOffset* smem_qoffset_ptr_;
+  int warp_id_;
+  int lane_id_;
+  int block_id_;
+
+  template<typename Element, int Size>
+  CUTLASS_DEVICE
+  static void print_fragment(cutlass::Array<Element, Size> const& frag, char label, int block_id, int warp_id, int lane_id){
+    static_assert(Size % 4 == 0, "Size must be multiple of 4");
+    if constexpr (debug_fragment){
+      if (block_id == 1 && warp_id == 0){
+        const Element* ptr = reinterpret_cast<const Element*>(&frag);
+        for (int i = 0; i < Size/4; i++, ptr+=4){
+          if constexpr(std::is_integral<Element>::value){
+            printf("T%.2d%c%d, %3d, %3d, %3d, %3d\n",
+                   threadIdx.x, label, i,
+                   ptr[0], ptr[1], ptr[2], ptr[3]);
+          } else {
+            printf("T%.2d%c%d, %.3f, %.3f, %.3f, %.3f\n",
+                   threadIdx.x, label, i,
+                   float(ptr[0]), float(ptr[1]), float(ptr[2]), float(ptr[3]));
+          }
+        }
+      }
+    }
+  }
+
+  template<typename Element, int Size>
+  CUTLASS_DEVICE
+  static void print_as_int4(cutlass::Array<Element, Size> const& frag, char label, int block_id, int warp_id, int lane_id){
+    constexpr int I8Size = Size * cutlass::sizeof_bits<Element>::value / 8;
+    static_assert(I8Size % 2 == 0, "Size must be multiple of 4");
+    if constexpr (debug_fragment){
+      if (block_id == 1 && warp_id == 0){
+        const uint8_t* ptr = reinterpret_cast<const uint8_t*>(&frag);
+        for (int i = 0; i < I8Size/2; i++, ptr+=2){
+          printf("T%.2dW%d, %d, %d, %d, %d\n", threadIdx.x, i, ptr[0] & 0x0f, ptr[0] >> 4, ptr[1] & 0x0f, ptr[1] >> 4);
+        }
+      }
+    }
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Dummy type when quant offset is not used, to avoid compilation error,
+/// and reduce runtime footprint
+///
+struct DummyType{
+  std::monostate dummy_;
+ public:
+  DummyType() = default;
+
+  CUTLASS_HOST_DEVICE
+  void* data() const {
+    return nullptr;
+  }
+
+  CUTLASS_HOST_DEVICE
+  std::monostate& operator[](int idx) {
+    return dummy_;
+  }
+};
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class QuantBMmaBase {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  //
+  // Dependent types
+  //
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Shape describing the overall GEMM computed from shared memory
+  /// by each warp.
+  using WarpGemm = typename Policy::Operator::Shape;
+
+  /// Shape describing the number of warps filling the CTA
+  using WarpCount = GemmShape<Shape::kM / WarpGemm::kM,
+                              Shape::kN / WarpGemm::kN,
+                              Shape::kK / WarpGemm::kK>;
+
+  /// Number of warp-level GEMM oeprations
+  static int const kWarpGemmIterations =
+      (WarpGemm::kK / Operator::Policy::MmaShape::kK);
+
+  /// Number of stages
+  static int const kStages = Stages;
+
+  static constexpr bool kHasQOffset = !std::is_same<typename Operator::ElementQOffset, std::monostate>::value;
+
+  /// Tensor reference to the A operand
+  using TensorRefA = TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;
+
+  /// Tensor reference to the prepacked weights
+  using TensorRefB = TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
+
+  static_assert(kWarpGemmIterations > 1,
+                "The pipelined structure requires at least two warp-level "
+                "GEMM operations.");
+
+  static_assert((kWarpGemmIterations % 2) == 0,
+                "Inner loop iteration must be an even number.");
+
+  // Tensor reference to the quantization scales
+  using TensorRefQScale = TensorRef<typename Operator::ElementQScale, typename Operator::SmemLayoutQScale>;
+  using TensorRefQOffset = TensorRef<typename Operator::ElementQOffset, typename Operator::SmemLayoutQOffset>;
+
+  // Block size of the quantization (one set of quantization parameters per block of weights)
+  using QuantBlocking = typename Operator::QuantBlocking;
+
+  //
+  // Nested structs
+  //
+
+  /// Shared storage object needed by threadblock-scoped GEMM
+  class SharedStorage {
+   public:
+    //
+    // Type definitions
+    //
+
+    /// Shape of the A matrix operand in shared memory
+    using ShapeA = MatrixShape<Shape::kM + Policy::SmemPaddingA::kRow,
+                               Shape::kK * kStages +
+                                   Policy::SmemPaddingA::kColumn>;
+
+    /// Shape of the prepacked weights in shared memory
+    using ShapeB =
+        MatrixShape<Shape::kK / 2 * kStages + Policy::SmemPaddingB::kRow,
+                    Shape::kN / 2 + Policy::SmemPaddingB::kColumn>;
+
+    /// Shape of the quantization parameter matrix in shared memory
+    /// Validation done in mma core class ThreadblockQShape
+    using ShapeQScale =
+        MatrixShape<(Shape::kK / QuantBlocking::kRow) * kStages,
+                    Shape::kN / QuantBlocking::kColumn>;
+
+    using BufTypeQOffset = std::conditional_t<kHasQOffset,
+          AlignedBuffer<typename Operator::ElementQOffset, ShapeQScale::kCount>,
+          DummyType>;
+   public:
+    //
+    // Data members
+    //
+
+    /// Buffer for A operand
+    AlignedBuffer<typename Operator::ElementA, ShapeA::kCount> operand_A;
+
+    /// Buffer for prepacked weights
+    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;
+
+    /// Buffer for quantization scales
+    AlignedBuffer<typename Operator::ElementQScale, ShapeQScale::kCount> operand_QScale;
+
+    /// Buffer for quantization offsets
+    BufTypeQOffset operand_QOffset;
+
+   public:
+
+    //
+    // Methods
+    //
+
+    /// Returns a layout object for the A matrix
+    CUTLASS_DEVICE
+    static typename Operator::LayoutA LayoutA() {
+      return Operator::LayoutA::packed({ShapeA::kRow, ShapeA::kColumn});
+    }
+
+    /// Returns a layout object for the B matrix
+    CUTLASS_HOST_DEVICE
+    static typename Operator::LayoutB LayoutB() {
+      return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});
+    }
+
+    CUTLASS_HOST_DEVICE
+    static typename Operator::SmemLayoutQScale LayoutQMeta() {
+      return Operator::SmemLayoutQScale::packed({ShapeQScale::kRow, ShapeQScale::kColumn});
+    }
+
+    CUTLASS_HOST_DEVICE
+    static typename Operator::SmemLayoutQOffset LayoutQOffset() {
+      return Operator::SmemLayoutQOffset::packed({ShapeQScale::kRow, ShapeQScale::kColumn});
+    }
+
+    /// Returns a TensorRef to the A operand
+    CUTLASS_HOST_DEVICE
+    TensorRefA operand_A_ref() {
+      return TensorRefA{operand_A.data(), LayoutA()};
+    }
+
+    /// Returns a TensorRef to the prepacked weights
+    CUTLASS_HOST_DEVICE
+    TensorRefB operand_B_ref() {
+      return TensorRefB{operand_B.data(), LayoutB()};
+    }
+
+    /// Returns a TensorRef to the quantization scales
+    CUTLASS_HOST_DEVICE
+    TensorRefQScale operand_QScale_ref() {
+      return TensorRefQScale{operand_QScale.data(), LayoutQMeta()};
+    }
+
+    CUTLASS_HOST_DEVICE
+    TensorRefQOffset operand_QOffset_ref() {
+      if constexpr (!kHasQOffset){
+        return TensorRefQOffset();
+      } else {
+        return TensorRefQOffset{operand_QOffset.data(), LayoutQOffset()};
+      }
+    }
+  };
+
+ protected:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to load a warp-scoped tile of A operand from shared memory
+  typename Operator::IteratorA warp_tile_iterator_A_;
+
+  /// Iterator to load a warp-scoped tile of B operand from shared memory
+  typename Operator::IteratorB warp_tile_iterator_B_;
+
+  /// Iterator to load a warp-scoped tile of quant scales from shared memory
+  typename Operator::IteratorQMeta warp_tile_iterator_QScale_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  QuantBMmaBase(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      warp_tile_iterator_A_(shared_storage.operand_A_ref(), lane_idx),
+      warp_tile_iterator_B_(shared_storage.operand_B_ref(), lane_idx),
+      warp_tile_iterator_QScale_(shared_storage.operand_QScale_ref(),
+             shared_storage.operand_QOffset_ref(), lane_idx)
+  {}
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Iterators over tiles of quant scales in global memory
+    typename IteratorQScale_,
+    /// Iterators over tiles of quant scales in shared memory
+    typename SmemIteratorQScale_,
+    /// Cache operation for quant scales
+    cutlass::arch::CacheOperation::Kind CacheOpQScale,
+    /// Iterators over tiles of quant scales in global memory
+    typename IteratorQOffset_,
+    /// Iterators over tiles of quant scales in shared memory
+    typename SmemIteratorQOffset_,
+    /// Cache operation for quant scales
+    cutlass::arch::CacheOperation::Kind CacheOpQOffset,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class QuantBMmaMultistage :
+  public QuantBMmaBase<Shape_, Policy_, Stages> {
+public:
+  ///< Base class
+  using Base = QuantBMmaBase<Shape_, Policy_, Stages>;
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  using IteratorQScale = IteratorQScale_;
+  using IteratorQOffset = IteratorQOffset_;
+  using SmemIteratorQScale = SmemIteratorQScale_;
+  using SmemIteratorQOffset = SmemIteratorQOffset_;
+  using QuantBlocking = typename Base::QuantBlocking;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpQScale = CacheOpQScale;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpQOffset = CacheOpQOffset;
+  static constexpr bool kHasQOffset = Base::kHasQOffset;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Minimum architecture is Sm80 to support cp.async
+  using ArchTag = arch::Sm80;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const AsyncCopyIterationsPerStageA =
+        IteratorA::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of packed weights
+    static int const AsyncCopyIterationsPerStageB =
+        IteratorB::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand A
+    static int const kAccessesPerGroupA =
+        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB =
+        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    static int const AsyncCopyIterationsPerStageQScale =
+        IteratorQScale::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of quant scale
+    static int const kAccessesPerGroupQScale =
+        (AsyncCopyIterationsPerStageQScale + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    static int const AsyncCopyIterationsPerStageQOffset =
+        IteratorQOffset::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of quant offset
+    static int const kAccessesPerGroupQOffset =
+        (AsyncCopyIterationsPerStageQOffset + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    // Optional staged-accumulation (e.g., tf32x3 kernels) for improved numerical
+    // accuracy, where each mainloop iteration first accumulates into a temporary
+    // set of freshly-cleared accumulators, which are subsequently added to the
+    // final accumulator set.
+    static bool const kStagedAccumulation = arch::UseStagedAccumulation<typename Operator::MathOperator>::value;
+  };
+
+ private:
+
+
+  // Structure encapsulating pipeline state live from one iteration to the next
+  struct PipeState {
+
+    using WarpLoadedFragmentA = typename Operator::FragmentA;
+    using WarpLoadedFragmentB = typename Operator::FragmentB;
+    using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
+    using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
+
+    /// Temporary accumulator to facilitate staged-accumulation
+    FragmentC tmp_accum_;
+
+    /// Pair of A fragments used to overlap shared memory loads and math instructions
+    WarpLoadedFragmentA warp_loaded_frag_A_[2];
+
+    /// Pair of B fragments used to overlap shared memory loads and math instructions
+    WarpLoadedFragmentB warp_loaded_frag_B_;
+    WarpTransformedFragmentB warp_transformed_frag_B_[2];
+
+    using WarpLoadedFragmentQScale = typename Operator::FragmentQScale;
+    WarpLoadedFragmentQScale warp_loaded_frag_QScale_;
+
+    using WarpLoadedFragmentQOffset = typename std::conditional<kHasQOffset,
+            typename Operator::FragmentQOffset,
+            std::monostate>::type;
+    WarpLoadedFragmentQOffset warp_loaded_frag_QOffset_;
+  };
+
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Warp-level MMA operator
+  Operator warp_mma_;
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+  /// Iterator to write threadblock-scoped tile of quant meta data to shared memory
+  SmemIteratorQScale smem_iterator_QScale_;
+  SmemIteratorQOffset smem_iterator_QOffset_;
+
+  /// Shared memory write stage index
+  int smem_write_stage_idx_;
+
+  /// Shared memory read stage index
+  int smem_read_stage_idx_;
+
+  /// very small meta data tensor require less threads to load
+  bool const should_load_qscale_;
+  bool const should_load_qoffset_;
+
+  /// Shared memory pointers for debug dumping
+  static constexpr bool debug_layout = false;
+  using LayoutDebugType = typename std::conditional<debug_layout,
+      QuantBLayoutDebug<typename IteratorB::Element, typename IteratorQScale::Element, typename IteratorQOffset::Element>,
+      std::monostate>::type;
+  LayoutDebugType layout_debug_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  QuantBMmaMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx),
+      smem_iterator_QScale_(shared_storage.operand_QScale_ref(), thread_idx),
+      smem_iterator_QOffset_(shared_storage.operand_QOffset_ref(), thread_idx),
+      should_load_qscale_(thread_idx < IteratorQScale::ThreadMap::kThreads),
+      should_load_qoffset_(thread_idx >= IteratorQOffset::kThreadblockSize - IteratorQOffset::ThreadMap::kThreads),
+      smem_write_stage_idx_(0),
+      smem_read_stage_idx_(0)
+  {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+    if constexpr(debug_layout){
+      layout_debug_.smem_b_ptr_ = shared_storage.operand_B_ref().data();
+      layout_debug_.smem_qscale_ptr_ = shared_storage.operand_QScale_ref().data();
+      if constexpr(kHasQOffset){
+        layout_debug_.smem_qoffset_ptr_ = shared_storage.operand_QOffset_ref().data();
+      } else {
+        layout_debug_.smem_qoffset_ptr_ = nullptr;
+      }
+      layout_debug_.warp_id_ = warp_idx;
+      layout_debug_.lane_id_ = lane_idx;
+      layout_debug_.block_id_ = blockIdx.x + blockIdx.y * gridDim.x + gridDim.x * gridDim.y * blockIdx.z;
+    }
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+    this->warp_tile_iterator_QScale_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+  /// Advance shared memory read-iterators to the next stage
+  CUTLASS_DEVICE
+  void advance_smem_read_stage()
+  {
+    ++smem_read_stage_idx_;
+
+    if (smem_read_stage_idx_ == Base::kStages) {
+      // Wrap back around to the 'start' of the circular buffer in shared memory
+      this->warp_tile_iterator_A_.add_tile_offset({0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
+      this->warp_tile_iterator_B_.add_tile_offset({-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations, 0});
+      this->warp_tile_iterator_QScale_.add_tile_offset({-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations, 0});
+
+      smem_read_stage_idx_ = 0;
+    }
+  }
+
+  /// Advance global memory read-iterators and shared memory write-iterators to the stage
+  CUTLASS_DEVICE
+  void advance_smem_write_stage(
+    IteratorA &iterator_A,
+    IteratorB &iterator_B,
+    IteratorQScale &iterator_QScale,
+    IteratorQOffset &iterator_QOffset)
+  {
+    // Advance global iterators
+    iterator_A.add_tile_offset({0, 1});
+    iterator_B.add_tile_offset({1, 0});
+    iterator_QScale.add_tile_offset({1, 0});
+
+    // Advance shared iterators
+    smem_iterator_A_.add_tile_offset({0, 1});
+    smem_iterator_B_.add_tile_offset({1, 0});
+    smem_iterator_QScale_.add_tile_offset({1, 0});
+
+    if constexpr (kHasQOffset) {
+      iterator_QOffset.add_tile_offset({1, 0});
+      smem_iterator_QOffset_.add_tile_offset({1, 0});
+    }
+
+    // Increment shared memory write stage index
+    ++smem_write_stage_idx_;
+
+    if (smem_write_stage_idx_ == Base::kStages) {
+      // Wrap back around to the 'start' of the circular buffer in shared memory
+      smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+      smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+      smem_iterator_QScale_.add_tile_offset({-Base::kStages, 0});
+      if constexpr (kHasQOffset) {
+        smem_iterator_QOffset_.add_tile_offset({-Base::kStages, 0});
+      }
+      smem_write_stage_idx_ = 0;
+    }
+  }
+
+  CUTLASS_DEVICE
+  void copy_qscale_tiles(IteratorQScale &iterator_QScale){
+    // Quant scale matrix is 1/block_size of the B matrix, for a 64x64 warp tile,
+    // it's only 64x64/block_size elements. For blocking size 16 ~ 64, it only
+    // takes 4 ~ 16 cp.async instructions to load. One warp has 32 threads, so
+    // it should be loaded in less than one cp.async instruction per thread.
+    // Even less for quant offset matrix.
+    static_assert(Detail::AsyncCopyIterationsPerStageQScale == 1,
+                  "Quant scale should be loaded in one shot!");
+    static_assert(IteratorQScale::kAccessesPerVector == 1,
+                  "Quant scale should 1 access per vector!");
+
+    // Async Copy for quantization scale
+    typename IteratorQScale::AccessType *dst_ptr =
+        reinterpret_cast<typename IteratorQScale::AccessType *>(
+            this->smem_iterator_QScale_.get());
+
+    constexpr int kSrcBytes =
+        sizeof_bits<typename IteratorQScale::Element>::value *
+            IteratorQScale::ThreadMap::kElementsPerAccess / 8;
+
+    cutlass::arch::cp_async<kSrcBytes, kCacheOpQScale>(
+        dst_ptr, iterator_QScale.get(), iterator_QScale.valid());
+  }
+
+  CUTLASS_DEVICE
+  void copy_qoffset_tiles(IteratorQOffset & iterator_QOffset) {
+    static_assert(Detail::AsyncCopyIterationsPerStageQOffset == 1,
+                  "Quant offset should be loaded in one shot!");
+    static_assert(IteratorQOffset::kAccessesPerVector == 1,
+                  "Quant offset should 1 access per vector!");
+
+    if constexpr(kHasQOffset) {
+      // Async Copy for quantization offset
+      typename IteratorQOffset::AccessType *dst_ptr =
+          reinterpret_cast<typename IteratorQOffset::AccessType *>(
+              this->smem_iterator_QOffset_.get());
+
+      constexpr int kSrcBytes = sizeof_bits<typename IteratorQOffset::Element>::value *
+                                IteratorQOffset::ThreadMap::kElementsPerAccess / 8;
+
+      cutlass::arch::cp_async<kSrcBytes, kCacheOpQOffset>(
+            dst_ptr, iterator_QOffset.get(), iterator_QOffset.valid());
+    }
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance(IteratorA &iterator_A, IteratorB &iterator_B,
+                              int group_start = 0) {
+    auto group_start_A = group_start * Detail::kAccessesPerGroupA;
+    iterator_A.set_iteration_index(group_start_A *
+                                   IteratorA::kAccessesPerVector);
+    this->smem_iterator_A_.set_iteration_index(group_start_A);
+
+    // Async Copy for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
+      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                              IteratorA::ThreadMap::kElementsPerAccess /
+                              IteratorA::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_A.get();
+
+          cutlass::arch::cp_async<kSrcBytes, kCacheOpA>(
+              dst_ptr + v, gmem_ptr, iterator_A.valid());
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+    }
+
+    auto group_start_B = group_start * Detail::kAccessesPerGroupB;
+    iterator_B.set_iteration_index(group_start_B *
+                                   IteratorB::kAccessesPerVector);
+    this->smem_iterator_B_.set_iteration_index(group_start_B);
+
+    // Async Copy for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
+      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+                              IteratorB::ThreadMap::kElementsPerAccess /
+                              IteratorB::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B.get();
+
+          cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(
+              dst_ptr + v, gmem_ptr, iterator_B.valid());
+
+          ++iterator_B;
+        }
+        ++this->smem_iterator_B_;
+      }
+    }
+  }
+
+  /// GEMM prologue.  Bootstrap the global->shared memory pipeline by fetching
+  /// the global fragments needed by the first kStages-1 threadblock mainloop iterations
+  CUTLASS_DEVICE
+  void prologue(
+    IteratorA &iterator_A,      ///< [in|out] iterator over A operand in global memory
+    IteratorB &iterator_B,      ///< [in|out] iterator over B operand in global memory
+    IteratorQScale &iterator_QScale, ///< [in|out] iterator over quant scales in global memory
+    IteratorQOffset &iterator_QOffset, ///< [in|out] iterator over quant offsets in global memory
+    int &gemm_k_iterations)     ///< [in|out] number of threadblock mainloop iterations remaining
+  {
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1; ++stage, --gemm_k_iterations) {
+
+      // Disable global fetching if done with global fetch iterations
+      iterator_A.clear_mask(gemm_k_iterations == 0);
+      iterator_B.clear_mask(gemm_k_iterations == 0);
+      iterator_QScale.clear_mask(gemm_k_iterations == 0 || !should_load_qscale_);
+
+      iterator_A.set_iteration_index(0);
+      this->smem_iterator_A_.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorA::Element>::value *
+              IteratorA::ThreadMap::kElementsPerAccess /
+              IteratorA::kAccessesPerVector / 8;
+
+          int src_bytes = (iterator_A.valid() ? kSrcBytes : 0);
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+              dst_ptr + v, iterator_A.get(), iterator_A.valid());
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+
+      iterator_B.set_iteration_index(0);
+      this->smem_iterator_B_.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB::Element>::value *
+              IteratorB::ThreadMap::kElementsPerAccess /
+              IteratorB::kAccessesPerVector / 8;
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+              dst_ptr + v, iterator_B.get(), iterator_B.valid());
+
+          ++iterator_B;
+        }
+
+        ++this->smem_iterator_B_;
+      }
+
+      // Async Copy for quantization scale
+      static_assert(Detail::AsyncCopyIterationsPerStageQScale == 1, "Quant scale should be loaded in one shot!");
+      static_assert(IteratorQScale::kAccessesPerVector == 1, "Quant scale should 1 access per vector!");
+
+      typename IteratorQScale::AccessType *dst_ptr =
+          reinterpret_cast<typename IteratorQScale::AccessType *>(
+              this->smem_iterator_QScale_.get());
+
+      constexpr int kSrcBytes =
+          sizeof_bits<typename IteratorQScale::Element>::value *
+          IteratorQScale::ThreadMap::kElementsPerAccess / 8;
+
+      auto gmem_ptr = iterator_QScale.get();
+
+      cutlass::arch::cp_async<kSrcBytes, kCacheOpQScale>(
+          dst_ptr, gmem_ptr, iterator_QScale.valid());
+
+      if constexpr (kHasQOffset) {
+        iterator_QOffset.clear_mask(gemm_k_iterations == 0 || !should_load_qoffset_);
+
+        // Async Copy for quantization offset
+        static_assert(Detail::AsyncCopyIterationsPerStageQOffset == 1, "Quant offset should be loaded in one shot!");
+        static_assert(IteratorQOffset::kAccessesPerVector == 1, "Quant offset should 1 access per vector!");
+        typename IteratorQOffset::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorQOffset::AccessType *>(
+                this->smem_iterator_QOffset_.get());
+
+        constexpr int kSrcBytes =
+            sizeof_bits<typename IteratorQOffset::Element>::value *
+                IteratorQOffset::ThreadMap::kElementsPerAccess / 8;
+
+        cutlass::arch::cp_async<kSrcBytes, kCacheOpQOffset>(
+            dst_ptr, iterator_QOffset.get(), iterator_QOffset.valid());
+      }
+
+      // Move to the next write stage
+      advance_smem_write_stage(iterator_A, iterator_B, iterator_QScale, iterator_QOffset);
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+  }
+
+
+  /// Wait until we have at least one completed global fetch stage
+  CUTLASS_DEVICE
+  void gmem_wait()
+  {
+    // Wait until we have at least one committed global fetch stage. (#uncommitted = Base::kStages - 1 - #committed)
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    if constexpr(debug_layout) {
+      if (LayoutDebugType::debug_smem && layout_debug_.block_id_ == 1) {
+        if (threadIdx.x == 0){
+          printf("stage: %d\n", smem_write_stage_idx_);
+        }
+        cutlass::debug::dump_shmem(layout_debug_.smem_qscale_ptr_, Base::SharedStorage::ShapeQScale::kCount);
+        if constexpr(kHasQOffset){
+          cutlass::debug::dump_shmem(layout_debug_.smem_qoffset_ptr_, Base::SharedStorage::ShapeQScale::kCount);
+        }
+      }
+    }
+  }
+
+  /// Perform a threadblock mainloop iteration of matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void mac_loop_iter(
+    PipeState &pipe_state,          ///< [in|out] loop-carried pipeline state
+    FragmentC &accum,               ///< [in|out] destination accumulator tile
+    IteratorA &iterator_A,          ///< [in|out] iterator over A operand in global memory
+    IteratorB &iterator_B,          ///< [in|out] iterator over B operand in global memory
+    IteratorQScale &iterator_QScale, ///< [in|out] iterator over quant scales in global memory
+    IteratorQOffset &iterator_QOffset, ///< [in|out] iterator over quant offsets in global memory
+    int &gemm_k_iterations)         ///< [in|out] number of threadblock mainloop iterations remaining
+  {
+    // Unroll the warp-level MMA tiles of a threadblock's mainloop iteration
+    CUTLASS_PRAGMA_UNROLL
+    for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
+      // Loading next warp-level tiles from shared memory. This can be skipped on the very
+      // last iteration where:
+      //   (gemm_k_iterations == (1 - Base::kStages)) && (warp_mma_k == (Base::kWarpGemmIterations - 1))
+      // However, evaluating this condition seems more expensive than simply loading the tiles
+      this->warp_tile_iterator_QScale_.load(
+          pipe_state.warp_loaded_frag_QScale_,
+          pipe_state.warp_loaded_frag_QOffset_);
+      ++this->warp_tile_iterator_QScale_;
+
+      this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+      this->warp_tile_iterator_B_.load(pipe_state.warp_loaded_frag_B_);
+      ++this->warp_tile_iterator_B_;
+
+      this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+      this->warp_tile_iterator_A_.load(pipe_state.warp_loaded_frag_A_[(warp_mma_k + 1) % 2]);
+      ++this->warp_tile_iterator_A_;
+
+      // All warp-tiles issue their share of global->shared fragment copies
+      copy_tiles_and_advance(
+          iterator_A,
+          iterator_B,
+          (warp_mma_k + 1) % Base::kWarpGemmIterations);
+
+      if constexpr(debug_layout) {
+        if (LayoutDebugType::debug_fragment && layout_debug_.block_id_ == 1 && layout_debug_.warp_id_ == 0 && layout_debug_.lane_id_ == 0){
+          printf("LINE %d, warp_tile_B kgroup %d\n", __LINE__, warp_mma_k % Base::kWarpGemmIterations);
+        }
+        LayoutDebugType::print_as_int4(pipe_state.warp_loaded_frag_B_, 'W', layout_debug_.block_id_, layout_debug_.warp_id_, layout_debug_.lane_id_);
+        LayoutDebugType::print_fragment(Operator::IteratorQScale::debug_expand(pipe_state.warp_loaded_frag_QScale_), 'Q', layout_debug_.block_id_, layout_debug_.warp_id_, layout_debug_.lane_id_);
+        if constexpr(kHasQOffset){
+          LayoutDebugType::print_fragment(Operator::IteratorQScale::debug_expand(pipe_state.warp_loaded_frag_QOffset_), 'O', layout_debug_.block_id_, layout_debug_.warp_id_, layout_debug_.lane_id_);
+        }
+      }
+
+      warp_mma_.transform(
+        pipe_state.warp_transformed_frag_B_[(warp_mma_k + 1) % 2],
+        pipe_state.warp_loaded_frag_B_,
+        pipe_state.warp_loaded_frag_QScale_,
+        pipe_state.warp_loaded_frag_QOffset_);
+
+      if constexpr(debug_layout) {
+        LayoutDebugType::print_fragment(pipe_state.warp_transformed_frag_B_[(warp_mma_k + 1) % 2], 'B', layout_debug_.block_id_, layout_debug_.warp_id_, layout_debug_.lane_id_);
+      }
+
+      // Execute the current warp-tile of MMA operations
+      if (Detail::kStagedAccumulation) {
+        warp_mma_(
+          pipe_state.tmp_accum_,
+          pipe_state.warp_loaded_frag_A_[warp_mma_k % 2],
+          pipe_state.warp_transformed_frag_B_[warp_mma_k % 2],
+          pipe_state.tmp_accum_
+        );
+
+        if (warp_mma_k == 0) {
+          plus<FragmentC> plus_accum;
+          accum = plus_accum(accum, pipe_state.tmp_accum_);
+          pipe_state.tmp_accum_.clear();
+        }
+      } else {
+        warp_mma_(
+          accum,
+          pipe_state.warp_loaded_frag_A_[warp_mma_k % 2],
+          pipe_state.warp_transformed_frag_B_[warp_mma_k % 2],
+          accum
+        );
+      }
+
+      if (warp_mma_k == 0) {
+        copy_qscale_tiles(iterator_QScale);
+      }
+      if (warp_mma_k == 1) {
+        copy_qoffset_tiles(iterator_QOffset);
+      }
+
+      // The second-to-last warp-tile also moves to the next global fetch stage
+      if (warp_mma_k == Base::kWarpGemmIterations - 2) {
+        // Inserts a memory fence between stages of cp.async instructions.
+        cutlass::arch::cp_async_fence();
+
+        // Move to the next global fetch stage
+        advance_smem_write_stage(iterator_A, iterator_B, iterator_QScale, iterator_QOffset);
+        advance_smem_read_stage();
+
+        // Disable global fetching when done with global fetch iterations
+        --gemm_k_iterations;
+        iterator_A.clear_mask(gemm_k_iterations == 0);
+        iterator_B.clear_mask(gemm_k_iterations == 0);
+        iterator_QScale.clear_mask(gemm_k_iterations == 0 || !should_load_qscale_);
+        if constexpr(kHasQOffset){
+          iterator_QOffset.clear_mask(gemm_k_iterations == 0 || !should_load_qoffset_);
+        }
+
+        // Wait until we have at least one completed global fetch stage
+        gmem_wait();
+      }
+
+    }
+  }
+
+  /// Specialized mainloop iteration of matrix multiply-accumulate, for small M
+  CUTLASS_DEVICE
+  void mac_loop_iter_small_m(
+    PipeState &pipe_state,          ///< [in|out] loop-carried pipeline state
+    FragmentC &accum,               ///< [in|out] destination accumulator tile
+    IteratorA &iterator_A,          ///< [in|out] iterator over A operand in global memory
+    IteratorB &iterator_B,          ///< [in|out] iterator over B operand in global memory
+    IteratorQScale &iterator_QScale, ///< [in|out] iterator over quant scales in global memory
+    IteratorQOffset &iterator_QOffset, ///< [in|out] iterator over quant offsets in global memory
+    int &gemm_k_iterations)         ///< [in|out] number of threadblock mainloop iterations remaining
+  {
+    // Unroll the warp-level MMA tiles of a threadblock's mainloop iteration
+    CUTLASS_PRAGMA_UNROLL
+    for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
+      // In the case of small M, memory latency dominates. We try to move uses far
+      // from their definitions to hide latency.
+      if constexpr(debug_layout) {
+        if (LayoutDebugType::debug_fragment && layout_debug_.block_id_ == 1 && layout_debug_.warp_id_ == 0 && layout_debug_.lane_id_ == 0){
+          printf("LINE %d, warp_tile_B kgroup %d\n", __LINE__, warp_mma_k % Base::kWarpGemmIterations);
+        }
+        LayoutDebugType::print_as_int4(pipe_state.warp_loaded_frag_B_, 'W', layout_debug_.block_id_, layout_debug_.warp_id_, layout_debug_.lane_id_);
+        LayoutDebugType::print_fragment(Operator::IteratorQScale::debug_expand(pipe_state.warp_loaded_frag_QScale_), 'Q', layout_debug_.block_id_, layout_debug_.warp_id_, layout_debug_.lane_id_);
+        if constexpr(kHasQOffset){
+          LayoutDebugType::print_fragment(Operator::IteratorQScale::debug_expand(pipe_state.warp_loaded_frag_QOffset_), 'O', layout_debug_.block_id_, layout_debug_.warp_id_, layout_debug_.lane_id_);
+        }
+      }
+
+      warp_mma_.transform(
+        pipe_state.warp_transformed_frag_B_[(warp_mma_k) % 2],
+        pipe_state.warp_loaded_frag_B_,
+        pipe_state.warp_loaded_frag_QScale_,
+        pipe_state.warp_loaded_frag_QOffset_);
+
+      if constexpr(debug_layout) {
+        LayoutDebugType::print_fragment(pipe_state.warp_transformed_frag_B_[(warp_mma_k) % 2], 'B', layout_debug_.block_id_, layout_debug_.warp_id_, layout_debug_.lane_id_);
+      }
+
+      // Loading next warp-level tiles from shared memory.
+      this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+      this->warp_tile_iterator_B_.load(pipe_state.warp_loaded_frag_B_);
+      ++this->warp_tile_iterator_B_;
+
+      this->warp_tile_iterator_QScale_.load(
+          pipe_state.warp_loaded_frag_QScale_,
+          pipe_state.warp_loaded_frag_QOffset_);
+      ++this->warp_tile_iterator_QScale_;
+
+      this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+      this->warp_tile_iterator_A_.load(pipe_state.warp_loaded_frag_A_[(warp_mma_k + 1) % 2]);
+      ++this->warp_tile_iterator_A_;
+
+      // All warp-tiles issue their share of global->shared fragment copies
+      copy_tiles_and_advance(
+          iterator_A,
+          iterator_B,
+          (warp_mma_k + 1) % Base::kWarpGemmIterations);
+
+      // Execute the current warp-tile of MMA operations
+      if (Detail::kStagedAccumulation) {
+        warp_mma_(
+          pipe_state.tmp_accum_,
+          pipe_state.warp_loaded_frag_A_[warp_mma_k % 2],
+          pipe_state.warp_transformed_frag_B_[warp_mma_k % 2],
+          pipe_state.tmp_accum_
+        );
+
+        if (warp_mma_k == 0) {
+          plus<FragmentC> plus_accum;
+          accum = plus_accum(accum, pipe_state.tmp_accum_);
+          pipe_state.tmp_accum_.clear();
+        }
+      } else {
+        warp_mma_(
+          accum,
+          pipe_state.warp_loaded_frag_A_[warp_mma_k % 2],
+          pipe_state.warp_transformed_frag_B_[warp_mma_k % 2],
+          accum
+        );
+      }
+
+      // The second-to-last warp-tile also moves to the next global fetch stage
+      if (warp_mma_k == Base::kWarpGemmIterations - 2) {
+        // Inserts a memory fence between stages of cp.async instructions.
+        cutlass::arch::cp_async_fence();
+
+        // Move to the next global fetch stage
+        advance_smem_write_stage(iterator_A, iterator_B, iterator_QScale, iterator_QOffset);
+        advance_smem_read_stage();
+
+        // Disable global fetching when done with global fetch iterations
+        --gemm_k_iterations;
+        iterator_A.clear_mask(gemm_k_iterations == 0);
+        iterator_B.clear_mask(gemm_k_iterations == 0);
+        iterator_QScale.clear_mask(gemm_k_iterations == 0 || !should_load_qscale_);
+        if constexpr(kHasQOffset){
+          iterator_QOffset.clear_mask(gemm_k_iterations == 0 || !should_load_qoffset_);
+        }
+
+        copy_qscale_tiles(iterator_QScale);
+        copy_qoffset_tiles(iterator_QOffset);
+
+        // Wait until we have at least one completed global fetch stage
+        gmem_wait();
+      }
+
+    }
+  }
+
+
+  /// Perform the specified number of threadblock mainloop iterations of matrix
+  /// multiply-accumulate.  Assumes prologue has been initiated.
+  CUTLASS_DEVICE
+  void gemm_iters(
+      int gemm_k_iterations,        ///< number of threadblock mainloop iterations
+      FragmentC &accum,             ///< [in|out] accumulator tile
+      IteratorA &iterator_A,        ///< [in|out] iterator over A operand in global memory
+      IteratorB &iterator_B,        ///< [in|out] iterator over B operand in global memory
+      IteratorQScale &iterator_QScale, ///< [in|out] iterator over QScale operand in global memory
+      IteratorQOffset &iterator_QOffset) ///< [in|out] iterator over QOffset operand in global memory
+  {
+    PipeState pipe_state;
+
+    // Disable global fetching if done with global fetch iterations
+    iterator_A.clear_mask(gemm_k_iterations == 0);
+    iterator_B.clear_mask(gemm_k_iterations == 0);
+    iterator_QScale.clear_mask(gemm_k_iterations == 0 || !should_load_qscale_);
+    if constexpr(kHasQOffset) {
+      iterator_QOffset.clear_mask(gemm_k_iterations == 0 || !should_load_qoffset_);
+    }
+
+    // Load first warp-tile's B fragment from shared memory
+    this->warp_tile_iterator_QScale_.load(
+        pipe_state.warp_loaded_frag_QScale_,
+        pipe_state.warp_loaded_frag_QOffset_);
+    ++this->warp_tile_iterator_QScale_;
+
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.load(pipe_state.warp_loaded_frag_B_);
+    ++this->warp_tile_iterator_B_;
+
+    // Load first warp-tile's A fragment from shared memory
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_A_.load(pipe_state.warp_loaded_frag_A_[0]);
+    ++this->warp_tile_iterator_A_;
+
+    copy_tiles_and_advance(iterator_A, iterator_B, 0);
+
+    if constexpr(Shape::kM > 32) {
+      // the case of bigger m
+      if constexpr(debug_layout) {
+        if (LayoutDebugType::debug_fragment && layout_debug_.block_id_ == 1 && layout_debug_.warp_id_ == 0 && layout_debug_.lane_id_ == 0){
+          printf("LINE %d, warp_tile_B kgroup %d\n", __LINE__, 0);
+        }
+        LayoutDebugType::print_as_int4(pipe_state.warp_loaded_frag_B_, 'W', layout_debug_.block_id_, layout_debug_.warp_id_, layout_debug_.lane_id_);
+        LayoutDebugType::print_fragment(Operator::IteratorQScale::debug_expand(pipe_state.warp_loaded_frag_QScale_), 'Q', layout_debug_.block_id_, layout_debug_.warp_id_, layout_debug_.lane_id_);
+        if constexpr(kHasQOffset){
+          LayoutDebugType::print_fragment(Operator::IteratorQScale::debug_expand(pipe_state.warp_loaded_frag_QOffset_), 'O', layout_debug_.block_id_, layout_debug_.warp_id_, layout_debug_.lane_id_);
+        }
+      }
+
+      warp_mma_.transform(
+        pipe_state.warp_transformed_frag_B_[0],
+        pipe_state.warp_loaded_frag_B_,
+        pipe_state.warp_loaded_frag_QScale_,
+        pipe_state.warp_loaded_frag_QOffset_);
+
+      if constexpr(debug_layout) {
+        LayoutDebugType::print_fragment(pipe_state.warp_transformed_frag_B_[0], 'B', layout_debug_.block_id_, layout_debug_.warp_id_, layout_debug_.lane_id_);
+      }
+    } else {
+      // the case of small m
+      copy_qscale_tiles(iterator_QScale);
+      copy_qoffset_tiles(iterator_QOffset);
+    }
+
+    if (Detail::kStagedAccumulation) {
+      pipe_state.tmp_accum_.clear();
+    }
+
+    // Mainloop
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-Base::kStages + 1);) {
+      if constexpr(Shape::kM > 32) {
+        mac_loop_iter(
+          pipe_state,
+          accum,
+          iterator_A,
+          iterator_B,
+          iterator_QScale,
+          iterator_QOffset,
+          gemm_k_iterations);
+      } else {
+        mac_loop_iter_small_m(
+          pipe_state,
+          accum,
+          iterator_A,
+          iterator_B,
+          iterator_QScale,
+          iterator_QOffset,
+          gemm_k_iterations);
+      }
+    }
+
+    if (Detail::kStagedAccumulation) {
+      plus<FragmentC> plus_accum;
+      accum = plus_accum(accum, pipe_state.tmp_accum_);
+    }
+
+    // Commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+
+  }
+
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC &accum,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      ///< iterator over quant scales in global memory
+      IteratorQScale iterator_QScale,
+      ///< Iterator over quant offsets in global memory
+      IteratorQOffset iterator_QOffset,
+      ///< initial value of accumulator
+      FragmentC const &src_accum) {
+
+    // Prologue (start fetching iterations of global fragments into shared memory)
+    prologue(iterator_A, iterator_B, iterator_QScale, iterator_QOffset, gemm_k_iterations);
+
+    // Wait until we have at least one completed global fetch stage
+    gmem_wait();
+
+    // Initialize destination accumulators with source accumulators
+    accum = src_accum;
+
+    // Perform the MAC-iterations
+    gemm_iters(gemm_k_iterations, accum, iterator_A, iterator_B, iterator_QScale, iterator_QOffset);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/default_quantb_mma_tensor_op.h b/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/default_quantb_mma_tensor_op.h
new file mode 100644
index 0000000000000..2c49888c94504
--- /dev/null
+++ b/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/default_quantb_mma_tensor_op.h
@@ -0,0 +1,112 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/**
+ * Modifications Copyright (c) Microsoft.
+ * Licensed under the MIT license.
+ *
+ * @file default_quantb_mma_tensor_op.h
+ * @brief Modified from cutlass/gemm/warp/default_mma_tensor_op.h
+ * Default warp-level GEMM operators selected by data type, size, and layouts of operands.
+ */
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass_ext/q4gemm/warp/quantb_mma_tensor_op.h"
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for m-by-n-by-kgroup
+template <
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A elements
+    typename ElementA,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Data type of B elements
+    typename ElementB,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Data type of quant scales
+    typename ElementQScale,
+    /// Layout of quant scales (concept: MatrixLayout)
+    typename SmemLayoutQScale,
+    /// Data type of quant offsets
+    typename ElementQOffset,
+    /// Layout of quant offsets (concept: MatrixLayout)
+    typename SmemLayoutQOffset,
+    /// Blocking size of quantization
+    typename QuantBlocking,
+    /// Element type of C matrix
+    typename ElementC,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Operator describing the tensor operation
+    typename Operator_ = arch::OpMultiplyAdd,
+    /// Number of partitions along K dimension
+    int PartitionsK = 1,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false>
+struct DefaultQuantBMmaTensorOp {
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<InstructionShape_, 32, ElementA,
+                         cutlass::layout::RowMajor, ElementB,
+                         cutlass::layout::ColumnMajor, ElementC,
+                         cutlass::layout::RowMajor, Operator_>,
+      cutlass::MatrixShape<1, 1> >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::QuantBMmaTensorOp<
+      WarpShape_, ElementA, LayoutA, ElementB, LayoutB, ElementQScale, SmemLayoutQScale,
+      ElementQOffset, SmemLayoutQOffset, QuantBlocking, ElementC, LayoutC,
+      Policy, PartitionsK, AccumulatorsInRowMajor>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/quantb_meta_mma_tensor_op_tile_iterator.h b/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/quantb_meta_mma_tensor_op_tile_iterator.h
new file mode 100644
index 0000000000000..4ba39dda3db8d
--- /dev/null
+++ b/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/quantb_meta_mma_tensor_op_tile_iterator.h
@@ -0,0 +1,883 @@
+/**
+ * Copyright (c) Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT license.
+ *
+ * @file quantb_meta_mma_tensor_op_tile_iterator.h
+ * @brief Templates for loading quantization meta data for operand B
+ *        from shared memory to fragments. This is meant to be used in
+ *        lock step with the operand B tile iterator. Containing logic
+ *        to figure out the operand B layout in the tensor core,
+ *        and deliver each meta data element to its corresponding
+ *        operand B element for dequantization.
+ */
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
+
+#include "cutlass/platform/platform.h"
+#include "cutlass/fast_math.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace{
+
+struct b32_pair{
+  uint32_t a;
+  uint32_t b;
+};
+
+struct fp16_quad{
+  cutlass::half_t a;
+  cutlass::half_t b;
+  cutlass::half_t c;
+  cutlass::half_t d;
+};
+
+struct b16_quad{
+  int16_t a;
+  int16_t b;
+  int16_t c;
+  int16_t d;
+};
+
+union b64 {
+  uint64_t single;
+  b32_pair pair;
+  b16_quad quard;
+  fp16_quad fp16_quad;
+};
+
+static_assert(sizeof(b64) == 8, "b64 should be 64 bits");
+
+/// Convert packed 4b weights into fp16(weight + 16)
+/// Current bit hacking only supports fp16, need to add bf16 later.
+///
+template<int Size>
+CUTLASS_DEVICE
+void weights2Half(cutlass::Array<uint8_t,Size/2> const &weights,
+                 cutlass::Array<cutlass::half_t, Size>& dest)
+{
+  static_assert(Size % 8 == 0, "Weights should have been prepacked by 2x2 tiles, 2 weights per tile.");
+  uint32_t* dest_pair = reinterpret_cast<uint32_t*>(dest.data());
+  const uint32_t* w_oct = reinterpret_cast<const uint32_t*>(weights.data());
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int oct_idx = 0; oct_idx < Size/8; oct_idx++, w_oct++, dest_pair += 4){
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+
+    // static_cast<cutlass::half_t>(16 + weight)
+    // 4b weights are prepacked into [0, 2, 4, 6, 1, 3, 5, 7], so that adjacent weights
+    // are in different 16b half words, making it easier to convert to fp16.
+    asm volatile(
+        "{\n\t"
+        "  shl.b32       %0, %4, 6;\n"
+        "  shl.b32       %1, %4, 2;\n"
+        "  shr.u32       %2, %4, 2;\n"
+        "  shr.u32       %3, %4, 6;\n"
+        "  lop3.b32      %0, %0, 0x03c003c0, 0x4c004c00, 0xea;\n" // a & 0x03c0 | 0x4c00
+        "  lop3.b32      %1, %1, 0x03c003c0, 0x4c004c00, 0xea;\n"
+        "  lop3.b32      %2, %2, 0x03c003c0, 0x4c004c00, 0xea;\n"
+        "  lop3.b32      %3, %3, 0x03c003c0, 0x4c004c00, 0xea;\n"
+        "}\n"
+        : "=r"(dest_pair[0]), "=r"(dest_pair[1]),
+          "=r"(dest_pair[2]), "=r"(dest_pair[3])
+        : "r"(*w_oct));
+#else
+    assert(0);
+#endif
+  }
+
+}
+
+} // namespace
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Traits to describe the layout of quantization meta data layout in a MMA fragment
+// Since operand B is quantized on a per block basis, it's one meta data per block.
+
+template <
+  /// Shape of the operand B matrix to load in a warp (concept: MatrixShape<kK, kN>)
+  typename WarpShapeB_,
+  /// Block dimensions of the blockwise quantization. So the actual meta data
+  /// warp shape is WarpShapeB_ / BlockingShape_
+  typename BlockingShape_,
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  typename ArchMmaOperator_,
+  /// Number of threads participating in one matrix operation
+  int Threads>
+class QuantBMetaMmaTile{
+public:
+
+  using WarpShapeB = WarpShapeB_;
+  using BlockingShape = BlockingShape_;
+  using ArchMmaOperator = ArchMmaOperator_;
+
+  static_assert(Threads == 32, "This iterator should work in a warp only.");
+
+  /// Shape of the curresponding operand B tile iterator <instruction_k, warp_n>
+  using TileShapeB = MatrixShape<ArchMmaOperator::Shape::kK, WarpShapeB::kColumn>;
+
+  // Tensor core operand B layout is a column major 4x8 tile, divided
+  // into 32 threads (T0 ~ T31) as shown below. Each element of the tile is 32b,
+  // so for fp16 it becomes 8 x 8, and int8 it becomes 16 x 8.
+  //  T0 |  T4 |  T8 | T12 | T16 | T20 | T24 | T28
+  //  T1 |  T5 |  T9 | T13 | T17 | T21 | T25 | T29
+  //  T2 |  T6 | T10 | T14 | T18 | T22 | T26 | T30
+  //  T3 |  T7 | T11 | T15 | T19 | T23 | T27 | T31
+  using CoreTile = layout::PitchLinearShape<4, 8>;
+
+  /// Each thread holds a 32b fragment per tile: for half precision, it's 2 elements, 4 elements for int8
+  static int const kNumBsPerCoreTileFragement = 32 / sizeof_bits<typename ArchMmaOperator::ElementB>::value;
+
+  /// Each mma instruction can process either 1 or 2 tensor core operand B tiles (stacked on the k dimension)
+  static int const kBTilesPerMma =
+      sizeof_bits<typename ArchMmaOperator::ElementB>::value * ArchMmaOperator::FragmentB::kElements / 32;
+  static_assert(kBTilesPerMma == 1 || kBTilesPerMma == 2, "Only support 1 or 2 operand B tiles per mma.");
+
+  /// Each operand B tile iterator load covers a number of mma instructions
+  static int const kMmaIterationsB = WarpShapeB::kColumn / ArchMmaOperator::Shape::kN;
+
+  /// Number of B elements a fragment of meta data should cover
+  static int const kExpandedSize = kNumBsPerCoreTileFragement * kBTilesPerMma * kMmaIterationsB;
+
+  // Now we figure out how many meta data elements to load for each TileShapeB
+
+  /// Number of meta elements per CoreTile.
+  static int const kCoreTileFragementSize = (kNumBsPerCoreTileFragement + BlockingShape::kRow - 1) / BlockingShape::kRow;
+
+  /// Number of core tiles per mma instruction, different from kBTilesPerMma when blocking size on K dimension
+  /// exceeds the tile depth, so two tiles share the same meta data
+  static int const kTilesPerMma = ((kBTilesPerMma == 2) &&
+                                  (BlockingShape::kRow <= kNumBsPerCoreTileFragement * CoreTile::kContiguous))
+                                  ? 2 : 1;
+
+  /// stride to reach the meta data for the next CoreTile on the K dimension
+  static int const kKTileStride = (kNumBsPerCoreTileFragement * CoreTile::kContiguous + BlockingShape::kRow - 1) / BlockingShape::kRow;
+
+  /// Stride on N dimension should be the tile width, shrunk by blocking size on this dimension.
+  static int const kNStride = (CoreTile::kStrided + BlockingShape::kColumn - 1) / BlockingShape::kColumn;
+
+  /// On N dimension, how many tiles share the same meta data
+  static int const kNRepeats = (BlockingShape::kColumn + CoreTile::kStrided - 1) / CoreTile::kStrided;
+
+  /// Each fragment should cover kMmaIterationsB number of mma intructions on the N dimension.
+  /// When blocking size on this dimension exceeds the tile width, multiple iterations
+  /// would share the same data.
+  static int const kMmaIterations = (kMmaIterationsB + kNRepeats - 1) / kNRepeats;
+
+  static int const kFragementSize = kCoreTileFragementSize * kTilesPerMma * kMmaIterations;
+
+  CUTLASS_DEVICE
+  static MatrixCoord lane_position(int lane_id) {
+    if constexpr(kNumBsPerCoreTileFragement == 2
+                 && kBTilesPerMma == 2
+                 && BlockingShape::kRow == 1){
+      // Optimize for a special case of:
+      //    16b gemm (kNumBsPerCoreTileFragement == 2)
+      //    2 B operand tiles per mma (kBTilesPerMma == 2)
+      //    (1,n) quantization blocking
+      // The scale and offset tensors are prepacked to reduce the number of load instructions.
+      return make_Coord((lane_id % CoreTile::kContiguous) * 4,
+         lane_id / CoreTile::kContiguous);
+    } else {
+      return make_Coord((lane_id % CoreTile::kContiguous) * kNumBsPerCoreTileFragement,
+         lane_id / CoreTile::kContiguous);
+    }
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is to load quantization meta data for operand B from
+/// shared memory to fragments (hopefully allocated to registers by compilers).
+/// Examples of meta data include scale or offsets. The operand B matrix is
+/// quantized on a per block basis, meaning one element of meta data per block.
+///
+/// This is meant to be used in lock step with the operand B tile iterator.
+/// So all parameters are logical positions in the operand B tiles.
+/// The goal here is to deliver each meta data element to its corresponding
+/// operand B element for dequantization. As a result, we need to figure
+/// out the operand B layout in the tensor core.
+///
+template <
+  /// Shape of the operand B matrix to load in a warp (concept: MatrixShape<kK, kN>)
+  typename WarpShapeB_,
+  /// Block dimensions of the blockwise quantization. So the actual meta data
+  /// warp shape is WarpShapeB_ / BlockingShape_
+  typename BlockingShape_,
+  /// Data type of the quant scales
+  typename ElementScale_,
+  /// Layout of the quant scales
+  typename LayoutScale_,
+  /// Data type of quant offsets
+  typename ElementOffset_,
+  /// Layout of quant offsets
+  typename LayoutOffset_,
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  typename ArchMmaOperator_,
+  /// Number of threads participating in one matrix operation
+  int Threads,
+  /// Number of partitions along K dimension
+  int PartitionsK_ = 1>
+class QuantBMetaMmaTensorOpTileIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for column major layout
+
+template <
+  /// Shape of the operand B matrix to load in a warp (concept: MatrixShape<kK, kN>)
+  typename WarpShapeB_,
+  /// Block dimensions of the blockwise quantization. So the actual meta data
+  /// warp shape is WarpShapeB_ / BlockingShape_
+  typename BlockingShape_,
+  /// Data type of the meta data elements
+  typename ElementScale_,
+  /// Data type of quant offsets
+  typename ElementOffset_,
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  typename ArchMmaOperator_,
+  /// Number of threads participating in one matrix operation
+  int Threads>
+class QuantBMetaMmaTensorOpTileIterator<WarpShapeB_, BlockingShape_,
+    ElementScale_, cutlass::layout::ColumnMajor,
+    ElementOffset_, cutlass::layout::ColumnMajor,
+    ArchMmaOperator_, Threads, 1>{
+public:
+
+  using WarpShapeB = WarpShapeB_;
+  using BlockingShape = BlockingShape_;
+  using ElementScale = ElementScale_;
+  using Layout = cutlass::layout::ColumnMajor;
+  using ElementOffset = ElementOffset_;
+  using ArchMmaOperator = ArchMmaOperator_;
+
+  static constexpr bool kHasOffset = !(std::is_same<ElementOffset, std::monostate>::value);
+
+  static_assert(BlockingShape::kRow == 1 && BlockingShape::kColumn > 1,
+          "Only support row blocking for column major layout");
+
+  using MetaTile = QuantBMetaMmaTile<WarpShapeB, BlockingShape, ArchMmaOperator, Threads>;
+
+  /// Number of MMA instructions for this tile
+  static constexpr int kMmaIterationsB = MetaTile::kMmaIterationsB;
+
+  /// Number of B elements per mma tile fragment (32b), 2 for half precision, 4 for int8
+  static constexpr int kNumBsPerCoreTileFragement = MetaTile::kNumBsPerCoreTileFragement;
+
+  /// Each mma instruction can process either 1 or 2 operand B tiles (stacked on the k dimension)
+  static constexpr int kBTilesPerMma = MetaTile::kBTilesPerMma;
+
+  /// Number of B elements a fragment of meta data should cover
+  static constexpr int kExpandedSize = MetaTile::kExpandedSize;
+
+  /// Number of meta elements per core tile fragment
+  static constexpr int kCoreTileFragementSize = MetaTile::kCoreTileFragementSize;
+
+  /// stride for reaching the next core tile (if there is one) on the K dimension
+  static constexpr int kKTileStride = MetaTile::kKTileStride;
+
+  /// do we need to load meta data for the next core tile on the K dimension?
+  static constexpr int kTilesPerMma = MetaTile::kTilesPerMma;
+
+  static constexpr int kNStride = MetaTile::kNStride;
+  static constexpr int kNRepeats = MetaTile::kNRepeats;
+  static constexpr int kMmaIterations = MetaTile::kMmaIterations;
+
+  using TensorRefScale = TensorRef<ElementScale, Layout>;
+  using TensorRefOffset = TensorRef<ElementOffset, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+
+  using FragmentScale = Array<ElementScale, MetaTile::kFragementSize>;
+  using FragmentOffset = typename std::conditional<kHasOffset,
+          Array<ElementOffset, MetaTile::kFragementSize>,
+          std::monostate>::type;
+
+  using AccessTypeScale = Array<ElementScale, kCoreTileFragementSize>;
+  using AccessTypeOffset = Array<ElementOffset, kCoreTileFragementSize>;
+
+private:
+
+  ElementScale *pointer_;
+  Layout layout_;
+
+  ElementOffset *pointer_offset_;
+  Layout layout_offset_;
+
+  TensorCoord lane_position_;
+
+public:
+
+  CUTLASS_DEVICE
+  QuantBMetaMmaTensorOpTileIterator() { }
+
+  CUTLASS_DEVICE
+  QuantBMetaMmaTensorOpTileIterator(
+    TensorRefScale const &ref,
+    TensorRefOffset const &ref_offset,
+    int lane_idx
+  ):
+    pointer_(ref.data()),
+    layout_(ref.layout()),
+    pointer_offset_(ref_offset.data()),
+    layout_offset_(ref_offset.layout()),
+    lane_position_(MetaTile::lane_position(lane_idx)){}
+
+  /// Loads a fragment
+  CUTLASS_HOST_DEVICE
+  void load(FragmentScale &frag, FragmentOffset &frag_offset) {
+    if constexpr(kNumBsPerCoreTileFragement == 2
+                 && kBTilesPerMma == 2){
+      // Optimize for a special case of:
+      //    16b gemm (kNumBsPerCoreTileFragement == 2)
+      //    2 B operand tiles per mma (kBTilesPerMma == 2)
+      //    (1,n) quantization blocking (BlockingShape::kRow == 1)
+      // The scale and offset tensors are prepacked to reduce the number of load instructions needed
+      const int row = lane_position_.row();
+      const int column = lane_position_.column() / BlockingShape::kColumn;
+
+      Array<ElementScale, 4> *dst_ptr = reinterpret_cast<Array<ElementScale, 4>*>(frag.data());
+      CUTLASS_PRAGMA_UNROLL
+      for (int n_idx = 0, c = column; n_idx < kMmaIterations; n_idx++, c += kNStride){
+        Array<ElementScale, 4> *src_ptr = reinterpret_cast<Array<ElementScale, 4>*>(pointer_ + layout_({row, c}));
+        *dst_ptr = *src_ptr;
+        dst_ptr++;
+      }
+
+      if constexpr(kHasOffset){
+        Array<ElementOffset, 4> *dst_ptr_offset = reinterpret_cast<Array<ElementOffset, 4>*>(frag_offset.data());
+        CUTLASS_PRAGMA_UNROLL
+        for (int n_idx = 0, c = column; n_idx < kMmaIterations; n_idx++, c += kNStride){
+          Array<ElementOffset, 4> *src_ptr_offset = reinterpret_cast<Array<ElementOffset, 4>*>(pointer_offset_ + layout_offset_({row, c}));
+          *dst_ptr_offset = *src_ptr_offset;
+          dst_ptr_offset++;
+        }
+      }
+
+    } else {
+      // Other cases, offsets and scales are not prepacked.
+
+      const int row = lane_position_.row() / BlockingShape::kRow;
+      const int column = lane_position_.column() / BlockingShape::kColumn;
+
+      AccessTypeScale* dst_ptr = reinterpret_cast<AccessTypeScale*>(frag.data());
+      CUTLASS_PRAGMA_UNROLL
+      for (int n_idx = 0, c = column; n_idx < kMmaIterations; n_idx++, c += kNStride){
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_tile_idx = 0, r = row; mma_tile_idx < kTilesPerMma; mma_tile_idx++, r += kKTileStride){
+          AccessTypeScale* src_ptr = reinterpret_cast<AccessTypeScale*>(pointer_ + layout_({r, c}));
+          *dst_ptr = *src_ptr;
+          dst_ptr++;
+        }
+      }
+
+      if constexpr(kHasOffset){
+        AccessTypeOffset* dst_ptr = reinterpret_cast<AccessTypeOffset*>(frag_offset.data());
+        CUTLASS_PRAGMA_UNROLL
+        for (int n_idx = 0, c = column; n_idx < kMmaIterations; n_idx++, c += kNStride){
+          CUTLASS_PRAGMA_UNROLL
+          for (int mma_tile_idx = 0, r = row; mma_tile_idx < kTilesPerMma; mma_tile_idx++, r += kKTileStride){
+            AccessTypeOffset* src_ptr = reinterpret_cast<AccessTypeOffset*>(pointer_offset_ + layout_offset_({r, c}));
+            *dst_ptr = *src_ptr;
+            dst_ptr++;
+          }
+        }
+      }
+    }
+  }
+
+  template <typename ElementT>
+  CUTLASS_HOST_DEVICE
+  static Array<ElementT, kExpandedSize> debug_expand(Array<ElementT, MetaTile::kFragementSize> const &frag){
+    Array<ElementT, kExpandedSize> ret;
+    int out_idx = 0;
+    CUTLASS_PRAGMA_UNROLL
+    for (int n_out = 0; n_out < kMmaIterationsB; n_out++){
+      int n_idx = n_out / kNRepeats;
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_tile_out_idx = 0; mma_tile_out_idx < kBTilesPerMma; mma_tile_out_idx++){
+        int mma_tile_idx = mma_tile_out_idx / (kBTilesPerMma / kTilesPerMma);
+        CUTLASS_PRAGMA_UNROLL
+        for (int elem_out_idx = 0; elem_out_idx < kNumBsPerCoreTileFragement; elem_out_idx++){
+          int elem_idx = elem_out_idx / BlockingShape::kRow;
+          int idx = elem_idx + mma_tile_idx * kCoreTileFragementSize + n_idx * kCoreTileFragementSize * kTilesPerMma;
+          ret[out_idx] = frag[idx];
+          out_idx++;
+        }
+      }
+    }
+    return ret;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static void dequant(FragmentScale const &scales,
+                      FragmentOffset const &offsets,
+                      Array<uint8_t,kExpandedSize/2> const &weights,
+                      Array<ElementScale, kExpandedSize>& dest){
+    static_assert(kNumBsPerCoreTileFragement == 2, "Only for 16b gemm.");
+    static_assert(kExpandedSize % 8 == 0, "Weights should have been prepacked by 2x2 tiles, 2 weights per tile.");
+
+    // First convert 4b weight into fp16(weight + 16)
+    weights2Half(weights, dest);
+
+    if constexpr(kBTilesPerMma == 2){
+      // Optimize for a special case of:
+      //    2 B operand tiles per mma (kBTilesPerMma == 2)
+      //    (1,n) quantization blocking (BlockingShape::kRow == 1)
+
+      uint32_t* dest_pair = reinterpret_cast<uint32_t*>(dest.data());
+      const b64* scales_ptr = reinterpret_cast<const b64*>(scales.data());
+      const ElementOffset* offsets_ptr = nullptr;
+      if constexpr(kHasOffset) { offsets_ptr = offsets.data(); }
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int n_idx = 0; n_idx < kMmaIterations; n_idx++){
+        // dequantize: d = scale * (weight - offset)
+        // to use FMA, d = scale * weight + (scale * (-offset))
+
+        b64 offsets;
+        if constexpr(kHasOffset){
+          const uint32_t* p = reinterpret_cast<const uint32_t*>(offsets_ptr);
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+          asm volatile(
+              "{\n\t"
+              "  .reg  .b32    rb0, rb1;\n"      // b32 regs for fp16x2 mul operands
+
+              // static_cast<cutlass::half_t>(-16 - offset)
+              // input [d, b, c, a],
+              "  shl.b32       rb0, %4, 6;\n"     // rb0 = [x, b, x, a] << 6
+              "  shr.u32       rb1, %4, 2;\n"     // rb1 = [x, d, x, c] << 6
+              "  lop3.b32      rb0, rb0, 0x03c003c0, 0xcc00cc00, 0xea;\n" // a & 0x03c0 | 0xcc00
+              "  lop3.b32      rb1, rb1, 0x03c003c0, 0xcc00cc00, 0xea;\n"
+              "  mul.rn.f16x2  %0, %2, rb0;\n"    // offset = scale * (-16 - offset)
+              "  mul.rn.f16x2  %1, %3, rb1;\n"
+              "}\n"
+              : "=r"(offsets.pair.a), "=r"(offsets.pair.b)
+              : "r"(scales_ptr->pair.a), "r"(scales_ptr->pair.b),
+                "r"(p[0]));
+#else
+          assert(0);
+#endif
+
+          offsets_ptr += 4;
+        } else {
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+          asm volatile(
+              "{\n\t"
+              "  .reg  .b32    rb0;\n"
+              "  mov.u32       rb0, 0xce00ce00;\n"
+              "  mul.rn.f16x2  %0, %2, rb0;\n"    // offset = scale * (-16 - 8)
+              "  mul.rn.f16x2  %1, %3, rb0;\n"
+              "}\n"
+              : "=r"(offsets.pair.a), "=r"(offsets.pair.b)
+              : "r"(scales_ptr->pair.a), "r"(scales_ptr->pair.b));
+#else
+          offsets.fp16_quad.a = scales_ptr->fp16_quad.a * static_cast<cutlass::half_t>(-16-8);
+          offsets.fp16_quad.b = scales_ptr->fp16_quad.b * static_cast<cutlass::half_t>(-16-8);
+          offsets.fp16_quad.c = scales_ptr->fp16_quad.c * static_cast<cutlass::half_t>(-16-8);
+          offsets.fp16_quad.d = scales_ptr->fp16_quad.d * static_cast<cutlass::half_t>(-16-8);
+#endif
+        }
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int n_r = 0; n_r < kNRepeats; n_r++){
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+          asm volatile(
+              "{\n\t"
+              "  fma.rn.f16x2  %0, %2, %0, %4;\n" // dest = scale * (16 + weight) +  (scale * (-16 - offset))
+              "  fma.rn.f16x2  %1, %3, %1, %5;\n"
+              "}\n"
+              : "+r"(dest_pair[0]), "+r"(dest_pair[1])
+              : "r"(scales_ptr->pair.a), "r"(scales_ptr->pair.b),
+                "r"(offsets.pair.a), "r"(offsets.pair.b));
+#else
+          assert(0);
+#endif
+          dest_pair += 2;
+        }
+        scales_ptr++;
+      }
+
+    } else {
+      // unoptiomized path for other cases, very slow
+      int out_idx = 0;
+      ElementScale offset;
+      CUTLASS_PRAGMA_UNROLL
+      for (int n_out = 0; n_out < kMmaIterationsB; n_out++){
+        int n_idx = n_out / kNRepeats;
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_tile_out_idx = 0; mma_tile_out_idx < kBTilesPerMma; mma_tile_out_idx++){
+          int mma_tile_idx = mma_tile_out_idx / (kBTilesPerMma / kTilesPerMma);
+          CUTLASS_PRAGMA_UNROLL
+          for (int elem_out_idx = 0; elem_out_idx < kNumBsPerCoreTileFragement; elem_out_idx++){
+            int elem_idx = elem_out_idx / BlockingShape::kRow;
+            int idx = elem_idx + mma_tile_idx * kCoreTileFragementSize + n_idx * kCoreTileFragementSize * kTilesPerMma;
+            ElementScale s = scales[idx];
+            if constexpr(kHasOffset){
+              offset = s * static_cast<ElementScale>(-16 - int(offsets[idx]));
+            } else {
+              offset = s * static_cast<ElementScale>(-16-8);
+            }
+            dest[out_idx] = s * dest[out_idx] + offset;
+            out_idx++;
+          }
+        }
+      }
+
+    }
+
+  }
+
+  /// Advances the pointer
+  CUTLASS_HOST_DEVICE
+  QuantBMetaMmaTensorOpTileIterator &operator++() {
+    // This is for operand B, so advance on the K dimension
+    lane_position_ += make_Coord(MetaTile::TileShapeB::kRow, 0);
+    return *this;
+  }
+
+  CUTLASS_DEVICE
+  QuantBMetaMmaTensorOpTileIterator &add_tile_offset(
+      TensorCoord const &tile_offset) {
+    int rows = tile_offset.row() * MetaTile::TileShapeB::kRow;
+    int columns = tile_offset.column() * MetaTile::TileShapeB::kColumn;
+    lane_position_ += TensorCoord(rows, columns);
+    return *this;
+  }
+
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row major layout
+
+template <
+  /// Shape of the operand B matrix to load in a warp (concept: MatrixShape<kK, kN>)
+  typename WarpShapeB_,
+  /// Block dimensions of the blockwise quantization. So the actual meta data
+  /// warp shape is WarpShapeB_ / BlockingShape_
+  typename BlockingShape_,
+  /// Data type of the meta data elements
+  typename ElementScale_,
+  /// Data type of quant offsets
+  typename ElementOffset_,
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  typename ArchMmaOperator_,
+  /// Number of threads participating in one matrix operation
+  int Threads>
+class QuantBMetaMmaTensorOpTileIterator<WarpShapeB_, BlockingShape_,
+    ElementScale_, cutlass::layout::RowMajor,
+    ElementOffset_, cutlass::layout::RowMajor,
+    ArchMmaOperator_, Threads, 1>{
+public:
+
+  using WarpShapeB = WarpShapeB_;
+  using BlockingShape = BlockingShape_;
+  using ElementScale = ElementScale_;
+  using ElementOffset = ElementOffset_;
+  using Layout = cutlass::layout::RowMajor;
+  using ArchMmaOperator = ArchMmaOperator_;
+
+  static constexpr bool kHasOffset = !(std::is_same<ElementOffset, std::monostate>::value);
+
+  static_assert(BlockingShape::kColumn == 1 && BlockingShape::kRow > 1,
+          "Only support column blocking for row major layout");
+
+  using MetaTile = QuantBMetaMmaTile<WarpShapeB, BlockingShape, ArchMmaOperator, Threads>;
+
+  /// Number of MMA instructions for this tile
+  static constexpr int kMmaIterationsB = MetaTile::kMmaIterationsB;
+
+  /// Number of B elements per mma tile fragment (32b), 2 for half precision, 4 for int8
+  static constexpr int kNumBsPerCoreTileFragement = MetaTile::kNumBsPerCoreTileFragement;
+
+  /// Each mma instruction can process either 1 or 2 operand B tiles (stacked on the k dimension)
+  static constexpr int kBTilesPerMma = MetaTile::kBTilesPerMma;
+
+  /// Number of B elements a fragment of meta data should cover
+  static constexpr int kExpandedSize = MetaTile::kExpandedSize;
+
+  /// Number of meta elements per core tile fragment
+  static constexpr int kCoreTileFragementSize = MetaTile::kCoreTileFragementSize;
+
+  /// stride for reaching the next core tile (if there is one) on the K dimension
+  static constexpr int kKTileStride = MetaTile::kKTileStride;
+
+  /// do we need to load meta data for the next core tile on the K dimension?
+  static constexpr int kTilesPerMma = MetaTile::kTilesPerMma;
+
+  static constexpr int kNStride = MetaTile::kNStride;
+  static constexpr int kNRepeats = MetaTile::kNRepeats;
+  static constexpr int kMmaIterations = MetaTile::kMmaIterations;
+
+  using TensorRefScale = TensorRef<ElementScale, Layout>;
+  using TensorRefOffset = TensorRef<ElementOffset, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+
+  using FragmentScale = Array<ElementScale, MetaTile::kFragementSize>;
+  using FragmentOffset = typename std::conditional<kHasOffset,
+          Array<ElementOffset, MetaTile::kFragementSize>,
+          std::monostate>::type;
+
+private:
+
+  ElementScale *pointer_;
+  Layout layout_;
+
+  ElementOffset *pointer_offset_;
+  Layout layout_offset_;
+
+  TensorCoord lane_position_;
+
+public:
+
+  CUTLASS_DEVICE
+  QuantBMetaMmaTensorOpTileIterator() { }
+
+  CUTLASS_DEVICE
+  QuantBMetaMmaTensorOpTileIterator(
+    TensorRefScale const &ref,
+    TensorRefOffset const &ref_offset,
+    int lane_idx
+  ):
+    pointer_(ref.data()),
+    layout_(ref.layout()),
+    pointer_offset_(ref_offset.data()),
+    layout_offset_(ref_offset.layout()),
+    lane_position_(MetaTile::lane_position(lane_idx))
+     {}
+
+  /// Loads a fragment
+  CUTLASS_HOST_DEVICE
+  void load(FragmentScale &frag, FragmentOffset &frag_offset) {
+    const int row = lane_position_.row() / BlockingShape::kRow;
+    const int column = lane_position_.column() / BlockingShape::kColumn;
+    static_assert(kTilesPerMma * kCoreTileFragementSize == 1, "Only support one meta data per core tile");
+
+    ElementScale* src_ptr = pointer_ + layout_({row, column});
+    ElementScale* dst_ptr = frag.data();
+    CUTLASS_PRAGMA_UNROLL
+    for (int n_idx = 0; n_idx < kMmaIterations; n_idx++){
+      dst_ptr[n_idx] = src_ptr[n_idx * kNStride];
+    }
+
+    if constexpr(kHasOffset){
+      ElementOffset* src_ptr_offset = pointer_offset_ + layout_offset_({row, column});
+      ElementOffset* dst_ptr_offset = frag_offset.data();
+      CUTLASS_PRAGMA_UNROLL
+      for (int n_idx = 0; n_idx < kMmaIterations; n_idx++){
+        dst_ptr_offset[n_idx] = src_ptr_offset[n_idx * kNStride];
+      }
+    }
+  }
+
+  template <typename ElementT>
+  CUTLASS_HOST_DEVICE
+  static Array<ElementT, kExpandedSize> debug_expand(Array<ElementT, MetaTile::kFragementSize> const &frag){
+    Array<ElementT, kExpandedSize> ret;
+
+    int out_idx = 0;
+    CUTLASS_PRAGMA_UNROLL
+    for (int n_out = 0; n_out < kMmaIterationsB; n_out++){
+      int n_idx = n_out / kNRepeats;
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_tile_out_idx = 0; mma_tile_out_idx < kBTilesPerMma; mma_tile_out_idx++){
+        int mma_tile_idx = mma_tile_out_idx / (kBTilesPerMma / kTilesPerMma);
+        CUTLASS_PRAGMA_UNROLL
+        for (int elem_out_idx = 0; elem_out_idx < kNumBsPerCoreTileFragement; elem_out_idx++){
+          int elem_idx = elem_out_idx / BlockingShape::kRow;
+          int col = elem_idx + mma_tile_idx * kCoreTileFragementSize;
+          int idx = col * kMmaIterations + n_idx;
+          ret[out_idx] = frag[idx];
+          out_idx++;
+        }
+      }
+    }
+    return ret;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static void dequant(FragmentScale const &scales,
+                      FragmentOffset const &offsets,
+                      Array<uint8_t,kExpandedSize/2> const &weights,
+                      Array<ElementScale, kExpandedSize>& dest){
+    static_assert(kNRepeats == 1, "This is implied by BlockingShape::kColumn == 1");
+    static_assert(kNumBsPerCoreTileFragement == 2, "Only for 16b gemm now.");
+
+    // First convert 4b weight into fp16(weight + 16)
+    weights2Half(weights, dest);
+
+    ElementScale addon[kMmaIterationsB];
+    if constexpr (kMmaIterationsB % 4 == 0) {
+      const b64* scales_ptr = reinterpret_cast<const b64*>(scales.data());
+      uint32_t* addon_ptr = reinterpret_cast<uint32_t*>(addon);
+      if constexpr(kHasOffset){
+        const uint32_t* p = reinterpret_cast<const uint32_t*>(offsets.data());
+        CUTLASS_PRAGMA_UNROLL
+        for (int n_idx = 0; n_idx < kMmaIterationsB; n_idx += 4){
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+          asm volatile(
+            "{\n\t"
+            "  .reg  .b32    rb0, rb1, rb2;\n"
+
+            // offset from [d, c, b, a] --> [d, b, c, a]
+            "  prmt.b32      rb2, %4, rb0, 0x3120;\n"
+
+            // static_cast<cutlass::half_t>(-16 - offset)
+            // input [d, b, c, a],
+            "  shl.b32       rb0, rb2, 6;\n"     // rb0 = [x, b, x, a] << 6
+            "  shr.u32       rb1, rb2, 2;\n"     // rb1 = [x, d, x, c] << 6
+            "  lop3.b32      rb0, rb0, 0x03c003c0, 0xcc00cc00, 0xea;\n" // a & 0x03c0 | 0xcc00
+            "  lop3.b32      rb1, rb1, 0x03c003c0, 0xcc00cc00, 0xea;\n"
+            "  mul.rn.f16x2  %0, %2, rb0;\n"    // offset = scale * (-16 - offset)
+            "  mul.rn.f16x2  %1, %3, rb1;\n"
+            "}\n"
+            : "=r"(addon_ptr[0]), "=r"(addon_ptr[1])
+            : "r"(scales_ptr->pair.a), "r"(scales_ptr->pair.b),
+              "r"(p[0]));
+#else
+          assert(0);
+#endif
+          scales_ptr++;
+          p++;
+          addon_ptr += 2;
+        }
+      } else {
+        CUTLASS_PRAGMA_UNROLL
+        for (int n_idx = 0; n_idx < kMmaIterationsB; n_idx += 4){
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+          asm volatile(
+            "{\n\t"
+            "  .reg  .b32    rb0;\n"
+            "  mov.u32       rb0, 0xce00ce00;\n"
+            "  mul.rn.f16x2  %0, %2, rb0;\n"    // offset = scale * (-16 - 8)
+            "  mul.rn.f16x2  %1, %3, rb0;\n"
+            "}\n"
+            : "=r"(addon_ptr[0]), "=r"(addon_ptr[1])
+            : "r"(scales_ptr->pair.a), "r"(scales_ptr->pair.b));
+#else
+          assert(0);
+#endif
+          scales_ptr++;
+          addon_ptr += 2;
+        }
+      }
+    } else if constexpr (kMmaIterationsB % 2 == 0) {
+      const uint32_t* scales_ptr = reinterpret_cast<const uint32_t*>(scales.data());
+      uint32_t* addon_ptr = reinterpret_cast<uint32_t*>(addon);
+
+      if constexpr (kHasOffset){
+        // possible buffer over read 2 bytes here.
+        const uint32_t* p = reinterpret_cast<const uint32_t*>(offsets.data());
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+        asm volatile(
+          "{\n\t"
+          "  .reg  .b32    rb0, rb1, rb2;\n"
+
+          // offset from [?, ?, b, a] --> [?, b, ?, a]
+          "  prmt.b32      rb2, %2, rb0, 0x3120;\n"
+
+          // static_cast<cutlass::half_t>(-16 - offset)
+          // input [d, b, c, a],
+          "  shl.b32       rb0, rb2, 6;\n"     // rb0 = [x, b, x, a] << 6
+          "  lop3.b32      rb0, rb0, 0x03c003c0, 0xcc00cc00, 0xea;\n" // a & 0x03c0 | 0xcc00
+          "  mul.rn.f16x2  %0, %1, rb0;\n"    // offset = scale * (-16 - offset)
+          "}\n"
+          : "=r"(addon_ptr[0])
+          : "r"(scales_ptr[0])
+            "r"(p[0]));
+#else
+        assert(0);
+#endif
+      } else {
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+        asm volatile(
+          "{\n\t"
+          "  .reg  .b32    rb0;\n"
+          "  mov.u32       rb0, 0xce00ce00;\n"
+          "  mul.rn.f16x2  %0, %1, rb0;\n"    // offset = scale * (-16 - 8)
+          "}\n"
+          : "=r"(addon_ptr[0])
+          : "r"(scales_ptr[0]));
+#else
+        assert(0);
+#endif
+      }
+    } else {
+      // kMmaIterationsB == 1
+      if constexpr(kHasOffset){
+        uint8_t zp = offsets[0];
+        addon[0] = scales[0] * static_cast<ElementScale>(-16 - static_cast<int>(zp));
+      } else {
+        addon[0] = scales[0] * static_cast<ElementScale>(-16-8);
+      }
+    }
+
+    int out_idx = 0;
+    CUTLASS_PRAGMA_UNROLL
+    for (int n_out = 0; n_out < kMmaIterationsB; n_out++){
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_tile_out_idx = 0; mma_tile_out_idx < kBTilesPerMma; mma_tile_out_idx++){
+        dest[out_idx] = scales[n_out] * dest[out_idx] + addon[n_out];
+        dest[out_idx + 1] = scales[n_out] * dest[out_idx + 1] + addon[n_out];
+        out_idx += 2;
+      }
+    }
+  }
+
+  /// Advances the pointer
+  CUTLASS_HOST_DEVICE
+  QuantBMetaMmaTensorOpTileIterator &operator++() {
+    // This is for operand B, so advance on the K dimension
+    lane_position_ += make_Coord(MetaTile::TileShapeB::kRow, 0);
+    return *this;
+  }
+
+  CUTLASS_DEVICE
+  QuantBMetaMmaTensorOpTileIterator &add_tile_offset(
+      TensorCoord const &tile_offset) {
+    int rows = tile_offset.row() * MetaTile::TileShapeB::kRow;
+    int columns = tile_offset.column() * MetaTile::TileShapeB::kColumn;
+    lane_position_ += TensorCoord(rows, columns);
+    return *this;
+  }
+
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+}  // namespace warp
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/quantb_mma_tensor_op.h b/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/quantb_mma_tensor_op.h
new file mode 100644
index 0000000000000..f29cedf326a44
--- /dev/null
+++ b/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/quantb_mma_tensor_op.h
@@ -0,0 +1,361 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/**
+ * Modifications Copyright (c) Microsoft.
+ * Licensed under the MIT license.
+ *
+ * @file quantb_mma_tensor_op.h
+ * @brief Modified from cutlass/gemm/warp/mma_tensor_op.h
+ * Templates implementing warp-level matrix multiply-accumulate operations
+ * targeting tensor cores.
+ */
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/platform/platform.h"
+
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/arch/mma_sm75.h"
+#include "cutlass/arch/mma_sm80.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
+#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+
+#include "cutlass_ext/q4gemm/warp/quantb_meta_mma_tensor_op_tile_iterator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Data type of A elements
+  typename ElementA_,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  /// Data type of B elements
+  typename ElementB_,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  /// Data type of quant scales
+  typename ElementQScale_,
+  /// Layout of quant scales (concept: MatrixLayout)
+  typename SmemLayoutQScale_,
+  /// Data type of quant offsets
+  typename ElementQOffset_,
+  /// Layout of quant offsets (concept: MatrixLayout)
+  typename SmemLayoutQOffset_,
+  /// Blocking dimensions of quantization
+  typename QuantBlocking_,
+  /// Element type of C matrix
+  typename ElementC_,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+  typename Policy_,
+  /// Number of partitions along K dimension
+  int PartitionsK_ = 1,
+  /// Store the accumulators in row major or column major.  Row major is used
+  /// when output layout is interleaved.
+  bool AccumulatorsInRowMajor = false,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class QuantBMmaTensorOp {
+public:
+  /// Shape of warp-level matrix operation (concept: GemmShape)
+  using Shape = Shape_;
+
+  /// Data type of multiplicand A
+  using ElementA = ElementA_;
+
+  /// Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  /// Data type of multiplicand B
+  using ElementB = ElementB_;
+
+  /// Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  /// Data type of accumulator matrix C
+  using ElementC = ElementC_;
+
+  /// Layout of accumulator matrix C
+  using LayoutC = LayoutC_;
+
+  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
+  using Policy = Policy_;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  using ArchMmaOperator = typename Policy::Operator;
+
+  /// Indicates math operator
+  using MathOperator = typename ArchMmaOperator::Operator;
+
+  /// Architecture tag from underlying instruction
+  using ArchTag = typename ArchMmaOperator::ArchTag;
+
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Shape of underlying instruction
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+
+  /// Number of threads participating in warp-level matrix product
+  static int const kThreadCount = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+public:
+
+  /// Iterates over the A operand in memory
+  using IteratorA = MmaTensorOpMultiplicandTileIterator<
+     MatrixShape<Shape::kM, Shape::kK>, Operand::kA, ElementA, LayoutA,
+     MatrixShape<ArchMmaOperator::Shape::kM, ArchMmaOperator::Shape::kK>,
+     Policy::OpDelta::kRow, kThreadCount, kPartitionsK>;
+
+  /// Storage for A tile
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Storage for transformed A tile
+  using TransformedFragmentA =
+      Array<typename ArchMmaOperator::ElementA, FragmentA::kElements>;
+
+  /// Iterates over the B operand in memory
+  using IteratorB = MmaTensorOpMultiplicandTileIterator<
+      MatrixShape<Shape::kK/2, Shape::kN/2>, Operand::kB, ElementB, LayoutB,
+      MatrixShape<ArchMmaOperator::Shape::kK/2, ArchMmaOperator::Shape::kN/2>,
+      Policy::OpDelta::kRow, kThreadCount, kPartitionsK>;
+  // warp B MatrixShape<64, 64>,
+  // layout B cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<16, 64>,
+  // instruction op shape cutlass::MatrixShape<16, 8>,
+  // kPartitionsK 1
+  // FragmentB::kElements 32
+
+  /// Storage for B tile
+  using FragmentB = typename IteratorB::Fragment; // cutlass::Array<cutlass::half_t, 8>
+
+  /// Storage for transformed B tile
+  /// When loading weights, we packed 4 int4 weights into one 2-byte-element, when expanded
+  /// we multiply the number of elements by 4.
+  /// TODO: make sure ArchMmaOperator::ElementB same as dequantized ElementB
+  /// and change the transform function below to perform dequantization
+  using TransformedFragmentB =
+      Array<typename ArchMmaOperator::ElementB, FragmentB::kElements * 4>;
+
+  /// Iterates over the C operand in memory
+  using IteratorC = MmaTensorOpAccumulatorTileIterator<
+     MatrixShape<Shape::kM, Shape::kN>, ElementC, LayoutC,
+     typename ArchMmaOperator::Shape, typename Policy::OpDelta>;
+
+  /// Storage for C tile
+  using FragmentC = typename IteratorC::Fragment;
+
+  using ElementQScale = ElementQScale_;
+  using SmemLayoutQScale = SmemLayoutQScale_;
+  using QuantBlocking = QuantBlocking_;
+
+  using ElementQOffset = ElementQOffset_;
+  using SmemLayoutQOffset = SmemLayoutQOffset_;
+
+  /// Iterates over the quantization parameters in memory
+  using WarpQScaleShape = MatrixShape<(Shape::kK / QuantBlocking::kRow), (Shape::kN / QuantBlocking::kColumn)>;
+  static_assert(Shape::kK % QuantBlocking::kRow == 0, "K must be multiple of QuantBlocking::kRow");
+  static_assert(Shape::kN % QuantBlocking::kColumn == 0, "N must be multiple of QuantBlocking::kColumn");
+  static_assert(WarpQScaleShape::kCount > 0, "QuantBlocking too big to fit in a warp block!");
+
+  // TODO This is an expanding iterator, it needs to replicate the quantization parameters
+  // to all threads in the warp.
+  using IteratorQMeta = QuantBMetaMmaTensorOpTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>, QuantBlocking, ElementQScale, SmemLayoutQScale,
+    ElementQOffset, SmemLayoutQOffset,
+    ArchMmaOperator, kThreadCount, kPartitionsK>;
+
+  using FragmentQScale = typename IteratorQMeta::FragmentScale;
+  using FragmentQOffset = typename IteratorQMeta::FragmentOffset;
+
+  /// Number of mma operations performed
+  using MmaIterations = MatrixShape<
+    (Shape::kM + ArchMmaOperator::Shape::kM - 1) / ArchMmaOperator::Shape::kM,
+    (Shape::kN + ArchMmaOperator::Shape::kN - 1) / ArchMmaOperator::Shape::kN
+  >;
+
+public:
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  ArchMmaOperator mma;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_DEVICE
+  QuantBMmaTensorOp() {}
+
+  /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D,
+    TransformedFragmentA const &A,
+    TransformedFragmentB const &B,
+    FragmentC const &C
+  ) const {
+
+    using MmaOperandA = typename ArchMmaOperator::FragmentA;
+    using MmaOperandB = typename ArchMmaOperator::FragmentB;
+    using MmaOperandC = typename ArchMmaOperator::FragmentC;
+
+    D = C;
+
+    MmaOperandA const *ptr_A = reinterpret_cast<MmaOperandA const *>(&A);
+    MmaOperandB const *ptr_B = reinterpret_cast<MmaOperandB const *>(&B);
+    MmaOperandC *ptr_D = reinterpret_cast<MmaOperandC *>(&D);
+
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)
+      // Serpentine visitation order maximizing reuse of Rb
+      // The visitation order is like
+      //      _
+      //   | | | |
+      //   | | | |
+      //   |_| |_|
+      //
+      // Down Up Down Up
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = 0; m < MmaIterations::kRow; ++m) {
+
+          int m_serpentine = ((n % 2) ? (MmaIterations::kRow - 1 - m) : m);
+
+          if (AccumulatorsInRowMajor) {  // matrix B is reordered
+            mma(
+              ptr_D[n + m_serpentine * MmaIterations::kColumn],
+              ptr_A[m_serpentine],
+              ptr_B[n],
+              ptr_D[n + m_serpentine * MmaIterations::kColumn]);
+          } else {
+            mma(
+              ptr_D[m_serpentine + n * MmaIterations::kRow],
+              ptr_A[m_serpentine],
+              ptr_B[n],
+              ptr_D[m_serpentine + n * MmaIterations::kRow]);
+          }
+        }
+      }
+    #elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+      // Serpentine visitation order maximizing reuse of Ra
+      // The visitation order is like
+      //   _________
+      //   _________|
+      //  |_________
+      //  __________|
+      //
+      // Right Left Right Left
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < MmaIterations::kRow; ++m) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+          int n_serpentine = ((m % 2) ? (MmaIterations::kColumn - 1 - n) : n);
+
+          if (AccumulatorsInRowMajor) {  // matrix B is reordered
+            mma(
+              ptr_D[n_serpentine + m * MmaIterations::kColumn],
+              ptr_A[m],
+              ptr_B[n_serpentine],
+              ptr_D[n_serpentine + m * MmaIterations::kColumn]);
+          } else {
+            mma(ptr_D[m + n_serpentine * MmaIterations::kRow],
+                ptr_A[m],
+                ptr_B[n_serpentine],
+                ptr_D[m + n_serpentine * MmaIterations::kRow]);
+          }
+        }
+      }
+    #else
+      assert(0);
+    #endif
+  }
+
+  /// Transform the mma operands to the required types
+  CUTLASS_DEVICE
+  void transform(TransformedFragmentB &dst_B,
+                 FragmentB const &B,
+                 FragmentQScale const &scales,
+                 FragmentQOffset const &offsets) const {
+
+    Array<uint8_t, FragmentB::kElements * 2> const *ptr_B =
+        reinterpret_cast<Array<uint8_t, FragmentB::kElements * 2> const *>(&B);
+    IteratorQMeta::dequant(scales, offsets, *ptr_B, dst_B);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//#include "cutlass/gemm/warp/mma_tensor_op_fast_f32.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/core/util/matrix_layout.h b/onnxruntime/core/util/matrix_layout.h
index a0405e32034ae..783a29d8a2055 100644
--- a/onnxruntime/core/util/matrix_layout.h
+++ b/onnxruntime/core/util/matrix_layout.h
@@ -17,7 +17,6 @@
 #include <cstdint>
 #include "core/common/gsl.h"
 
-// TODO!! Already have this in cuda, what about cpu code though?
 #if defined(_MSC_VER)
 #define ORT_FORCEINLINE __forceinline
 #else
diff --git a/onnxruntime/test/cuda_host/blkq4_fp16_quant_sm80.h b/onnxruntime/test/cuda_host/blkq4_fp16_quant_sm80.h
new file mode 100644
index 0000000000000..6ea8b55505214
--- /dev/null
+++ b/onnxruntime/test/cuda_host/blkq4_fp16_quant_sm80.h
@@ -0,0 +1,203 @@
+/**
+ * Copyright (c) Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License.
+ *
+ * Module Name:
+ *    blkq4_fp16_quant_sm80.h
+ *
+ * Abstract:
+ *   Oracle computation for blockwise 4b quantization for fp16
+ *   gemm kernel specifically for Ampere GPUs. This is used for
+ *   testing the cuda kernel implementation in
+ *   (test/providers/cuda/test_cases)
+ *   and for testing the cuda op prepack code in (test/optimizer)
+ */
+
+#pragma once
+
+#include "core/util/matrix_layout.h"
+#include "core/common/common.h"
+
+namespace onnxruntime {
+namespace test {
+
+static inline void sm80_prepack_weights_ref(
+    int rows,
+    int columns,
+    const MatrixRef<uint8_t const, ColumnMajorLayout, true>& tensor_weight,
+    const MatrixRef<uint8_t, ColumnMajorLayout, true>& tensor_weight_prepacked) {
+  ORT_ENFORCE(tensor_weight.shape()[0] == rows / 2 && tensor_weight.shape()[1] == columns,
+              "Unexpected tensor_weight shape! Expected: (", rows / 2, ", ", columns, "), Got: (",
+              tensor_weight.shape()[0], ", ", tensor_weight.shape()[1], ").");
+  ORT_ENFORCE(tensor_weight_prepacked.shape()[0] == rows && tensor_weight_prepacked.shape()[1] == columns / 2,
+              "tensor_weight_prepacked shape is not compatible with prepacked weight shape");
+
+  auto t0_base = make_Position(0, 0);
+  auto t1_base = make_Position(4, 0);
+  auto t2_base = make_Position(0, 8);
+  auto t3_base = make_Position(4, 8);
+  for (int col_dtile = 0; col_dtile < columns / 16; ++col_dtile) {
+    for (int row_dtile = 0; row_dtile < rows / 16; ++row_dtile) {
+      // Packing from a 8x16 tile to a 16x8 tile
+      auto dtile_base = make_Position(row_dtile * 8, col_dtile * 16);
+      auto packed_tile_base = make_Position(row_dtile * 16, col_dtile * 8);
+      for (int col = 0; col < 8; ++col) {
+        for (int row = 0; row < 4; ++row) {
+          auto cord = make_Position(row, col);
+          auto packed_cord = packed_tile_base + make_Position(row * 4, col);  // packed tile is 16x8
+          uint8_t buf[4];
+          buf[0] = tensor_weight.at(dtile_base + t0_base + cord);
+          buf[1] = tensor_weight.at(dtile_base + t1_base + cord);
+          buf[2] = tensor_weight.at(dtile_base + t2_base + cord);
+          buf[3] = tensor_weight.at(dtile_base + t3_base + cord);
+
+          // [0, 1, 2, 3, 4, 5, 6, 7] => [0, 2, 4, 6, 1, 3, 5, 7] so that each pair of adjacent weights
+          // are in different b16 register at the same positions. This makes it easier to convert to
+          // fp16x2 format in a b32 register
+
+          tensor_weight_prepacked.at(packed_cord) = (buf[0] & 0x0f) | ((buf[1] & 0x0f) << 4);
+          tensor_weight_prepacked.at(packed_cord + make_Position(1, 0)) = (buf[2] & 0x0f) | ((buf[3] & 0x0f) << 4);
+          tensor_weight_prepacked.at(packed_cord + make_Position(2, 0)) = ((buf[0] & 0xf0) >> 4) | (buf[1] & 0xf0);
+          tensor_weight_prepacked.at(packed_cord + make_Position(3, 0)) = ((buf[2] & 0xf0) >> 4) | (buf[3] & 0xf0);
+        }
+      }
+    }
+  }
+}
+
+template <
+    typename ScaleElementT,
+    typename Layout,
+    typename QuantBlocking>
+inline void sm80_prepack_quant_scales_ref(
+    int rows,
+    int columns,
+    const MatrixRef<ScaleElementT const, Layout, true>& tensor_scale,
+    const MatrixRef<ScaleElementT, Layout, true>& tensor_scale_prepacked) {
+  ORT_ENFORCE(tensor_scale.shape()[0] == (rows / QuantBlocking::kRow) && tensor_scale.shape()[1] == (columns / QuantBlocking::kColumn),
+              "Unexpected tensor_scale shape! Expected: (",
+              rows / QuantBlocking::kRow, ", ", columns / QuantBlocking::kColumn, ")");
+  ORT_ENFORCE(tensor_scale_prepacked.shape() == tensor_scale.shape());
+
+  // Only prepacking scale and offset tensors for a often used special case:
+  //    16b gemm (2 elements per 32b register, operand tile shape 8x8)
+  //    2 B operand tiles per mma instruction stacked on k dimension
+  //    (1,n) quantization blocking
+  if constexpr (sizeof(ScaleElementT) != 2 || QuantBlocking::kRow != 1) {
+    ORT_THROW("sm80_prepack_quant_scales_ref should only be called for row-wise block quantization on 16b float values.");
+  }
+
+  // In Ampere tensor op, each operand B tile is 8 x 8, in a warp of 32 threads, each thread
+  // holds a fragment of the tile containing 2 elements in the k dimension. Most often we use
+  // mma instruction shape of 16x8x16, which means 2 B tiles are stacked in the k dimension,
+  // as shown below (T stands for thread):
+  // T0, T4, T8, T12
+  // T1, T5, T9, T13
+  // T2, T6, T10, T14
+  // T3, T7, T11, T15
+  // T0, T4, T8, T12
+  // T1, T5, T9, T13
+  // T2, T6, T10, T14
+  // T3, T7, T11, T15
+  //
+  // We need to deliver quantization scale and offset elements to the corresponding threads,
+  // so we can perform dequantization efficiently. With a column major layout, each thread
+  // needs two separate loads for a mma instruction, due to the tile fragment layout shown
+  // above. To reduce the number of loads, we rearrange each column as below, so we can use
+  // a single load to load fragments for two tiles:
+  // T0        T0
+  // T1        T0
+  // T2        T1
+  // T3   =>   T1
+  // T0        T2
+  // T1        T2
+  // T2        T3
+  // T3        T3
+
+  for (int col = 0; col < tensor_scale.shape()[1]; ++col) {
+    for (int row_blk = 0; row_blk < tensor_scale.shape()[0]; row_blk += 16) {
+      for (int thread_id = 0; thread_id < 4; thread_id++) {
+        const int dst_idx = row_blk + thread_id * 4;
+        const int src_idx = row_blk + thread_id * 2;
+        tensor_scale_prepacked.at(dst_idx + 0, col) = tensor_scale.at(src_idx + 0, col);
+        tensor_scale_prepacked.at(dst_idx + 1, col) = tensor_scale.at(src_idx + 1, col);
+        tensor_scale_prepacked.at(dst_idx + 2, col) = tensor_scale.at(src_idx + 8, col);
+        tensor_scale_prepacked.at(dst_idx + 3, col) = tensor_scale.at(src_idx + 9, col);
+      }
+    }
+  }
+}
+
+template <typename Layout, typename QuantBlocking>
+inline void sm80_prepack_quant_offsets_ref(
+    int rows,
+    int columns,
+    MatrixRef<uint8_t const, Layout, true> tensor_offset,
+    MatrixRef<uint8_t, Layout, true> tensor_offset_prepacked) {
+  const auto meta_shape = make_Position(rows / QuantBlocking::kRow, columns / QuantBlocking::kColumn);
+  const auto zp_shape = make_Position((meta_shape[0] + 1) / 2, meta_shape[1]);
+  ORT_ENFORCE(tensor_offset_prepacked.shape() == meta_shape,
+              "Unexpected tensor_offset_prepacked shape (",
+              tensor_offset_prepacked.shape()[0], ",", tensor_offset_prepacked.shape()[1],
+              ")! Expected: (", meta_shape[0], ", ", meta_shape[1], ")");
+  ORT_ENFORCE(tensor_offset.shape() == zp_shape,
+              "Unexpected tensor_offset shape (",
+              tensor_offset.shape()[0], ",", tensor_offset.shape()[1],
+              ")! Expected: (", zp_shape[0], ", ", zp_shape[1], ")");
+
+  // Only prepacking scale and offset tensors for a often used special case:
+  //    16b gemm (2 elements per 32b register, operand tile shape 8x8)
+  //    2 B operand tiles per mma instruction stacked on k dimension
+  //    (1,n) quantization blocking
+  if constexpr (QuantBlocking::kRow != 1) {
+    ORT_THROW("sm80_prepack_quant_offsets_ref should only be called for row-wise block quantization.");
+  }
+  // In Ampere tensor op, each operand B tile is 8 x 8, in a warp of 32 threads, each thread
+  // holds a fragment of the tile containing 2 elements in the k dimension. Most often we use
+  // mma instruction shape of 16x8x16, which means 2 B tiles are stacked in the k dimension,
+  // as shown below (T stands for thread):
+  // T0, T4, T8, T12
+  // T1, T5, T9, T13
+  // T2, T6, T10, T14
+  // T3, T7, T11, T15
+  // T0, T4, T8, T12
+  // T1, T5, T9, T13
+  // T2, T6, T10, T14
+  // T3, T7, T11, T15
+  //
+  // We need to deliver quantization scale and offset elements to the corresponding threads,
+  // so we can perform dequantization efficiently. With a column major layout, each thread
+  // needs two separate loads for a mma instruction, due to the tile fragment layout shown
+  // above. To reduce the number of loads, we rearrange each column as below, so we can use
+  // a single load to load fragments for two tiles:
+  // T0        T0
+  // T1        T0
+  // T2        T1
+  // T3   =>   T1
+  // T0        T2
+  // T1        T2
+  // T2        T3
+  // T3        T3
+  if (tensor_offset_prepacked.good()) {
+    for (int col = 0; col < tensor_offset_prepacked.shape()[1]; ++col) {
+      for (int row_blk = 0; row_blk < tensor_offset_prepacked.shape()[0]; row_blk += 16) {
+        for (int thread_id = 0; thread_id < 4; thread_id++) {
+          const int dst_idx = row_blk + thread_id * 4;
+          const int src_idx = row_blk + thread_id * 2;
+          // [a, b, c, d] => [a, c, b, d] so that adjacent weights are in their own
+          // 16b element: [a, x, b, x] and [x, c, x, d], which makes it easier to
+          // convert to fp16x2 format in a b32 register
+          uint8_t pair01 = tensor_offset.at(src_idx / 2, col);
+          uint8_t pair89 = tensor_offset.at((src_idx + 8) / 2, col);
+          tensor_offset_prepacked.at(dst_idx + 0, col) = pair01 & 0xf;
+          tensor_offset_prepacked.at(dst_idx + 1, col) = pair89 & 0xf;
+          tensor_offset_prepacked.at(dst_idx + 2, col) = pair01 >> 4;
+          tensor_offset_prepacked.at(dst_idx + 3, col) = pair89 >> 4;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h
new file mode 100644
index 0000000000000..bbe370675fc48
--- /dev/null
+++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h
@@ -0,0 +1,188 @@
+/**
+ * Copyright (c) Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License.
+ *
+ * Module Name:
+ *    blkq4_fp16_gemm_sm80.h
+ *
+ * Abstract:
+ *   Bridge between gtest code and gemm kernel implementation.
+ *   Gemm kernel requires CUTLASS header files, which causes strange
+ *   compilation errors with RE2 header files, which are required
+ *   by gtest.
+ */
+
+#pragma once
+
+#include <random>
+
+#include "core/util/matrix_layout.h"
+#include "core/common/common.h"
+#include "core/mickey/blk_q4/f16_prepack_sm80.h"
+#include "test/cuda_host/blkq4_fp16_quant_sm80.h"
+
+namespace onnxruntime {
+namespace cuda {
+namespace test {
+
+Status sm80_supported();
+
+/**
+ * @brief Generate a set of quantized weights, scales and offsets
+ *        and dequantized weights for testing quantization and
+ *        dequantization. All outputs are column major layout.
+ *
+ * @tparam ElementT The type of the dequantized weights.
+ * @tparam block_size The block size of the quantization.
+ * @tparam col_blocking Whether to use column blocking (all elements of
+ *                      a block comes from a single column) or row blocking
+ * @tparam has_offsets Whether to generate offsets.
+ *
+ * @param[in]  rows The number of rows of the weight matrix.
+ * @param[in]  columns The number of columns of the weight matrix.
+ * @param[out] dequants The dequantized weights, column major layout.
+ * @param[out] q_weights The quantized weights, column major layout.
+ * @param[out] q_scales The scales, column major layout.
+ * @param[out] q_zp The zero points, column major layout.
+ */
+template <typename ElementT, int block_size, bool col_blocking, bool has_offsets>
+inline void blkq4_weights_gen(
+    int rows, int columns,
+    std::vector<ElementT>& dequants,
+    std::vector<uint8_t>& q_weights,
+    std::vector<ElementT>& q_scales,
+    std::vector<uint8_t>& q_zp) {
+  using Base = onnxruntime::cuda::BlockwiseQuantization<
+      ElementT,
+      block_size,
+      4,
+      col_blocking>;
+
+  using QuantBlocking = typename Base::QuantBlocking;
+  using ElementW = typename Base::ElementW;
+  using LayoutWPack = typename Base::LayoutWPack;
+  using ElementQOffset = typename Base::ElementQOffset;
+
+  static_assert(std::is_same<ElementW, uint8_t>::value);
+  static_assert(std::is_same<ElementQOffset, uint8_t>::value);
+  static_assert(std::is_same<LayoutWPack, ColumnMajorLayout>::value);
+
+  unsigned int seed = 28571;  // Replace with desired seed value
+  std::seed_seq seq{seed};
+  std::mt19937 gen(seq);
+  std::uniform_int_distribution<uint32_t> dis(0, 8192);
+
+  const auto q_weight_shape = Base::get_quant_weights_shape(rows, columns);
+  const auto meta_shape = Base::get_quant_meta_shape(rows, columns);
+  const auto zp_shape = make_Position((meta_shape[0] + 1) / 2, meta_shape[1]);
+
+  //
+  // For testing quantization and dequantization, it is not straight
+  // forward to avoid flaky tests due to rounding errors. The way we
+  // try to achieve this is to:
+  // 1. Generate a set of quantized weights, scales and offsets
+  // 2. Dequantize the weights
+  // 3. Quantize the dequantized weights
+  // 4. Compare the dequantied-and-then-quantized weights with
+  //    the original quantized weights
+  //
+  // Random filling of the initial values are key to get this right.
+  // For weights, we must ensure each block gets a full range of
+  // values, i.e. must contain 0 and 15. And for scales, they must
+  // all be positive.
+  //
+
+  q_weights.resize(q_weight_shape.product());
+  MatrixRef<ElementW, ColumnMajorLayout, true> tensor_q_weight(
+      q_weights, make_Position(rows / 2, columns));
+  int v = 7;
+  for (int c = 0; c < tensor_q_weight.shape()[1]; c++) {
+    for (int r = 0; r < tensor_q_weight.shape()[0]; ++r) {
+      uint8_t v0 = static_cast<uint8_t>(v);
+      v = (v + 5) % 16;
+      if (v == 11 || v == 7 || v == 3) {
+        // making the cycle 13 instead of 16, avoiding same values in a row
+        v = (v + 5) % 16;
+      }
+      uint8_t v1 = 0;
+      if (r + 1 < rows) {
+        v1 = static_cast<uint8_t>(v);
+        v = (v + 5) % 16;
+        if (v == 11 || v == 7 || v == 3) {
+          // making the cycle 13 instead of 16, avoiding same values in a row
+          v = (v + 5) % 16;
+        }
+      }
+
+      tensor_q_weight.at(r, c) = ElementW((v1 << 4) | v0);
+    }
+  }
+
+  q_scales.resize(meta_shape.product());
+  for (size_t i = 0; i < q_scales.size(); i++) {
+    uint32_t v = dis(gen);
+    uint32_t m = (v % 63) + 1;
+    uint32_t e = (v >> 6) % 4;
+    q_scales[i] = ElementT(m / static_cast<float>(1 << (2 + e)));
+  }
+  MatrixRef<ElementT, ColumnMajorLayout, true> tensor_scale(
+      q_scales, meta_shape);
+
+  MatrixRef<ElementQOffset, ColumnMajorLayout, true> tensor_offset;
+  if constexpr (has_offsets) {
+    q_zp.resize(zp_shape.product());
+    tensor_offset = MatrixRef<ElementQOffset, ColumnMajorLayout, true>(
+        q_zp, zp_shape);
+    for (int c = 0; c < zp_shape[1]; c++) {
+      for (int r = 0; r < zp_shape[0]; ++r) {
+        uint8_t v0 = dis(gen) % 16;
+        uint8_t v1 = 8;
+        if (r * 2 + 1 < meta_shape[0]) {
+          v1 = dis(gen) % 16;
+        }
+        tensor_offset.at(r, c) = static_cast<uint8_t>(v0 | (v1 << 4));
+      }
+    }
+  }
+
+  dequants.resize(rows * columns);
+  MatrixRef<ElementT, ColumnMajorLayout> tensor_dequant(dequants, make_Position(rows, columns));
+
+  // Dequantize weights and save into matrix B
+  for (int col = 0; col < tensor_dequant.shape()[1]; ++col) {
+    for (int row = 0; row < tensor_dequant.shape()[0]; ++row) {
+      auto weight_cord = make_Position(row / 2, col);
+      auto scale_cord = make_Position(row / QuantBlocking::kRow, col / QuantBlocking::kColumn);
+      uint8_t offset = 8;
+      if constexpr (has_offsets) {
+        if (scale_cord[0] % 2 == 0) {
+          offset = tensor_offset.at(scale_cord[0] / 2, scale_cord[1]) & 0x0f;
+        } else {
+          offset = tensor_offset.at(scale_cord[0] / 2, scale_cord[1]) >> 4;
+        }
+      }
+      int w = 0;
+      if (row % 2 == 0) {
+        w = int(tensor_q_weight.at(weight_cord) & 0x0f);
+      } else {
+        w = int(tensor_q_weight.at(weight_cord) >> 4);
+      }
+      float scale = float(tensor_scale.at(scale_cord));
+      float dequant = scale * float(w - offset);
+      tensor_dequant.at(row, col) = ElementT(dequant);
+      // Prints for help debugging in case of test failure
+      // fprintf(stderr, "(%2d,%2d)= %2d, %2d, %f, %f\n", row, col, w, offset, scale, dequant);
+    }
+  }
+}
+
+template <
+    int block_size,
+    bool column_wise_blocking,
+    bool small_m,
+    bool has_offsets>
+void run_blkq4_gemm(int m, int n, int k);
+
+}  // namespace test
+}  // namespace cuda
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc
new file mode 100644
index 0000000000000..e687ae73e66f2
--- /dev/null
+++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc
@@ -0,0 +1,330 @@
+/**
+ * Copyright (c) Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License.
+ *
+ * Module Name:
+ *    blkq4_fp16_gemm_sm80_test.cc
+ *
+ * Abstract:
+ *   Test code for block-wise quantized 4b GEMM kernels.
+ *   This part requires gtest header files, which do not play
+ *   well with CUTLASS headers.
+ */
+
+#include <random>
+
+#include "core/framework/float16.h"
+#include "core/mlas/inc/mlas_q4.h"
+
+#include "blkq4_fp16_gemm_sm80.h"
+
+#include "gtest/gtest.h"
+
+namespace onnxruntime {
+namespace test {
+
+template <bool col_blocking, bool has_offset = true>
+void testPrepack(int rows, int columns) {
+  using ElementT = MLFloat16;
+  constexpr int block_size = 32;
+  using Base = onnxruntime::cuda::BlockwiseQuantization<
+      ElementT,
+      block_size,
+      4,
+      col_blocking>;
+
+  using QuantBlocking = typename Base::QuantBlocking;
+  using ElementW = typename Base::ElementW;
+  using LayoutWPack = typename Base::LayoutWPack;
+  using ElementQOffset = typename Base::ElementQOffset;
+  using LayoutQmeta = typename Base::LayoutQmeta;
+
+  const auto q_weight_shape = Base::get_quant_weights_shape(rows, columns);
+  const auto meta_shape = Base::get_quant_meta_shape(rows, columns);
+  const auto zp_shape = make_Position((meta_shape[0] + 1) / 2, meta_shape[1]);
+
+  std::vector<ElementW> q_weights;
+  std::vector<ElementT> q_scales;
+  std::vector<ElementQOffset> q_zp;
+  std::vector<ElementT> dequants;
+  onnxruntime::cuda::test::blkq4_weights_gen<ElementT, block_size, col_blocking, has_offset>(
+      rows, columns, dequants, q_weights, q_scales, q_zp);
+
+  // for quantization tool, the input is row major, all outputs are column major
+  MatrixRef<ElementW, ColumnMajorLayout, true> tensor_q_weight(
+      q_weights, make_Position(rows / 2, columns));
+  MatrixRef<ElementT, ColumnMajorLayout, true> tensor_scale(
+      q_scales, meta_shape);
+  MatrixRef<ElementQOffset, ColumnMajorLayout, true> tensor_offset;
+  if constexpr (has_offset) {
+    tensor_offset = MatrixRef<ElementQOffset, ColumnMajorLayout, true>(q_zp, zp_shape);
+  }
+
+  // for quantization tool, the input is row major, test weight gen output is column major
+  std::vector<ElementT> dequants_transposed(dequants.size());
+  MatrixRef<ElementT, ColumnMajorLayout> tensor_dequant(dequants, make_Position(rows, columns));
+  MatrixRef<ElementT, RowMajorLayout> tensor_dequant_transposed(dequants_transposed, make_Position(rows, columns));
+  for (int col = 0; col < tensor_dequant.shape()[1]; ++col) {
+    for (int row = 0; row < tensor_dequant.shape()[0]; ++row) {
+      tensor_dequant_transposed.at(row, col) = tensor_dequant.at(row, col);
+    }
+  }
+
+  int q_rows, q_cols;
+  MlasBlockwiseQuantizedShape<ElementT, 4>(
+      block_size, col_blocking, rows, columns, q_rows, q_cols);
+  // to be exact, q_rows are padded to multiple of block_size, deal with it when we care about strange shapes
+  EXPECT_EQ(q_rows, q_weight_shape[0]);
+  EXPECT_EQ(q_cols, q_weight_shape[1]);
+
+  //
+  // Quantization tool outputs:
+  //
+  std::vector<ElementW> o_elements(q_rows * q_cols);
+  MatrixRef<ElementW, ColumnMajorLayout, true> tensor_o_elements(o_elements, q_weight_shape);
+
+  std::vector<ElementT> o_scales(meta_shape.product());
+  MatrixRef<ElementT, ColumnMajorLayout, true> tensor_o_scales(o_scales, meta_shape);
+
+  std::vector<uint8_t> o_zp(zp_shape.product());
+  MatrixRef<uint8_t, ColumnMajorLayout, true> tensor_o_zp(o_zp, zp_shape);
+
+  MlasQuantizeBlockwise<MLFloat16, 4>(o_elements.data(), o_scales.data(), has_offset ? o_zp.data() : nullptr,
+                                      dequants_transposed.data(), block_size,
+                                      col_blocking, rows, columns, columns, nullptr);
+  for (int col = 0; col < tensor_q_weight.shape()[1]; ++col) {
+    for (int row = 0; row < tensor_q_weight.shape()[0]; ++row) {
+      EXPECT_EQ(tensor_o_elements.at(row, col), tensor_q_weight.at(row, col))
+          << "quantized value mismatch at [" << row << "," << col << "]"
+          << " shape[" << rows << "," << columns << "]"
+          << (col_blocking ? "Column-wise-block" : "Row-wise-block")
+          << std::endl;
+    }
+  }
+
+  for (int col = 0; col < meta_shape[1]; ++col) {
+    for (int row = 0; row < meta_shape[0]; row += 2) {
+      if (has_offset) {
+        uint8_t pair01 = tensor_o_zp.at(row / 2, col);
+        uint8_t expected_pair01 = tensor_offset.at(row / 2, col);
+        EXPECT_EQ(expected_pair01 & 0xf, pair01 & 0xf)
+            << "quantized offset mismatch at [" << row << "," << col << "]"
+            << " shape[" << rows << "," << columns << "]"
+            << (col_blocking ? "Column-wise-block" : "Row-wise-block")
+            << std::endl;
+        if (row + 1 < meta_shape[0]) {
+          EXPECT_EQ(expected_pair01 >> 4, pair01 >> 4)
+              << "quantized offset mismatch at [" << row + 1 << "," << col << "]"
+              << " shape[" << rows << "," << columns << "]"
+              << (col_blocking ? "Column-wise-block" : "Row-wise-block")
+              << std::endl;
+        }
+      }
+
+      EXPECT_EQ(tensor_scale.at(row + 0, col), tensor_o_scales.at(row + 0, col))
+          << "quantized scale mismatch at [" << row << "," << col << "]"
+          << " shape[" << rows << "," << columns << "]"
+          << (col_blocking ? "Column-wise-block" : "Row-wise-block")
+          << std::endl;
+      if (row + 1 < meta_shape[0]) {
+        EXPECT_EQ(tensor_scale.at(row + 1, col), tensor_o_scales.at(row + 1, col))
+            << "quantized scale mismatch at [" << row + 1 << "," << col << "]"
+            << " shape[" << rows << "," << columns << "]"
+            << (col_blocking ? "Column-wise-block" : "Row-wise-block")
+            << std::endl;
+      }
+    }
+  }
+
+  //
+  // Now we just setup quantized weights tensor_q_weight, quantization scale tensor_scale
+  // and quantization offset tensor_offset. The above tests just make sure our setup is
+  // consistent with quantization tool output.
+  //
+  // Next we test the prepack code
+  //
+
+  std::vector<ElementW> packed_w_ref(q_weight_shape.product());
+  MatrixRef<ElementW, LayoutWPack, true> tensor_packed_w_ref(
+      packed_w_ref, make_Position(rows, columns / 2));
+  onnxruntime::test::sm80_prepack_weights_ref(rows, columns, tensor_q_weight, tensor_packed_w_ref);
+
+  std::vector<ElementW> packed_w(q_weight_shape.product());
+  MatrixRef<ElementW, LayoutWPack, true> tensor_packed_w(
+      packed_w, make_Position(rows, columns / 2));
+  Base::prepack_weights(rows, columns, o_elements, packed_w);
+
+  for (int col = 0; col < tensor_packed_w.shape()[1]; ++col) {
+    for (int row = 0; row < tensor_packed_w.shape()[0]; ++row) {
+      EXPECT_EQ(tensor_packed_w_ref.at(row, col), tensor_packed_w.at(row, col))
+          << "prepacked weights mismatch at [" << row << "," << col << "]"
+          << " shape[" << rows << "," << columns << "]"
+          << (col_blocking ? "Column-wise-block" : "Row-wise-block")
+          << std::endl;
+    }
+  }
+
+  std::vector<ElementT> packed_scales_ref(meta_shape.product());
+  MatrixRef<ElementT, LayoutQmeta, true> tensor_packed_s_ref =
+      make_MatrixRef<ElementT, LayoutQmeta, true>(packed_scales_ref, meta_shape);
+  if constexpr (Base::ShouldRearrangeMeta) {
+    onnxruntime::test::sm80_prepack_quant_scales_ref<ElementT, LayoutQmeta, QuantBlocking>(
+        rows, columns, tensor_scale.const_ref(), tensor_packed_s_ref);
+  } else {
+    for (int col = 0; col < tensor_packed_s_ref.shape()[1]; ++col) {
+      for (int row = 0; row < tensor_packed_s_ref.shape()[0]; ++row) {
+        tensor_packed_s_ref.at(row, col) = tensor_scale.at(row, col);
+      }
+    }
+  }
+
+  std::vector<ElementT> packed_scales(meta_shape.product());
+  MatrixRef<ElementT, LayoutQmeta, true> tensor_packed_s(
+      packed_scales, meta_shape);
+  Base::prepack_quant_scales(rows, columns, o_scales, packed_scales);
+
+  for (int col = 0; col < tensor_packed_s.shape()[1]; ++col) {
+    for (int row = 0; row < tensor_packed_s.shape()[0]; ++row) {
+      EXPECT_EQ(tensor_packed_s_ref.at(row, col), tensor_packed_s.at(row, col))
+          << "prepacked scales mismatch at [" << row << "," << col << "]"
+          << " shape[" << rows << "," << columns << "]"
+          << (col_blocking ? "Column-wise-block" : "Row-wise-block")
+          << std::endl;
+    }
+  }
+
+  if (has_offset) {
+    std::vector<ElementQOffset> packed_zp_ref(meta_shape.product());
+    MatrixRef<ElementQOffset, LayoutQmeta, true> tensor_packed_zp_ref =
+        make_MatrixRef<ElementQOffset, LayoutQmeta, true>(packed_zp_ref, meta_shape);
+    if constexpr (Base::ShouldRearrangeMeta) {
+      onnxruntime::test::sm80_prepack_quant_offsets_ref<LayoutQmeta, QuantBlocking>(
+          rows, columns, tensor_offset.const_ref(), tensor_packed_zp_ref);
+    } else {
+      for (int col = 0; col < meta_shape[1]; ++col) {
+        for (int row = 0; row < meta_shape[0]; row += 2) {
+          uint8_t pair01 = tensor_offset.at(row / 2, col);
+          tensor_packed_zp_ref.at(row, col) = pair01 & 0xf;
+          if (row + 1 < meta_shape[0]) {
+            tensor_packed_zp_ref.at(row + 1, col) = pair01 >> 4;
+          }
+        }
+      }
+    }
+
+    std::vector<ElementQOffset> packed_zp(meta_shape.product());
+    MatrixRef<ElementQOffset, LayoutQmeta, true> tensor_packed_zp(
+        packed_zp, meta_shape);
+    Base::prepack_quant_offsets(rows, columns, o_zp, packed_zp);
+
+    for (int col = 0; col < tensor_packed_zp.shape()[1]; ++col) {
+      for (int row = 0; row < tensor_packed_zp.shape()[0]; ++row) {
+        EXPECT_EQ(tensor_packed_zp_ref.at(row, col), tensor_packed_zp.at(row, col))
+            << "prepacked offsets mismatch at [" << row << "," << col << "]"
+            << " shape[" << rows << "," << columns << "]"
+            << (col_blocking ? "Column-wise-block" : "Row-wise-block")
+            << std::endl;
+      }
+    }
+  }
+}
+
+// TODO: code runs on CPU, but this is for sm80 only, maybe enable only when test on sm80
+TEST(BlkQ4_GEMM, PrepackSm80Test) {
+  Status status = onnxruntime::cuda::test::sm80_supported();
+  if (!status.IsOK()) {
+    // skip the test if sm80 is not supported
+    return;
+  }
+
+  testPrepack<false>(32, 32);
+  testPrepack<false, false>(32, 32);
+  testPrepack<true>(32, 32);
+  testPrepack<true, false>(32, 32);
+  testPrepack<false>(32, 64);
+  testPrepack<false>(32, 128);
+  testPrepack<false>(32, 256);
+  testPrepack<false>(64, 32);
+  testPrepack<false>(128, 32);
+  testPrepack<false>(256, 32);
+  testPrepack<false>(256, 256);
+  testPrepack<false, false>(32, 128);
+  testPrepack<false, false>(128, 32);
+  testPrepack<false, false>(256, 256);
+  testPrepack<true>(32, 64);
+  testPrepack<true>(32, 128);
+  testPrepack<true>(32, 256);
+  testPrepack<true>(64, 32);
+  testPrepack<true>(128, 32);
+  testPrepack<true>(256, 32);
+  testPrepack<true>(256, 256);
+  testPrepack<true, false>(32, 128);
+  testPrepack<true, false>(128, 32);
+  testPrepack<true, false>(256, 256);
+}
+
+TEST(BlkQ4_GEMM, Sm80RowBlockingTest) {
+  Status status = onnxruntime::cuda::test::sm80_supported();
+  if (!status.IsOK()) {
+    // skip the test if sm80 is not supported
+    return;
+  }
+
+  onnxruntime::cuda::test::run_blkq4_gemm<32, false, false, false>(32, 32, 64);
+  onnxruntime::cuda::test::run_blkq4_gemm<32, false, false, true>(32, 32, 64);
+
+  onnxruntime::cuda::test::run_blkq4_gemm<32, false, false, false>(32, 96, 64);
+  onnxruntime::cuda::test::run_blkq4_gemm<32, false, false, true>(32, 96, 64);
+
+  onnxruntime::cuda::test::run_blkq4_gemm<32, false, false, false>(32, 96, 192);
+  onnxruntime::cuda::test::run_blkq4_gemm<32, false, false, true>(32, 96, 192);
+
+  onnxruntime::cuda::test::run_blkq4_gemm<32, false, false, false>(256, 672, 576);
+  onnxruntime::cuda::test::run_blkq4_gemm<32, false, false, true>(256, 672, 576);
+
+  onnxruntime::cuda::test::run_blkq4_gemm<32, false, false, false>(512, 2048 + 32, 960);
+  onnxruntime::cuda::test::run_blkq4_gemm<32, false, false, false>(512, 2048 + 32, 960);
+
+  onnxruntime::cuda::test::run_blkq4_gemm<16, false, false, false>(256, 672, 576);
+  onnxruntime::cuda::test::run_blkq4_gemm<16, false, false, true>(256, 672, 576);
+
+  onnxruntime::cuda::test::run_blkq4_gemm<64, false, false, false>(256, 1024, 576);
+  onnxruntime::cuda::test::run_blkq4_gemm<64, false, false, true>(256, 1024, 576);
+}
+
+TEST(BlkQ4_GEMM, Sm80ColBlockingTest) {
+  Status status = onnxruntime::cuda::test::sm80_supported();
+  if (!status.IsOK()) {
+    // skip the test if sm80 is not supported
+    return;
+  }
+  onnxruntime::cuda::test::run_blkq4_gemm<16, true, false, false>(64, 672, 576);
+  onnxruntime::cuda::test::run_blkq4_gemm<16, true, false, true>(64, 672, 576);
+
+  onnxruntime::cuda::test::run_blkq4_gemm<64, true, false, false>(256, 1024, 576);
+  onnxruntime::cuda::test::run_blkq4_gemm<64, true, false, true>(256, 1024, 576);
+}
+
+TEST(BlkQ4_GEMM, Sm80SmallMTest) {
+  Status status = onnxruntime::cuda::test::sm80_supported();
+  if (!status.IsOK()) {
+    // skip the test if sm80 is not supported
+    return;
+  }
+
+  // // small m
+  onnxruntime::cuda::test::run_blkq4_gemm<16, false, true, false>(16, 704, 576);
+  onnxruntime::cuda::test::run_blkq4_gemm<16, false, true, true>(16, 704, 576);
+
+  onnxruntime::cuda::test::run_blkq4_gemm<64, false, true, false>(16, 1024, 576);
+  onnxruntime::cuda::test::run_blkq4_gemm<64, false, true, true>(16, 1024, 576);
+
+  onnxruntime::cuda::test::run_blkq4_gemm<16, true, true, false>(16, 672, 576);
+  onnxruntime::cuda::test::run_blkq4_gemm<16, true, true, true>(16, 672, 576);
+
+  onnxruntime::cuda::test::run_blkq4_gemm<64, true, true, false>(16, 1024, 576);
+  onnxruntime::cuda::test::run_blkq4_gemm<64, true, true, true>(16, 1024, 576);
+}
+
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu
new file mode 100644
index 0000000000000..69c929d446ce4
--- /dev/null
+++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu
@@ -0,0 +1,344 @@
+/**
+ * Copyright (c) Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License.
+ *
+ * Module Name:
+ *    blkq4_fp16_gemm_sm80_testcu.cu
+ *
+ * Abstract:
+ *   Test code for invoking block-wise quantized 4b GEMM kernels.
+ *   This part requires CUTLASS header files, which do not play
+ *   well with gtest headers.
+ */
+
+#include <random>
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+#include "core/mickey/blk_q4/f16_gemm_sm80.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "core/common/common.h"
+
+#include "blkq4_fp16_gemm_sm80.h"
+
+namespace onnxruntime {
+namespace cuda{
+namespace test{
+
+Status sm80_supported(){
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::ostringstream ss;
+    ss << "Unable to obtain GPU device properties: " << cudaGetErrorString(error);
+    return Status(common::ONNXRUNTIME, common::ENGINE_ERROR, ss.str());
+  }
+
+  if (!((props.major * 10 + props.minor) >= 80)) {
+    std::ostringstream ss;
+    ss << "Device compute capability mismatch, desired 8.0, actual " << props.major << "." << props.minor;
+    return Status(common::ONNXRUNTIME, common::ENGINE_ERROR, ss.str());
+  }
+  return Status::OK();
+}
+
+/**
+ * @brief Reference implementation of GEMM
+ *        Copied directly from cutlass util/reference/device/gemm.h
+ *        for the strange reason that compiler insists on asking
+ *        for explicit stream argument in kernel launch.
+*/
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename AccumulatorType
+>
+void compute_gemm_ref(
+  cutlass::gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  cutlass::TensorRef<ElementA, LayoutA> tensor_a,
+  cutlass::TensorRef<ElementB, LayoutB> tensor_b,
+  ScalarType beta,
+  cutlass::TensorRef<ElementC, LayoutC> tensor_c,
+  cutlass::TensorRef<ElementC, LayoutC> tensor_d,
+  AccumulatorType initial_accum = AccumulatorType(0)) {
+
+  // Blocking structure potentially improves performance of reference implementation
+  // with a minor increase in complexity.
+  //
+  // Note, this reference implementation is NOT expected to approach peak performance.
+  using OutputTile = cutlass::MatrixShape<4, 4>;
+
+  dim3 block(16, 8);
+
+  dim3 grid(
+    (problem_size.m() + block.x * OutputTile::kRow - 1) / (block.x * OutputTile::kRow),
+    (problem_size.n() + block.y * OutputTile::kColumn - 1) / (block.y * OutputTile::kColumn)
+  );
+
+  // Launch a GEMM kernel
+  cutlass::reference::device::kernel::Gemm<
+    cutlass::TensorRef<ElementA, LayoutA>,
+    cutlass::TensorRef<ElementB, LayoutB>,
+    cutlass::TensorRef<ElementC, LayoutC>,
+    ScalarType,
+    AccumulatorType,
+    OutputTile,
+    cutlass::multiply_add<AccumulatorType>,
+    cutlass::NumericConverter<ElementC, ScalarType>
+  ><<<grid, block, 0, 0>>>(
+    problem_size,
+    alpha,
+    tensor_a,
+    tensor_b,
+    beta,
+    tensor_c,
+    tensor_d,
+    initial_accum
+  );
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Converting cutlass tensor to MatrixRef
+//
+
+template <
+  typename Element,
+  typename LayoutCutlass,
+  typename Layout = std::conditional_t<std::is_same<LayoutCutlass, cutlass::layout::ColumnMajor>::value, ColumnMajorLayout, RowMajorLayout>
+  >
+__forceinline__
+MatrixRef<Element, Layout, true> make_MatrixRef(cutlass::HostTensor<Element, LayoutCutlass> const& tensor) {
+  static_assert(std::is_same<LayoutCutlass, cutlass::layout::ColumnMajor>::value
+                || std::is_same<LayoutCutlass, cutlass::layout::RowMajor>::value);
+  auto shape = make_Position(tensor.extent().row(), tensor.extent().column());
+  auto* ptr = const_cast<typename std::remove_const<Element>::type *>(tensor.host_data());
+  return MatrixRef<Element, Layout, true>(ptr, tensor.capacity(), shape);
+}
+
+template <
+  typename Element,
+  typename LayoutCutlass,
+  typename Layout = std::conditional_t<std::is_same<LayoutCutlass, cutlass::layout::ColumnMajor>::value, ColumnMajorLayout, RowMajorLayout>
+  >
+__forceinline__
+MatrixRef<Element const, Layout, true> make_ConstMatrixRef(cutlass::HostTensor<Element, LayoutCutlass> const& tensor) {
+  static_assert(std::is_same<LayoutCutlass, cutlass::layout::ColumnMajor>::value
+                || std::is_same<LayoutCutlass, cutlass::layout::RowMajor>::value);
+  auto shape = make_Position(tensor.extent().row(), tensor.extent().column());
+  return MatrixRef<Element const, Layout, true>(tensor.host_data(), tensor.capacity(), shape);
+}
+
+//
+// Invoking the kernel
+//
+
+template<
+    int block_size,
+    bool column_wise_blocking,
+    bool small_m,
+    bool has_offsets>
+void run_blkq4_gemm(int m, int n, int k) {
+  unsigned int seed = 28571;  // Replace with desired seed value
+  std::seed_seq seq{seed};
+  std::mt19937 gen(seq);
+  std::uniform_int_distribution<> dis(0, 8192);
+
+  using ElementDequant = cutlass::half_t;
+  using QuantBlocking =
+    typename std::conditional<column_wise_blocking,
+                     cutlass::MatrixShape<block_size, 1>,
+                     cutlass::MatrixShape<1, block_size>>::type;
+
+  using GemmRunner = BlkQ4F16GemmImpl<ElementDequant, QuantBlocking, small_m, has_offsets>;
+
+  using ElementAccumulator = typename GemmRunner::ElementAccumulator;
+  using ElementComputeEpilogue = typename GemmRunner::ElementComputeEpilogue;
+  using ElementInputA = typename GemmRunner::ElementInputA;
+  using ElementOutput = typename GemmRunner::ElementOutput;
+  using ElementW = typename GemmRunner::ElementW;
+  using ElementWPack = typename GemmRunner::ElementWPack;
+  using ElementQScale = typename GemmRunner::ElementQScale;
+  using ElementQOffset = typename GemmRunner::ElementQOffset;
+
+  using LayoutInputA = typename GemmRunner::LayoutInputA;
+  using LayoutOutput = typename GemmRunner::LayoutOutput;
+  using LayoutInputWPack = typename GemmRunner::LayoutInputWPack;
+  using LayoutInputQScale = typename GemmRunner::LayoutInputQScale;
+
+  const cutlass::gemm::GemmCoord problem_size = {m, n, k};
+  const auto q_weight_shape = cutlass::make_Coord(problem_size.k()/2, problem_size.n());
+  const auto meta_shape = cutlass::make_Coord(problem_size.k()/QuantBlocking::kRow, problem_size.n()/QuantBlocking::kColumn);
+
+  //
+  // Generate quantized and dequantizeed input matrix B [K, N]
+  //
+  static_assert(std::is_same<LayoutInputWPack, cutlass::layout::ColumnMajor>::value);
+  std::vector<ElementW> q_weights;
+  std::vector<ElementQScale> q_scales;
+  std::vector<ElementQOffset> q_zp;
+  std::vector<ElementDequant> dequants;
+  onnxruntime::cuda::test::blkq4_weights_gen<ElementDequant, block_size, column_wise_blocking, has_offsets>(
+      problem_size.k(), problem_size.n(), dequants, q_weights, q_scales, q_zp);
+
+  using PrepackT = onnxruntime::cuda::BlockwiseQuantization<
+      ElementDequant,
+      block_size,
+      4,
+      column_wise_blocking>;
+
+  std::vector<ElementW> packed_w(q_weight_shape.product());
+  PrepackT::prepack_weights(problem_size.k(), problem_size.n(), q_weights, packed_w);
+  std::vector<ElementQScale> packed_scales(meta_shape.product());
+  PrepackT::prepack_quant_scales(problem_size.k(), problem_size.n(), q_scales, packed_scales);
+  std::vector<ElementQOffset> packed_zp;
+  if constexpr (has_offsets) {
+    packed_zp.resize(meta_shape.product());
+    PrepackT::prepack_quant_offsets(problem_size.k(), problem_size.n(), q_zp, packed_zp);
+  }
+
+  cutlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(
+      problem_size.mk());  // <- Create matrix A with dimensions M x K
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_c(
+      problem_size.mn());  // <- Create matrix C with dimensions M x N
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_d(
+      problem_size.mn());  // <- Create matrix D with dimensions M x N used to store output from
+                           // CUTLASS kernel
+
+  // Fill input and output matrices on host using CUTLASS helper functions
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_a.host_view(),
+      1,
+      ElementInputA(4),
+      ElementInputA(-4),
+      2);  // <- Fill matrix A on host with uniform-distribution random data
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_c.host_view(),
+      1,
+      ElementOutput(4),
+      ElementOutput(-4),
+      0);  // <- Fill matrix C on host with uniform-distribution random data
+  cutlass::reference::host::TensorFill(
+      tensor_d.host_view());  // <- fill matrix D on host with zeros
+
+  //
+  // Copy data from host to GPU...
+  //
+  thrust::device_vector<ElementW> d_packed_w(packed_w);
+  cutlass::TensorRef<ElementWPack const, LayoutInputWPack> ref_W(
+    reinterpret_cast<ElementWPack const *>(d_packed_w.data().get()),
+    LayoutInputWPack::packed({problem_size.k()/2, problem_size.n()/2}));
+
+  thrust::device_vector<ElementQScale> d_packed_scales(packed_scales);
+  cutlass::TensorRef<ElementQScale const, LayoutInputQScale> ref_scales(
+    d_packed_scales.data().get(), LayoutInputQScale::packed(meta_shape));
+
+  thrust::device_vector<ElementQOffset> d_packed_zp(packed_zp);
+  cutlass::TensorRef<ElementQOffset const, LayoutInputQScale> ref_zp(
+    d_packed_zp.data().get(), LayoutInputQScale::packed(meta_shape));
+
+  tensor_a.sync_device();
+  tensor_c.sync_device();
+  tensor_d.sync_device();
+
+  // run GEMM
+  cutlass::Status status;
+  if constexpr (has_offsets){
+    status = GemmRunner::run(
+      nullptr, problem_size, tensor_a.device_ref(), ref_W,
+      ref_scales, ref_zp,
+      tensor_c.device_ref(), tensor_d.device_ref());
+  } else {
+    status = GemmRunner::run(
+      nullptr, problem_size, tensor_a.device_ref(), ref_W,
+      ref_scales,
+      tensor_c.device_ref(), tensor_d.device_ref());
+  }
+  ORT_ENFORCE(status == cutlass::Status::kSuccess, "Kernel execution failed: ", cutlassGetStatusString(status));
+
+  // Running reference kernel
+  using ElementInputB = ElementInputA;
+  using LayoutInputB = cutlass::layout::ColumnMajor;
+  thrust::device_vector<ElementInputB> d_dequants(dequants);
+  cutlass::TensorRef<ElementInputB, LayoutInputB> ref_B(
+    d_dequants.data().get(), LayoutInputB::packed(problem_size.kn()));
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_d(
+      problem_size.mn());  // <- Create matrix D with dimensions M x N used to store output from
+                           // reference kernel
+
+  cutlass::reference::host::TensorFill(
+      tensor_ref_d.host_view());  // <- fill matrix D for reference on host with zeros
+  tensor_ref_d.sync_device();
+
+  // Initialize alpha and beta for dot product computation
+  ElementComputeEpilogue alpha = ElementComputeEpilogue(1);
+  ElementComputeEpilogue beta = ElementComputeEpilogue(0);
+
+  compute_gemm_ref<ElementInputA, LayoutInputA,
+               ElementInputB, LayoutInputB,
+               ElementOutput, LayoutOutput,
+               ElementComputeEpilogue, ElementAccumulator>(
+      problem_size,
+      alpha,
+      tensor_a.device_ref(),
+      ref_B,
+      beta,
+      tensor_c.device_ref(),
+      tensor_ref_d.device_ref());
+
+  // Wait for kernels to finish
+  cudaDeviceSynchronize();
+
+  // Copy output data from CUTLASS and reference kernel to host for comparison
+  tensor_d.sync_host();
+  tensor_ref_d.sync_host();
+
+  // Check if output from CUTLASS kernel and reference kernel are equal or not
+  bool passed = cutlass::reference::host::TensorEquals(
+    tensor_d.host_view(),
+    tensor_ref_d.host_view());
+  ORT_ENFORCE(passed, "Gemm kernel result wrong!");
+}
+
+template void run_blkq4_gemm<16, true, false, true>(int m, int n, int k);
+template void run_blkq4_gemm<16, true, false, false>(int m, int n, int k);
+template void run_blkq4_gemm<32, true, false, true>(int m, int n, int k);
+template void run_blkq4_gemm<32, true, false, false>(int m, int n, int k);
+template void run_blkq4_gemm<64, true, false, true>(int m, int n, int k);
+template void run_blkq4_gemm<64, true, false, false>(int m, int n, int k);
+template void run_blkq4_gemm<16, false, false, true>(int m, int n, int k);
+template void run_blkq4_gemm<16, false, false, false>(int m, int n, int k);
+template void run_blkq4_gemm<32, false, false, true>(int m, int n, int k);
+template void run_blkq4_gemm<32, false, false, false>(int m, int n, int k);
+template void run_blkq4_gemm<64, false, false, true>(int m, int n, int k);
+template void run_blkq4_gemm<64, false, false, false>(int m, int n, int k);
+template void run_blkq4_gemm<16, true, true, true>(int m, int n, int k);
+template void run_blkq4_gemm<16, true, true, false>(int m, int n, int k);
+template void run_blkq4_gemm<32, true, true, true>(int m, int n, int k);
+template void run_blkq4_gemm<32, true, true, false>(int m, int n, int k);
+template void run_blkq4_gemm<64, true, true, true>(int m, int n, int k);
+template void run_blkq4_gemm<64, true, true, false>(int m, int n, int k);
+template void run_blkq4_gemm<16, false, true, true>(int m, int n, int k);
+template void run_blkq4_gemm<16, false, true, false>(int m, int n, int k);
+template void run_blkq4_gemm<32, false, true, true>(int m, int n, int k);
+template void run_blkq4_gemm<32, false, true, false>(int m, int n, int k);
+template void run_blkq4_gemm<64, false, true, true>(int m, int n, int k);
+template void run_blkq4_gemm<64, false, true, false>(int m, int n, int k);
+
+}  // namespace test
+}  // namespace cuda
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_sm80_prepack_test.cc b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_sm80_prepack_test.cc
deleted file mode 100644
index aba2b0b2cb4a4..0000000000000
--- a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_sm80_prepack_test.cc
+++ /dev/null
@@ -1,507 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include <random>
-
-#include "core/framework/float16.h"
-#include "core/mickey/blk_q4/prepack_sm80.h"
-#include "core/mlas/inc/mlas_q4.h"
-
-#include "gtest/gtest.h"
-
-namespace onnxruntime {
-namespace test {
-
-void prepack_weights_ref(
-    int rows,
-    int columns,
-    const MatrixRef<uint8_t const, ColumnMajorLayout, true>& tensor_weight,
-    const MatrixRef<uint8_t, ColumnMajorLayout, true>& tensor_weight_prepacked) {
-  EXPECT_TRUE(tensor_weight.shape()[0] == rows / 2 && tensor_weight.shape()[1] == columns);
-  EXPECT_TRUE(tensor_weight_prepacked.shape()[0] == rows && tensor_weight_prepacked.shape()[1] == columns / 2);
-
-  auto t0_base = make_Position(0, 0);
-  auto t1_base = make_Position(4, 0);
-  auto t2_base = make_Position(0, 8);
-  auto t3_base = make_Position(4, 8);
-  for (int col_dtile = 0; col_dtile < columns / 16; ++col_dtile) {
-    for (int row_dtile = 0; row_dtile < rows / 16; ++row_dtile) {
-      // Packing from a 8x16 tile to a 16x8 tile
-      auto dtile_base = make_Position(row_dtile * 8, col_dtile * 16);
-      auto packed_tile_base = make_Position(row_dtile * 16, col_dtile * 8);
-      for (int col = 0; col < 8; ++col) {
-        for (int row = 0; row < 4; ++row) {
-          auto cord = make_Position(row, col);
-          auto packed_cord = packed_tile_base + make_Position(row * 4, col);  // packed tile is 16x8
-          uint8_t buf[4];
-          buf[0] = tensor_weight.at(dtile_base + t0_base + cord);
-          buf[1] = tensor_weight.at(dtile_base + t1_base + cord);
-          buf[2] = tensor_weight.at(dtile_base + t2_base + cord);
-          buf[3] = tensor_weight.at(dtile_base + t3_base + cord);
-
-          // [0, 1, 2, 3, 4, 5, 6, 7] => [0, 2, 4, 6, 1, 3, 5, 7] so that each pair of adjacent weights
-          // are in different b16 register at the same positions. This makes it easier to convert to
-          // fp16x2 format in a b32 register
-
-          tensor_weight_prepacked.at(packed_cord) = (buf[0] & 0x0f) | ((buf[1] & 0x0f) << 4);
-          tensor_weight_prepacked.at(packed_cord + make_Position(1, 0)) = (buf[2] & 0x0f) | ((buf[3] & 0x0f) << 4);
-          tensor_weight_prepacked.at(packed_cord + make_Position(2, 0)) = ((buf[0] & 0xf0) >> 4) | (buf[1] & 0xf0);
-          tensor_weight_prepacked.at(packed_cord + make_Position(3, 0)) = ((buf[2] & 0xf0) >> 4) | (buf[3] & 0xf0);
-        }
-      }
-    }
-  }
-}
-
-template <
-    typename ScaleElementT,
-    typename Layout,
-    typename QuantBlocking>
-void prepack_quant_scales_ref(
-    int rows,
-    int columns,
-    const MatrixRef<ScaleElementT const, Layout, true>& tensor_scale,
-    const MatrixRef<ScaleElementT, Layout, true>& tensor_scale_prepacked) {
-  EXPECT_TRUE(tensor_scale.shape()[0] == (rows / QuantBlocking::kRow) && tensor_scale.shape()[1] == (columns / QuantBlocking::kColumn));
-  EXPECT_TRUE(tensor_scale_prepacked.shape() == tensor_scale.shape());
-
-  // Only prepacking scale and offset tensors for a often used special case:
-  //    16b gemm (2 elements per 32b register, operand tile shape 8x8)
-  //    2 B operand tiles per mma instruction stacked on k dimension
-  //    (1,n) quantization blocking
-  if constexpr (sizeof(ScaleElementT) == 2 && QuantBlocking::kRow == 1) {
-    // In Ampere tensor op, each operand B tile is 8 x 8, in a warp of 32 threads, each thread
-    // holds a fragment of the tile containing 2 elements in the k dimension. Most often we use
-    // mma instruction shape of 16x8x16, which means 2 B tiles are stacked in the k dimension,
-    // as shown below (T stands for thread):
-    // T0, T4, T8, T12
-    // T1, T5, T9, T13
-    // T2, T6, T10, T14
-    // T3, T7, T11, T15
-    // T0, T4, T8, T12
-    // T1, T5, T9, T13
-    // T2, T6, T10, T14
-    // T3, T7, T11, T15
-    //
-    // We need to deliver quantization scale and offset elements to the corresponding threads,
-    // so we can perform dequantization efficiently. With a column major layout, each thread
-    // needs two separate loads for a mma instruction, due to the tile fragment layout shown
-    // above. To reduce the number of loads, we rearrange each column as below, so we can use
-    // a single load to load fragments for two tiles:
-    // T0        T0
-    // T1        T0
-    // T2        T1
-    // T3   =>   T1
-    // T0        T2
-    // T1        T2
-    // T2        T3
-    // T3        T3
-
-    for (int col = 0; col < tensor_scale.shape()[1]; ++col) {
-      for (int row_blk = 0; row_blk < tensor_scale.shape()[0]; row_blk += 16) {
-        for (int thread_id = 0; thread_id < 4; thread_id++) {
-          const int dst_idx = row_blk + thread_id * 4;
-          const int src_idx = row_blk + thread_id * 2;
-          tensor_scale_prepacked.at(dst_idx + 0, col) = tensor_scale.at(src_idx + 0, col);
-          tensor_scale_prepacked.at(dst_idx + 1, col) = tensor_scale.at(src_idx + 1, col);
-          tensor_scale_prepacked.at(dst_idx + 2, col) = tensor_scale.at(src_idx + 8, col);
-          tensor_scale_prepacked.at(dst_idx + 3, col) = tensor_scale.at(src_idx + 9, col);
-        }
-      }
-    }
-  } else {
-    // In all other cases, we don't prepack scale or offset
-    FAIL() << "Scale prepack only supported for 16b gemm with (1,n) quantization blocking";
-  }
-}
-
-template <typename Layout, typename QuantBlocking>
-void prepack_quant_offsets_ref(
-    size_t rows,
-    size_t columns,
-    MatrixRef<uint8_t const, Layout, true> tensor_offset,
-    MatrixRef<uint8_t, Layout, true> tensor_offset_prepacked) {
-  // EXPECT_TRUE(tensor_offset.shape()[0] == (rows / QuantBlocking::kRow) && tensor_offset.shape()[1] == (columns / QuantBlocking::kColumn));
-  EXPECT_TRUE(tensor_offset_prepacked.shape() == tensor_offset.shape());
-
-  // Only prepacking scale and offset tensors for a often used special case:
-  //    16b gemm (2 elements per 32b register, operand tile shape 8x8)
-  //    2 B operand tiles per mma instruction stacked on k dimension
-  //    (1,n) quantization blocking
-  if constexpr (QuantBlocking::kRow != 1) {
-    FAIL() << "Offsets prepack only supported for 16b gemm with (1,n) quantization blocking";
-  }
-  // In Ampere tensor op, each operand B tile is 8 x 8, in a warp of 32 threads, each thread
-  // holds a fragment of the tile containing 2 elements in the k dimension. Most often we use
-  // mma instruction shape of 16x8x16, which means 2 B tiles are stacked in the k dimension,
-  // as shown below (T stands for thread):
-  // T0, T4, T8, T12
-  // T1, T5, T9, T13
-  // T2, T6, T10, T14
-  // T3, T7, T11, T15
-  // T0, T4, T8, T12
-  // T1, T5, T9, T13
-  // T2, T6, T10, T14
-  // T3, T7, T11, T15
-  //
-  // We need to deliver quantization scale and offset elements to the corresponding threads,
-  // so we can perform dequantization efficiently. With a column major layout, each thread
-  // needs two separate loads for a mma instruction, due to the tile fragment layout shown
-  // above. To reduce the number of loads, we rearrange each column as below, so we can use
-  // a single load to load fragments for two tiles:
-  // T0        T0
-  // T1        T0
-  // T2        T1
-  // T3   =>   T1
-  // T0        T2
-  // T1        T2
-  // T2        T3
-  // T3        T3
-  if (tensor_offset_prepacked.good()) {
-    for (int col = 0; col < tensor_offset.shape()[1]; ++col) {
-      for (int row_blk = 0; row_blk < tensor_offset.shape()[0]; row_blk += 16) {
-        for (int thread_id = 0; thread_id < 4; thread_id++) {
-          const int dst_idx = row_blk + thread_id * 4;
-          const int src_idx = row_blk + thread_id * 2;
-          // [a, b, c, d] => [a, c, b, d] so that adjacent weights are in their own
-          // 16b element: [a, x, b, x] and [x, c, x, d], which makes it easier to
-          // convert to fp16x2 format in a b32 register
-          tensor_offset_prepacked.at(dst_idx + 0, col) = tensor_offset.at(src_idx + 0, col);
-          tensor_offset_prepacked.at(dst_idx + 1, col) = tensor_offset.at(src_idx + 8, col);
-          tensor_offset_prepacked.at(dst_idx + 2, col) = tensor_offset.at(src_idx + 1, col);
-          tensor_offset_prepacked.at(dst_idx + 3, col) = tensor_offset.at(src_idx + 9, col);
-        }
-      }
-    }
-  }
-}
-
-template <bool ColumnMajorQuantBlocking>
-void testPrepack(int rows, int columns, bool has_offset = true) {
-  using ElementT = MLFloat16;
-  constexpr int block_size = 32;
-  using Base = onnxruntime::cuda::BlockwiseQuantization<
-      ElementT,
-      block_size,
-      4,
-      ColumnMajorQuantBlocking>;
-
-  using QuantBlocking = typename Base::QuantBlocking;
-  using ElementW = typename Base::ElementW;
-  using LayoutWPack = typename Base::LayoutWPack;
-  using ElementQOffset = typename Base::ElementQOffset;
-  using LayoutQmeta = typename Base::LayoutQmeta;
-
-  unsigned int seed = 28571;  // Replace with desired seed value
-  std::seed_seq seq{seed};
-  std::mt19937 gen(seq);
-  std::uniform_int_distribution<> dis(0, 8192);
-
-  const auto q_weight_shape = Base::get_quant_weights_shape(rows, columns);
-  const auto meta_shape = Base::get_quant_meta_shape(rows, columns);
-
-  //
-  // For testing quantization and dequantization, it is not straight
-  // forward to avoid flaky tests due to rounding errors. The way we
-  // try to achieve this is to:
-  // 1. Generate a set of quantized weights, scales and offsets
-  // 2. Dequantize the weights
-  // 3. Quantize the dequantized weights
-  // 4. Compare the dequantied-and-then-quantized weights with
-  //    the original quantized weights
-  //
-  // Random filling of the initial values are key to get this right.
-  // For weights, we must ensure each block gets a full range of
-  // values, i.e. must contain 0 and 15. And for scales, they must
-  // all be positive.
-  //
-
-  std::vector<ElementW> q_weights(q_weight_shape.product());
-  MatrixRef<ElementW, LayoutWPack, true> tensor_q_weight(
-      q_weights, make_Position(rows / 2, columns));
-  int v = 7;
-  for (int c = 0; c < tensor_q_weight.shape()[1]; c++) {
-    for (int r = 0; r < tensor_q_weight.shape()[0]; ++r) {
-      uint8_t v0 = static_cast<uint8_t>(v);
-      v = (v + 5) % 16;
-      if (v == 11 || v == 7 || v == 3) {
-        // making the cycle 13 instead of 16, avoiding same values in a row
-        v = (v + 5) % 16;
-      }
-      uint8_t v1 = 0;
-      if (r + 1 < rows) {
-        v1 = static_cast<uint8_t>(v);
-        v = (v + 5) % 16;
-        if (v == 11 || v == 7 || v == 3) {
-          // making the cycle 13 instead of 16, avoiding same values in a row
-          v = (v + 5) % 16;
-        }
-      }
-
-      tensor_q_weight.at(r, c) = ElementW((v1 << 4) | v0);
-    }
-  }
-
-  std::vector<ElementT> q_scales(meta_shape.product());
-  for (size_t i = 0; i < q_scales.size(); i++) {
-    q_scales[i] = ElementT(((dis(gen) % 127) + 1) / 32.0f);
-  }
-  MatrixRef<ElementT, LayoutQmeta, true> tensor_scale(
-      q_scales, meta_shape);
-
-  std::vector<ElementQOffset> q_zp(meta_shape.product());
-  for (size_t i = 0; i < q_zp.size(); i++) {
-    q_zp[i] = dis(gen) % 16;
-  }
-  MatrixRef<ElementQOffset, LayoutQmeta, true> tensor_offset(
-      q_zp, meta_shape);
-
-#if 0  // debug
-  // Fill tensor_q_weight with the patterned data, easier to debug with print
-  int loop_val = 0;
-  int offset = 3;
-  for (int col_tile = 0; col_tile < tensor_q_weight.extent().column()/8; ++col_tile) {
-    for (int row_tile = 0; row_tile < tensor_q_weight.extent().row()/4; ++row_tile) {
-      for (int col = 0; col < 8; ++col) {
-        for (int row = 0; row < 4; ++row) {
-          auto weight_cord = cutlass::make_Coord(row_tile * 4 + row, col_tile * 8 + col);
-          auto val = (loop_val + offset) % 256;
-          tensor_q_weight.at(weight_cord) = ElementW(val);
-          loop_val++;
-          if (loop_val == 256) {
-            loop_val = 0;
-            offset += 11;
-          }
-        }
-      }
-    }
-  }
-  for (int col = 0; col < tensor_scale.extent().column(); ++col){
-    int c =  col * QuantBlocking::kColumn;
-    for (int row = 0; row < tensor_scale.extent().row(); ++row){
-      int r = row * QuantBlocking::kRow;
-      auto weight_cord = cutlass::make_Coord(r/2, c);
-      int w = 0;
-      if (r % 2 == 0) {
-        w = int(tensor_q_weight.at(weight_cord) & 0x0f);
-      } else {
-        w = int(tensor_q_weight.at(weight_cord) >> 4);
-      }
-      tensor_scale.at({row, col}) = w;
-      tensor_offset.at({row, col}) = ElementQOffset(w);
-    }
-  }
-
-  int fill_val = -512;
-  int factor = 1;
-  for (int col = 0; col < tensor_scale.extent().column(); ++col){
-    for (int row = 0; row < tensor_scale.extent().row(); ++row){
-      tensor_scale.at({row, col}) = ElementQScale((float)fill_val * float(factor));
-      fill_val++;
-      if (fill_val == 512) {
-        fill_val = -512;
-        factor += 1;
-      }
-    }
-  }
-
-#endif  // debug
-
-  std::vector<ElementT> dequants(rows * columns);
-  MatrixRef<ElementT, RowMajorLayout> tensor_dequant(dequants, make_Position(rows, columns));
-
-  // Dequantize weights and save into matrix B for reference
-  for (int col = 0; col < tensor_dequant.shape()[1]; ++col) {
-    for (int row = 0; row < tensor_dequant.shape()[0]; ++row) {
-      auto weight_cord = make_Position(row / 2, col);
-      auto scale_cord = make_Position(row / QuantBlocking::kRow, col / QuantBlocking::kColumn);
-      const uint8_t offset = has_offset ? tensor_offset.at(scale_cord) : 8;
-      int w = 0;
-      if (row % 2 == 0) {
-        w = int(tensor_q_weight.at(weight_cord) & 0x0f);
-      } else {
-        w = int(tensor_q_weight.at(weight_cord) >> 4);
-      }
-      float scale = float(tensor_scale.at(scale_cord));
-      float dequant = scale * float(w - offset);
-      tensor_dequant.at(row, col) = ElementT(dequant);
-      // Prints for help debugging in case of test failure
-      // fprintf(stderr, "(%2d,%2d)= %2d, %2d, %f, %f\n", row, col, w, offset, scale, dequant);
-    }
-  }
-
-  int q_rows, q_cols;
-  MlasBlockwiseQuantizedShape<ElementT, 4>(
-      block_size, ColumnMajorQuantBlocking, rows, columns, q_rows, q_cols);
-  // to be exact, q_rows are padded to multiple of block_size, deal with it when we care about strange shapes
-  EXPECT_EQ(q_rows, q_weight_shape[0]);
-  EXPECT_EQ(q_cols, q_weight_shape[1]);
-
-  //
-  // Quantization tool outputs:
-  //
-  std::vector<ElementW> o_elements(q_rows * q_cols);
-  MatrixRef<ElementW, ColumnMajorLayout, true> tensor_o_elements(o_elements, q_weight_shape);
-
-  std::vector<ElementT> o_scales(meta_shape.product());
-  MatrixRef<ElementT, ColumnMajorLayout, true> tensor_o_scales(o_scales, meta_shape);
-
-  std::vector<uint8_t> o_zp(((meta_shape[0] + 1) / 2) * meta_shape[1], true);
-  MatrixRef<uint8_t, ColumnMajorLayout, true> tensor_o_zp(
-      o_zp, make_Position((meta_shape[0] + 1) / 2, meta_shape[1]));
-
-  MlasQuantizeBlockwise<MLFloat16, 4>(o_elements.data(), o_scales.data(), has_offset ? o_zp.data() : nullptr,
-                                      tensor_dequant.data().data(), block_size,
-                                      ColumnMajorQuantBlocking, rows, columns, columns, nullptr);
-  for (int col = 0; col < tensor_q_weight.shape()[1]; ++col) {
-    for (int row = 0; row < tensor_q_weight.shape()[0]; ++row) {
-      EXPECT_EQ(tensor_o_elements.at(row, col), tensor_q_weight.at(row, col))
-          << "quantized value mismatch at [" << row << "," << col << "]"
-          << " shape[" << rows << "," << columns << "]"
-          << (ColumnMajorQuantBlocking ? "Column-wise-block" : "Row-wise-block")
-          << std::endl;
-    }
-  }
-
-  for (int col = 0; col < meta_shape[1]; ++col) {
-    for (int row = 0; row < meta_shape[0]; row += 2) {
-      if (has_offset) {
-        uint8_t pair01 = tensor_o_zp.at(row / 2, col);
-        EXPECT_EQ(tensor_offset.at(row + 0, col), pair01 & 0xf)
-            << "quantized offset mismatch at [" << row << "," << col << "]"
-            << " shape[" << rows << "," << columns << "]"
-            << (ColumnMajorQuantBlocking ? "Column-wise-block" : "Row-wise-block")
-            << std::endl;
-        if (row + 1 < meta_shape[0]) {
-          EXPECT_EQ(tensor_offset.at(row + 1, col), pair01 >> 4)
-              << "quantized offset mismatch at [" << row + 1 << "," << col << "]"
-              << " shape[" << rows << "," << columns << "]"
-              << (ColumnMajorQuantBlocking ? "Column-wise-block" : "Row-wise-block")
-              << std::endl;
-        }
-      }
-
-      EXPECT_EQ(tensor_scale.at(row + 0, col), tensor_o_scales.at(row + 0, col))
-          << "quantized scale mismatch at [" << row << "," << col << "]"
-          << " shape[" << rows << "," << columns << "]"
-          << (ColumnMajorQuantBlocking ? "Column-wise-block" : "Row-wise-block")
-          << std::endl;
-      if (row + 1 < meta_shape[0]) {
-        EXPECT_EQ(tensor_scale.at(row + 1, col), tensor_o_scales.at(row + 1, col))
-            << "quantized scale mismatch at [" << row + 1 << "," << col << "]"
-            << " shape[" << rows << "," << columns << "]"
-            << (ColumnMajorQuantBlocking ? "Column-wise-block" : "Row-wise-block")
-            << std::endl;
-      }
-    }
-  }
-
-  //
-  // Now we just setup fp16 weights tensor_dequant, quantized weights tensor_q_weight,
-  // quantization scale tensor_scale and quantization offset tensor_offset. The above
-  // testing just make sure our test setup is consistent with quantization tool output.
-  //
-  // Next we test the prepack code
-  //
-
-  std::vector<ElementW> packed_w_ref(q_weight_shape.product());
-  MatrixRef<ElementW, LayoutWPack, true> tensor_packed_w_ref(
-      packed_w_ref, make_Position(rows, columns / 2));
-  prepack_weights_ref(rows, columns, tensor_q_weight, tensor_packed_w_ref);
-
-  std::vector<ElementW> packed_w(q_weight_shape.product());
-  MatrixRef<ElementW, LayoutWPack, true> tensor_packed_w(
-      packed_w, make_Position(rows, columns / 2));
-  Base::prepack_weights(rows, columns, o_elements, packed_w);
-
-  for (int col = 0; col < tensor_packed_w.shape()[1]; ++col) {
-    for (int row = 0; row < tensor_packed_w.shape()[0]; ++row) {
-      EXPECT_EQ(tensor_packed_w_ref.at(row, col), tensor_packed_w.at(row, col))
-          << "prepacked weights mismatch at [" << row << "," << col << "]"
-          << " shape[" << rows << "," << columns << "]"
-          << (ColumnMajorQuantBlocking ? "Column-wise-block" : "Row-wise-block")
-          << std::endl;
-    }
-  }
-
-  std::vector<ElementT> packed_scales_ref(meta_shape.product());
-  MatrixRef<ElementT, LayoutQmeta, true> tensor_packed_s_ref =
-      Base::ShouldRearrangeMeta ? make_MatrixRef<ElementT, LayoutQmeta, true>(packed_scales_ref, meta_shape)
-                                : tensor_scale;
-  if (Base::ShouldRearrangeMeta) {
-    prepack_quant_scales_ref<ElementT, LayoutQmeta, QuantBlocking>(
-        rows, columns, tensor_scale.const_ref(), tensor_packed_s_ref);
-  }
-
-  std::vector<ElementT> packed_scales(meta_shape.product());
-  MatrixRef<ElementT, LayoutQmeta, true> tensor_packed_s(
-      packed_scales, meta_shape);
-  Base::prepack_quant_scales(rows, columns, o_scales, packed_scales);
-
-  for (int col = 0; col < tensor_packed_s.shape()[1]; ++col) {
-    for (int row = 0; row < tensor_packed_s.shape()[0]; ++row) {
-      EXPECT_EQ(tensor_packed_s_ref.at(row, col), tensor_packed_s.at(row, col))
-          << "prepacked scales mismatch at [" << row << "," << col << "]"
-          << " shape[" << rows << "," << columns << "]"
-          << (ColumnMajorQuantBlocking ? "Column-wise-block" : "Row-wise-block")
-          << std::endl;
-    }
-  }
-
-  if (has_offset) {
-    std::vector<ElementQOffset> packed_zp_ref(meta_shape.product());
-    MatrixRef<ElementQOffset, LayoutQmeta, true> tensor_packed_zp_ref =
-        Base::ShouldRearrangeMeta ? make_MatrixRef<ElementQOffset, LayoutQmeta, true>(packed_zp_ref, meta_shape)
-                                  : tensor_offset;
-    if (Base::ShouldRearrangeMeta) {
-      prepack_quant_offsets_ref<LayoutQmeta, QuantBlocking>(
-          rows, columns, tensor_offset.const_ref(), tensor_packed_zp_ref);
-    }
-
-    std::vector<ElementQOffset> packed_zp(meta_shape.product());
-    MatrixRef<ElementQOffset, LayoutQmeta, true> tensor_packed_zp(
-        packed_zp, meta_shape);
-    Base::prepack_quant_offsets(rows, columns, o_zp, packed_zp);
-
-    for (int col = 0; col < tensor_packed_zp.shape()[1]; ++col) {
-      for (int row = 0; row < tensor_packed_zp.shape()[0]; ++row) {
-        EXPECT_EQ(tensor_packed_zp_ref.at(row, col), tensor_packed_zp.at(row, col))
-            << "prepacked offsets mismatch at [" << row << "," << col << "]"
-            << " shape[" << rows << "," << columns << "]"
-            << (ColumnMajorQuantBlocking ? "Column-wise-block" : "Row-wise-block")
-            << std::endl;
-      }
-    }
-  }
-}
-
-// TODO: code runs on CPU, but this is for sm80 only, maybe enable only when test on sm80
-TEST(BlkQ4_GEMM, PrepackSm80Test) {
-  testPrepack<false>(32, 32);
-  testPrepack<false>(32, 32, false);
-  testPrepack<true>(32, 32);
-  testPrepack<true>(32, 32, false);
-  testPrepack<false>(32, 64);
-  testPrepack<false>(32, 128);
-  testPrepack<false>(32, 256);
-  testPrepack<false>(64, 32);
-  testPrepack<false>(128, 32);
-  testPrepack<false>(256, 32);
-  testPrepack<false>(256, 256);
-  testPrepack<false>(32, 128, false);
-  testPrepack<false>(128, 32, false);
-  testPrepack<false>(256, 256, false);
-  testPrepack<true>(32, 64);
-  testPrepack<true>(32, 128);
-  testPrepack<true>(32, 256);
-  testPrepack<true>(64, 32);
-  testPrepack<true>(128, 32);
-  testPrepack<true>(256, 32);
-  testPrepack<true>(256, 256);
-  testPrepack<true>(32, 128, false);
-  testPrepack<true>(128, 32, false);
-  testPrepack<true>(256, 256, false);
-}
-
-}  // namespace test
-}  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc b/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc
index 5505d689381c9..8dfaaedcbb378 100644
--- a/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc
@@ -29,7 +29,7 @@ TEST(TestDeferredRelease, WithArena) {
   AllocatorPtr cpu_pinned_alloc = ep.CreatePreferredAllocators()[1];
   // let the CudaStream instance "own" the default stream, so we can avoid the
   // work to initialize cublas/cudnn/... It is ok since it is just a customized unit test.
-  CudaStream stream(nullptr, gpu_alloctor->Info().device, cpu_pinned_alloc, false, true, nullptr, nullptr);
+  CudaStream stream(nullptr, gpu_alloctor->Info().device, cpu_pinned_alloc, false, true, nullptr, nullptr, info);
   // 10 MB
   const size_t n_bytes = 10 * 1000000;
   const int64_t n_allocs = 64;
@@ -71,7 +71,7 @@ TEST(TestDeferredRelease, WithoutArena) {
   // For details, see CUDAPinnedAllocator in cuda_allocator.cc.
   // let the CudaStream instance "own" the default stream, so we can avoid the
   // work to initialize cublas/cudnn/... It is ok since it is just a customized unit test.
-  CudaStream stream(nullptr, gpu_alloctor->Info().device, cuda_pinned_alloc, false, true, nullptr, nullptr);
+  CudaStream stream(nullptr, gpu_alloctor->Info().device, cuda_pinned_alloc, false, true, nullptr, nullptr, info);
   // 10 MB
   const size_t n_bytes = 10 * 1000000;
   const int64_t n_allocs = 64;

From 1e78bcea6011ac43093bb08a647cf3717d73047a Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <yuslepukhin@users.noreply.github.com>
Date: Tue, 5 Mar 2024 13:33:01 -0800
Subject: [PATCH 110/279] Implement CUDA IsInf-10,20 (#19772)

### Description
Implment IsInf-10,20 for CUDA.
Add FP16 types also on CPU.

### Motivation and Context
Certain models lag in performance due to IsInf not available on CUDA.
---
 docs/OperatorKernels.md                       |  4 +-
 .../core/framework/data_types_internal.h      |  2 +-
 .../core/providers/cpu/tensor/isinf.cc        | 64 ++++++++++---
 .../core/providers/cuda/cu_inc/common.cuh     | 94 +++++++++++++++++++
 onnxruntime/core/providers/cuda/cuda_common.h | 18 ++++
 .../providers/cuda/cuda_execution_provider.cc |  5 +
 .../cuda/math/unary_elementwise_ops.cc        | 38 ++++++++
 .../cuda/math/unary_elementwise_ops.h         | 12 +++
 .../cuda/math/unary_elementwise_ops_impl.cu   | 38 ++++++++
 .../cuda/math/unary_elementwise_ops_impl.h    | 15 +++
 .../core/providers/rocm/cu_inc/common.cuh     | 94 +++++++++++++++++++
 .../providers/rocm/rocm_execution_provider.cc |  9 ++
 .../test/providers/cpu/tensor/isinf_test.cc   | 42 +++++++++
 13 files changed, 420 insertions(+), 15 deletions(-)

diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index 71b0def659741..4514a85531d6b 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -160,7 +160,7 @@ Do not modify directly.*
 |||[1, 10]|**B** = tensor(bool)<br/> **V** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |ImageScaler|*in* input:**T**<br> *out* output:**T**|1+|**T** = tensor(float)|
 |InstanceNormalization|*in* input:**T**<br> *in* scale:**T**<br> *in* B:**T**<br> *out* output:**T**|6+|**T** = tensor(float)|
-|IsInf|*in* X:**T1**<br> *out* Y:**T2**|20+|**T1** = tensor(double), tensor(float), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)<br/> **T2** = tensor(bool)|
+|IsInf|*in* X:**T1**<br> *out* Y:**T2**|20+|**T1** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)<br/> **T2** = tensor(bool)|
 |||[10, 19]|**T1** = tensor(double), tensor(float)<br/> **T2** = tensor(bool)|
 |IsNaN|*in* X:**T1**<br> *out* Y:**T2**|20+|**T1** = tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)<br/> **T2** = tensor(bool)|
 |||[13, 19]|**T1** = tensor(double), tensor(float), tensor(float16)<br/> **T2** = tensor(bool)|
@@ -631,6 +631,8 @@ Do not modify directly.*
 |||[1, 10]|**B** = tensor(bool)<br/> **V** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |ImageScaler|*in* input:**T**<br> *out* output:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
 |InstanceNormalization|*in* input:**T**<br> *in* scale:**T**<br> *in* B:**T**<br> *out* output:**T**|6+|**T** = tensor(double), tensor(float), tensor(float16)|
+|IsInf|*in* X:**T1**<br> *out* Y:**T2**|20+|**T1** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)<br/> **T2** = tensor(bool)|
+|||[10, 19]|**T1** = tensor(double), tensor(float)<br/> **T2** = tensor(bool)|
 |LRN|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16)|
 |||[1, 12]|**T** = tensor(double), tensor(float), tensor(float16)|
 |LSTM|*in* X:**T**<br> *in* W:**T**<br> *in* R:**T**<br> *in* B:**T**<br> *in* sequence_lens:**T1**<br> *in* initial_h:**T**<br> *in* initial_c:**T**<br> *in* P:**T**<br> *out* Y:**T**<br> *out* Y_h:**T**<br> *out* Y_c:**T**|14+|**T** = tensor(double), tensor(float), tensor(float16)<br/> **T1** = tensor(int32)|
diff --git a/include/onnxruntime/core/framework/data_types_internal.h b/include/onnxruntime/core/framework/data_types_internal.h
index fbeee8a2aedc5..3a3b5cb6888f2 100644
--- a/include/onnxruntime/core/framework/data_types_internal.h
+++ b/include/onnxruntime/core/framework/data_types_internal.h
@@ -305,7 +305,7 @@ class CallableDispatchableHelper {
     return 0;
   }
 
-  void CheckCalledOnce() {
+  void CheckCalledOnce() const {
     ORT_ENFORCE(called_ == 1, "Unsupported data type: ", dt_type_);
   }
 };
diff --git a/onnxruntime/core/providers/cpu/tensor/isinf.cc b/onnxruntime/core/providers/cpu/tensor/isinf.cc
index 1b449f46927a2..9d18d1fa62288 100644
--- a/onnxruntime/core/providers/cpu/tensor/isinf.cc
+++ b/onnxruntime/core/providers/cpu/tensor/isinf.cc
@@ -23,7 +23,9 @@ ORT_SPECIFY_OP_KERNEL_ARG_DEFAULT_TYPE_LIST(
 using IsInfTypesOpset20 =
     TypeList<
         float,
-        double
+        double,
+        MLFloat16,
+        BFloat16
 #if !defined(DISABLE_FLOAT8_TYPES)
         ,
         Float8E4M3FN, Float8E4M3FNUZ, Float8E5M2, Float8E5M2FNUZ
@@ -76,10 +78,8 @@ ONNX_CPU_OPERATOR_KERNEL(
     IsInf);
 
 IsInf::IsInf(const OpKernelInfo& info) : OpKernel(info) {
-  Status status = info.GetAttr("detect_positive", &detect_positive_);
-  ORT_ENFORCE(status.IsOK(), "Failed to obtain detect_positive");
-  status = info.GetAttr("detect_negative", &detect_negative_);
-  ORT_ENFORCE(status.IsOK(), "Failed to obtain detect_negative");
+  detect_positive_ = info.GetAttrOrDefault<int64_t>("detect_positive", 1);
+  detect_negative_ = info.GetAttrOrDefault<int64_t>("detect_negative", 1);
   opset_ = info.node().SinceVersion();
 }
 
@@ -87,29 +87,67 @@ namespace isinf_internal {
 template <class T>
 struct ComputeDispatchTarget {
   void operator()(const Tensor& X, Tensor& Y, bool detect_positive, bool detect_negative) const {
-    const auto total_items = X.Shape().Size();
+    auto input_data = X.DataAsSpan<T>();
     auto output_data = Y.MutableData<bool>();
 
     if (detect_positive && detect_negative) {
       EigenMap<bool>(Y) = EigenMap<T>(X).array().isInf();
     } else if (detect_positive) {
-      auto input_data = X.Data<T>();
-      auto end_data = input_data + total_items;
       std::transform(
-          input_data, end_data, output_data, [](T v) {
+          input_data.begin(), input_data.end(), output_data, [](T v) {
             return (v == std::numeric_limits<T>::infinity());
           });
 
     } else if (detect_negative) {
-      auto input_data = X.Data<T>();
-      auto end_data = input_data + total_items;
       std::transform(
-          input_data, end_data, output_data, [](T v) {
+          input_data.begin(), input_data.end(), output_data, [](T v) {
             return (v == -std::numeric_limits<T>::infinity());
           });
     } else {
       // all false
-      memset(output_data, false, onnxruntime::narrow<size_t>(total_items));
+      memset(output_data, false, input_data.size());
+    }
+  }
+};
+
+template <>
+struct ComputeDispatchTarget<MLFloat16> {
+  void operator()(const Tensor& X, Tensor& Y, bool detect_positive, bool detect_negative) const {
+    auto output_data = Y.MutableData<bool>();
+    auto input_data = X.DataAsSpan<MLFloat16>();
+    if (detect_positive && detect_negative) {
+      std::transform(input_data.begin(), input_data.end(), output_data,
+                     [](MLFloat16 v) { return v.IsInfinity(); });
+    } else if (detect_positive) {
+      std::transform(input_data.begin(), input_data.end(), output_data,
+                     [](MLFloat16 v) { return v.IsPositiveInfinity(); });
+    } else if (detect_negative) {
+      std::transform(input_data.begin(), input_data.end(), output_data,
+                     [](MLFloat16 v) { return v.IsNegativeInfinity(); });
+    } else {
+      // all false
+      memset(output_data, false, input_data.size());
+    }
+  }
+};
+
+template <>
+struct ComputeDispatchTarget<BFloat16> {
+  void operator()(const Tensor& X, Tensor& Y, bool detect_positive, bool detect_negative) const {
+    auto output_data = Y.MutableData<bool>();
+    auto input_data = X.DataAsSpan<BFloat16>();
+    if (detect_positive && detect_negative) {
+      std::transform(input_data.begin(), input_data.end(), output_data,
+                     [](BFloat16 v) { return v.IsInfinity(); });
+    } else if (detect_positive) {
+      std::transform(input_data.begin(), input_data.end(), output_data,
+                     [](BFloat16 v) { return v.IsPositiveInfinity(); });
+    } else if (detect_negative) {
+      std::transform(input_data.begin(), input_data.end(), output_data,
+                     [](BFloat16 v) { return v.IsNegativeInfinity(); });
+    } else {
+      // all false
+      memset(output_data, false, input_data.size());
     }
   }
 };
diff --git a/onnxruntime/core/providers/cuda/cu_inc/common.cuh b/onnxruntime/core/providers/cuda/cu_inc/common.cuh
index 66794f88d8670..bba9178348132 100644
--- a/onnxruntime/core/providers/cuda/cu_inc/common.cuh
+++ b/onnxruntime/core/providers/cuda/cu_inc/common.cuh
@@ -438,6 +438,100 @@ __device__ __inline__ BFloat16 _Fmod(BFloat16 a, BFloat16 b) {
   return fmodf((float)a, (float)b);
 }
 
+namespace isinf_details {
+template <typename T>
+struct IsInfTyped {
+  static __device__ __inline__ bool IsInf(T a) {
+    // cast is needed because on non MS compilers,
+    // because there isinf() returns int
+    // and we want to avoid stupid warnings
+    return static_cast<bool>(isinf(a));
+  }
+  static __device__ __inline__ bool IsInfPos(T a) {
+    return a == std::numeric_limits<T>::infinity();
+  }
+  static __device__ __inline__ bool IsInfNeg(T a) {
+    return a == -std::numeric_limits<T>::infinity();
+  }
+};
+
+template <>
+struct IsInfTyped<half> {
+  static __device__ __inline__ bool IsInf(half a) {
+    return MLFloat16::kPositiveInfinityBits ==
+           static_cast<uint16_t>(*reinterpret_cast<uint16_t*>(&a) & ~MLFloat16::kSignMask);
+  }
+  static __device__ __inline__ bool IsInfPos(half a) {
+    return MLFloat16::kPositiveInfinityBits == *reinterpret_cast<uint16_t*>(&a);
+  }
+  static __device__ __inline__ bool IsInfNeg(half a) {
+    return MLFloat16::kNegativeInfinityBits == *reinterpret_cast<uint16_t*>(&a);
+  }
+};
+
+template <>
+struct IsInfTyped<BFloat16> {
+  static __device__ __inline__ bool IsInf(BFloat16 a) {
+    return BFloat16::kPositiveInfinityBits ==
+           static_cast<uint16_t>(*reinterpret_cast<uint16_t*>(&a) & ~BFloat16::kSignMask);
+  }
+  static __device__ __inline__ bool IsInfPos(BFloat16 a) {
+    return BFloat16::kPositiveInfinityBits == *reinterpret_cast<uint16_t*>(&a);
+  }
+  static __device__ __inline__ bool IsInfNeg(BFloat16 a) {
+    return BFloat16::kNegativeInfinityBits == *reinterpret_cast<uint16_t*>(&a);
+  }
+};
+
+#if !defined(DISABLE_FLOAT8_TYPES)
+
+template<typename T>
+struct ReturnFalse {
+  constexpr static bool __device__ __inline__ IsInf(T) { return false; }
+  constexpr static bool __device__ __inline__ IsInfPos(T) { return false; }
+  constexpr static bool __device__ __inline__ IsInfNeg(T) { return false; }
+};
+
+template <>
+struct IsInfTyped<Float8E4M3FN> : ReturnFalse<Float8E4M3FN> {};
+
+template <>
+struct IsInfTyped<Float8E4M3FNUZ> : ReturnFalse<Float8E4M3FNUZ> {};
+
+template <>
+struct IsInfTyped<Float8E5M2> {
+  static __device__ __inline__ bool IsInf(Float8E5M2 a) {
+    return a.val == 0b01111100 || a.val == 0b11111100;
+  }
+  static __device__ __inline__ bool IsInfPos(Float8E5M2 a) {
+    return a.val == 0b01111100;
+  }
+  static __device__ __inline__ bool IsInfNeg(Float8E5M2 a) {
+    return a.val == 0b11111100;
+  }
+};
+
+template <>
+struct IsInfTyped<Float8E5M2FNUZ> : ReturnFalse<Float8E5M2FNUZ> {};
+
+#endif
+}  // namespace isinf_details
+
+template <typename T, bool detect_positive, bool detect_negative>
+struct _IsInf {
+  __device__ __inline__ bool operator()(T a) const {
+    if constexpr (detect_positive && detect_negative) {
+      return isinf_details::IsInfTyped<T>::IsInf(a);
+    } else if constexpr (detect_positive) {
+      return isinf_details::IsInfTyped<T>::IsInfPos(a);
+    } else if constexpr (detect_negative) {
+      return isinf_details::IsInfTyped<T>::IsInfNeg(a);
+    } else {
+      return false;
+    }
+  }
+};
+
 // We would like to use 64-bit integer to support large matrices. However, CUDA seems to support only 32-bit integer
 // For now, use int32_t to ensure that both Linux and Windows see this as 32 bit integer type.
 #ifndef CUDA_LONG
diff --git a/onnxruntime/core/providers/cuda/cuda_common.h b/onnxruntime/core/providers/cuda/cuda_common.h
index 41c999bacee13..61da125b40953 100644
--- a/onnxruntime/core/providers/cuda/cuda_common.h
+++ b/onnxruntime/core/providers/cuda/cuda_common.h
@@ -70,6 +70,15 @@ class ToCudaType<Float8E4M3FN> {
   }
 };
 
+template <>
+class ToCudaType<Float8E4M3FNUZ> {
+ public:
+  typedef Float8E4M3FNUZ MappedType;
+  static MappedType FromFloat(float f) {
+    return MappedType(f);
+  }
+};
+
 template <>
 class ToCudaType<Float8E5M2> {
  public:
@@ -79,6 +88,15 @@ class ToCudaType<Float8E5M2> {
   }
 };
 
+template <>
+class ToCudaType<Float8E5M2FNUZ> {
+ public:
+  typedef Float8E5M2FNUZ MappedType;
+  static MappedType FromFloat(float f) {
+    return MappedType(f);
+  }
+};
+
 #endif
 
 inline bool CalculateFdmStrides(gsl::span<fast_divmod> p, const std::vector<int64_t>& dims) {
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index 8ba282031a5d4..3c0930638a205 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -830,6 +830,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, MLFloat16, ThresholdedRelu);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, TopK);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 12, Mod);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 19, IsInf);
 
 // opset 11
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, Compress);
@@ -1342,6 +1343,7 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, S
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, float, Gelu);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, double, Gelu);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, MLFloat16, Gelu);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, IsInf);
 
 template <>
 KernelCreateInfo BuildKernelCreateInfo<void>() {
@@ -1739,6 +1741,8 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 12, int8_t, DequantizeLinear)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 12, uint8_t, DequantizeLinear)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 12, Mod)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10,
+                                                                    19, IsInf)>,
 
     // opset 11
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 11, float, ArgMax)>,
@@ -2250,6 +2254,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, float, Gelu)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, double, Gelu)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, MLFloat16, Gelu)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, IsInf)>,
 #endif
   };
 
diff --git a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc
index fd8b69d7bd2f5..00de1b37f3302 100644
--- a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc
+++ b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc
@@ -71,6 +71,44 @@ Status UnaryElementwise::Prepare(OpKernelContext* context, UnaryElementwisePrepa
     return Status::OK();                                                                          \
   }
 
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    IsInf,
+    kOnnxDomain,
+    10,
+    19,
+    kCudaExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T1", BuildKernelDefConstraints<float, double>())
+        .TypeConstraint("T2", DataTypeImpl::GetTensorType<bool>()),
+    IsInf);
+
+ONNX_OPERATOR_KERNEL_EX(
+    IsInf,
+    kOnnxDomain,
+    20,
+    kCudaExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T1", BuildKernelDefConstraints<ISINF_OPSET20_ALL_FLOATS>())
+        .TypeConstraint("T2", DataTypeImpl::GetTensorType<bool>()),
+    IsInf);
+
+IsInf::IsInf(const OpKernelInfo& info) : UnaryElementwise(info) {
+  detect_positive_ = static_cast<bool>(info.GetAttrOrDefault<int64_t>("detect_positive", 1));
+  detect_negative_ = static_cast<bool>(info.GetAttrOrDefault<int64_t>("detect_negative", 1));
+  opset_ = info.node().SinceVersion();
+}
+
+Status IsInf::ComputeInternal(OpKernelContext* context) const {
+  UnaryElementwisePreparation p;
+  ORT_RETURN_IF_ERROR(UnaryElementwise::Prepare(context, &p));
+
+  Explicit_Impl_IsInf(Stream(context), opset_, detect_positive_, detect_negative_,
+                      p.input_tensor->GetElementType(), p.input_tensor->DataRaw(),
+                      p.output_tensor->MutableData<bool>(),
+                      p.input_tensor->Shape().Size());
+  return Status::OK();
+}
+
 #define UNARY_OP_VERSIONED_TYPED(name, startver, endver, T) \
   UNARY_ELEMENTWISE_REGISTER_VERSIONED_KERNEL(name, startver, endver, T)
 
diff --git a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.h b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.h
index 775b78c43a736..3b7d6df7221b7 100644
--- a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.h
+++ b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.h
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 #pragma once
+
 #include "core/providers/cuda/cuda_kernel.h"
 
 namespace onnxruntime {
@@ -119,5 +120,16 @@ class Sign final : public UnaryElementwise {
   Status ComputeInternal(OpKernelContext* context) const override;
 };
 
+class IsInf final : public UnaryElementwise {
+ public:
+  explicit IsInf(const OpKernelInfo& info);
+  Status ComputeInternal(OpKernelContext* context) const override;
+
+ private:
+  bool detect_positive_{true};
+  bool detect_negative_{true};
+  int opset_;
+};
+
 }  // namespace cuda
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.cu b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.cu
index 73c5ac80756be..fd8f7929d4426 100644
--- a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.cu
+++ b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.cu
@@ -11,6 +11,7 @@
 #endif
 
 namespace onnxruntime {
+
 namespace cuda {
 
 #define OP(name, expr)                                     \
@@ -284,5 +285,42 @@ EXPLICIT_IMPL_CASTSAT(__nv_bfloat16, Float8E5M2)
 
 #endif
 
+namespace isinf_details {
+template <typename T>
+struct IsInf_DispFunc {
+  void operator()(cudaStream_t stream, const void* input_raw, bool* output_data,
+                  bool detect_positive, bool detect_negative, size_t count) const {
+    using CudaType = typename ToCudaType<T>::MappedType;
+    const auto* input_data = reinterpret_cast<const CudaType*>(input_raw);
+    if (detect_positive && detect_negative) {
+      UnaryElementWiseImpl(stream, input_data, output_data, _IsInf<CudaType, true, true>{}, count);
+    } else if (detect_positive) {
+      UnaryElementWiseImpl(stream, input_data, output_data, _IsInf<CudaType, true, false>{}, count);
+    } else if (detect_negative) {
+      UnaryElementWiseImpl(stream, input_data, output_data, _IsInf<CudaType, false, true>{}, count);
+    } else {
+      UnaryElementWiseImpl(stream, input_data, output_data, _IsInf<CudaType, false, false>{}, count);
+    }
+  }
+};
+
+}  // namespace isinf_details
+
+void Explicit_Impl_IsInf(cudaStream_t stream, int op_set,
+                         bool detect_positive, bool detect_negative,
+                         int32_t input_data_type,
+                         const void* input_raw, bool* output_data,
+                         size_t count) {
+  if (op_set < 20) {
+    utils::MLTypeCallDispatcher<float, double> dispatcher{input_data_type};
+    dispatcher.Invoke<isinf_details::IsInf_DispFunc>(stream, input_raw, output_data,
+                                                 detect_positive, detect_negative, count);
+  } else {
+    utils::MLTypeCallDispatcher<ISINF_OPSET20_ALL_FLOATS> dispatcher{input_data_type};
+    dispatcher.Invoke<isinf_details::IsInf_DispFunc>(stream, input_raw, output_data,
+                                                 detect_positive, detect_negative, count);
+  }
+}
+
 }  // namespace cuda
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.h b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.h
index 608a81a24cf4f..a606d479bc79b 100644
--- a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.h
+++ b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.h
@@ -137,5 +137,20 @@ void Impl_CastSat(
 
 #endif
 
+// IsInf
+
+#if !defined(DISABLE_FLOAT8_TYPES)
+#define ISINF_OPSET20_ALL_FLOATS float, double, MLFloat16, BFloat16, Float8E4M3FN, Float8E4M3FNUZ, Float8E5M2, \
+                                 Float8E5M2FNUZ
+#else
+#define ISINF_OPSET20_ALL_FLOATS float, double, MLFloat16, BFloat16
+#endif
+
+void Explicit_Impl_IsInf(cudaStream_t stream, int op_set,
+                         bool detect_positive, bool detect_negative,
+                         int32_t input_data_type,
+                         const void* input_raw, bool* output_data,
+                         size_t count);
 }  // namespace cuda
+
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/rocm/cu_inc/common.cuh b/onnxruntime/core/providers/rocm/cu_inc/common.cuh
index 5f966ac746fcb..f3685606c17f5 100644
--- a/onnxruntime/core/providers/rocm/cu_inc/common.cuh
+++ b/onnxruntime/core/providers/rocm/cu_inc/common.cuh
@@ -335,6 +335,100 @@ __device__ __inline__ BFloat16 _Fmod(BFloat16 a, BFloat16 b) {
   return fmodf((float)a, (float)b);
 }
 
+namespace isinf_details {
+template <typename T>
+struct IsInfTyped {
+  static __device__ __inline__ bool IsInf(T a) {
+    // cast is needed because on non MS compilers,
+    // because there isinf() returns int
+    // and we want to avoid stupid warnings
+    return static_cast<bool>(isinf(a));
+  }
+  static __device__ __inline__ bool IsInfPos(T a) {
+    return a == std::numeric_limits<T>::infinity();
+  }
+  static __device__ __inline__ bool IsInfNeg(T a) {
+    return a == -std::numeric_limits<T>::infinity();
+  }
+};
+
+template <>
+struct IsInfTyped<half> {
+  static __device__ __inline__ bool IsInf(half a) {
+    return MLFloat16::kPositiveInfinityBits ==
+           static_cast<uint16_t>(*reinterpret_cast<uint16_t*>(&a) & ~MLFloat16::kSignMask);
+  }
+  static __device__ __inline__ bool IsInfPos(half a) {
+    return MLFloat16::kPositiveInfinityBits == *reinterpret_cast<uint16_t*>(&a);
+  }
+  static __device__ __inline__ bool IsInfNeg(half a) {
+    return MLFloat16::kNegativeInfinityBits == *reinterpret_cast<uint16_t*>(&a);
+  }
+};
+
+template <>
+struct IsInfTyped<BFloat16> {
+  static __device__ __inline__ bool IsInf(BFloat16 a) {
+    return BFloat16::kPositiveInfinityBits ==
+           static_cast<uint16_t>(*reinterpret_cast<uint16_t*>(&a) & ~BFloat16::kSignMask);
+  }
+  static __device__ __inline__ bool IsInfPos(BFloat16 a) {
+    return BFloat16::kPositiveInfinityBits == *reinterpret_cast<uint16_t*>(&a);
+  }
+  static __device__ __inline__ bool IsInfNeg(BFloat16 a) {
+    return BFloat16::kNegativeInfinityBits == *reinterpret_cast<uint16_t*>(&a);
+  }
+};
+
+#if !defined(DISABLE_FLOAT8_TYPES)
+
+template <typename T>
+struct ReturnFalse {
+  constexpr static bool __device__ __inline__ IsInf(T) { return false; }
+  constexpr static bool __device__ __inline__ IsInfPos(T) { return false; }
+  constexpr static bool __device__ __inline__ IsInfNeg(T) { return false; }
+};
+
+template <>
+struct IsInfTyped<Float8E4M3FN> : ReturnFalse<Float8E4M3FN> {};
+
+template <>
+struct IsInfTyped<Float8E4M3FNUZ> : ReturnFalse<Float8E4M3FNUZ> {};
+
+template <>
+struct IsInfTyped<Float8E5M2> {
+  static __device__ __inline__ bool IsInf(Float8E5M2 a) {
+    return a.val == 0b01111100 || a.val == 0b11111100;
+  }
+  static __device__ __inline__ bool IsInfPos(Float8E5M2 a) {
+    return a.val == 0b01111100;
+  }
+  static __device__ __inline__ bool IsInfNeg(Float8E5M2 a) {
+    return a.val == 0b11111100;
+  }
+};
+
+template <>
+struct IsInfTyped<Float8E5M2FNUZ> : ReturnFalse<Float8E5M2FNUZ> {};
+
+#endif
+}  // namespace isinf_details
+
+template <typename T, bool detect_positive, bool detect_negative>
+struct _IsInf {
+  __device__ __inline__ bool operator()(T a) const {
+    if constexpr (detect_positive && detect_negative) {
+      return isinf_details::IsInfTyped<T>::IsInf(a);
+    } else if constexpr (detect_positive) {
+      return isinf_details::IsInfTyped<T>::IsInfPos(a);
+    } else if constexpr (detect_negative) {
+      return isinf_details::IsInfTyped<T>::IsInfNeg(a);
+    } else {
+      return false;
+    }
+  }
+};
+
 // We would like to use 64-bit integer to support large matrices. However, ROCM seems to support only 32-bit integer
 // For now, use int32_t to ensure that both Linux and Windows see this as 32 bit integer type.
 #ifndef HIP_LONG
diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
index 0265c06b9a938..4a679b790ee40 100644
--- a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
+++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
@@ -793,6 +793,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 10, MLFloat16, ThresholdedRelu);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 10, 10, TopK);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 10, 12, Mod);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 10, 19, IsInf);
 
 // opset 11
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 11, 11, float, ArgMax);
@@ -1342,6 +1343,9 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, R
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, Scan);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, Shape);
 
+// Opset 20
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 20, IsInf);
+
 template <>
 KernelCreateInfo BuildKernelCreateInfo<void>() {
   return {};
@@ -1738,6 +1742,8 @@ static Status RegisterRocmKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 10, 12, int8_t, DequantizeLinear)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 10, 12, uint8_t, DequantizeLinear)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 10, 12, Mod)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 10,
+                                                                                                           19, IsInf)>,
 
     // opset 11
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 11, 11, float, ArgMax)>,
@@ -2294,6 +2300,9 @@ static Status RegisterRocmKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, Reshape)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, Scan)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, Shape)>,
+
+    // opset 20
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 20, IsInf)>,
   };
 
   for (auto& function_table_entry : function_table) {
diff --git a/onnxruntime/test/providers/cpu/tensor/isinf_test.cc b/onnxruntime/test/providers/cpu/tensor/isinf_test.cc
index 2e583c5d2547b..bd97306142f18 100644
--- a/onnxruntime/test/providers/cpu/tensor/isinf_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/isinf_test.cc
@@ -99,6 +99,48 @@ TEST(IsInfTest, test_isinf_negative_double20) {
   run_is_inf_test(20, 0, 1, input, output);
 }
 
+TEST(IsInfTest, test_isinf_mlfloat16) {
+  std::initializer_list<MLFloat16> input = {MLFloat16{-1.7f}, MLFloat16::NaN, MLFloat16::Infinity, 3.6_fp16,
+                                            MLFloat16::NegativeInfinity, MLFloat16::Infinity};
+  std::initializer_list<bool> output = {false, false, true, false, true, true};
+  run_is_inf_test(20, 1, 1, input, output);
+}
+
+TEST(IsInfTest, test_isinf_positive_mlfloat16) {
+  std::initializer_list<MLFloat16> input = {MLFloat16{-1.7f}, MLFloat16::NaN, MLFloat16::Infinity, 3.6_fp16,
+                                            MLFloat16::NegativeInfinity, MLFloat16::Infinity};
+  std::initializer_list<bool> output = {false, false, true, false, false, true};
+  run_is_inf_test(20, 1, 0, input, output);
+}
+
+TEST(IsInfTest, test_isinf_negative_mlfloat16) {
+  std::initializer_list<MLFloat16> input = {MLFloat16{-1.7f}, MLFloat16::NaN, MLFloat16::Infinity, 3.6_fp16,
+                                            MLFloat16::NegativeInfinity, MLFloat16::Infinity};
+  std::initializer_list<bool> output = {false, false, false, false, true, false};
+  run_is_inf_test(20, 0, 1, input, output);
+}
+
+TEST(IsInfTest, test_isinf_bfloat16) {
+  std::initializer_list<BFloat16> input = {BFloat16{-1.7f}, BFloat16::NaN, BFloat16::Infinity, 3.6_bfp16,
+                                           BFloat16::NegativeInfinity, BFloat16::Infinity};
+  std::initializer_list<bool> output = {false, false, true, false, true, true};
+  run_is_inf_test(20, 1, 1, input, output);
+}
+
+TEST(IsInfTest, test_isinf_positive_bfloat16) {
+  std::initializer_list<BFloat16> input = {BFloat16{-1.7f}, BFloat16::NaN, BFloat16::Infinity, 3.6_bfp16,
+                                           BFloat16::NegativeInfinity, BFloat16::Infinity};
+  std::initializer_list<bool> output = {false, false, true, false, false, true};
+  run_is_inf_test(20, 1, 0, input, output);
+}
+
+TEST(IsInfTest, test_isinf_negative_bfloat16) {
+  std::initializer_list<BFloat16> input = {BFloat16{-1.7f}, BFloat16::NaN, BFloat16::Infinity, 3.6_bfp16,
+                                           BFloat16::NegativeInfinity, BFloat16::Infinity};
+  std::initializer_list<bool> output = {false, false, false, false, true, false};
+  run_is_inf_test(20, 0, 1, input, output);
+}
+
 #if !defined(DISABLE_FLOAT8_TYPES)
 TEST(IsInfTest, test_Float8E4M3FN) {
   std::initializer_list<Float8E4M3FN> input = {

From d9730c7f43437070eba28d8dcdd9f94c102265ab Mon Sep 17 00:00:00 2001
From: Chi Lo <54722500+chilo-ms@users.noreply.github.com>
Date: Tue, 5 Mar 2024 14:39:36 -0800
Subject: [PATCH 111/279] [TensorRT EP] Fix bug for DDS output handling for
 empty tensor (#19575)

When the DDS output is empty tensor (i.e. any of the dimension is 0),
TRT EP won't perform either cudaMemcpyAsync() nor cuda::Impl_Cast(), to
prevent accidentally overwriting other location that might belong to
other tensors.

This PR also refactors the code to only allocate single bytes for all
empty tensors.

#TODO: add unit tests to cover the DDS code paths or doing more testing
with concurrent,sequential, threaded faster-rcnn using onnx_test_runner
and verifying outputs

---------

Co-authored-by: Chi Lo <lochi@microsoft.com>
---
 cmake/deps.txt                                |   4 +-
 .../tensorrt/tensorrt_execution_provider.cc   | 465 ++++++------------
 2 files changed, 160 insertions(+), 309 deletions(-)

diff --git a/cmake/deps.txt b/cmake/deps.txt
index 9cba25b00157d..9630b6185fcf6 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -37,8 +37,8 @@ mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee
 mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063
 neural_speed;https://github.com/intel/neural-speed/archive/refs/tags/bestlav0.1.1.zip;65b0f7a0d04f72f0d5a8d48af70f0366f2ab3939
 onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.15.0.zip;54c3f960a0541c5d8d3e60c2933e11f5d3688a11
-#use the commit of supporting all the plugins and TRT 8.6-GA (https://github.com/onnx/onnx-tensorrt/commit/0462dc31ae78f48744b6141ae376df1f96d3f459)
-onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/a43ce67187bab219520fd80f21af8bbd4354bc8c.zip;572535aefef477050f86744dfab1fef840198035
+#use the commit of Final DDS removal. DDS output is now supported by ORT TRT.
+onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/bacfaaa951653cd4e72efe727a543567cb38f7de.zip;26434329612e804164ab7baa6ae629ada56c1b26
 protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v21.12.zip;7cf2733949036c7d52fda017badcab093fe73bfa
 protoc_win64;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip;b4521f7ada5b260380f94c4bd7f1b7684c76969a
 protoc_win32;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win32.zip;3688010318192c46ce73213cdfb6b3e5656da874
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index 81346671f2aad..157cd0a200b35 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -717,6 +717,77 @@ Status ApplyProfileShapesFromInputTensorValue(std::vector<nvinfer1::IOptimizatio
   return Status::OK();
 }
 
+#define CASE_GET_INPUT_TENSOR(DATA_TYPE, SrcT)                                              \
+  case DATA_TYPE: {                                                                         \
+    auto input_tensor_ptr = input_tensor.GetTensorData<SrcT>();                             \
+    if (input_tensor_ptr != nullptr && elem_cnt > 0) {                                      \
+      data = const_cast<SrcT*>(input_tensor_ptr);                                           \
+    } else {                                                                                \
+      scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, 1)); \
+      data = scratch_buffers.back().get();                                                  \
+    }                                                                                       \
+    break;                                                                                  \
+  }
+
+#define CASE_GET_CAST_INPUT_TENSOR(DATA_TYPE, SrcT, DstT)                                                         \
+  case DATA_TYPE: {                                                                                               \
+    auto input_tensor_ptr = input_tensor.GetTensorData<SrcT>();                                                   \
+    if (input_tensor_ptr != nullptr && elem_cnt > 0) {                                                            \
+      scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, elem_cnt * sizeof(DstT))); \
+      data = scratch_buffers.back().get();                                                                        \
+      cuda::Impl_Cast<SrcT, DstT>(stream, input_tensor_ptr, reinterpret_cast<DstT*>(data), elem_cnt);             \
+    } else {                                                                                                      \
+      scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, 1));                       \
+      data = scratch_buffers.back().get();                                                                        \
+    }                                                                                                             \
+    break;                                                                                                        \
+  }
+
+#define CASE_GET_OUTPUT_TENSOR(DATA_TYPE, SrcT)                                             \
+  case DATA_TYPE: {                                                                         \
+    auto output_tensor_ptr = output_tensor.GetTensorMutableData<SrcT>();                    \
+    if (output_tensor_ptr != nullptr && elem_cnt > 0) {                                     \
+      buffers[output_name] = output_tensor_ptr;                                             \
+    } else {                                                                                \
+      scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, 1)); \
+      buffers[output_name] = scratch_buffers.back().get();                                  \
+    }                                                                                       \
+    break;                                                                                  \
+  }
+
+#define CASE_GET_CAST_OUTPUT_TENSOR(DATA_TYPE, SrcT, DstT)                                                        \
+  case DATA_TYPE: {                                                                                               \
+    auto output_tensor_ptr = output_tensor.GetTensorMutableData<SrcT>();                                          \
+    if (output_tensor_ptr != nullptr && elem_cnt > 0) {                                                           \
+      scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, elem_cnt * sizeof(DstT))); \
+      buffers[output_name] = scratch_buffers.back().get();                                                        \
+      output_dim_sizes[i] = static_cast<int>(elem_cnt);                                                           \
+    } else {                                                                                                      \
+      scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, 1));                       \
+      buffers[output_name] = scratch_buffers.back().get();                                                        \
+      output_dim_sizes[i] = 1;                                                                                    \
+    }                                                                                                             \
+    break;                                                                                                        \
+  }
+
+#define CASE_COPY_TENSOR(DATA_TYPE, DstT)                                                                                                          \
+  case DATA_TYPE: {                                                                                                                                \
+    auto output_tensor_ptr = output_tensor.GetTensorMutableData<DstT>();                                                                           \
+    if (output_tensor_ptr != nullptr && elem_cnt > 0) {                                                                                            \
+      CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_tensor_ptr, allocator->getBuffer(), elem_cnt * sizeof(DstT), cudaMemcpyDeviceToDevice, stream)); \
+    }                                                                                                                                              \
+    break;                                                                                                                                         \
+  }
+
+#define CASE_CAST_TENSOR(DATA_TYPE, SrcT, DstT)                                                                                                   \
+  case DATA_TYPE: {                                                                                                                               \
+    auto output_tensor_ptr = output_tensor.GetTensorMutableData<DstT>();                                                                          \
+    if (output_tensor_ptr != nullptr && elem_cnt > 0) {                                                                                           \
+      cuda::Impl_Cast<SrcT, DstT>(stream, reinterpret_cast<SrcT*>(allocator->getBuffer()), reinterpret_cast<DstT*>(output_tensor_ptr), elem_cnt); \
+    }                                                                                                                                             \
+    break;                                                                                                                                        \
+  }
+
 /*
  * Set TensorRT execution context input.
  *
@@ -737,6 +808,17 @@ Status BindContextInput(Ort::KernelContext& ctx,
   auto tensor_info = input_tensor.GetTensorTypeAndShapeInfo();
   const auto tensor_shapes = tensor_info.GetShape();
   const auto tensor_type = tensor_info.GetElementType();
+  /*
+   * Return the number of elements specified by the tensor shape (all dimensions multiplied by each other).
+   * For 0 dimensions, 1 is returned. If any dimension is less than 0, the result is always -1.
+   *
+   * Examples:<br>
+   * [] = 1<br>
+   * [1,3,4] = 12<br>
+   * [2,0,4] = 0<br>
+   * [-1,3,4] = -1<br>
+   */
+  const auto elem_cnt = tensor_info.GetElementCount();
 
   if (trt_engine->isShapeInferenceIO(input_name)) {
     // Get the shape value of "shape tensor"
@@ -765,113 +847,24 @@ Status BindContextInput(Ort::KernelContext& ctx,
       ORT_THROW_IF_ERROR(ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
                                          "TensorRT EP failed to call nvinfer1::IExecutionContext::setInputShape() for input '" + error_input_name + "'"));
     }
-    // Bind "execution tensor" input buffers
+
+    // Bind "execution tensor" input buffer
+    //
+    // Note: If an engine binding is an empty tensor, it still needs a non-null memory address, and different tensors should have different addresses.
+    //       Therefore, in the case of empty tensor, TRT EP always allocates a dummy byte.
+    //       https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#empty-tensors
     void* data = nullptr;
     switch (tensor_type) {
-      case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT: {
-        auto input_tensor_ptr = input_tensor.GetTensorData<float>();
-        if (input_tensor_ptr == nullptr) {
-          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(float)));
-          data = scratch_buffers.back().get();
-        } else {
-          data = const_cast<float*>(input_tensor_ptr);
-        }
-        break;
-      }
-      case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16: {
-        auto input_tensor_ptr = input_tensor.GetTensorData<uint16_t>();
-        if (input_tensor_ptr == nullptr) {
-          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(uint16_t)));
-          data = scratch_buffers.back().get();
-        } else {
-          data = const_cast<uint16_t*>(input_tensor_ptr);
-        }
-        break;
-      }
-      case ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL: {
-        auto input_tensor_ptr = input_tensor.GetTensorData<bool>();
-        if (input_tensor_ptr == nullptr) {
-          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(bool)));
-          data = scratch_buffers.back().get();
-        } else {
-          data = const_cast<bool*>(input_tensor_ptr);
-        }
-        break;
-      }
-      case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8: {
-        auto input_tensor_ptr = input_tensor.GetTensorData<int8_t>();
-        if (input_tensor_ptr == nullptr) {
-          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(int8_t)));
-          data = scratch_buffers.back().get();
-        } else {
-          data = const_cast<int8_t*>(input_tensor_ptr);
-        }
-        break;
-      }
-      case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8: {
-        auto input_tensor_ptr = input_tensor.GetTensorData<uint8_t>();
-        if (input_tensor_ptr == nullptr) {
-          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(uint8_t)));
-          data = scratch_buffers.back().get();
-        } else {
-          data = const_cast<uint8_t*>(input_tensor_ptr);
-        }
-        break;
-      }
-      case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: {
-        auto input_tensor_ptr = input_tensor.GetTensorData<int32_t>();
-        if (input_tensor_ptr == nullptr) {
-          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(int32_t)));
-          data = scratch_buffers.back().get();
-        } else {
-          data = const_cast<int32_t*>(input_tensor_ptr);
-        }
-        break;
-      }
-      case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: {
-        // Cast INT64 input to INT32 because TensorRT doesn't fully support INT64
-        auto input_tensor_ptr = input_tensor.GetTensorData<int64_t>();
-        if (input_tensor_ptr == nullptr) {
-          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(int32_t)));
-          data = scratch_buffers.back().get();
-        } else {
-          SafeInt<int> input_dim_size = 1;
-          for (int j = 0, end = nb_dims; j < end; ++j) {
-            if (tensor_shapes[j] == 0) {
-              input_dim_size = 1;
-              break;
-            } else {
-              input_dim_size *= tensor_shapes[j];
-            }
-          }
-          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, input_dim_size * sizeof(int32_t)));
-          data = scratch_buffers.back().get();
-          cuda::Impl_Cast<int64_t, int32_t>(stream, input_tensor_ptr, reinterpret_cast<int32_t*>(data), input_dim_size);
-        }
-        break;
-      }
-      case ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE: {
-        // Cast DOUBLE input to FLOAT because TensorRT doesn't fully support INT64
-        auto input_tensor_ptr = input_tensor.GetTensorData<double>();
-        if (input_tensor_ptr == nullptr) {
-          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(float)));
-          data = scratch_buffers.back().get();
-        } else {
-          SafeInt<int> input_dim_size = 1;
-          for (int j = 0, end = nb_dims; j < end; ++j) {
-            if (tensor_shapes[j] == 0) {
-              input_dim_size = 1;
-              break;
-            } else {
-              input_dim_size *= tensor_shapes[j];
-            }
-          }
-          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, input_dim_size * sizeof(float)));
-          data = scratch_buffers.back().get();
-          cuda::Impl_Cast<double, float>(stream, input_tensor_ptr, reinterpret_cast<float*>(data), input_dim_size);
-        }
-        break;
-      }
+      CASE_GET_INPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, float)
+      CASE_GET_INPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16, uint16_t)
+      CASE_GET_INPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL, bool)
+      CASE_GET_INPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8, int8_t)
+      CASE_GET_INPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8, uint8_t)
+      CASE_GET_INPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32, int32_t)
+      // Cast int64 input to int32 input because TensorRT doesn't support int64
+      CASE_GET_CAST_INPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, int64_t, int32_t)
+      // Cast double input to float because TensorRT doesn't support double
+      CASE_GET_CAST_INPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE, double, float)
       default: {
         return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
                                "TensorRT EP input onnx tensor data type: " + std::to_string(tensor_type) + " not supported.");
@@ -884,7 +877,7 @@ Status BindContextInput(Ort::KernelContext& ctx,
 }
 
 /*
- * Set TensorRT execution context output.
+ * Bind TensorRT execution context output.
  *
  * Please note that the "data-depedent shape" output needs corresponding allocator provided.
  *
@@ -912,7 +905,6 @@ Status BindContextOutput(Ort::KernelContext& ctx,
                          size_t i,
                          std::unordered_map<size_t, Ort::UnownedValue>& output_tensors,
                          std::unordered_map<size_t, int>& output_dim_sizes,
-                         std::unordered_set<char const*>& dds_output_set,
                          DDSOutputAllocatorMap& dds_output_allocator_map,
                          std::vector<IAllocatorUniquePtr<void>>& scratch_buffers,
                          OrtAllocator* alloc,
@@ -920,142 +912,47 @@ Status BindContextOutput(Ort::KernelContext& ctx,
   // Get output shape
   nvinfer1::Dims dims = trt_context->getTensorShape(output_name);
   int nb_dims = dims.nbDims;
-  bool is_dds_output = false;
+  bool is_DDS = false;
   std::vector<int64_t> output_shapes(nb_dims);
   for (int j = 0, end = nb_dims; j < end; ++j) {
     // data-dependent shape
     if (dims.d[j] == -1) {
-      is_dds_output = true;
-      dds_output_set.emplace(output_name);
+      is_DDS = true;
       break;
     }
     output_shapes[j] = dims.d[j];
   }
 
+  auto known_DDS = dds_output_allocator_map.find(output_name) != dds_output_allocator_map.end();
+
   // If the output tensor has data-dependent shape, TRT EP will provide an IOutputAllocator for enqueueV3 to dynamically allocate memory buffer.
   // Once enqueueV3 returns, TRT EP will then bind the output allocation to ORT kernel context output.
   // (Please note that we take strategy A mentioned in https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#dynamic-shaped-output,
   //  which we defer allocation until the size is known and don't call IExecution::setTensorAddress)
   //
   // Otherwise, if the shape of the output tensor is known prior to the runtime, ORT will pre-allocate memory buffer for the output tensor for enqueueV3.
-  if (is_dds_output) {
-    if (dds_output_allocator_map.find(output_name) == dds_output_allocator_map.end()) {
+  if (is_DDS || known_DDS) {
+    if (!known_DDS) {
       auto allocatorPtr = std::make_unique<OutputAllocator>();
       trt_context->setOutputAllocator(output_name, allocatorPtr.get());
       dds_output_allocator_map[output_name] = std::move(allocatorPtr);
-    } else {
-      trt_context->setOutputAllocator(output_name, dds_output_allocator_map[output_name].get());
     }
   } else {
     output_tensors[i] = ctx.GetOutput(output_index, output_shapes);
     auto& output_tensor = output_tensors[i];
+    const auto elem_cnt = output_tensor.GetTensorTypeAndShapeInfo().GetElementCount();
+
     switch (output_type) {
-      case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT: {
-        auto output_tensor_ptr = output_tensor.GetTensorMutableData<float>();
-        if (output_tensor_ptr == nullptr) {
-          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(float)));
-          buffers[output_name] = scratch_buffers.back().get();
-        } else {
-          buffers[output_name] = output_tensor_ptr;
-        }
-        break;
-      }
-      case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16: {
-        auto output_tensor_ptr = output_tensor.GetTensorMutableData<uint16_t>();
-        if (output_tensor_ptr == nullptr) {
-          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(uint16_t)));
-          buffers[output_name] = scratch_buffers.back().get();
-        } else {
-          buffers[output_name] = output_tensor_ptr;
-        }
-        break;
-      }
-      case ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL: {
-        auto output_tensor_ptr = output_tensor.GetTensorMutableData<bool>();
-        if (output_tensor_ptr == nullptr) {
-          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(bool)));
-          buffers[output_name] = scratch_buffers.back().get();
-        } else {
-          buffers[output_name] = output_tensor_ptr;
-        }
-        break;
-      }
-      case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8: {
-        auto output_tensor_ptr = output_tensor.GetTensorMutableData<int8_t>();
-        if (output_tensor_ptr == nullptr) {
-          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(int8_t)));
-          buffers[output_name] = scratch_buffers.back().get();
-        } else {
-          buffers[output_name] = output_tensor_ptr;
-        }
-        break;
-      }
-      case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8: {
-        auto output_tensor_ptr = output_tensor.GetTensorMutableData<uint8_t>();
-        if (output_tensor_ptr == nullptr) {
-          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(uint8_t)));
-          buffers[output_name] = scratch_buffers.back().get();
-        } else {
-          buffers[output_name] = output_tensor_ptr;
-        }
-        break;
-      }
-      case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: {
-        auto output_tensor_ptr = output_tensor.GetTensorMutableData<int32_t>();
-        if (output_tensor_ptr == nullptr) {
-          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(int32_t)));
-          buffers[output_name] = scratch_buffers.back().get();
-        } else {
-          buffers[output_name] = output_tensor_ptr;
-        }
-        break;
-      }
-      case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: {
-        // Allocate INT32 CUDA memory for INT64 output type because TensorRT doesn't fully support INT64
-        auto output_tensor_ptr = output_tensor.GetTensorMutableData<int64_t>();
-        if (output_tensor_ptr == nullptr) {
-          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(int32_t)));
-          buffers[output_name] = scratch_buffers.back().get();
-          output_dim_sizes[i] = 1;
-        } else {
-          SafeInt<int> output_dim_size(1);
-          for (int j = 0, end = nb_dims; j < end; ++j) {
-            if (dims.d[j] == 0) {
-              output_dim_size = 1;
-              break;
-            } else {
-              output_dim_size *= dims.d[j];
-            }
-          }
-          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, output_dim_size * sizeof(int32_t)));
-          buffers[output_name] = scratch_buffers.back().get();
-          output_dim_sizes[i] = output_dim_size;
-        }
-        break;
-      }
-      case ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE: {
-        // Allocate FLOAT CUDA memory for DOUBLE output type because TensorRT doesn't fully support DOUBLE
-        auto output_tensor_ptr = output_tensor.GetTensorMutableData<double>();
-        if (output_tensor_ptr == nullptr) {
-          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(float)));
-          buffers[output_name] = scratch_buffers.back().get();
-          output_dim_sizes[i] = 1;
-        } else {
-          SafeInt<int> output_dim_size(1);
-          for (int j = 0, end = nb_dims; j < end; ++j) {
-            if (dims.d[j] == 0) {
-              output_dim_size = 1;
-              break;
-            } else {
-              output_dim_size *= dims.d[j];
-            }
-          }
-          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, output_dim_size * sizeof(float)));
-          buffers[output_name] = scratch_buffers.back().get();
-          output_dim_sizes[i] = output_dim_size;
-        }
-        break;
-      }
+      CASE_GET_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, float)
+      CASE_GET_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16, uint16_t)
+      CASE_GET_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL, bool)
+      CASE_GET_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8, int8_t)
+      CASE_GET_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8, uint8_t)
+      CASE_GET_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32, int32_t)
+      // Allocate int32 CUDA memory for int64 output type because TensorRT doesn't support int64
+      CASE_GET_CAST_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, int64_t, int32_t)
+      // Allocate float CUDA memory for double output type because TensorRT doesn't support double
+      CASE_GET_CAST_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE, double, float)
       default: {
         return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
                                "TensorRT EP output tensor data type: " + std::to_string(output_type) + " not supported.");
@@ -1068,10 +965,13 @@ Status BindContextOutput(Ort::KernelContext& ctx,
 }
 
 /*
- * Set ORT kernel context Output.
+ * Bind ORT kernel context Output.
  *
- * Note: In the case of DDS (data-dependent shape) output, TRT requires a provided allocator to allocate memory during runtime.
+ * In the case of DDS (data-dependent shape) output, TRT requires a provided allocator to allocate memory during runtime.
  * Once the output has been put in the allocation buffer, ORT calls this function to bind the allocation to ORT kernel context output.
+ *
+ * Note: Current approach of setting the ORT kernel context output is copying the output data from allocation buffer to ORT context output address which is not optimal,
+ * we are waiting for ORT core to support "assign" memory address to ORT context output. Some works need to be done in ORT memory planner to be aware of this memory support.
  */
 Status BindKernelOutput(Ort::KernelContext& ctx,
                         OrtMemoryInfo* mem_info,
@@ -1083,93 +983,46 @@ Status BindKernelOutput(Ort::KernelContext& ctx,
   auto allocator = allocator_map[output_name].get();
   auto& shape = allocator->getOutputShape();
   auto output_tensor = ctx.GetOutput(output_index, shape);
+
+  /*
+   * Return the number of elements specified by the tensor shape (all dimensions multiplied by each other).
+   * For 0 dimensions, 1 is returned. If any dimension is less than 0, the result is always -1.
+   *
+   * Examples:<br>
+   * [] = 1<br>
+   * [1,3,4] = 12<br>
+   * [2,0,4] = 0<br>
+   * [-1,3,4] = -1<br>
+   */
   auto elem_cnt = output_tensor.GetTensorTypeAndShapeInfo().GetElementCount();
 
+  /*
+   * Copy output data from allocation buffer to ORT kernel context output location or
+   * cast (int32 or float) -> (int64 or double) to ORT kernel context output location.
+   *
+   * Note:
+   * 1. If the output tensor is empty tensor (i.e. any of the dimension is 0) which means element count is 0,
+   *    TRT EP does not perform cuda memory copy nor cuda cast to prevent overwriting other location that might belong to other tensors.
+   * 2. The cudaMemcpyAsync() and cuda::Impl_Cast() (implemented as _UnaryElementWise() in cuda ep) are all async, but we
+   *    don't need to explicitly call cudaStreamSynchronize() after those APIs due to CUDA EP and TRT EP uses same stream,
+   *    and within the same stream, operations are guaranteed to be executed in order.
+   */
   switch (output_type) {
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT: {
-      auto output_tensor_ptr = output_tensor.GetTensorMutableData<float>();
-      if (output_tensor_ptr != nullptr) {
-        CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_tensor_ptr, allocator->getBuffer(), elem_cnt * sizeof(float), cudaMemcpyDeviceToDevice, stream));
-      }
-      break;
-    }
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16: {
-      auto output_tensor_ptr = output_tensor.GetTensorMutableData<uint16_t>();
-      if (output_tensor_ptr != nullptr) {
-        CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_tensor_ptr, allocator->getBuffer(), elem_cnt * sizeof(uint16_t), cudaMemcpyDeviceToDevice, stream));
-      }
-      break;
-    }
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL: {
-      auto output_tensor_ptr = output_tensor.GetTensorMutableData<bool>();
-      if (output_tensor_ptr != nullptr) {
-        CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_tensor_ptr, allocator->getBuffer(), elem_cnt * sizeof(bool), cudaMemcpyDeviceToDevice, stream));
-      }
-      break;
-    }
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8: {
-      auto output_tensor_ptr = output_tensor.GetTensorMutableData<int8_t>();
-      if (output_tensor_ptr != nullptr) {
-        CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_tensor_ptr, allocator->getBuffer(), elem_cnt * sizeof(int8_t), cudaMemcpyDeviceToDevice, stream));
-      }
-      break;
-    }
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8: {
-      auto output_tensor_ptr = output_tensor.GetTensorMutableData<uint8_t>();
-      if (output_tensor_ptr != nullptr) {
-        CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_tensor_ptr, allocator->getBuffer(), elem_cnt * sizeof(uint8_t), cudaMemcpyDeviceToDevice, stream));
-      }
-      break;
-    }
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: {
-      auto output_tensor_ptr = output_tensor.GetTensorMutableData<int32_t>();
-      if (output_tensor_ptr != nullptr) {
-        CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_tensor_ptr, allocator->getBuffer(), elem_cnt * sizeof(int32_t), cudaMemcpyDeviceToDevice, stream));
-      }
-      break;
-    }
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: {
-      // The allocation buffer holds the INT32 output data since TRT doesn't support INT64 but INT32.
-      // So, we need to cast the data from INT32 to INT64 and then set INT64 output data to kernel context.
-      SafeInt<int> output_dim_size(1);
-      for (size_t i = 0; i < shape.size(); ++i) {
-        if (shape[i] == 0) {
-          output_dim_size = 1;
-          break;
-        } else {
-          output_dim_size *= shape[i];
-        }
-      }
-      auto output_tensor_ptr = output_tensor.GetTensorMutableData<int64_t>();
-      if (output_tensor_ptr != nullptr) {
-        cuda::Impl_Cast<int32_t, int64_t>(stream, reinterpret_cast<int32_t*>(allocator->getBuffer()), reinterpret_cast<int64_t*>(output_tensor_ptr), output_dim_size);
-      }
-      break;
-    }
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE: {
-      // The allocation buffer holds the FLOAT output data since TRT doesn't support DOUBLE but FLOAT.
-      // So, we need to cast the data from FLOAT to DOUBEL and then set DOUBLE output data to kernel context.
-      SafeInt<int> output_dim_size(1);
-      for (size_t i = 0; i < shape.size(); ++i) {
-        if (shape[i] == 0) {
-          output_dim_size = 1;
-          break;
-        } else {
-          output_dim_size *= shape[i];
-        }
-      }
-      auto output_tensor_ptr = output_tensor.GetTensorMutableData<double>();
-      if (output_tensor_ptr != nullptr) {
-        cuda::Impl_Cast<float, double>(stream, reinterpret_cast<float*>(allocator->getBuffer()), reinterpret_cast<double*>(output_tensor_ptr), output_dim_size);
-      }
-      break;
-    }
+    CASE_COPY_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, float)
+    CASE_COPY_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16, uint16_t)
+    CASE_COPY_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL, bool)
+    CASE_COPY_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8, int8_t)
+    CASE_COPY_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8, uint8_t)
+    CASE_COPY_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32, int32_t)
+    // The allocation buffer holds the int32 output data since TRT doesn't support int64. So, we need to cast the data (int32 -> int64) for ORT kernel output.
+    CASE_CAST_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, int32_t, int64_t)
+    // The allocation buffer holds the float output data since TRT doesn't support double. So, we need to cast the data (float -> double) for ORT kernel output.
+    CASE_CAST_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE, float, double)
     default: {
       return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
                              "TensorRT EP output tensor data type: " + std::to_string(output_type) + " not supported.");
     }
   }
-  CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream));
   return Status::OK();
 }
 
@@ -3513,7 +3366,6 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
     output_tensors.reserve(num_outputs);
     std::unordered_map<size_t, int> output_dim_sizes;
     output_dim_sizes.reserve(num_outputs);
-    std::unordered_set<char const*> dds_output_set;
 
     for (size_t i = 0, end = output_binding_names.size(); i < end; ++i) {
       char const* output_name = output_binding_names[i];
@@ -3531,7 +3383,7 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
       }
 
       Status status = BindContextOutput(ctx, trt_context, output_name, output_index, output_type, i, output_tensors, output_dim_sizes,
-                                        dds_output_set, dds_output_allocator_map, scratch_buffers, alloc, buffers);
+                                        dds_output_allocator_map, scratch_buffers, alloc, buffers);
       if (status != Status::OK()) {
         return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
       }
@@ -3590,7 +3442,7 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
         output_type = iter->second;
       }
 
-      if (dds_output_set.find(output_name) != dds_output_set.end()) {
+      if (dds_output_allocator_map.find(output_name) != dds_output_allocator_map.end()) {
         size_t output_index = 0;
         const auto& index_iter = output_indexes.find(output_name);
         if (index_iter != output_indexes.end()) {
@@ -3806,7 +3658,6 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(con
     output_tensors.reserve(num_outputs);
     std::unordered_map<size_t, int> output_dim_sizes;
     output_dim_sizes.reserve(num_outputs);
-    std::unordered_set<char const*> dds_output_set;
 
     for (size_t i = 0, end = output_binding_names.size(); i < end; ++i) {
       char const* output_name = output_binding_names[i];
@@ -3824,7 +3675,7 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(con
       }
 
       Status status = BindContextOutput(ctx, trt_context, output_name, output_index, output_type, i, output_tensors, output_dim_sizes,
-                                        dds_output_set, dds_output_allocator_map, scratch_buffers, alloc, buffers);
+                                        dds_output_allocator_map, scratch_buffers, alloc, buffers);
       if (status != Status::OK()) {
         return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
       }
@@ -3883,7 +3734,7 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(con
         output_type = iter->second;
       }
 
-      if (dds_output_set.find(output_name) != dds_output_set.end()) {
+      if (dds_output_allocator_map.find(output_name) != dds_output_allocator_map.end()) {
         size_t output_index = 0;
         const auto& index_iter = output_indexes.find(output_name);
         if (index_iter != output_indexes.end()) {

From d10256975527e8e041cedb19227cb5f207087c42 Mon Sep 17 00:00:00 2001
From: pengwa <pengwa@microsoft.com>
Date: Wed, 6 Mar 2024 10:06:25 +0800
Subject: [PATCH 112/279] Fix seed for recomputed Dropout (#19715)

### Fix seed for recomputed Dropout

If Dropout node is recomputed in the backward, we should make sure its
execution is same as the run in the forward.
If we don't set seed attribute, then this cannot be guaranteed.

Add ` export ORTMODULE_MEMORY_OPT_LEVEL=2` to enabled per layer
recompute with compromised recomputable subgraphs.
---
 docs/Memory_Optimizer.md                      |  1 +
 docs/ORTModule_Training_Guidelines.md         |  5 ++-
 onnxruntime/core/common/string_utils.h        | 12 +++++++
 .../memory_optimizer/memory_insight.cc        |  6 +++-
 .../memory_optimizer/memory_optimizer.cc      | 34 +++++++++++++++++--
 .../memory_optimizer/memory_optimizer.h       |  1 +
 .../ortmodule/_graph_execution_manager.py     |  7 +++-
 .../training/ortmodule/_runtime_inspector.py  | 34 ++++++++++++++-----
 .../python/training/ortmodule/options.py      | 18 ++++++++--
 9 files changed, 101 insertions(+), 17 deletions(-)

diff --git a/docs/Memory_Optimizer.md b/docs/Memory_Optimizer.md
index 97f7e7ff2c14b..eaa48c9da0609 100644
--- a/docs/Memory_Optimizer.md
+++ b/docs/Memory_Optimizer.md
@@ -51,6 +51,7 @@ There are two modes to enable the memory optimizations:
 	- Plan 8            :  OFF  :  Cast+:2:-1                                           1     2,048              2.0*inputs_input_ids_dim0*inputs_input_ids_dim1
 	```
 3. As shown above, `Config` is a string representative for a re-computable subgraph. All are enabled for recompute in this case.
+4. By `export ORTMODULE_MEMORY_OPT_LEVEL=2`, all plans including compromised recomptable subgraphs will also be enabled.
 
 
 ### Mode 2 -  Advanced Usage (User Selected Subgraph Recompute)
diff --git a/docs/ORTModule_Training_Guidelines.md b/docs/ORTModule_Training_Guidelines.md
index 84631bd1f6555..54137937ad56d 100644
--- a/docs/ORTModule_Training_Guidelines.md
+++ b/docs/ORTModule_Training_Guidelines.md
@@ -287,7 +287,10 @@ A classical usage of disabling the deep copy: when the deep copy before module e
 #### ORTMODULE_MEMORY_OPT_LEVEL
 
 - **Feature Area**: *ORTMODULE/Optimizations*
-- **Description**: By default, the level is 0. This env var can be used for enabling recomputation for reducing memory peak requirement. Setting the level to be 0 means all detected subgraphs with each transformer-based model layer generating stashed activations will be recomputed. This is conceptually equivalent to PyTorch's gradient checkpoint. When level is not 0, check Check [Memory Optimizer for ONNX Runtime Training](Memory_Optimizer.md) for more details.
+- **Description**: By default, the level is 0. This env var can be used for enabling recomputation for reducing memory peak requirement.
+   - Setting the level to be 1 means all detected recomputable subgraphs (NOT including compromised recomputable graphs)  with each transformer-based model layer generating stashed activations will be recomputed. This is conceptually equivalent to PyTorch's gradient checkpoint.
+   - Setting the level to be 2 means all detected recomputable subgraphs (including compromised recomputable graphs) with each transformer-based model layer generating stashed activations will be recomputed. This is conceptually equivalent to PyTorch's gradient checkpoint.
+   - When the level is 0, check Check [Memory Optimizer for ONNX Runtime Training](Memory_Optimizer.md) for more details.
 
     ```bash
     export ORTMODULE_MEMORY_OPT_LEVEL=0
diff --git a/onnxruntime/core/common/string_utils.h b/onnxruntime/core/common/string_utils.h
index eca1221e84cb8..03e94cefd0564 100644
--- a/onnxruntime/core/common/string_utils.h
+++ b/onnxruntime/core/common/string_utils.h
@@ -65,5 +65,17 @@ inline std::string TrimString(std::string s) {
   return s;
 }
 
+/**
+ * So use this simple hash to generate unique int by given string input.
+ */
+inline uint32_t GetHashFromString(const std::string& str_value) {
+  uint32_t hash = 0;
+  for (char const& c : str_value) {
+    hash = hash * 101 + c;
+  }
+
+  return hash;
+}
+
 }  // namespace utils
 }  // namespace onnxruntime
diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/memory_insight.cc b/orttraining/orttraining/core/optimizer/memory_optimizer/memory_insight.cc
index 3fbdd5da7b768..08c402bf669c8 100644
--- a/orttraining/orttraining/core/optimizer/memory_optimizer/memory_insight.cc
+++ b/orttraining/orttraining/core/optimizer/memory_optimizer/memory_insight.cc
@@ -9,6 +9,8 @@
 #include <utility>
 #include <vector>
 
+#include "core/common/string_utils.h"
+#include "core/framework/random_seed.h"
 #include "core/graph/graph_utils.h"
 #include "core/graph/graph_viewer.h"
 #include "orttraining/core/optimizer/memory_optimizer/common.h"
@@ -284,7 +286,9 @@ Status FindORTModuleMemoryOpportunity(const GraphViewer& graph_viewer,
       memory_opt_planner.AddNodeOptimizationPlan(p_node, std::move(recompute_plan));
     }
 
-    if (can_compromise_stashed_activation) {
+    // Only detect compromise recompute when recompute is not found, in case there are multiple recompute plans
+    // for the same named activations, then user might enable those conflicting recompute plans by mistakes.
+    if (recompute_plan == nullptr && can_compromise_stashed_activation) {
       MO_LOG_DEBUG_INFO(logger, "Searching Node " + p_node->Name() + "(" + p_node->OpType() +
                                     ") for compromised recompute");
       // If the subgraph recompute can save memory by comprising the assumption - recompute graphs' input must exist
diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/memory_optimizer.cc b/orttraining/orttraining/core/optimizer/memory_optimizer/memory_optimizer.cc
index 49e026ca86bd3..525e3b4b8de35 100644
--- a/orttraining/orttraining/core/optimizer/memory_optimizer/memory_optimizer.cc
+++ b/orttraining/orttraining/core/optimizer/memory_optimizer/memory_optimizer.cc
@@ -28,6 +28,29 @@ constexpr bool IsForwardPassOperator(ptrdiff_t op_order_in_topological_sort,
   return op_order_in_topological_sort <= boundary_op_order_in_topological_sort;
 }
 
+// Reset seed attribute for the dropout node if the seed is not set.
+bool SetSeedForDropoutNode(Node& node) {
+  // ONNX Dropout 1, 6, 7, 10 do not have seed attribute, so we remove them from the recompute support.
+  // TODO(pengwa): add the opset check in GetAllowedRecomputeOps.
+  if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "Dropout", {12, 13}, kOnnxDomain) ||
+      graph_utils::IsSupportedOptypeVersionAndDomain(node, "BitmaskDropout", {1}, kMSDomain) ||
+      graph_utils::IsSupportedOptypeVersionAndDomain(node, "BiasDropout", {1}, kMSDomain) ||
+      graph_utils::IsSupportedOptypeVersionAndDomain(node, "BitmaskBiasDropout", {1}, kMSDomain) ||
+      graph_utils::IsSupportedOptypeVersionAndDomain(node, "BiasSoftmaxDropout", {1}, kMSDomain)) {
+    auto& attrs = node.GetAttributes();
+    if (attrs.count("seed")) {
+      return false;
+    }
+
+    int64_t seed = static_cast<int64_t>(utils::GetHashFromString(node.OutputDefs()[0]->Name())) +
+                   utils::GetRandomSeed();
+    node.AddAttribute("seed", seed);
+    return true;
+  }
+
+  return false;
+}
+
 }  // namespace
 
 Status MemoryOptimizer::ParseOptimizationConfigFromString(const std::string& memory_optimizer_config,
@@ -74,7 +97,7 @@ bool MemoryOptimizer::ModifyGraph(Graph& graph,
       optimizer::memory_optimizer::NodeRecomputePlan* recompute_plan =
           dynamic_cast<optimizer::memory_optimizer::NodeRecomputePlan*>(node_plan.get());
       ORT_ENFORCE(recompute_plan != nullptr);
-      ORT_ENFORCE(CreateRecomputeGraph(graph, recompute_plan->GetNodesInTopoOrder(), replacement_node_ptr).IsOK());
+      ORT_ENFORCE(CreateRecomputeGraph(graph, recompute_plan->GetNodesInTopoOrder(), logger, replacement_node_ptr).IsOK());
     } else {
       ORT_THROW("unsupported optimization type found.");
     }
@@ -93,7 +116,7 @@ bool MemoryOptimizer::ModifyGraph(Graph& graph,
 
         auto tid = node_index_to_its_order_in_topological_sort_map.find(it->GetNode().Index());
         // It is possible the consumer node is newly added as the recompute node, so we need a check here.
-        // For those kind of ops, we can treat them as backward ops.
+        // For those kinds of ops, we can treat them as backward ops.
         if (tid == node_index_to_its_order_in_topological_sort_map.end() ||
             !IsForwardPassOperator(node_index_to_its_order_in_topological_sort_map.at(tid->first),
                                    boundary_op_order_in_topological_sort)) {
@@ -223,6 +246,7 @@ void MemoryOptimizer::PrintSummary(const optimizer::memory_optimizer::MemoryOpti
 
 Status MemoryOptimizer::CreateRecomputeGraph(Graph& graph,
                                              const InlinedVector<const Node*>& nodes_in_topological_order,
+                                             const logging::Logger& logger,
                                              Node*& new_output_node_ptr) const {
   InlinedHashMap<NodeArg*, NodeArg*> self_contained_outputs_map;
   for (size_t i = 0; i < nodes_in_topological_order.size(); ++i) {
@@ -236,6 +260,12 @@ Status MemoryOptimizer::CreateRecomputeGraph(Graph& graph,
       continue;
     }
 
+    bool seed_reset = SetSeedForDropoutNode(*node_to_duplicate);
+    if (seed_reset) {
+      LOGS(logger, VERBOSE) << "Set seed for Node " << node_to_duplicate->Name() << "(" << node_to_duplicate->OpType()
+                            << ").";
+    }
+
     InlinedVector<NodeArg*> new_input_args;
     new_input_args.reserve(node_to_duplicate->MutableInputDefs().size());
     for (NodeArg* input_arg : node_to_duplicate->MutableInputDefs()) {
diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/memory_optimizer.h b/orttraining/orttraining/core/optimizer/memory_optimizer/memory_optimizer.h
index b3e05fd334e48..1d837038e76c1 100644
--- a/orttraining/orttraining/core/optimizer/memory_optimizer/memory_optimizer.h
+++ b/orttraining/orttraining/core/optimizer/memory_optimizer/memory_optimizer.h
@@ -94,6 +94,7 @@ class MemoryOptimizer : public GraphTransformer {
    */
   Status CreateRecomputeGraph(Graph& graph,
                               const InlinedVector<const Node*>& nodes_in_topological_order,
+                              const logging::Logger& logger,
                               Node*& recompute_subgraph_output_node) const;
 
   /**************************************************
diff --git a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
index e189ffff9cc7f..c67b05758c5aa 100755
--- a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
+++ b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
@@ -754,6 +754,11 @@ def _add_record(tbl, columns):
 
         if self._runtime_options.memory_optimization_level == _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE:
             opt_config_to_display = "ALL_RECOMPUTE_FOR_EACH_LAYER"
+        elif (
+            self._runtime_options.memory_optimization_level
+            == _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE_WITH_COMPROMISE
+        ):
+            opt_config_to_display = "ALL_RECOMPUTE_FOR_EACH_LAYER_WITH_COMPROMISE"
         else:
             opt_config_to_display = self._runtime_options.memory_optimizer_config
 
@@ -766,7 +771,7 @@ def _add_record(tbl, columns):
                     f"Memory Optimization Level: [{_MemoryOptimizationLevel.to_string(self._runtime_options.memory_optimization_level)}], "
                     f"Optimization Config: [{opt_config_to_display}]"
                     if len(self._runtime_options.memory_optimizer_config) > 0
-                    else "Enable with env ORTMODULE_MEMORY_OPT_LEVEL=1 or ORTMODULE_MEMORY_OPT_CONFIG=<plan1 config>,<plan2 config>,..."
+                    else "Enable with env ORTMODULE_MEMORY_OPT_LEVEL=1/2 or ORTMODULE_MEMORY_OPT_CONFIG=<plan1 config>,<plan2 config>,..."
                 ),
             ],
         )
diff --git a/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py b/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py
index 772b9bd9e31ae..22e31466887a6 100644
--- a/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py
+++ b/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py
@@ -545,7 +545,10 @@ def find_memory_optimization_opportunity(self, execution_agent: TrainingAgent, r
 
         # If the memory optimization level is aggressive, we will first collect all
         # recompute subgraph by passing empty memory_optimizer_config to get_serialized_ortmodule_memory_stat.
-        if runtime_options.memory_optimization_level == _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE:
+        if runtime_options.memory_optimization_level in [
+            _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE,
+            _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE_WITH_COMPROMISE,
+        ]:
             memory_optimizer_config = ""
 
         (
@@ -581,16 +584,27 @@ def find_memory_optimization_opportunity(self, execution_agent: TrainingAgent, r
             self.cluster_id_combination_to_saving_symbolics_map[cluster_id] = values
 
         # For aggressive memory optimization, we update the memory_optimizer_config using all.
-        if runtime_options.memory_optimization_level == _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE:
+        if runtime_options.memory_optimization_level > 0:
             recompute_configs = []
             for cluster_id in self.cluster_id_combination_to_saving_symbolics_map:
                 config_values = cluster_id.split(":")
                 opt_type = int(config_values[1])
-                # TODO(pengwa): use enum instead of 1 here.
-                if opt_type != 1:
-                    continue
-
-                recompute_configs.append(cluster_id)
+                if (
+                    runtime_options.memory_optimization_level
+                    == _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE
+                    and opt_type == _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE
+                ):
+                    recompute_configs.append(cluster_id)
+                elif (
+                    runtime_options.memory_optimization_level
+                    == _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE_WITH_COMPROMISE
+                    and opt_type
+                    in [
+                        _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE,
+                        _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE_WITH_COMPROMISE,
+                    ]
+                ):
+                    recompute_configs.append(cluster_id)
 
             runtime_options.memory_optimizer_config = ",".join(recompute_configs)
 
@@ -699,14 +713,16 @@ def _get_user_config_without_freq(configs: str):
             notes = []
             if details:
                 notes.append(
-                    "[Memory Optimizer] Use ORTMODULE_MEMORY_OPT_LEVEL=1 to enable all recomputable subgraphs per transformer layer."
+                    "[Memory Optimizer] Use ORTMODULE_MEMORY_OPT_LEVEL=1/2 to enable all recomputable subgraphs per transformer layer."
                 )
                 saving_recommendation = "[Memory Optimizer] Or use comma as a delimiter to selectively enable multiple memory optimization plans:\n"
                 saving_recommendation += "  export ORTMODULE_MEMORY_OPT_CONFIG=<plan1 config>,<plan2 config>,..."
 
                 notes.append(saving_recommendation)
 
-                saving_recommendation = "memory saving is calculated based on the 1st batch symbolic dim values:\n"
+                saving_recommendation = (
+                    "[Memory Optimizer] memory saving is calculated based on the 1st batch symbolic dim values:\n"
+                )
                 for dim_param, dim_value in self.symbolic_dim_name_to_value_map.items():
                     saving_recommendation += f"  {dim_param}={dim_value},"
                 notes.append(saving_recommendation)
diff --git a/orttraining/orttraining/python/training/ortmodule/options.py b/orttraining/orttraining/python/training/ortmodule/options.py
index 93d24a34df6bd..7263a5719e262 100644
--- a/orttraining/orttraining/python/training/ortmodule/options.py
+++ b/orttraining/orttraining/python/training/ortmodule/options.py
@@ -196,7 +196,10 @@ class _MemoryOptimizationLevel(IntFlag):
     """Enumeration to specify memory optimization level"""
 
     USER_SPECIFIED = 0  # Fully respect user-specified config
-    TRANSFORMER_LAYERWISE_RECOMPUTE = 1  # Enable all recomputable subgraphs per layer
+    TRANSFORMER_LAYERWISE_RECOMPUTE = (
+        1  # Enable all recomputable subgraphs (excluding compromised recomptable graphs) per layer
+    )
+    TRANSFORMER_LAYERWISE_RECOMPUTE_WITH_COMPROMISE = 2  # Enable all recomputable subgraphs per layer
 
     @staticmethod
     def to_string(memory_optimization_level):
@@ -206,6 +209,9 @@ def to_string(memory_optimization_level):
         if memory_optimization_level == _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE:
             return "TRANSFORMER_LAYERWISE_RECOMPUTE"
 
+        if memory_optimization_level == _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE_WITH_COMPROMISE:
+            return "TRANSFORMER_LAYERWISE_RECOMPUTE_WITH_COMPROMISE"
+
         return ""
 
 
@@ -344,7 +350,10 @@ def _override_from_env_vars(self):
         self.memory_optimization_level = int(os.getenv("ORTMODULE_MEMORY_OPT_LEVEL", self.memory_optimization_level))
         user_given_memory_optimizer_config = os.getenv("ORTMODULE_MEMORY_OPT_CONFIG", self.memory_optimizer_config)
         self.memory_optimizer_config = ",".join([c for c in user_given_memory_optimizer_config.split(",") if c])
-        if self.memory_optimization_level == _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE:
+        if self.memory_optimization_level in [
+            _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE,
+            _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE_WITH_COMPROMISE,
+        ]:
             # For transformer layer-wise recompute, we enable layer boundary when detecting subgraphs.
             # Then all detected subgraphs will not cross different layers.
             self.recompute_probe_config = "1:1"
@@ -419,7 +428,10 @@ def memory_optimizer_is_enabled(self) -> bool:
         """Check whether memory optimizer is enabled."""
         if self.memory_optimization_level == _MemoryOptimizationLevel.USER_SPECIFIED:
             return len(self.memory_optimizer_config) > 0
-        elif self.memory_optimization_level == _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE:
+        elif self.memory_optimization_level in [
+            _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE,
+            _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE_WITH_COMPROMISE,
+        ]:
             return True
 
         return False

From 1bfc26685b51522395e136a606005a72997e6bff Mon Sep 17 00:00:00 2001
From: Vincent Wang <wangwchpku@outlook.com>
Date: Wed, 6 Mar 2024 10:11:46 +0800
Subject: [PATCH 113/279] ATen Op Supports Int Return Type and CPU Tensor
 Arguments (#19773)

This PR:
- add support for int as return type, will create a CPU scalar tensor
for it.
- add attributes to specify which arguments or returns are CPU tensors.
- adjust ATen efficient attn to match latest PyTorch native function.
- a Triton codegen bugfix by the way.
---
 .../cpu/aten_ops/aten_op_executor.h           |  16 +-
 onnxruntime/core/framework/utils.cc           |  24 ++-
 .../core/graph/contrib_ops/contrib_defs.cc    |   2 +
 .../python/onnxruntime_pybind_state.cc        |  10 +-
 .../aten_op_executor/__init__.py              |   2 +-
 .../aten_op_executor/aten_op_executor.cc      |  62 ++++---
 .../ort_torch_ext/__init__.py                 |   4 +-
 .../python/training/ort_triton/_ir.py         |   3 +
 .../ortmodule/graph_optimizers/__init__.py    |   2 +-
 .../ortmodule/graph_optimizers/_aten_attn.py  | 169 +++---------------
 10 files changed, 96 insertions(+), 198 deletions(-)

diff --git a/onnxruntime/contrib_ops/cpu/aten_ops/aten_op_executor.h b/onnxruntime/contrib_ops/cpu/aten_ops/aten_op_executor.h
index d72868cd8fa9f..56c8e2911e280 100644
--- a/onnxruntime/contrib_ops/cpu/aten_ops/aten_op_executor.h
+++ b/onnxruntime/contrib_ops/cpu/aten_ops/aten_op_executor.h
@@ -10,7 +10,7 @@ namespace onnxruntime {
 namespace contrib {
 namespace aten_ops {
 
-typedef bool (*IsCpuArgumentFunc)(const char* op_name, const char* overload_name, size_t index, bool is_input);
+typedef bool (*IsTensorArgumentFunc)(const char* op_name, const char* overload_name, size_t index, bool is_input);
 typedef void (*ExecuteATenOperatorFunc)(const char* op_name, const char* overload_name, size_t input_size,
                                         DLManagedTensor** dlpack_inputs, size_t output_size,
                                         DLManagedTensor** dlpack_outputs);
@@ -22,17 +22,17 @@ class ATenOperatorExecutor {
     return instance;
   }
 
-  void Initialize(void* p_is_cpu_argument_func_raw, void* p_execute_aten_op_func_raw) {
-    ORT_ENFORCE(p_is_cpu_argument_func_raw && p_execute_aten_op_func_raw);
-    p_is_cpu_argument_func_ = reinterpret_cast<IsCpuArgumentFunc>(p_is_cpu_argument_func_raw);
+  void Initialize(void* p_is_tensor_argument_func_raw, void* p_execute_aten_op_func_raw) {
+    ORT_ENFORCE(p_is_tensor_argument_func_raw && p_execute_aten_op_func_raw);
+    p_is_tensor_argument_func_ = reinterpret_cast<IsTensorArgumentFunc>(p_is_tensor_argument_func_raw);
     p_execute_aten_op_func_ = reinterpret_cast<ExecuteATenOperatorFunc>(p_execute_aten_op_func_raw);
   }
 
   bool IsInitialized() { return p_execute_aten_op_func_ != nullptr; }
 
-  bool IsCpuArgument(const std::string& op_name, const std::string& overload_name, size_t index, bool is_input) {
-    ORT_ENFORCE(p_is_cpu_argument_func_, "ATenOperatorExecutor is not initialized.");
-    return p_is_cpu_argument_func_(op_name.c_str(), overload_name.c_str(), index, is_input);
+  bool IsTensorArgument(const std::string& op_name, const std::string& overload_name, size_t index, bool is_input) {
+    ORT_ENFORCE(p_is_tensor_argument_func_, "ATenOperatorExecutor is not initialized.");
+    return p_is_tensor_argument_func_(op_name.c_str(), overload_name.c_str(), index, is_input);
   }
 
   void operator()(const std::string& op_name, const std::string& overload_name, size_t input_size,
@@ -43,7 +43,7 @@ class ATenOperatorExecutor {
   }
 
  private:
-  IsCpuArgumentFunc p_is_cpu_argument_func_ = nullptr;
+  IsTensorArgumentFunc p_is_tensor_argument_func_ = nullptr;
   ExecuteATenOperatorFunc p_execute_aten_op_func_ = nullptr;
 };
 
diff --git a/onnxruntime/core/framework/utils.cc b/onnxruntime/core/framework/utils.cc
index 23fe5e1cd3d96..b737d735b977b 100644
--- a/onnxruntime/core/framework/utils.cc
+++ b/onnxruntime/core/framework/utils.cc
@@ -1015,9 +1015,19 @@ bool IsInputOnCpu(const Node& node, const KernelCreateInfo* p_kci, size_t index)
   }
 
 #ifdef ENABLE_ATEN
+  // For ATen node, we assume that all tensor inputs are on device, all non-tensor inputs are on CPU,
+  // except those specified in attribute cpu_input_args;
   if (node.GetExecutionProviderType() == kCudaExecutionProvider && node.OpType() == "ATen" &&
       node.Domain() == kPytorchAtenDomain) {
     const auto& attrs = node.GetAttributes();
+    if (auto entry = attrs.find("cpu_input_args"); entry != attrs.end()) {
+      const auto& attr = entry->second;
+      if (utils::HasInts(attr) && std::any_of(attr.ints().cbegin(), attr.ints().cend(),
+                                              [index](int64_t arg) { return static_cast<int64_t>(index) == arg; })) {
+        return true;
+      }
+    }
+
     ORT_ENFORCE(utils::HasString(attrs.at("operator")));
     std::string op_name = attrs.at("operator").s();
     std::string overload_name = "";
@@ -1025,7 +1035,7 @@ bool IsInputOnCpu(const Node& node, const KernelCreateInfo* p_kci, size_t index)
       overload_name = attrs.at("overload_name").s();
     }
 
-    return contrib::aten_ops::ATenOperatorExecutor::Instance().IsCpuArgument(op_name, overload_name, index, true);
+    return !contrib::aten_ops::ATenOperatorExecutor::Instance().IsTensorArgument(op_name, overload_name, index, true);
   }
 #else
   ORT_UNUSED_PARAMETER(node);
@@ -1040,9 +1050,19 @@ bool IsOutputOnCpu(const Node& node, const KernelCreateInfo* p_kci, size_t index
   }
 
 #ifdef ENABLE_ATEN
+  // For ATen node, we assume that all tensor outputs are on device, all non-tensor outputs are on CPU,
+  // except those specified in attribute cpu_output_args;
   if (node.GetExecutionProviderType() == kCudaExecutionProvider && node.OpType() == "ATen" &&
       node.Domain() == kPytorchAtenDomain) {
     const auto& attrs = node.GetAttributes();
+    if (auto entry = attrs.find("cpu_output_args"); entry != attrs.end()) {
+      const auto& attr = entry->second;
+      if (utils::HasInts(attr) && std::any_of(attr.ints().cbegin(), attr.ints().cend(),
+                                              [index](int64_t arg) { return static_cast<int64_t>(index) == arg; })) {
+        return true;
+      }
+    }
+
     ORT_ENFORCE(utils::HasString(attrs.at("operator")));
     std::string op_name = attrs.at("operator").s();
     std::string overload_name = "";
@@ -1050,7 +1070,7 @@ bool IsOutputOnCpu(const Node& node, const KernelCreateInfo* p_kci, size_t index
       overload_name = attrs.at("overload_name").s();
     }
 
-    return contrib::aten_ops::ATenOperatorExecutor::Instance().IsCpuArgument(op_name, overload_name, index, false);
+    return !contrib::aten_ops::ATenOperatorExecutor::Instance().IsTensorArgument(op_name, overload_name, index, false);
   }
 #else
   ORT_UNUSED_PARAMETER(node);
diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
index f06a3785f362d..6709398c788f0 100644
--- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
@@ -3474,6 +3474,8 @@ MatMulBnb4 is a MatMul with weight quantized with 4 bits using either FP4 or NF4
               /*min_arity*/ 1)
       .Attr("operator", "Name of ATen operator.", AttributeProto::STRING)
       .Attr("overload_name", "Overload name of ATen operator.", AttributeProto::STRING, false)
+      .Attr("cpu_input_args", "CPU input argument indices.", AttributeProto::INTS, false)
+      .Attr("cpu_output_args", "CPU output argument indices.", AttributeProto::INTS, false)
       .TypeConstraint("T", OpSchema::all_tensor_types_ir4(),
                       "Allow inputs and outputs to be any kind of tensor.");
 #endif
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index 9c36eb635ffcf..e5e0e81cb7da8 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -1327,14 +1327,14 @@ void addGlobalMethods(py::module& m) {
 
 #ifdef ENABLE_ATEN
   m.def("register_aten_op_executor",
-        [](const std::string& is_cpu_argument_address_str, const std::string& aten_op_executor_address_str) -> void {
-          size_t is_cpu_argument_address_int, aten_op_executor_address_int;
+        [](const std::string& is_tensor_argument_address_str, const std::string& aten_op_executor_address_str) -> void {
+          size_t is_tensor_argument_address_int, aten_op_executor_address_int;
           ORT_THROW_IF_ERROR(
-              ParseStringWithClassicLocale(is_cpu_argument_address_str, is_cpu_argument_address_int));
+              ParseStringWithClassicLocale(is_tensor_argument_address_str, is_tensor_argument_address_int));
           ORT_THROW_IF_ERROR(ParseStringWithClassicLocale(aten_op_executor_address_str, aten_op_executor_address_int));
-          void* p_is_cpu_argument = reinterpret_cast<void*>(is_cpu_argument_address_int);
+          void* p_is_tensor_argument = reinterpret_cast<void*>(is_tensor_argument_address_int);
           void* p_aten_op_executor = reinterpret_cast<void*>(aten_op_executor_address_int);
-          contrib::aten_ops::ATenOperatorExecutor::Instance().Initialize(p_is_cpu_argument, p_aten_op_executor);
+          contrib::aten_ops::ATenOperatorExecutor::Instance().Initialize(p_is_tensor_argument, p_aten_op_executor);
         });
 #endif
 }
diff --git a/onnxruntime/python/torch_cpp_extensions/aten_op_executor/__init__.py b/onnxruntime/python/torch_cpp_extensions/aten_op_executor/__init__.py
index 8bf7cbf80eb37..9dee6564509d5 100644
--- a/onnxruntime/python/torch_cpp_extensions/aten_op_executor/__init__.py
+++ b/onnxruntime/python/torch_cpp_extensions/aten_op_executor/__init__.py
@@ -29,5 +29,5 @@ def load_aten_op_executor_cpp_extension():
     from onnxruntime.training.ortmodule.torch_cpp_extensions import aten_op_executor
 
     _C.register_aten_op_executor(
-        str(aten_op_executor.is_cpu_argument_address()), str(aten_op_executor.execute_aten_operator_address())
+        str(aten_op_executor.is_tensor_argument_address()), str(aten_op_executor.execute_aten_operator_address())
     )
diff --git a/onnxruntime/python/torch_cpp_extensions/aten_op_executor/aten_op_executor.cc b/onnxruntime/python/torch_cpp_extensions/aten_op_executor/aten_op_executor.cc
index 903a394a06ef3..e8be98cbfc0e4 100644
--- a/onnxruntime/python/torch_cpp_extensions/aten_op_executor/aten_op_executor.cc
+++ b/onnxruntime/python/torch_cpp_extensions/aten_op_executor/aten_op_executor.cc
@@ -34,18 +34,23 @@ struct ATenOperator {
   std::vector<bool> is_optional_arguments;
   std::vector<c10::optional<c10::IValue>> default_values;
   size_t return_size;
+  std::vector<c10::TypeKind> ret_kinds;
 
   c10::IValue ToIValueArgument(const DLManagedTensor* dlpack, size_t index) const {
     TORCH_INTERNAL_ASSERT(index < argument_size);
     bool is_optional = is_optional_arguments[index];
-    TORCH_INTERNAL_ASSERT(dlpack || is_optional || default_values[index]);
+    TORCH_INTERNAL_ASSERT(dlpack || is_optional || default_values[index] ||
+                          elem_kinds[index] == c10::TypeKind::TensorType);
     if (!dlpack) {
       if (is_optional) {
         // Optional argument always has no default value.
         return c10::IValue(c10::nullopt);
       }
-
-      return *default_values[index];
+      if (default_values[index]) {
+        return *default_values[index];
+      }
+      // Fow bw func, it's possible that input is an undefined tensor from fw outputs, dlpack is nullptr for such case.
+      return c10::IValue(at::Tensor());
     }
 
     bool is_list = is_list_arguments[index];
@@ -142,7 +147,10 @@ class ATenOperatorCache {
       }
       aten_op.return_size = schema.returns().size();
       for (const auto& ret : schema.returns()) {
-        TORCH_INTERNAL_ASSERT(ret.type()->kind() == c10::TypeKind::TensorType);
+        c10::TypeKind ret_type = ret.type()->kind();
+        // Support tensor or int only for now.
+        TORCH_INTERNAL_ASSERT(ret_type == c10::TypeKind::TensorType || ret_type == c10::TypeKind::IntType);
+        aten_op.ret_kinds.emplace_back(ret_type);
       }
       ops_.emplace(key, aten_op);
     }
@@ -154,32 +162,15 @@ class ATenOperatorCache {
   std::unordered_map<std::pair<std::string, std::string>, ATenOperator, PairHash> ops_;
 };
 
-const std::unordered_map<std::string, std::unordered_set<size_t>> kCpuTensorInputsMap = {
-    {"_efficient_attention_forward", {4, 5, 11, 12}}, {"_efficient_attention_backward", {6, 7, 12, 13}}};
-
-const std::unordered_map<std::string, std::unordered_set<size_t>> kCpuTensorOutputsMap = {
-    {"_efficient_attention_forward", {2, 3}}};
-
-// Backend uses this function to check if an argument is CPU input or not.
-bool IsCpuArgument(const char* op_name, const char* overload_name, size_t index, bool is_input) {
+// Backend uses this function to check if an argument is tensor type or not.
+bool IsTensorArgument(const char* op_name, const char* overload_name, size_t index, bool is_input) {
+  const auto& aten_op = ATenOperatorCache::Instance().GetOperator(op_name, overload_name);
   if (is_input) {
-    // If the argument is non-tensor type, it's CPU argument.
-    const auto& aten_op = ATenOperatorCache::Instance().GetOperator(op_name, overload_name);
     TORCH_INTERNAL_ASSERT(index < aten_op.argument_size);
-    if (aten_op.elem_kinds[index] != c10::TypeKind::TensorType) {
-      return true;
-    }
-  }
-
-  std::string full_name = std::string(op_name);
-  std::string overload_name_str = std::string(overload_name);
-  if (overload_name_str != "") {
-    full_name += ("." + overload_name_str);
+    return aten_op.elem_kinds[index] == c10::TypeKind::TensorType;
   }
-
-  const auto& cpu_tensors_map = is_input ? kCpuTensorInputsMap : kCpuTensorOutputsMap;
-  return cpu_tensors_map.find(full_name) != cpu_tensors_map.end() &&
-         cpu_tensors_map.at(full_name).find(index) != cpu_tensors_map.at(full_name).end();
+  TORCH_INTERNAL_ASSERT(index < aten_op.return_size);
+  return aten_op.ret_kinds[index] == c10::TypeKind::TensorType;
 }
 
 void ExecuteATenOperator(const char* op_name, const char* overload_name, size_t input_size,
@@ -216,16 +207,23 @@ void ExecuteATenOperator(const char* op_name, const char* overload_name, size_t
   TORCH_INTERNAL_ASSERT(output_size == aten_op.return_size);
   size_t output_index = 0;
   for (const auto& ret : torch::jit::pop(stack, output_size)) {
-    const auto& tensor = ret.toTensor();
-    dlpack_outputs[output_index++] =
-        tensor.defined() ? at::toDLPack(tensor.is_contiguous() ? tensor : tensor.contiguous()) : nullptr;
+    if (ret.isTensor()) {
+      const auto& tensor = ret.toTensor();
+      dlpack_outputs[output_index++] =
+          tensor.defined() ? at::toDLPack(tensor.is_contiguous() ? tensor : tensor.contiguous()) : nullptr;
+    } else if (ret.isInt()) {
+      at::Tensor scalar = at::scalar_to_tensor(at::Scalar(ret.toInt()));
+      dlpack_outputs[output_index++] = at::toDLPack(scalar);
+    } else {
+      TORCH_INTERNAL_ASSERT(false);
+    }
   }
 }
 
-size_t is_cpu_argument_address() { return reinterpret_cast<size_t>(&IsCpuArgument); }
+size_t is_tensor_argument_address() { return reinterpret_cast<size_t>(&IsTensorArgument); }
 size_t execute_aten_operator_address() { return reinterpret_cast<size_t>(&ExecuteATenOperator); }
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("is_cpu_argument_address", &is_cpu_argument_address, "Address of tensor argument check.");
+  m.def("is_tensor_argument_address", &is_tensor_argument_address, "Address of tensor argument check.");
   m.def("execute_aten_operator_address", &execute_aten_operator_address, "Address of Aten operator executor");
 }
diff --git a/onnxruntime/python/torch_cpp_extensions/ort_torch_ext/__init__.py b/onnxruntime/python/torch_cpp_extensions/ort_torch_ext/__init__.py
index 329fba5aa670a..7d5716b85db30 100644
--- a/onnxruntime/python/torch_cpp_extensions/ort_torch_ext/__init__.py
+++ b/onnxruntime/python/torch_cpp_extensions/ort_torch_ext/__init__.py
@@ -5,7 +5,7 @@
 
 from onnxruntime.capi import _pybind_state as _C
 
-from .aten_op_executor import execute_aten_operator_address, is_cpu_argument_address
+from .aten_op_executor import execute_aten_operator_address, is_tensor_argument_address
 
 
 def run_once_aten_op_executor(f):
@@ -30,7 +30,7 @@ def aten_op_executor_wrapper(*args, **kwargs):
 
 @run_once_aten_op_executor
 def load_aten_op_executor_cpp_extension():
-    _C.register_aten_op_executor(str(is_cpu_argument_address()), str(execute_aten_operator_address()))
+    _C.register_aten_op_executor(str(is_tensor_argument_address()), str(execute_aten_operator_address()))
 
 
 def init_aten_op_executor():
diff --git a/orttraining/orttraining/python/training/ort_triton/_ir.py b/orttraining/orttraining/python/training/ort_triton/_ir.py
index a2b8407645c46..a963d30a9e6e7 100644
--- a/orttraining/orttraining/python/training/ort_triton/_ir.py
+++ b/orttraining/orttraining/python/training/ort_triton/_ir.py
@@ -392,5 +392,8 @@ def __init__(
             for ir_node in kernel.sub_nodes:
                 if isinstance(ir_node, DropoutNode):
                     ir_node.global_offset = running_offset
+                    kernel.offset_calc.symbolic_shape_variables.update(
+                        [symbol.name for symbol in running_offset.free_symbols]
+                    )
                     running_offset = running_offset + sympy.prod(ir_node.outputs[0].shape)
                     self.has_dropout = True
diff --git a/orttraining/orttraining/python/training/ortmodule/graph_optimizers/__init__.py b/orttraining/orttraining/python/training/ortmodule/graph_optimizers/__init__.py
index 3d3538a62da61..368d1b238fd9e 100644
--- a/orttraining/orttraining/python/training/ortmodule/graph_optimizers/__init__.py
+++ b/orttraining/orttraining/python/training/ortmodule/graph_optimizers/__init__.py
@@ -13,7 +13,7 @@
 if (
     "ORTMODULE_USE_EFFICIENT_ATTENTION" in os.environ
     and int(os.getenv("ORTMODULE_USE_EFFICIENT_ATTENTION")) == 1
-    and Version(torch.__version__) >= Version("2.1.1")
+    and Version(torch.__version__) >= Version("2.3.0")
 ):
     from ._aten_attn import optimize_graph_for_aten_efficient_attention  # noqa: F401
 
diff --git a/orttraining/orttraining/python/training/ortmodule/graph_optimizers/_aten_attn.py b/orttraining/orttraining/python/training/ortmodule/graph_optimizers/_aten_attn.py
index b1e8809f03fc0..c1fb6e68568f5 100644
--- a/orttraining/orttraining/python/training/ortmodule/graph_optimizers/_aten_attn.py
+++ b/orttraining/orttraining/python/training/ortmodule/graph_optimizers/_aten_attn.py
@@ -5,9 +5,12 @@
 
 """
 PyTorch's _efficient_attention_forward/_efficient_attention_backward APIs is keep changing. Current implementation
-is tested well on version 2.2.0.dev20231010+cu121, and should be run well since official version 2.2.0. If may fail to
+is tested well on version 2.3.0.dev20240221+cu118, and should be run well since official version 2.3.0. If may fail to
 run is you are using PyTorch with older versions.
 
+This file is more like an example of how to add a new graph optimizer. Ideally user can add graph optimizer according
+to the specific model they are using on their own instead of putting every possible graph optimizer here.
+
 PyTorch also has API for flash attention (currently doesn't support random attention mask or Dropout), we can add
 support if we want to try in the future.
 """
@@ -40,13 +43,14 @@ def _make_efficient_attention_nodes(
     scale_node = make_constant_node("scale_" + str(idx), TensorProto.FLOAT, [], [scale])
     dropout_ratio_node = make_constant_node("dropout_ratio_" + str(idx), TensorProto.FLOAT, [], [dropout_ratio])
     causal_node = make_constant_node("causal_" + str(idx), TensorProto.INT64, [], [1 if causal else 0])
-    int_zero_node = make_constant_node("int_zero_" + str(idx), TensorProto.INT64, [], [0])
-    true_node = make_constant_node("true_" + str(idx), TensorProto.BOOL, [], [True])
-    false_node = make_constant_node("false_" + str(idx), TensorProto.BOOL, [], [False])
+    one_node = make_constant_node("one_" + str(idx), TensorProto.INT64, [], [1])
+    zero_node = make_constant_node("zero_" + str(idx), TensorProto.INT64, [], [0])
     logsumexp = helper.make_tensor_value_info("logsumexp" + str(idx), TensorProto.FLOAT, [])
     seed = helper.make_tensor_value_info("seed" + str(idx), TensorProto.INT64, [])
     offset = helper.make_tensor_value_info("offset" + str(idx), TensorProto.INT64, [])
-    new_value_infos = [logsumexp, seed, offset]
+    msb_q = helper.make_tensor_value_info("msb_q_" + str(idx), TensorProto.INT64, [])
+    msb_k = helper.make_tensor_value_info("msb_k_" + str(idx), TensorProto.INT64, [])
+    new_value_infos = [logsumexp, seed, offset, msb_q, msb_k]
     if expand_bias:
         shape_0 = helper.make_node("Shape", [q], ["shape_0_" + str(idx)], start=0, end=1)
         shape_1 = helper.make_node("Shape", [q], ["shape_1_" + str(idx)], start=2, end=3)
@@ -54,13 +58,13 @@ def _make_efficient_attention_nodes(
         shape_3 = helper.make_node("Shape", [k], ["shape_3_" + str(idx)], start=1, end=2)
         concat = helper.make_node(
             "Concat",
-            ["shape_0_" + str(idx), "shape_1_" + str(idx), "shape_2_" + str(idx), "shape_3_" + str(idx)],
+            [shape_0.output[0], shape_1.output[0], shape_2.output[0], shape_3.output[0]],
             ["concated_shape_" + str(idx)],
             axis=0,
         )
-        expand = helper.make_node("Expand", [bias, "concated_shape_" + str(idx)], ["expanded_bias_" + str(idx)])
+        expand = helper.make_node("Expand", [bias, concat.output[0]], ["expanded_bias_" + str(idx)])
         nodes_to_add.extend([shape_0, shape_1, shape_2, shape_3, concat, expand])
-        bias = "expanded_bias_" + str(idx)
+        bias = expand.output[0]
     fwd_node = helper.make_node(
         "ATen",
         [
@@ -71,18 +75,21 @@ def _make_efficient_attention_nodes(
             "",
             "",
             "",
+            "",
             dropout_ratio_node.output[0],
             causal_node.output[0],
-            true_node.output[0],
+            one_node.output[0],
             scale_node.output[0],
             "",
             "",
         ],
-        [y, logsumexp.name, seed.name, offset.name],
+        [y, logsumexp.name, seed.name, offset.name, msb_q.name, msb_k.name],
         "efficient_attention_forward_" + str(idx),
         None,
         "org.pytorch.aten",
         operator="_efficient_attention_forward",
+        cpu_input_args=[4, 5, 12, 13],
+        cpu_output_args=[2, 3, 4, 5],
     )
     bwd_node = helper.make_node(
         "ATen",
@@ -95,14 +102,14 @@ def _make_efficient_attention_nodes(
             y,
             "",
             "",
-            int_zero_node.output[0],
-            int_zero_node.output[0],
+            msb_q.name,
+            msb_k.name,
             logsumexp.name,
             dropout_ratio_node.output[0],
             seed.name,
             offset.name,
             causal_node.output[0],
-            false_node.output[0],
+            zero_node.output[0],
             scale_node.output[0],
             "",
         ],
@@ -111,10 +118,9 @@ def _make_efficient_attention_nodes(
         None,
         "org.pytorch.aten",
         operator="_efficient_attention_backward",
+        cpu_input_args=[6, 7, 12, 13],
     )
-    nodes_to_add.extend(
-        [scale_node, dropout_ratio_node, causal_node, int_zero_node, true_node, false_node, fwd_node, bwd_node]
-    )
+    nodes_to_add.extend([scale_node, dropout_ratio_node, causal_node, one_node, zero_node, fwd_node, bwd_node])
     return nodes_to_add, new_value_infos
 
 
@@ -240,140 +246,9 @@ def _optimize_for_pattern_1(matcher: GraphMatcher, idx: int, nodes: List[NodePro
     return nodes, nodes_to_add, new_value_infos
 
 
-# No causal mask, no attention mask, without Dropout.
-_PATTERN_2: List[Tuple[str, bool, List[Tuple[int, int, int]]]] = [
-    ("MatMul", False, []),  # 0
-    ("Mul", True, [(0, 0, 0)]),  # 1
-    ("Mul", True, [(0, 0, 1)]),  # 2
-    ("Transpose", True, [(1, 0, 0)]),  # 3
-    ("Transpose", True, [(2, 0, 0)]),  # 4
-    ("Softmax", False, [(0, 0, 0)]),  # 5
-    ("MatMul", False, [(5, 0, 0)]),  # 6
-    ("Transpose", True, [(6, 0, 1)]),  # 7
-    ("Transpose", False, [(6, 0, 0)]),  # 8
-    ("FusedMatMul", False, [(7, 0, 1)]),  # 9
-    ("SoftmaxGrad_13", False, [(9, 0, 0), (5, 0, 1)]),  # 10
-    ("FusedMatMul", False, [(2, 0, 1), (10, 0, 0)]),  # 11
-    ("FusedMatMul", False, [(1, 0, 0), (10, 0, 1)]),  # 12
-    ("Mul", False, [(11, 0, 0)]),  # 13
-    ("Mul", False, [(12, 0, 0)]),  # 14
-    ("Identity", False, [(13, 0, 0)]),  # 15
-    ("Identity", False, [(14, 0, 0)]),  # 16
-    ("Transpose", False, [(15, 0, 0)]),  # 17
-    ("Transpose", False, [(16, 0, 0)]),  # 18
-    ("FusedMatMul", False, [(5, 0, 0)]),  # 19
-    ("Transpose", True, [(19, 0, 1)]),  # 20
-    ("Transpose", False, [(19, 0, 0)]),  # 21
-]
-
-
-def _optimize_for_pattern_2(matcher: GraphMatcher, idx: int, nodes: List[NodeProto]):
-    # Check forward only as the backward is expected to be consistent if it's built correctly.
-    scale_value_1 = matcher.get_constant_value(nodes[1].input[1])
-    scale_value_1 = scale_value_1[0] if isinstance(scale_value_1, list) else scale_value_1
-    scale_value_2 = matcher.get_constant_value(nodes[2].input[1])
-    scale_value_2 = scale_value_2[0] if isinstance(scale_value_2, list) else scale_value_2
-    if not (
-        check_attribute_value(nodes[3], "perm", [0, 2, 1, 3])
-        and check_attribute_value(nodes[4], "perm", [0, 2, 3, 1])
-        and check_attribute_value(nodes[7], "perm", [0, 2, 1, 3])
-        and check_attribute_value(nodes[8], "perm", [0, 2, 1, 3])
-        and scale_value_1 == scale_value_2
-    ):
-        return [], [], []
-
-    nodes_to_add, new_value_infos = _make_efficient_attention_nodes(
-        idx,
-        nodes[3].input[0],
-        nodes[4].input[0],
-        nodes[7].input[0],
-        nodes[8].output[0],
-        nodes[20].input[0],
-        nodes[17].output[0],
-        nodes[18].output[0],
-        nodes[21].output[0],
-        "",
-        False,
-        scale_value_1,
-        0.0,
-        False,
-    )
-    return nodes, nodes_to_add, new_value_infos
-
-
-# Has causal mask, no attention mask, without Dropout.
-_PATTERN_3: List[Tuple[str, bool, List[Tuple[int, int, int]]]] = [
-    ("MatMul", False, []),  # 0
-    ("Mul", True, [(0, 0, 0)]),  # 1
-    ("Mul", True, [(0, 0, 1)]),  # 2
-    ("Transpose", True, [(1, 0, 0)]),  # 3
-    ("Transpose", True, [(2, 0, 0)]),  # 4
-    ("Add", False, [(0, 0, 0)]),  # 5
-    ("Slice", True, [(5, 0, 1)]),  # 6
-    ("Slice", True, [(6, 0, 0)]),  # 7
-    ("Unsqueeze", True, [(6, 0, 2)]),  # 8
-    ("Gather", True, [(8, 0, 0)]),  # 9
-    ("Shape", True, [(9, 0, 0)]),  # 10
-    ("Softmax", False, [(5, 0, 0)]),  # 11
-    ("MatMul", False, [(11, 0, 0)]),  # 12
-    ("Transpose", True, [(12, 0, 1)]),  # 13
-    ("Transpose", False, [(12, 0, 0)]),  # 14
-    ("FusedMatMul", False, [(13, 0, 1)]),  # 15
-    ("SoftmaxGrad_13", False, [(15, 0, 0), (11, 0, 1)]),  # 16
-    ("Identity", False, [(16, 0, 0)]),  # 17
-    ("FusedMatMul", False, [(2, 0, 1), (17, 0, 0)]),  # 18
-    ("FusedMatMul", False, [(1, 0, 0), (17, 0, 1)]),  # 19
-    ("Mul", False, [(18, 0, 0)]),  # 20
-    ("Mul", False, [(19, 0, 0)]),  # 21
-    ("Identity", False, [(20, 0, 0)]),  # 22
-    ("Identity", False, [(21, 0, 0)]),  # 23
-    ("Transpose", False, [(22, 0, 0)]),  # 24
-    ("Transpose", False, [(23, 0, 0)]),  # 25
-    ("FusedMatMul", False, [(11, 0, 0)]),  # 26
-    ("Transpose", True, [(26, 0, 1)]),  # 27
-    ("Transpose", False, [(26, 0, 0)]),  # 28
-]
-
-
-def _optimize_for_pattern_3(matcher: GraphMatcher, idx: int, nodes: List[NodeProto]):
-    # Check forward only as the backward is expected to be consistent if it's built correctly.
-    scale_value_1 = matcher.get_constant_value(nodes[1].input[1])
-    scale_value_1 = scale_value_1[0] if isinstance(scale_value_1, list) else scale_value_1
-    scale_value_2 = matcher.get_constant_value(nodes[2].input[1])
-    scale_value_2 = scale_value_2[0] if isinstance(scale_value_2, list) else scale_value_2
-    if not (
-        check_attribute_value(nodes[3], "perm", [0, 2, 1, 3])
-        and check_attribute_value(nodes[4], "perm", [0, 2, 3, 1])
-        and check_attribute_value(nodes[13], "perm", [0, 2, 1, 3])
-        and check_attribute_value(nodes[14], "perm", [0, 2, 1, 3])
-        and scale_value_1 == scale_value_2
-    ):
-        return [], [], []
-
-    nodes_to_add, new_value_infos = _make_efficient_attention_nodes(
-        idx,
-        nodes[3].input[0],
-        nodes[4].input[0],
-        nodes[13].input[0],
-        nodes[14].output[0],
-        nodes[27].input[0],
-        nodes[24].output[0],
-        nodes[25].output[0],
-        nodes[28].output[0],
-        "",
-        False,
-        scale_value_1,
-        0.0,
-        True,
-    )
-    return nodes, nodes_to_add, new_value_infos
-
-
 _PATTERNS = [
     (_PATTERN_0, _optimize_for_pattern_0),
     (_PATTERN_1, _optimize_for_pattern_1),
-    (_PATTERN_2, _optimize_for_pattern_2),
-    (_PATTERN_3, _optimize_for_pattern_3),
 ]
 
 
From a788514027c3a6ee5f284c965ccffcb8805302a5 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Tue, 5 Mar 2024 18:27:26 -0800
Subject: [PATCH 114/279] [js/web] dump debug logs for karma for diagnose
 purpose (#19785)

### Description
dump debug logs for karma for diagnose purpose.

This is for debugging the CI issue of Chrome launch failure and
considered temporary.
---
 js/web/script/test-runner-cli.ts                     |  3 +++
 .../github/azure-pipelines/templates/win-web-ci.yml  | 12 ++++++------
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/js/web/script/test-runner-cli.ts b/js/web/script/test-runner-cli.ts
index 59bd0d5f6313a..ace64e9532b12 100644
--- a/js/web/script/test-runner-cli.ts
+++ b/js/web/script/test-runner-cli.ts
@@ -569,6 +569,9 @@ async function main() {
       if (webnn) {
         chromiumFlags.push('--enable-experimental-web-platform-features');
       }
+      if (process.argv.includes('--karma-debug')) {
+        karmaArgs.push('--log-level debug');
+      }
       karmaArgs.push(`--bundle-mode=${args.bundleMode}`);
       karmaArgs.push(...chromiumFlags.map(flag => `--chromium-flags=${flag}`));
       if (browser.startsWith('Edge')) {
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
index b882d6fb167fd..9553bc1bc3547 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
@@ -153,31 +153,31 @@ jobs:
       errorActionPreference: stop
     displayName: 'Pack NPM packages'
   - script: |
-     npm test -- -e=chrome -b=webgl,wasm
+     npm test -- -e=chrome -b=webgl,wasm --karma-debug
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'Run ort-web tests (wasm,webgl backend)'
     condition: eq('${{ parameters.RunWebGpuTests }}', 'false')
   - script: |
-     npm test -- -e=chrome -b=webgl,wasm,webgpu $(webgpuCommandlineExtraFlags)
+     npm test -- -e=chrome -b=webgl,wasm,webgpu --karma-debug $(webgpuCommandlineExtraFlags)
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'Run ort-web tests (ALL backends)'
     condition: eq('${{ parameters.RunWebGpuTests }}', 'true')
   - script: |
-     npm test -- suite1 -e=chrome -b=webgpu --io-binding=gpu-tensor $(webgpuCommandlineExtraFlags)
+     npm test -- suite1 -e=chrome -b=webgpu --io-binding=gpu-tensor --karma-debug $(webgpuCommandlineExtraFlags)
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'Run ort-web tests (Suite1, webgpu, IO-binding=gpu-tensor)'
     condition: eq('${{ parameters.RunWebGpuTests }}', 'true')
   - script: |
-     npm test -- suite1 -e=chrome -b=webgpu --io-binding=gpu-location $(webgpuCommandlineExtraFlags)
+     npm test -- suite1 -e=chrome -b=webgpu --io-binding=gpu-location --karma-debug $(webgpuCommandlineExtraFlags)
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'Run ort-web tests (Suite1, webgpu, IO-binding=gpu-location)'
     condition: eq('${{ parameters.RunWebGpuTests }}', 'true')
   - script: |
-     npm test -- --webgl-texture-pack-mode -b=webgl -e=chrome
+     npm test -- --webgl-texture-pack-mode -b=webgl -e=chrome --karma-debug
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'Run ort-web tests - WebGL: packed mode'
   - script: |
-     npm test -- --wasm-enable-proxy -b=wasm -e=chrome
+     npm test -- --wasm-enable-proxy -b=wasm -e=chrome --karma-debug
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'Run ort-web tests - WebAssembly: proxy'
     condition: and(succeeded(), eq('${{ parameters.BuildConfig }}', 'Release'))

From db59cec82f226dbba3ce7c5b03db35b0fe07fb60 Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Wed, 6 Mar 2024 15:03:55 +1000
Subject: [PATCH 115/279] Don't reduce warning level for CUDA build on Windows
 (#19663)

### Description
<!-- Describe your changes. -->
Address warnings so all the ORT projects build with /W4 on Windows.

Mainly
- unused parameters
- variables shadowing other ones

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
#19588 started on this.
---
 cmake/CMakeLists.txt                          |  6 +--
 cmake/onnxruntime_providers_cuda.cmake        | 13 ++++-
 .../core/providers/cuda/cuda_context.h        |  2 +-
 .../cuda/bert/add_bias_transpose.cu           | 10 ++--
 .../contrib_ops/cuda/bert/attention_impl.cu   | 20 +++----
 .../cuda/bert/attention_prepare_qkv.cu        |  4 +-
 .../bert/cutlass_fmha/fmha_launch_template.h  |  8 +--
 .../cuda/bert/decoder_attention_impl.cu       |  2 +-
 .../cuda/bert/group_query_attention_impl.cu   |  4 +-
 .../cuda/bert/packed_attention_impl.cu        |  2 +-
 .../bert/packed_multihead_attention_impl.cu   |  4 +-
 .../contrib_ops/cuda/bert/rotary_embedding.cc |  2 -
 .../cuda/bert/rotary_embedding_impl.cu        |  2 +-
 .../mha_runner.cu                             | 54 +++++++++----------
 .../cuda/diffusion/group_norm_common_base.h   |  6 +--
 onnxruntime/contrib_ops/cuda/inverse.cc       |  8 +--
 .../contrib_ops/cuda/math/complex_mul_impl.cu |  4 +-
 .../contrib_ops/cuda/math/gemm_float8.cu      |  2 +-
 .../cuda/moe/ft_moe/moe_cutlass_kernel.h      |  2 +-
 .../moe/ft_moe/moe_gemm_kernels_template.h    | 29 ++++++----
 .../contrib_ops/cuda/moe/ft_moe/moe_kernel.cu |  4 +-
 .../cuda/moe/ft_moe/moe_problem_visitor.h     |  8 +--
 .../quantization/attention_quantization.cc    |  2 +-
 .../qordered_ops/qordered_attention.cc        |  2 +-
 .../qordered_ops/qordered_attention_impl.cu   |  2 +-
 .../qordered_ops/qordered_qdq_impl.cu         |  2 +-
 .../cuda/transformers/generation_cuda_impl.cu | 17 ++++--
 .../providers/cuda/cuda_execution_provider.h  | 20 +++----
 .../core/providers/cuda/cudnn_common.cc       |  1 -
 .../cuda/math/unary_elementwise_ops_impl.cu   |  7 +--
 onnxruntime/core/providers/cuda/nn/conv.cc    | 20 ++++---
 onnxruntime/core/providers/cuda/nn/conv.h     |  2 +-
 .../core/providers/cuda/nn/layer_norm.h       |  2 -
 .../core/providers/cuda/nn/layer_norm_impl.cu |  2 -
 .../core/providers/cuda/rnn/cudnn_rnn_base.cc |  1 -
 .../cuda/tensor/gelu_approximate_impl.cu      |  6 +--
 .../cuda/tensor/resize_antialias_impl.cu      | 20 +++----
 .../core/providers/cuda/tensor/resize_impl.cu |  2 +-
 .../providers/cuda/tensor/transpose_impl.cu   |  6 +--
 .../core/providers/cuda/triton_kernel.cu      | 50 ++++++++++-------
 .../core/providers/tensorrt/nv_includes.h     | 20 +++++++
 .../tensorrt/onnx_ctx_model_helper.h          |  2 +-
 .../tensorrt/tensorrt_execution_provider.cc   | 48 ++++++++++-------
 .../tensorrt/tensorrt_execution_provider.h    |  5 +-
 .../tensorrt_execution_provider_custom_ops.cc |  5 +-
 .../tensorrt_execution_provider_custom_ops.h  | 23 +++++---
 ...oder_masked_multihead_attention_op_test.cc | 12 ++---
 .../providers/cpu/generator/random_test.cc    |  4 +-
 onnxruntime/test/unittest_main/test_main.cc   | 17 +++++-
 .../training_ops/cuda/cross_entropy_test.cc   | 10 ++--
 .../training_ops/cuda/nn/conv_shared.cc       | 11 ++--
 .../cuda/nn/conv_transpose_grad.cc            |  2 -
 .../training_ops/cuda/nn/layer_norm_impl.cu   |  2 -
 .../training_ops/cuda/optimizer/lamb_impl.cu  |  2 +-
 .../templates/jobs/win-ci-prebuild-steps.yml  | 11 +++-
 55 files changed, 315 insertions(+), 219 deletions(-)
 create mode 100644 onnxruntime/core/providers/tensorrt/nv_includes.h

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 0d55d4cab9826..3f919d7bf6e18 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -1274,11 +1274,7 @@ endif()
 #Dependencies end. In the next we'll enable "treat warning as error"
 
 #Adjust warning flags
-if (onnxruntime_USE_CUDA)
-  set_msvc_c_cpp_compiler_warning_level(3)
-else()
-  set_msvc_c_cpp_compiler_warning_level(4)
-endif()
+set_msvc_c_cpp_compiler_warning_level(4)
 
 set(onnxruntime_DELAYLOAD_FLAGS "")
 
diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
index 7f295a59a0931..aeeac10ead27d 100644
--- a/cmake/onnxruntime_providers_cuda.cmake
+++ b/cmake/onnxruntime_providers_cuda.cmake
@@ -141,18 +141,22 @@
     if (HAS_GUARD_CF)
       target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /guard:cf>")
     endif()
+
     if (HAS_QSPECTRE)
       target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /Qspectre>")
     endif()
+
     foreach(ORT_FLAG ${ORT_WARNING_FLAGS})
         target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler \"${ORT_FLAG}\">")
     endforeach()
+
     # CUDA 11.3+ supports parallel compilation
     # https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#options-for-guiding-compiler-driver-threads
     if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.3)
       option(onnxruntime_NVCC_THREADS "Number of threads that NVCC can use for compilation." 1)
       target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--threads \"${onnxruntime_NVCC_THREADS}\">")
     endif()
+
     if (UNIX)
       target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler -Wno-reorder>"
                   "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:-Wno-reorder>")
@@ -162,6 +166,13 @@
       #mutex.cuh(91): warning C4834: discarding return value of function with 'nodiscard' attribute
       target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /wd4834>")
       target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /wd4127>")
+      if (MSVC)
+        # the VS warnings for 'Conditional Expression is Constant' are spurious as they don't handle multiple conditions
+        # e.g. `if (std::is_same_v<T, float> && not_a_const)` will generate the warning even though constexpr cannot
+        # be used due to `&& not_a_const`. This affects too many places for it to be reasonable to disable at a finer
+        # granularity.
+        target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:/wd4127>")
+      endif()
     endif()
 
     onnxruntime_add_include_to_target(${target} onnxruntime_common onnxruntime_framework onnx onnx_proto ${PROTOBUF_LIB} flatbuffers::flatbuffers)
@@ -187,7 +198,7 @@
           target_link_directories(${target} PRIVATE ${onnxruntime_CUDNN_HOME}/lib)
       endif()
     endif()
-    
+
     if (onnxruntime_USE_TRITON_KERNEL)
       # compile triton kernel, generate .a and .h files
       include(onnxruntime_compile_triton_kernel.cmake)
diff --git a/include/onnxruntime/core/providers/cuda/cuda_context.h b/include/onnxruntime/core/providers/cuda/cuda_context.h
index 108173474db46..7104e70c3a8a9 100644
--- a/include/onnxruntime/core/providers/cuda/cuda_context.h
+++ b/include/onnxruntime/core/providers/cuda/cuda_context.h
@@ -58,7 +58,7 @@ struct CudaContext : public CustomOpContext {
 
   template <typename T>
   T FetchResource(const OrtKernelContext& kernel_ctx, CudaResource resource_type) {
-    if (sizeof(T) > sizeof(void*)) {
+    if constexpr (sizeof(T) > sizeof(void*)) {
       ORT_CXX_API_THROW("void* is not large enough to hold resource type: " + std::to_string(resource_type), OrtErrorCode::ORT_INVALID_ARGUMENT);
     }
     const auto& ort_api = Ort::GetApi();
diff --git a/onnxruntime/contrib_ops/cuda/bert/add_bias_transpose.cu b/onnxruntime/contrib_ops/cuda/bert/add_bias_transpose.cu
index 1ea2540db486f..9e6752b451868 100644
--- a/onnxruntime/contrib_ops/cuda/bert/add_bias_transpose.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/add_bias_transpose.cu
@@ -843,11 +843,11 @@ void InvokeAddBiasTransposeTrt(
 
 template <>
 void LaunchAddBiasTransposeTrt(
-    cudaStream_t stream, const int max_threads_per_block,
-    const int batch_size, const int sequence_length,
-    const int num_heads, const int head_size,
-    const float* biases, const float* query, const float* key, const float* value, float* output,
-    bool is_cross_attention, int kv_sequence_length) {
+    cudaStream_t /*stream*/, const int /*max_threads_per_block*/,
+    const int /*batch_size*/, const int /*sequence_length*/,
+    const int /*num_heads*/, const int /*head_size*/,
+    const float* /*biases*/, const float* /*query*/, const float* /*key*/, const float* /*value*/, float* /*output*/,
+    bool /*is_cross_attention*/, int /*kv_sequence_length*/) {
   ORT_ENFORCE(false, "Shall not call this since fused kernel does not support float input.");
 }
 
diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu
index c20f42c4d06bc..a93fdf74dc28c 100644
--- a/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu
@@ -58,12 +58,12 @@ size_t AlignSize(size_t bytes) {
   return bytesAligned;
 }
 
-void CumulatedSequenceLengthCache::Initialize(int32_t sequence_length, cudaStream_t stream) {
-  if (this->sequence_length != sequence_length) {
+void CumulatedSequenceLengthCache::Initialize(int32_t seq_length, cudaStream_t stream) {
+  if (this->sequence_length != seq_length) {
     ORT_ENFORCE(buffer.get() != nullptr && this->max_batch_size > 0);
     LaunchTrtSequenceOffset(reinterpret_cast<int32_t*>(buffer.get()), nullptr,
-                            this->max_batch_size, sequence_length, stream);
-    this->sequence_length = sequence_length;
+                            this->max_batch_size, seq_length, stream);
+    this->sequence_length = seq_length;
   }
 }
 
@@ -213,9 +213,9 @@ Status FusedTrtCrossAttention(
 
 template <>
 Status FusedTrtCrossAttention<float>(
-    cudaStream_t stream,
-    contrib::AttentionParameters& parameters,
-    AttentionData<float>& data) {
+    cudaStream_t /*stream*/,
+    contrib::AttentionParameters& /*parameters*/,
+    AttentionData<float>& /*data*/) {
   return ORT_MAKE_STATUS(ONNXRUNTIME, StatusCode::NOT_IMPLEMENTED,
                          "Trt fused cross attention does not support float tensor");
 }
@@ -276,9 +276,9 @@ Status FusedTrtSelfAttention(
 // Template Specialization for float type
 template <>
 Status FusedTrtSelfAttention<float>(
-    cudaStream_t stream,
-    contrib::AttentionParameters& parameters,
-    AttentionData<float>& data) {
+    cudaStream_t /*stream*/,
+    contrib::AttentionParameters& /*parameters*/,
+    AttentionData<float>& /*data*/) {
   return ORT_MAKE_STATUS(ONNXRUNTIME, StatusCode::NOT_IMPLEMENTED,
                          "Trt fused attention does not support float tensor");
 }
diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_prepare_qkv.cu b/onnxruntime/contrib_ops/cuda/bert/attention_prepare_qkv.cu
index a513d9e8d2211..b843966d88e85 100644
--- a/onnxruntime/contrib_ops/cuda/bert/attention_prepare_qkv.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/attention_prepare_qkv.cu
@@ -231,7 +231,7 @@ Status PrepareQkv_MHA_PackedQKV(contrib::AttentionParameters& parameters,
                                 AttentionData<T>& data,
                                 cudaStream_t stream,
                                 int max_threads_per_block,
-                                T* q, T* k, T* v, AttentionQkvFormat& qkv_format) {
+                                T* /*q*/, T* /*k*/, T* /*v*/, AttentionQkvFormat& qkv_format) {
   const int batch_size = parameters.batch_size;
   const int sequence_length = parameters.sequence_length;
   const int num_heads = parameters.num_heads;
@@ -279,7 +279,7 @@ Status PrepareQkv_MHA_PackedKV(contrib::AttentionParameters& parameters,
                                AttentionData<T>& data,
                                cudaStream_t stream,
                                int max_threads_per_block,
-                               T* q, T* k, T* v, AttentionQkvFormat& qkv_format) {
+                               T* /*q*/, T* k, T* /*v*/, AttentionQkvFormat& qkv_format) {
   const int batch_size = parameters.batch_size;
   const int kv_sequence_length = parameters.kv_sequence_length;
   const int num_heads = parameters.num_heads;
diff --git a/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h b/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h
index db78722cc0e4c..c12cb374d9adf 100644
--- a/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h
+++ b/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h
@@ -242,18 +242,18 @@ void DispatchIsAligned(const MemoryEfficientAttentionParams& params) {
   using AlignedAK = AttentionKernel<T, ArchTag, true, queries_per_block, keys_per_block, single_value_iteration>;
 #if defined(_MSC_VER) && !defined(__clang__)
 #pragma warning(push)
-#pragma warning(disable : 6287)
+#pragma warning(disable : 6287 4189)  // kAligned is used via capture so 4189 warning seems incorrect
 #endif
   // Run a more efficient kernel with `isAligned=True` when memory is correctly aligned.
   bool is_aligned = params.qk_head_size % AlignedAK::kAlignmentQ == 0 &&
                     params.qk_head_size % AlignedAK::kAlignmentK == 0 &&
                     params.v_head_size % AlignedAK::kAlignmentV == 0;
-#if defined(_MSC_VER) && !defined(__clang__)
-#pragma warning(pop)
-#endif
   DISPATCH_BOOL(is_aligned, kIsAligned, ([&]() {
                   LaunchCutlassFmha<T, ArchTag, kIsAligned, queries_per_block, keys_per_block, single_value_iteration>(params);
                 }));
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(pop)
+#endif
 }
 
 template <typename T, typename ArchTag>
diff --git a/onnxruntime/contrib_ops/cuda/bert/decoder_attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/decoder_attention_impl.cu
index e24d9da94c964..c0b1996789183 100644
--- a/onnxruntime/contrib_ops/cuda/bert/decoder_attention_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/decoder_attention_impl.cu
@@ -17,7 +17,7 @@ Status DecoderQkvToContext(
     const cudaDeviceProp& device_prop,
     Stream* ort_stream,
     cublasHandle_t& cublas,
-    const size_t element_size,
+    const size_t /*element_size*/,
     const int batch_size,
     const int sequence_length,
     const int kv_sequence_length,
diff --git a/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu
index d88e9a49fb5ee..cb5631542c113 100644
--- a/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu
@@ -451,7 +451,7 @@ __global__ void PastToTotalSeqlen(int32_t* seqlens_k,
 // Convert Past to Total sequence length tensor
 Status LaunchGetSeqlenBuff(contrib::GroupQueryAttentionParameters& parameters, int32_t* seqlens_k,
                            int32_t* seqlens_k_buff, bool is_total, cudaStream_t stream,
-                           const int threads_per_block) {
+                           const int /*threads_per_block*/) {
   if (parameters.is_prompt) {
     return Status::OK();
   }
@@ -655,7 +655,7 @@ Status EfficientAttention(
 template <typename T>
 Status QkvToContext(
     const cudaDeviceProp& device_prop,
-    cublasHandle_t& cublas,
+    cublasHandle_t& /*cublas*/,
     Stream* ort_stream,
     contrib::GroupQueryAttentionParameters& parameters,
     GroupQueryAttentionData<T>& data) {
diff --git a/onnxruntime/contrib_ops/cuda/bert/packed_attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/packed_attention_impl.cu
index ce7ac3796dbe1..a84a310b46ca0 100644
--- a/onnxruntime/contrib_ops/cuda/bert/packed_attention_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/packed_attention_impl.cu
@@ -440,7 +440,7 @@ Status LaunchTransposeRemovePadding(
 
 template <typename T>
 Status FusedScaledDotProductAttention(
-    const cudaDeviceProp& device_prop,
+    const cudaDeviceProp& /*device_prop*/,
     cudaStream_t stream,
     PackedAttentionParameters& parameters,
     PackedAttentionData<T>& data) {
diff --git a/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention_impl.cu
index 49029da12a308..982c7eaa2cb2c 100644
--- a/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention_impl.cu
@@ -381,7 +381,7 @@ void InvokeTranspose(
     const T* query, const T* key, const T* value, const T* bias, T* output,
     const int batch_size, const int sequence_length,
     const int num_heads, const int qk_head_size, const int v_head_size,
-    AttentionQkvFormat source_format, AttentionQkvFormat target_format,
+    [[maybe_unused]] AttentionQkvFormat source_format, AttentionQkvFormat target_format,
     const int32_t* token_offset, int32_t token_count,
     cudaStream_t stream) {
   if (key != nullptr && value != nullptr) {
@@ -551,7 +551,7 @@ void LaunchTranspose(
 
 template <typename T>
 Status FusedAttentionTrt(
-    const cudaDeviceProp& device_prop,
+    const cudaDeviceProp& /*device_prop*/,
     cudaStream_t stream,
     PackedAttentionParameters& parameters,
     PackedMultiHeadAttentionData<T>& data) {
diff --git a/onnxruntime/contrib_ops/cuda/bert/rotary_embedding.cc b/onnxruntime/contrib_ops/cuda/bert/rotary_embedding.cc
index 9de7ba3885c3c..ab7479f2938fe 100644
--- a/onnxruntime/contrib_ops/cuda/bert/rotary_embedding.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/rotary_embedding.cc
@@ -82,8 +82,6 @@ Status RotaryEmbedding<T>::ComputeInternal(OpKernelContext* context) const {
       interleaved,
       device_prop.maxThreadsPerBlock,
       parameters.transposed);
-
-  return Status::OK();
 }
 
 }  // namespace cuda
diff --git a/onnxruntime/contrib_ops/cuda/bert/rotary_embedding_impl.cu b/onnxruntime/contrib_ops/cuda/bert/rotary_embedding_impl.cu
index c6637041f05bd..3a14161f29e9f 100644
--- a/onnxruntime/contrib_ops/cuda/bert/rotary_embedding_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/rotary_embedding_impl.cu
@@ -93,7 +93,7 @@ Status LaunchRotaryEmbeddingKernel(
     const int num_heads,
     const int head_size,
     const int rotary_embedding_dim,
-    const int max_sequence_length,
+    const int /*max_sequence_length*/,
     const int position_ids_format,
     const bool interleaved,
     const int max_threads_per_block,
diff --git a/onnxruntime/contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/mha_runner.cu b/onnxruntime/contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/mha_runner.cu
index 8fb6575d27cc0..4a4e3eeecf642 100644
--- a/onnxruntime/contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/mha_runner.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/mha_runner.cu
@@ -53,9 +53,9 @@ class FusedMHARunnerFP16v2::mhaImpl {
 
   ~mhaImpl() {}
 
-  void setup(const int S, const int B) {
+  void setup(const int seq_len, const int B) {
     // For bert and vit, use flash attention when sequence length is larger than the threshold.
-    use_flash_attention = is_flash_attention(S);
+    use_flash_attention = is_flash_attention(seq_len);
 
     params.force_unroll = use_flash_attention;
 
@@ -68,26 +68,26 @@ class FusedMHARunnerFP16v2::mhaImpl {
       warps_n = 1;
     } else {
       if (sm == 70) {
-        if (S == 64 || S == 96) {
+        if (seq_len == 64 || seq_len == 96) {
           warps_m = 2;
           warps_n = 2;
-        } else if (S == 128) {
+        } else if (seq_len == 128) {
           warps_m = 1;
           warps_n = 4;
-        } else if (S == 256 || S == 384) {
+        } else if (seq_len == 256 || seq_len == 384) {
           warps_m = 1;
           warps_n = 8;
         } else {
           ORT_ENFORCE(false, "Unsupported sequence length");
         }
       } else {
-        if (S == 32 || S == 64 || S == 96 || S == 128) {
+        if (seq_len == 32 || seq_len == 64 || seq_len == 96 || seq_len == 128) {
           warps_m = 2;
           warps_n = 2;
-        } else if (S == 192 || S == 256) {
+        } else if (seq_len == 192 || seq_len == 256) {
           warps_m = 1;
           warps_n = 4;
-        } else if (S == 384) {
+        } else if (seq_len == 384) {
           warps_m = 1;
           warps_n = 8;
         } else {
@@ -99,7 +99,7 @@ class FusedMHARunnerFP16v2::mhaImpl {
     // The number of threads per CTA.
     threads_per_cta = warps_m * warps_n * warps_k * 32;
     // The number of xmmas in the M dimension. We use one uint32_t per XMMA in the M dimension.
-    xmmas_m = (S + 16 * warps_m - 1) / (16 * warps_m);
+    xmmas_m = (seq_len + 16 * warps_m - 1) / (16 * warps_m);
 
     const float scale_bmm1 = interface->mScale;
     const float scale_softmax = 1.f;  // Seems to be only required for int8
@@ -111,7 +111,7 @@ class FusedMHARunnerFP16v2::mhaImpl {
 
     params.b = B;
     params.h = interface->mNumHeads;
-    params.s = S;
+    params.s = seq_len;
     params.d = interface->mHeadSize;
 
     params.qkv_stride_in_bytes = 3 * interface->mNumHeads * interface->mHeadSize * sizeof(half);
@@ -121,7 +121,7 @@ class FusedMHARunnerFP16v2::mhaImpl {
     has_causal_mask = false;
   }
 
-  void setup_causal_masked_fmha(const int S, const int B) {
+  void setup_causal_masked_fmha(const int seq_len, const int B) {
     const float scale_bmm1 = interface->mScale;
     const float scale_softmax = 1.f;  // Seems to be only required for int8
     const float scale_bmm2 = 1.f;
@@ -132,7 +132,7 @@ class FusedMHARunnerFP16v2::mhaImpl {
 
     params.b = B;
     params.h = interface->mNumHeads;
-    params.s = S;
+    params.s = seq_len;
     params.d = interface->mHeadSize;
 
     params.qkv_stride_in_bytes = 3 * interface->mNumHeads * interface->mHeadSize * sizeof(half);
@@ -182,30 +182,30 @@ class FusedMHARunnerFP16v2::mhaImpl {
       return max_seq_len;
     }
 
-    int S = max_seq_len;
+    int seq_len = max_seq_len;
     if (max_seq_len <= 32) {
-      S = (sm == 70) ? 64 : 32;
+      seq_len = (sm == 70) ? 64 : 32;
     } else if (max_seq_len <= 64) {
-      S = 64;
+      seq_len = 64;
     } else if (max_seq_len <= 96) {
-      S = 96;
+      seq_len = 96;
     } else if (max_seq_len <= 128) {
-      S = 128;
+      seq_len = 128;
     } else if (max_seq_len <= 192) {
-      S = (sm == 70) ? 256 : 192;
+      seq_len = (sm == 70) ? 256 : 192;
     } else if (max_seq_len <= 256) {
-      S = 256;
+      seq_len = 256;
     } else if (max_seq_len <= 384) {
-      S = 384;
+      seq_len = 384;
     }
 
-    return S;
+    return seq_len;
   }
 
  protected:
-  bool is_flash_attention(const int S) const {
+  bool is_flash_attention(const int seq_len) const {
     ORT_ENFORCE(interface->mHasCausalMask == false);
-    return interface->mEnableFlashAttention && S >= kMinSequenceLengthFlashAttention;
+    return interface->mEnableFlashAttention && seq_len >= kMinSequenceLengthFlashAttention;
   }
 
  private:
@@ -232,12 +232,12 @@ FusedMHARunnerFP16v2::FusedMHARunnerFP16v2(const int numHeads,
       pimpl(new mhaImpl(this)) {
 }
 
-void FusedMHARunnerFP16v2::setup(const int S, const int B) {
-  MHARunner::setup(S, B);
+void FusedMHARunnerFP16v2::setup(const int seq_len, const int B) {
+  MHARunner::setup(seq_len, B);
   if (mHasCausalMask) {
-    pimpl->setup_causal_masked_fmha(S, B);
+    pimpl->setup_causal_masked_fmha(seq_len, B);
   } else {
-    pimpl->setup(S, B);
+    pimpl->setup(seq_len, B);
   }
 }
 
diff --git a/onnxruntime/contrib_ops/cuda/diffusion/group_norm_common_base.h b/onnxruntime/contrib_ops/cuda/diffusion/group_norm_common_base.h
index ea87d0c29111e..a80584d3293a0 100644
--- a/onnxruntime/contrib_ops/cuda/diffusion/group_norm_common_base.h
+++ b/onnxruntime/contrib_ops/cuda/diffusion/group_norm_common_base.h
@@ -136,10 +136,10 @@ struct GroupNormNHWCParams {
                       bool use_silu,
                       bool broadcast_skip,
                       int channels_per_block) {
-    int32_t channels_per_group = num_channels / num_groups;
+    int32_t channels_per_group_in = num_channels / num_groups;
     // channels_per_block is computed in PrePack.
     // If the gamma is not initializer, channels_per_block might be zero after PrePack. In that happens, compute it here.
-    if (channels_per_block < channels_per_group) {
+    if (channels_per_block < channels_per_group_in) {
       channels_per_block = GetChannelsPerBlock(num_channels, num_groups);
     }
 
@@ -167,7 +167,7 @@ struct GroupNormNHWCParams {
     this->hw_per_block = DivUp(this->hw, blocks_per_hw);
 
     this->channels_per_block = channels_per_block;
-    this->channels_per_group = channels_per_group;
+    this->channels_per_group = channels_per_group_in;
     this->hwc = this->hw * this->c;
     this->inv_hw_channels_per_group = 1.F / (float)(this->hw * this->channels_per_group);
     this->groups_per_block = channels_per_block / this->channels_per_group;
diff --git a/onnxruntime/contrib_ops/cuda/inverse.cc b/onnxruntime/contrib_ops/cuda/inverse.cc
index 81e161e60642c..9075dda26f86b 100644
--- a/onnxruntime/contrib_ops/cuda/inverse.cc
+++ b/onnxruntime/contrib_ops/cuda/inverse.cc
@@ -78,9 +78,9 @@ struct Inverse::ComputeImpl {
     cudaStream_t stream = ort_stream ? static_cast<cudaStream_t>(ort_stream->GetHandle()) : nullptr;
 
     // Make a copy of the input which will serve as a workspace as well.
-    if (std::is_same<T, float>::value || std::is_same<T, MLFloat16>::value) {
+    if constexpr (std::is_same<T, float>::value || std::is_same<T, MLFloat16>::value) {
       IAllocatorUniquePtr<float> input_workspace = inst->GetScratchBuffer<float>(input_count, ort_stream);
-      if (std::is_same<T, MLFloat16>::value) {
+      if constexpr (std::is_same<T, MLFloat16>::value) {
         // Convert from MLFloat16(half) to float
         Impl_Cast<CudaT, float>(stream, reinterpret_cast<const CudaT*>(input.Data<MLFloat16>()), input_workspace.get(), input_count);
       } else {
@@ -96,7 +96,7 @@ struct Inverse::ComputeImpl {
       // Need to compute ptrs for output buffers
       // Output for MLFloat
       IAllocatorUniquePtr<float*> output_ptrs = inst->GetScratchBuffer<float*>(n_batches, ort_stream);
-      if (std::is_same<T, MLFloat16>::value) {
+      if constexpr (std::is_same<T, MLFloat16>::value) {
         IAllocatorUniquePtr<float> ml_float_output = inst->GetScratchBuffer<float>(input_count, ort_stream);
         ORT_RETURN_IF_ERROR(ComputeMatrixOffsets<float>(stream, ml_float_output.get(), num_batches, rows, output_ptrs));
         // Do the inverse
@@ -112,7 +112,7 @@ struct Inverse::ComputeImpl {
         ORT_RETURN_IF_ERROR(CheckForSingularity(stream, info, info_cpu, num_batches));
         // We are done here
       }
-    } else if (std::is_same<T, double>::value) {
+    } else if constexpr (std::is_same<T, double>::value) {
       IAllocatorUniquePtr<double> input_workspace = inst->GetScratchBuffer<double>(static_cast<int>(input_count), ort_stream);
       CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(input_workspace.get(), input.Data<double>(), sizeof(double) * input_count,
                                            cudaMemcpyDeviceToDevice, stream));
diff --git a/onnxruntime/contrib_ops/cuda/math/complex_mul_impl.cu b/onnxruntime/contrib_ops/cuda/math/complex_mul_impl.cu
index ca94477114ee2..47a64502b3480 100644
--- a/onnxruntime/contrib_ops/cuda/math/complex_mul_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/math/complex_mul_impl.cu
@@ -97,8 +97,8 @@ void ComplexMul_Impl(
     const TArray<int64_t>* rhs_padded_strides,
     const T* rhs_data,
     const TArray<onnxruntime::cuda::fast_divmod>* fdm_output_strides,
-    const onnxruntime::cuda::fast_divmod& fdm_H,
-    const onnxruntime::cuda::fast_divmod& fdm_C,
+    const onnxruntime::cuda::fast_divmod& /*fdm_H*/,
+    const onnxruntime::cuda::fast_divmod& /*fdm_C*/,
     T* output_data,
     int64_t count,
     int64_t lhs_size,
diff --git a/onnxruntime/contrib_ops/cuda/math/gemm_float8.cu b/onnxruntime/contrib_ops/cuda/math/gemm_float8.cu
index 064b6dd392437..28ab27ee33d10 100644
--- a/onnxruntime/contrib_ops/cuda/math/gemm_float8.cu
+++ b/onnxruntime/contrib_ops/cuda/math/gemm_float8.cu
@@ -174,7 +174,7 @@ Status GemmFloat8::ComputeGemm(
     int32_t dtype_A, int32_t dtype_B,
     int32_t dtype_C, int32_t dtype_Y,
     const TensorShape& shape_A, const TensorShape& shape_B,
-    const TensorShape& shape_C, const TensorShape& shape_Y,
+    const TensorShape& shape_C, const TensorShape& /*shape_Y*/,
     bool trans_A, bool trans_B, const void* p_input_a, const void* p_input_b,
     const void* p_input_c, const void* p_scale_a, const void* p_scale_b,
     const void* p_scale_y, void* p_output_y, int M, int N, int K, int lda,
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_cutlass_kernel.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_cutlass_kernel.h
index bfe30b71170d8..cfe306c2482a5 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_cutlass_kernel.h
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_cutlass_kernel.h
@@ -202,7 +202,7 @@ struct MoeFCGemm {
           total_rows_before_expert(total_rows_before_expert),
           gemm_n(gemm_n),
           gemm_k(gemm_k),
-          host_problem_sizes(nullptr) {
+          host_problem_sizes(host_problem_sizes) {
       if (platform::is_same<uint8_t, ElementB>::value || platform::is_same<uint4b_t, ElementB>::value) {
         assert(weight_scales);
       }
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_template.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_template.h
index 66950c9b65970..a3dcf0da16b98 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_template.h
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_template.h
@@ -20,6 +20,12 @@
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #endif
 
+// Ignore CUTLASS warning C4100: unreferenced formal parameter
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4100)
+#endif
+
 #include "cutlass/array.h"
 #include "cutlass/numeric_conversion.h"
 #include "cutlass/layout/matrix.h"
@@ -36,6 +42,10 @@
 #include "layout_traits_helper.h"
 #include "moe_cutlass_kernel.h"
 
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
 #ifdef __GNUC__
 #pragma GCC diagnostic pop
 #endif
@@ -149,10 +159,10 @@ void generic_moe_gemm_kernelLauncher(const T* A, const WeightType* B, const T* w
 template <typename T, typename WeightType, typename arch, typename EpilogueTag, typename ThreadblockShape,
           typename WarpShape, int Stages, typename Enable = void>
 struct dispatch_stages {
-  static void dispatch(const T* A, const WeightType* B, const T* weight_scales, const T* biases, T* C,
-                       int64_t* total_rows_before_expert, int64_t gemm_n, int64_t gemm_k, int num_experts,
-                       CutlassGemmConfig gemm_config, int multi_processor_count, cudaStream_t stream,
-                       int* occupancy = nullptr) {
+  static void dispatch(const T* /*A*/, const WeightType* /*B*/, const T* /*weight_scales*/, const T* /*biases*/,
+                       T* /*C*/, int64_t* /*total_rows_before_expert*/, int64_t /*gemm_n*/, int64_t /*gemm_k*/,
+                       int /*num_experts*/, CutlassGemmConfig /*gemm_config*/, int /*multi_processor_count*/,
+                       cudaStream_t /*stream*/, [[maybe_unused]] int* occupancy = nullptr) {
     std::string err_msg = "Cutlass fpA_intB gemm. Not instantiates for arch " +
                           std::to_string(arch::kMinComputeCapability) + " with stages set to " + std::to_string(Stages);
     ORT_THROW("[FT Error][dispatch_stages::dispatch] " + err_msg);
@@ -221,9 +231,10 @@ template <
     typename T, typename WeightType, typename arch, typename EpilogueTag,
     typename std::enable_if<!std::is_same<T, float>::value && std::is_same<T, WeightType>::value>::type* = nullptr>
 void dispatch_moe_gemm_to_cutlass(const T* A, const WeightType* B, const T* weight_scales, const T* biases, T* C,
-                                  int64_t* total_rows_before_expert, int64_t total_rows, int64_t gemm_n, int64_t gemm_k,
-                                  int num_experts, CutlassGemmConfig gemm_config, int sm_version,
-                                  int multi_processor_count, cudaStream_t stream, int* occupancy = nullptr) {
+                                  int64_t* total_rows_before_expert, int64_t /*total_rows*/,
+                                  int64_t gemm_n, int64_t gemm_k, int num_experts, CutlassGemmConfig gemm_config,
+                                  int /*sm_version*/, int multi_processor_count, cudaStream_t stream,
+                                  int* occupancy = nullptr) {
   switch (gemm_config.tile_config) {
     case CutlassTileConfig::CtaShape32x128x64_WarpShape32x32x64:
       dispatch_gemm_config<T, WeightType, arch, EpilogueTag, cutlass::gemm::GemmShape<32, 128, 64>,
@@ -300,8 +311,8 @@ void dispatch_moe_gemm_to_cutlass(const T* A, const WeightType* B, const T* weig
 template <typename T, typename WeightType, typename arch, typename EpilogueTag,
           typename std::enable_if<std::is_same<T, float>::value>::type* = nullptr>
 void dispatch_moe_gemm_to_cutlass(const T* A, const WeightType* B, const T* weight_scales, const T* biases, T* C,
-                                  int64_t* total_rows_before_expert, int64_t total_rows, int64_t gemm_n, int64_t gemm_k,
-                                  int num_experts, CutlassGemmConfig gemm_config, int sm_version,
+                                  int64_t* total_rows_before_expert, int64_t /*total_rows*/, int64_t gemm_n, int64_t gemm_k,
+                                  int num_experts, CutlassGemmConfig gemm_config, int /*sm_version*/,
                                   int multi_processor_count, cudaStream_t stream, int* occupancy = nullptr) {
   switch (gemm_config.tile_config) {
     case CutlassTileConfig::CtaShape128x128x8_WarpShape64x64x8:
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu
index f4f2b49032d23..a5b47bcddefbc 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu
@@ -370,7 +370,7 @@ struct TopkConstants {
 
 template <typename T, int EXPERTS, int WARPS_PER_TB>
 void topk_gating_softmax_launcher_helper(const T* input, const bool* finished, T* output, int* indices, int* source_row,
-                                         int num_rows, int num_experts, int k, cudaStream_t stream) {
+                                         int num_rows, int /*num_experts*/, int k, cudaStream_t stream) {
   static constexpr unsigned long MAX_BYTES_PER_LDG = 16;
 
   static constexpr int BYTES_PER_LDG = std::min((int)MAX_BYTES_PER_LDG, (int)sizeof(T) * EXPERTS);
@@ -599,7 +599,7 @@ void CutlassMoeFCRunner<T, WeightType, Enable>::run_moe_fc(
   static constexpr bool scales_required =
       std::is_same<WeightType, uint8_t>::value || std::is_same<WeightType, cutlass::uint4b_t>::value;
 
-  if (scales_required) {
+  if constexpr (scales_required) {
     if (fc1_scales == nullptr) {
       ORT_THROW("[FT Error][Run MoE FC] Scales expected but scale for first matmul is a null pointer");
     } else if (fc2_scales == nullptr) {
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_problem_visitor.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_problem_visitor.h
index 00f977c615df6..1de8f6b69642c 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_problem_visitor.h
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_problem_visitor.h
@@ -276,13 +276,13 @@ struct MoeProblemVisitor<ProblemSizeHelper, ThreadblockShape, GroupScheduleMode:
     return true;
   }
 
-  static size_t get_workspace_size(const cutlass::gemm::GemmCoord* host_problem_sizes_ptr, int32_t problem_count,
-                                   int32_t block_count) {
+  static size_t get_workspace_size(const cutlass::gemm::GemmCoord* /*host_problem_sizes_ptr*/,
+                                   int32_t /*problem_count*/, int32_t /*block_count*/) {
     return 0;
   }
 
-  static void host_precompute(const cutlass::gemm::GemmCoord* host_problem_sizes_ptr, int32_t problem_count,
-                              int32_t block_count, void* host_workspace_ptr) {}
+  static void host_precompute(const cutlass::gemm::GemmCoord* /*host_problem_sizes_ptr*/, int32_t /*problem_count*/,
+                              int32_t /*block_count*/, void* /*host_workspace_ptr*/) {}
 };
 
 }  // namespace kernel
diff --git a/onnxruntime/contrib_ops/cuda/quantization/attention_quantization.cc b/onnxruntime/contrib_ops/cuda/quantization/attention_quantization.cc
index 001b6070d5e1a..168c69c69f003 100644
--- a/onnxruntime/contrib_ops/cuda/quantization/attention_quantization.cc
+++ b/onnxruntime/contrib_ops/cuda/quantization/attention_quantization.cc
@@ -154,7 +154,7 @@ Status QAttention<T, int8_t>::ComputeInternal(OpKernelContext* context) const {
   CudaT dequant_scale;
   CudaT input_scale = *(reinterpret_cast<const CudaT*>(input_scale_tensor->Data<T>()));
   CudaT weight_scale = *(reinterpret_cast<const CudaT*>(weight_scale_tensor->Data<T>()));
-  if (sizeof(T) == 2) {
+  if constexpr (sizeof(T) == 2) {
     dequant_scale = __float2half(__half2float(input_scale) * __half2float(weight_scale));
   } else {
     dequant_scale = input_scale * weight_scale;
diff --git a/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_attention.cc b/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_attention.cc
index 3cecebedae2f0..12835978536e1 100644
--- a/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_attention.cc
@@ -142,7 +142,7 @@ inline void debug_print([[maybe_unused]] const T* arr,
   std::cout << "========" << name << std::endl;
   for (size_t i = 0; i < sz; i++) {
     if (i % w == 0) std::cout << std::endl;
-    if (std::is_same<T, int8_t>().value) {
+    if constepxr (std::is_same<T, int8_t>::value) {
       std::cout << (int)buf[i] << ", ";
     } else {
       std::cout << buf[i] << ", ";
diff --git a/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_attention_impl.cu b/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_attention_impl.cu
index f4d5a7b404a62..fd4b51f40fb4f 100644
--- a/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_attention_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_attention_impl.cu
@@ -151,7 +151,7 @@ QOrderBatchInt8MatrixTransposeKernel(const int8_t* src, const int8_t* dst, const
   }
 }
 
-Status QOrderBatchTransposeInt8Matrix(cudaStream_t stream, const cudaDeviceProp& device_prop,
+Status QOrderBatchTransposeInt8Matrix(cudaStream_t stream, const cudaDeviceProp& /*device_prop*/,
                                       const int batch_size, const int rows, const int cols,
                                       const int8_t* input, int8_t* output) {
   ORT_ENFORCE(rows % 4 == 0 && cols % 4 == 0, "Matrix rows and cols must be divisible by 4!");
diff --git a/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_qdq_impl.cu b/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_qdq_impl.cu
index baff8e76ec73b..e6ac0bc8a5171 100644
--- a/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_qdq_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_qdq_impl.cu
@@ -389,7 +389,7 @@ QOrderDequantizeKernel_Strict(const int8_t* __restrict__ src, const __half* __re
   }
 }
 
-Status QOrderDequantize_Strict(cudaStream_t stream, const cudaDeviceProp& device_prop,
+Status QOrderDequantize_Strict(cudaStream_t stream, const cudaDeviceProp& /*device_prop*/,
                                const int8_t* src, __half* dst, float scale, size_t N) {
   ORT_RETURN_IF(N & 0x3LL, "N can not divide by 4!");
 
diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu
index a39abefed9cd0..eb1943b59d976 100644
--- a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu
@@ -1,11 +1,22 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+
+// cub.cuh includes device/dispatch_radix_sort.cuh which has assignment in conditional expressions
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4706)  
+#endif
+#include <cub/cub.cuh>
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
+#include <cub/util_type.cuh>
+
 #include "core/providers/cuda/cuda_common.h"
 #include "core/providers/cuda/cu_inc/common.cuh"
-#include "cub/util_type.cuh"
-#include <cub/cub.cuh>
-#include <cub/device/device_segmented_radix_sort.cuh>
+
 #include "contrib_ops/cuda/bert/utils.cuh"
 #include "contrib_ops/cuda/transformers/generation_cuda_impl.h"
 
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.h b/onnxruntime/core/providers/cuda/cuda_execution_provider.h
index 5f62f313b86a2..75fe1dff7c4a4 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.h
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.h
@@ -131,41 +131,33 @@ class CUDAExecutionProvider : public IExecutionProvider {
 
     template <typename T>
     const T* GetConstOnes(size_t count, cudaStream_t stream) {
-      constexpr bool is_float = std::is_same<T, float>::value;
-      constexpr bool is_double = std::is_same<T, double>::value;
-      constexpr bool is_half = std::is_same<T, half>::value;
-      constexpr bool is_BFloat16 = std::is_same<T, BFloat16>::value;
-#if !defined(DISABLE_FLOAT8_TYPES)
-      constexpr bool is_Float8E4M3FN = std::is_same<T, Float8E4M3FN>::value;
-      constexpr bool is_Float8E5M2 = std::is_same<T, Float8E5M2>::value;
-#endif
-      if (is_float) {
+      if constexpr (std::is_same<T, float>::value) {
         if (!constant_ones_float_) {
           constant_ones_float_ = cuda::CreateConstantOnes<float>();
         }
         return reinterpret_cast<const T*>(constant_ones_float_->GetBuffer(stream, count));
-      } else if (is_double) {
+      } else if constexpr (std::is_same<T, double>::value) {
         if (!constant_ones_double_) {
           constant_ones_double_ = cuda::CreateConstantOnes<double>();
         }
         return reinterpret_cast<const T*>(constant_ones_double_->GetBuffer(stream, count));
-      } else if (is_half) {
+      } else if constexpr (std::is_same<T, half>::value) {
         if (!constant_ones_half_) {
           constant_ones_half_ = cuda::CreateConstantOnes<half>();
         }
         return reinterpret_cast<const T*>(constant_ones_half_->GetBuffer(stream, count));
-      } else if (is_BFloat16) {
+      } else if constexpr (std::is_same<T, BFloat16>::value) {
         if (!constant_ones_bfloat16_) {
           constant_ones_bfloat16_ = cuda::CreateConstantOnes<BFloat16>();
         }
         return reinterpret_cast<const T*>(constant_ones_bfloat16_->GetBuffer(stream, count));
 #if !defined(DISABLE_FLOAT8_TYPES)
-      } else if (is_Float8E4M3FN) {
+      } else if constexpr (std::is_same<T, Float8E4M3FN>::value) {
         if (!constant_ones_float8e4m3fn_) {
           constant_ones_float8e4m3fn_ = cuda::CreateConstantOnes<Float8E4M3FN>();
         }
         return reinterpret_cast<const T*>(constant_ones_float8e4m3fn_->GetBuffer(stream, count));
-      } else if (is_Float8E5M2) {
+      } else if constexpr (std::is_same<T, Float8E5M2>::value) {
         if (!constant_ones_float8e5m2_) {
           constant_ones_float8e5m2_ = cuda::CreateConstantOnes<Float8E5M2>();
         }
diff --git a/onnxruntime/core/providers/cuda/cudnn_common.cc b/onnxruntime/core/providers/cuda/cudnn_common.cc
index c850f7b583bfc..39b73163794f0 100644
--- a/onnxruntime/core/providers/cuda/cudnn_common.cc
+++ b/onnxruntime/core/providers/cuda/cudnn_common.cc
@@ -160,7 +160,6 @@ cudnnDataType_t CudnnTensor::GetDataType<half>() {
 template <>
 cudnnDataType_t CudnnTensor::GetDataType<BFloat16>() {
   ORT_THROW("cuDNN doesn't support BFloat16.");
-  return CUDNN_DATA_FLOAT;
 }
 
 template <>
diff --git a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.cu b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.cu
index fd8f7929d4426..554d5908cf854 100644
--- a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.cu
+++ b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.cu
@@ -127,9 +127,10 @@ struct OP_Cast {
     UnaryElementWiseImpl(stream, input_data, output_data, OP_Cast<InT, OutT>(), count);                  \
   }
 
-#define IMPL_CAST_IMPL_THROW(InT, OutT)                                                                  \
-  void Explicit_Impl_Cast(cudaStream_t stream, const InT* input_data, OutT* output_data, size_t count) { \
-    ORT_THROW("Cast from " #InT " to " #OutT " must define saturate.");                                  \
+#define IMPL_CAST_IMPL_THROW(InT, OutT)                                                              \
+  void Explicit_Impl_Cast(cudaStream_t /*stream*/, const InT* /*input_data*/, OutT* /*output_data*/, \
+                          size_t /*count*/) {                                                        \
+    ORT_THROW("Cast from " #InT " to " #OutT " must define saturate.");                              \
   }
 
 #if !defined(DISABLE_FLOAT8_TYPES)
diff --git a/onnxruntime/core/providers/cuda/nn/conv.cc b/onnxruntime/core/providers/cuda/nn/conv.cc
index a417be5a86c32..e05786248cbcf 100644
--- a/onnxruntime/core/providers/cuda/nn/conv.cc
+++ b/onnxruntime/core/providers/cuda/nn/conv.cc
@@ -97,11 +97,11 @@ Status SliceOutUnwantedOutputSection(cudaStream_t stream,
 
 template <typename T, bool NHWC>
 Status Conv<T, NHWC>::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                              bool& is_packed, [[maybe_unused]] PrePackedWeights* prepacked_weights) {
+                              bool& is_packed, PrePackedWeights* /*prepacked_weights*/) {
   is_packed = false;
   // only layout of weight input is adjusted via PrePack
-  if (NHWC && is_nhwc_domain_) {  // InputTensors::IN_W
-    if (input_idx == 1) {
+  if constexpr (NHWC) {
+    if (is_nhwc_domain_ && input_idx == 1) {  // InputTensors::IN_W
       // Transpose from {M, C/group, kH, kW} to {M, kH, kW, C/group}
       auto orig_shape = tensor.Shape();
 
@@ -123,6 +123,10 @@ Status Conv<T, NHWC>::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr
       CUDA_CALL_THROW(cudaStreamSynchronize(DefaultCudaStream()));
       is_packed = true;
     }
+  } else {
+    ORT_UNUSED_PARAMETER(tensor);
+    ORT_UNUSED_PARAMETER(input_idx);
+    ORT_UNUSED_PARAMETER(alloc);
   }
 
   return Status::OK();
@@ -149,8 +153,11 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
 
   // Make sure input and weight are 4D for NHWC since we set 4D descriptor for NHWC.
   constexpr bool channels_last = NHWC;
-  if (channels_last && (x_shape.NumDimensions() != 4 || w_shape.NumDimensions() != 4)) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Number of dimensions of X and W should be 4 for channels_last format (NHWC)");
+  if constexpr (channels_last) {
+    if (x_shape.NumDimensions() != 4 || w_shape.NumDimensions() != 4) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "Number of dimensions of X and W should be 4 for channels_last format (NHWC)");
+    }
   }
 
   // set B
@@ -403,7 +410,8 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
         default:
           perf.algo = kDefaultConvAlgo;
           CUDNN_RETURN_IF_ERROR(GetWorkspaceSize(GetCudnnHandle(context), s_, perf.algo, &perf.memory));
-          if (std::is_same<T, MLFloat16>::value) {
+
+          if constexpr (std::is_same<T, MLFloat16>::value) {
             perf.mathType = CUDNN_TENSOR_OP_MATH;
           } else if (std::is_same<T, float>::value && !UseTF32()) {
             perf.mathType = CUDNN_FMA_MATH;
diff --git a/onnxruntime/core/providers/cuda/nn/conv.h b/onnxruntime/core/providers/cuda/nn/conv.h
index 181fbc99fd8e9..3aec654224e39 100644
--- a/onnxruntime/core/providers/cuda/nn/conv.h
+++ b/onnxruntime/core/providers/cuda/nn/conv.h
@@ -195,7 +195,7 @@ class Conv : public CudaKernel {
   }
 
   Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                 bool& is_packed, [[maybe_unused]] PrePackedWeights* prepacked_weights) override;
+                 bool& is_packed, PrePackedWeights* prepacked_weights) override;
 
   Status ComputeInternal(OpKernelContext* context) const override;
 
diff --git a/onnxruntime/core/providers/cuda/nn/layer_norm.h b/onnxruntime/core/providers/cuda/nn/layer_norm.h
index ff231f4f1ad5c..c021d3ffe63a2 100644
--- a/onnxruntime/core/providers/cuda/nn/layer_norm.h
+++ b/onnxruntime/core/providers/cuda/nn/layer_norm.h
@@ -7,8 +7,6 @@
 namespace onnxruntime {
 namespace cuda {
 
-using namespace onnxruntime::cuda;
-
 // NOTE: This was originally a contrib op with 3 type constraints. The ONNX spec merges 'T' and 'V'.
 // the kernel is templatized on all three for backwards compatibility, but in ONNX usage T == V.
 template <typename T, typename U, typename V, bool simplified>
diff --git a/onnxruntime/core/providers/cuda/nn/layer_norm_impl.cu b/onnxruntime/core/providers/cuda/nn/layer_norm_impl.cu
index 679b8b6b78886..b9e8b45307079 100644
--- a/onnxruntime/core/providers/cuda/nn/layer_norm_impl.cu
+++ b/onnxruntime/core/providers/cuda/nn/layer_norm_impl.cu
@@ -29,8 +29,6 @@
 namespace onnxruntime {
 namespace cuda {
 
-using namespace onnxruntime::cuda;
-
 template <typename U, bool simplified>
 __device__ void cuWelfordOnlineSum(
     const U curr,
diff --git a/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc b/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc
index b61b104790fe5..6476364a211fd 100644
--- a/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc
+++ b/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc
@@ -305,7 +305,6 @@ Status CudnnRnnBase<T>::ComputeInternal(OpKernelContext* ctx) const {
   if (!weight_cached_) {
     const Tensor& W = *ctx->Input<Tensor>(RNN_Input_Index::W);
     const Tensor& R = *ctx->Input<Tensor>(RNN_Input_Index::R);
-    const Tensor* B = ctx->Input<Tensor>(RNN_Input_Index::B);
     ORT_RETURN_IF_ERROR(ReorganizeWeights(&W, &R, B, w_data_size_in_bytes, w_data, w_desc,
                                           rnn_desc, ctx->GetComputeStream()));
   }
diff --git a/onnxruntime/core/providers/cuda/tensor/gelu_approximate_impl.cu b/onnxruntime/core/providers/cuda/tensor/gelu_approximate_impl.cu
index 3292650584de8..7a27b7af33137 100644
--- a/onnxruntime/core/providers/cuda/tensor/gelu_approximate_impl.cu
+++ b/onnxruntime/core/providers/cuda/tensor/gelu_approximate_impl.cu
@@ -62,7 +62,7 @@ __global__ void FastGeluKernel2(const half2 a, const half2 b, const half2 c, int
 }
 
 template <>
-Status LaunchFastGeluKernel(const cudaDeviceProp& prop, cudaStream_t stream, int input_length, int bias_length,
+Status LaunchFastGeluKernel(const cudaDeviceProp& /*prop*/, cudaStream_t stream, int input_length, int bias_length,
                             const float* input, const float* bias, float* output, bool /*use_half2*/) {
   constexpr int blockSize = 256;
   const int gridSize = (input_length + blockSize - 1) / blockSize;
@@ -73,7 +73,7 @@ Status LaunchFastGeluKernel(const cudaDeviceProp& prop, cudaStream_t stream, int
 }
 
 template <>
-Status LaunchFastGeluKernel(const cudaDeviceProp& prop, cudaStream_t stream, int input_length, int bias_length,
+Status LaunchFastGeluKernel(const cudaDeviceProp& /*prop*/, cudaStream_t stream, int input_length, int bias_length,
                             const double* input, const double* bias, double* output, bool /*use_half2*/) {
   constexpr int blockSize = 256;
   const int gridSize = (input_length + blockSize - 1) / blockSize;
@@ -108,7 +108,7 @@ Status LaunchFastGeluKernel(const cudaDeviceProp& prop, cudaStream_t stream, int
 }
 
 template <>
-Status LaunchFastGeluKernel(const cudaDeviceProp& prop, cudaStream_t stream, int input_length, int bias_length,
+Status LaunchFastGeluKernel(const cudaDeviceProp& /*prop*/, cudaStream_t stream, int input_length, int bias_length,
                             const BFloat16* input, const BFloat16* bias, BFloat16* output, bool /*use_half2*/) {
   constexpr int blockSize = 256;
 
diff --git a/onnxruntime/core/providers/cuda/tensor/resize_antialias_impl.cu b/onnxruntime/core/providers/cuda/tensor/resize_antialias_impl.cu
index 56b7c3f499303..d56e4bc53874d 100644
--- a/onnxruntime/core/providers/cuda/tensor/resize_antialias_impl.cu
+++ b/onnxruntime/core/providers/cuda/tensor/resize_antialias_impl.cu
@@ -680,10 +680,10 @@ template <typename T>
 void ResizeTrilinearUpsample(
     cudaStream_t stream,
     int rank,
-    const UpsampleMode upsample_mode,
+    const UpsampleMode /*upsample_mode*/,
     ResizeCoordinateTransformationMode coordinate_transform_mode,
-    gsl::span<const int64_t> input_shape,
-    gsl::span<const int64_t> output_shape,
+    gsl::span<const int64_t> /*input_shape*/,
+    gsl::span<const int64_t> /*output_shape*/,
     int64_t batch_size, int64_t num_channels,
     std::tuple<int64_t, int64_t, int64_t> inferred_input_dims,
     std::tuple<int64_t, int64_t, int64_t> inferred_output_dims,
@@ -832,11 +832,11 @@ void ResizeTrilinearUpsample(
 template <class T>
 void ResizeBiLinearUpsample(cudaStream_t stream,
                             int rank,
-                            const UpsampleMode upsample_mode,
+                            const UpsampleMode /*upsample_mode*/,
                             ResizeCoordinateTransformationMode coordinate_transform_mode,
-                            gsl::span<const int64_t> input_shape,
-                            gsl::span<const int64_t> output_shape,
-                            int64_t batch_size, int64_t num_channels,
+                            gsl::span<const int64_t> /*input_shape*/,
+                            gsl::span<const int64_t> /*output_shape*/,
+                            int64_t /*batch_size*/, int64_t num_channels,
                             std::tuple<int64_t, int64_t, int64_t> inferred_input_dims,
                             std::tuple<int64_t, int64_t, int64_t> inferred_output_dims,
                             std::tuple<float, float, float> inferred_dim_rscales,
@@ -959,10 +959,10 @@ void ResizeBiLinearUpsample(cudaStream_t stream,
 template <typename T>
 void ResizeBicubicUpsample(cudaStream_t stream,
                            int rank,
-                           const UpsampleMode upsample_mode,
+                           const UpsampleMode /*upsample_mode*/,
                            ResizeCoordinateTransformationMode coordinate_transform_mode,
-                           gsl::span<const int64_t> input_shape,
-                           gsl::span<const int64_t> output_shape,
+                           gsl::span<const int64_t> /*input_shape*/,
+                           gsl::span<const int64_t> /*output_shape*/,
                            int64_t batch_size, int64_t num_channels,
                            std::tuple<int64_t, int64_t, int64_t> inferred_input_dims,
                            std::tuple<int64_t, int64_t, int64_t> inferred_output_dims,
diff --git a/onnxruntime/core/providers/cuda/tensor/resize_impl.cu b/onnxruntime/core/providers/cuda/tensor/resize_impl.cu
index 0cde0ed8e8681..e788f24052985 100644
--- a/onnxruntime/core/providers/cuda/tensor/resize_impl.cu
+++ b/onnxruntime/core/providers/cuda/tensor/resize_impl.cu
@@ -609,7 +609,7 @@ void ResizeNearestImpl(
     const size_t N,
     bool extrapolation_enabled,
     const T extrapolation_value,
-    float cubic_coeff_a,
+    float /*cubic_coeff_a*/,
     ResizeCoordinateTransformationMode transform_coordinate,
     ResizeNearestMode calc_nearest_pixel,
     int64_t* /* prefix_dim_sum */,
diff --git a/onnxruntime/core/providers/cuda/tensor/transpose_impl.cu b/onnxruntime/core/providers/cuda/tensor/transpose_impl.cu
index 9f9c365d2a53d..6344845359b32 100644
--- a/onnxruntime/core/providers/cuda/tensor/transpose_impl.cu
+++ b/onnxruntime/core/providers/cuda/tensor/transpose_impl.cu
@@ -80,7 +80,7 @@ bool CanDoTranspose3D(const cudaDeviceProp& prop, size_t rank, const gsl::span<c
   } break
 
 Status Transpose3DImpl(cudaStream_t stream, size_t element_size, const TArray<int64_t>& input_shape,
-                       const TArray<int64_t>& input_strides, const void* input_data, void* output_data, int64_t N,
+                       const TArray<int64_t>& input_strides, const void* input_data, void* output_data, int64_t /*N*/,
                        const dim3& grid_size, const dim3& block_size) {
   switch (element_size) {
     HANDLE_TRANSPOSE_3D_TILE_DIM(int8_t);
@@ -248,10 +248,10 @@ __global__ void Transpose4DKernelParallelizeOneElementPerThread(
 }
 
 bool CanDoTranspose4DParallelizeOneElementPerThread(const cudaDeviceProp& prop,
-                                                    size_t element_size,
+                                                    size_t /*element_size*/,
                                                     int32_t rank,
                                                     const gsl::span<const int64_t>& input_dims,
-                                                    const gsl::span<const size_t>& permutations,
+                                                    const gsl::span<const size_t>& /*permutations*/,
                                                     dim3& grid_size, dim3& block_size) {
   if (rank == 4) {
     // dims[3]: block.x
diff --git a/onnxruntime/core/providers/cuda/triton_kernel.cu b/onnxruntime/core/providers/cuda/triton_kernel.cu
index 6ffbf0420a15f..b42dbd0291b7a 100644
--- a/onnxruntime/core/providers/cuda/triton_kernel.cu
+++ b/onnxruntime/core/providers/cuda/triton_kernel.cu
@@ -130,27 +130,11 @@ void LoadOrtTritonKernel() {
   std::call_once(load_ort_triton_kernel_flag, TryToLoadKernel);
 }
 
-Status LaunchTritonKernel(cudaStream_t stream, std::string fname,
-                          int grid0, int grid1, int grid2, void* args, size_t args_size) {
-#ifdef USE_TRITON_KERNEL
-  if (ort_triton_kernel_map.count(fname) == 0) {
-    // Return unsupported status if function name not found in registry.
-    // This error status will be used by TunableOp
-    std::ostringstream message_stream;
-    message_stream << "Can't find ort triton kernel name: " << fname;
-    std::string message = message_stream.str();
-    TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(true, message);
-  }
-  auto idx = ort_triton_kernel_map[fname];
-  return LaunchTritonKernel(stream, idx, grid0, grid1, grid2, args, args_size);
-#else
-  return Status::OK();
-#endif
-}
 
-Status LaunchTritonKernel(cudaStream_t stream, size_t idx,
-                          int grid0, int grid1, int grid2, void* args, size_t args_size) {
+
 #ifdef USE_TRITON_KERNEL
+Status LaunchTritonKernel(cudaStream_t stream, size_t idx, int grid0, int grid1, int grid2,
+                          void* args, size_t args_size) {
   if (idx >= ort_triton_kernel_metadata.size()) {
     // Return unsupported status when idx exceeds the size of ort_triton_kernel_metadata.
     // This error status will be used by TunableOp
@@ -181,11 +165,37 @@ Status LaunchTritonKernel(cudaStream_t stream, size_t idx,
                                   nullptr,
                                   (void**)&config),
                    "Launching kernel failed.");
-#endif
 
   return Status::OK();
 }
 
+Status LaunchTritonKernel(cudaStream_t stream, std::string fname, int grid0, int grid1, int grid2,
+                          void* args, size_t args_size) {
+  if (ort_triton_kernel_map.count(fname) == 0) {
+    // Return unsupported status if function name not found in registry.
+    // This error status will be used by TunableOp
+    std::ostringstream message_stream;
+    message_stream << "Can't find ort triton kernel name: " << fname;
+    std::string message = message_stream.str();
+    TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(true, message);
+  }
+  auto idx = ort_triton_kernel_map[fname];
+  return LaunchTritonKernel(stream, idx, grid0, grid1, grid2, args, args_size);
+}
+
+#else
+Status LaunchTritonKernel(cudaStream_t /*stream*/, std::string /*fname*/, int /*grid0*/, int /*grid1*/, int /*grid2*/,
+                          void* /*args*/, size_t /*args_size*/) {
+  return Status::OK();
+}
+
+Status LaunchTritonKernel(cudaStream_t /*stream*/, size_t /*idx*/, int /*grid0*/, int /*grid1*/, int /*grid2*/,
+                          void* /*args*/, size_t /*args_size*/) {
+  return Status::OK();
+}
+#endif
+
+
 const TritonKernelMetaData* GetOrtTritonKernelMetadata(size_t idx) {
   if (idx >= ort_triton_kernel_metadata.size()) {
     return nullptr;
diff --git a/onnxruntime/core/providers/tensorrt/nv_includes.h b/onnxruntime/core/providers/tensorrt/nv_includes.h
new file mode 100644
index 0000000000000..c3e9f7a3a2a77
--- /dev/null
+++ b/onnxruntime/core/providers/tensorrt/nv_includes.h
@@ -0,0 +1,20 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+
+// File to include the required TRT headers with workarounds for warnings we can't fix.
+
+// Ignore warning C4100: unreferenced formal parameter
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4100)
+#endif
+
+#include <NvInfer.h>
+#include <NvInferPlugin.h>
+#include <NvInferRuntime.h>
+#include <NvOnnxParser.h>
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
diff --git a/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.h b/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.h
index bf3bf9e3495d7..9f1e5178428e7 100644
--- a/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.h
+++ b/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.h
@@ -6,7 +6,7 @@
 #include <string>
 #include <filesystem>
 
-#include "NvInfer.h"
+#include "core/providers/tensorrt/nv_includes.h"
 #include "core/providers/shared_library/provider_api.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index 157cd0a200b35..e521640681a77 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -7,6 +7,7 @@
 #define ORT_API_MANUAL_INIT
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/common/common.h"
+#include "core/common/narrow.h"
 #include "core/common/safeint.h"
 #include "tensorrt_execution_provider.h"
 #include "tensorrt_execution_provider_utils.h"
@@ -137,10 +138,10 @@ std::vector<std::string> SplitToStringVec(std::string const& s, char separator)
   return splitted;
 }
 
-nvinfer1::TacticSources GetTacticSourceFromString(std::string& tactic_sting) {
+nvinfer1::TacticSources GetTacticSourceFromString(std::string& tactic_string) {
   nvinfer1::TacticSources disabledTactics = 0;
   nvinfer1::TacticSources enabledTactics = 0;
-  std::vector<std::string> tacticList = SplitToStringVec(tactic_sting, ',');
+  std::vector<std::string> tacticList = SplitToStringVec(tactic_string, ',');
   for (auto& t : tacticList) {
     bool enable{false};
     if (t.front() == '+') {
@@ -151,8 +152,8 @@ nvinfer1::TacticSources GetTacticSourceFromString(std::string& tactic_sting) {
     t.erase(0, 1);
 
     const auto toUpper = [](std::string& sourceName) {
-      std::transform(
-          sourceName.begin(), sourceName.end(), sourceName.begin(), [](char c) { return std::toupper(c); });
+      std::transform(sourceName.begin(), sourceName.end(), sourceName.begin(),
+                     [](char c) { return onnxruntime::narrow<char>(std::toupper(c)); });
       return sourceName;
     };
 
@@ -288,7 +289,8 @@ void CudaCall<cudnnStatus_t, true>(cudnnStatus_t retCode, const char* exprString
   return g_host->CudaCall_true(retCode, exprString, libName, successCode, msg, file, line);
 }
 
-void* OutputAllocator::reallocateOutput(char const* tensorName, void* currentMemory, uint64_t size, uint64_t alignment) noexcept {
+void* OutputAllocator::reallocateOutput(char const* /*tensorName*/, void* /*currentMemory*/, uint64_t size,
+                                        uint64_t /*alignment*/) noexcept {
   // Some memory allocators return nullptr when allocating zero bytes, but TensorRT requires a non-null ptr
   // even for empty tensors, so allocate a dummy byte.
   size = std::max(size, static_cast<uint64_t>(1));
@@ -304,7 +306,7 @@ void* OutputAllocator::reallocateOutput(char const* tensorName, void* currentMem
   return outputPtr;
 }
 
-void OutputAllocator::notifyShape(char const* tensorName, nvinfer1::Dims const& dims) noexcept {
+void OutputAllocator::notifyShape(char const* /*tensorName*/, nvinfer1::Dims const& dims) noexcept {
   output_shapes.clear();
   output_shapes.reserve(dims.nbDims);
   for (int i = 0; i < dims.nbDims; i++) {
@@ -613,20 +615,22 @@ Status ApplyProfileShapesFromInputTensorValue(std::vector<nvinfer1::IOptimizatio
       tensor_shape_values[input_name].resize(shape_size);
       switch (tensor_type) {
         case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: {
-          auto input = std::make_unique<int32_t[]>(shape_size);
-          CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(input.get(), input_tensor.GetTensorData<int32_t>(), shape_size * sizeof(int32_t), cudaMemcpyDeviceToHost, stream));
+          auto input_shape = std::make_unique<int32_t[]>(shape_size);
+          CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(input_shape.get(), input_tensor.GetTensorData<int32_t>(),
+                                               shape_size * sizeof(int32_t), cudaMemcpyDeviceToHost, stream));
           CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream));
           for (int j = 0; j < shape_size; ++j) {
-            tensor_shape_values[input_name][j] = input[j];
+            tensor_shape_values[input_name][j] = input_shape[j];
           }
           break;
         }
         case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: {
-          auto input = std::make_unique<int64_t[]>(shape_size);
-          CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(input.get(), input_tensor.GetTensorData<int64_t>(), shape_size * sizeof(int64_t), cudaMemcpyDeviceToHost, stream));
+          auto input_shape = std::make_unique<int64_t[]>(shape_size);
+          CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(input_shape.get(), input_tensor.GetTensorData<int64_t>(),
+                                               shape_size * sizeof(int64_t), cudaMemcpyDeviceToHost, stream));
           CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream));
           for (int j = 0; j < shape_size; ++j) {
-            tensor_shape_values[input_name][j] = static_cast<int32_t>(input[j]);
+            tensor_shape_values[input_name][j] = static_cast<int32_t>(input_shape[j]);
           }
           break;
         }
@@ -974,7 +978,7 @@ Status BindContextOutput(Ort::KernelContext& ctx,
  * we are waiting for ORT core to support "assign" memory address to ORT context output. Some works need to be done in ORT memory planner to be aware of this memory support.
  */
 Status BindKernelOutput(Ort::KernelContext& ctx,
-                        OrtMemoryInfo* mem_info,
+                        OrtMemoryInfo* /*mem_info*/,
                         DDSOutputAllocatorMap& allocator_map,
                         char const* output_name,
                         size_t output_index,
@@ -1143,7 +1147,8 @@ TensorrtExecutionProvider::PerThreadContext& TensorrtExecutionProvider::GetPerTh
 
     // get or create a context
     if (context_state_.retired_context_pool.empty()) {
-      context = std::make_shared<PerThreadContext>(info_.device_id, info_.has_user_compute_stream, stream_);
+      context = std::make_shared<PerThreadContext>(narrow<OrtDevice::DeviceId>(info_.device_id),
+                                                   info_.has_user_compute_stream, stream_);
     } else {
       context = context_state_.retired_context_pool.back();
       context_state_.retired_context_pool.pop_back();
@@ -1163,7 +1168,11 @@ TensorrtExecutionProvider::PerThreadContext& TensorrtExecutionProvider::GetPerTh
 }
 
 TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProviderInfo& info)
-    : IExecutionProvider{onnxruntime::kTensorrtExecutionProvider, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, info.device_id)}, info_(info), device_id_(info.device_id) {
+    : IExecutionProvider{onnxruntime::kTensorrtExecutionProvider,
+                         OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT,
+                                   narrow<OrtDevice::DeviceId>(info.device_id))},
+      info_(info),
+      device_id_(info.device_id) {
   InitProviderOrtApi();
 
   CUDA_CALL_THROW(cudaSetDevice(device_id_));
@@ -1655,7 +1664,8 @@ void TensorrtExecutionProvider::IncrementRegularRunCountBeforeGraphCapture() {
 
 std::vector<AllocatorPtr> TensorrtExecutionProvider::CreatePreferredAllocators() {
   AllocatorCreationInfo default_memory_info(
-      [](OrtDevice::DeviceId device_id) { return CreateCUDAAllocator(device_id, onnxruntime::CUDA); }, device_id_);
+      [](OrtDevice::DeviceId device_id) { return CreateCUDAAllocator(device_id, onnxruntime::CUDA); },
+      narrow<OrtDevice::DeviceId>(device_id_));
 
   AllocatorCreationInfo pinned_allocator_info(
       [](OrtDevice::DeviceId device_id) {
@@ -3036,7 +3046,8 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
     std::unordered_set<std::string> input_names;
     std::unordered_map<std::string, std::vector<int32_t>> tensor_shape_values;
 
-    OrtMemoryInfo mem_info("", OrtAllocatorType::OrtDeviceAllocator, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, device_id_), device_id_);
+    OrtDevice device(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, narrow<OrtDevice::DeviceId>(device_id_));
+    OrtMemoryInfo mem_info("", OrtAllocatorType::OrtDeviceAllocator, device, device_id_);
     if (alloc_ == nullptr) {
       Ort::ThrowOnError(api->KernelContext_GetAllocator(context, &mem_info, &alloc_));
     }
@@ -3603,7 +3614,8 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(con
     // int num_inputs = static_cast<int>(input_indexes.size());
     int num_outputs = static_cast<int>(output_indexes.size());
 
-    OrtMemoryInfo mem_info("", OrtAllocatorType::OrtDeviceAllocator, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, device_id_), device_id_);
+    OrtDevice device(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, narrow<OrtDevice::DeviceId>(device_id_));
+    OrtMemoryInfo mem_info("", OrtAllocatorType::OrtDeviceAllocator, device, device_id_);
     if (alloc_ == nullptr) {
       Ort::ThrowOnError(api->KernelContext_GetAllocator(context, &mem_info, &alloc_));
     }
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
index 26f6b2dcc3020..339c45a8742d2 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
@@ -5,8 +5,9 @@
 #include <ctime>
 #include <cudnn.h>
 #include <cublas_v2.h>
-#include "NvInfer.h"
-#include "NvOnnxParser.h"
+
+#include "core/providers/tensorrt/nv_includes.h"
+
 #include "core/platform/ort_mutex.h"
 #include "core/providers/cuda/cuda_graph.h"
 #include "tensorrt_execution_provider_info.h"
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.cc
index eb340ba1e64b6..b4f348159440f 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.cc
@@ -1,12 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include <unordered_set>
+
 #include "core/framework/provider_options.h"
 #include "tensorrt_execution_provider_custom_ops.h"
 #include "tensorrt_execution_provider.h"
-#include <NvInferRuntime.h>
-#include <NvInferPlugin.h>
-#include <unordered_set>
 
 namespace onnxruntime {
 extern TensorrtLogger& GetTensorrtLogger();
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.h
index b19d9ab0f66d0..54212d34aa2ce 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.h
@@ -13,7 +13,8 @@ using namespace onnxruntime;
 namespace onnxruntime {
 
 common::Status LoadDynamicLibrary(onnxruntime::PathString library_name);
-common::Status CreateTensorRTCustomOpDomainList(std::vector<OrtCustomOpDomain*>& domain_list, const std::string extra_plugin_lib_paths);
+common::Status CreateTensorRTCustomOpDomainList(std::vector<OrtCustomOpDomain*>& domain_list,
+                                                const std::string extra_plugin_lib_paths);
 common::Status CreateTensorRTCustomOpDomainList(TensorrtExecutionProviderInfo& info);
 void ReleaseTensorRTCustomOpDomain(OrtCustomOpDomain* domain);
 void ReleaseTensorRTCustomOpDomainList(std::vector<OrtCustomOpDomain*>& custom_op_domain_list);
@@ -23,16 +24,22 @@ struct TensorRTCustomKernel {
       : compute_stream_(compute_stream) {
   }
 
-  void Compute(OrtKernelContext* context){};  // The implementation is in TensorRT plugin. No need to implement it here.
+  void Compute(OrtKernelContext* /*context*/){
+      // The implementation is in TensorRT plugin. No need to implement it here.
+  };
 
  private:
   void* compute_stream_;
 };
 
 struct TensorRTCustomOp : Ort::CustomOpBase<TensorRTCustomOp, TensorRTCustomKernel> {
-  explicit TensorRTCustomOp(const char* provider, void* compute_stream) : provider_(provider), compute_stream_(compute_stream) {}
+  explicit TensorRTCustomOp(const char* provider, void* compute_stream) : provider_(provider),
+                                                                          compute_stream_(compute_stream) {
+  }
 
-  void* CreateKernel(const OrtApi& /* api */, const OrtKernelInfo* info) const { return new TensorRTCustomKernel(info, compute_stream_); };
+  void* CreateKernel(const OrtApi& /* api */, const OrtKernelInfo* info) const {
+    return new TensorRTCustomKernel(info, compute_stream_);
+  };
 
   const char* GetName() const { return name_; };
 
@@ -46,7 +53,9 @@ struct TensorRTCustomOp : Ort::CustomOpBase<TensorRTCustomOp, TensorRTCustomKern
 
   ONNXTensorElementDataType GetInputType(size_t /*index*/) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED; };
 
-  OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(size_t) const { return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_VARIADIC; };
+  OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(size_t) const {
+    return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_VARIADIC;
+  };
 
   size_t GetOutputTypeCount() const { return num_outputs_; };
 
@@ -54,7 +63,9 @@ struct TensorRTCustomOp : Ort::CustomOpBase<TensorRTCustomOp, TensorRTCustomKern
 
   ONNXTensorElementDataType GetOutputType(size_t /*index*/) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED; };
 
-  OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(size_t) const { return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_VARIADIC; };
+  OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(size_t) const {
+    return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_VARIADIC;
+  };
 
   bool GetVariadicInputHomogeneity() const {
     return false;  // heterogenous
diff --git a/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc b/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc
index 8ea37ad054ed0..acaae2dcd9712 100644
--- a/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc
+++ b/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc
@@ -463,12 +463,12 @@ std::vector<MLFloat16> QK_Transpose(MLFloat16* q_matrix, MLFloat16* k_transpose_
 
 // Softmax_QK_Transpose
 template <typename T>
-std::vector<T> Softmax_QK_Transpose(T* qk_transpose_matrix,
-                                    int batch_size, int num_heads, int sequence_length, int total_sequence_length, int head_size);
+std::vector<T> Softmax_QK_Transpose(T* qk_transpose_matrix, int batch_size, int num_heads,
+                                    int sequence_length, int total_sequence_length, int head_size);
 
 template <>
-std::vector<float> Softmax_QK_Transpose(float* qk_transpose_matrix,
-                                        int batch_size, int num_heads, int sequence_length, int total_sequence_length, int head_size) {
+std::vector<float> Softmax_QK_Transpose(float* qk_transpose_matrix, int batch_size, int num_heads,
+                                        int sequence_length, int total_sequence_length, int /*head_size*/) {
   if (sequence_length != 1) {
     throw std::runtime_error("Not supported");
   }
@@ -506,8 +506,8 @@ std::vector<float> Softmax_QK_Transpose(float* qk_transpose_matrix,
 }
 
 template <>
-std::vector<MLFloat16> Softmax_QK_Transpose(MLFloat16* qk_transpose_matrix,
-                                            int batch_size, int num_heads, int sequence_length, int total_sequence_length, int head_size) {
+std::vector<MLFloat16> Softmax_QK_Transpose(MLFloat16* qk_transpose_matrix, int batch_size, int num_heads,
+                                            int sequence_length, int total_sequence_length, int /*head_size*/) {
   if (sequence_length != 1) {
     throw std::runtime_error("Not supported");
   }
diff --git a/onnxruntime/test/providers/cpu/generator/random_test.cc b/onnxruntime/test/providers/cpu/generator/random_test.cc
index 16582696a81d4..532b98317405f 100644
--- a/onnxruntime/test/providers/cpu/generator/random_test.cc
+++ b/onnxruntime/test/providers/cpu/generator/random_test.cc
@@ -380,7 +380,7 @@ void RunRandomNormalGpuTest(const std::vector<int64_t> dims, const float mean, c
     test.AddOutput("Y", dims, fp16_data);
   }
 
-  auto output_verifier = [&](const std::vector<OrtValue>& fetches, const std::string& provider_type) {
+  auto output_verifier = [&](const std::vector<OrtValue>& fetches, const std::string& /*provider_type*/) {
     // Only one output, and mean of output values are near attribute mean.
     ASSERT_EQ(fetches.size(), 1u);
     const auto& output_tensor = fetches[0].Get<Tensor>();
@@ -472,7 +472,7 @@ void RunRandomUniformGpuTest(const std::vector<int64_t> dims, const float low, c
     test.AddOutput("Y", dims, fp16_data);
   }
 
-  auto output_verifier = [&](const std::vector<OrtValue>& fetches, const std::string& provider_type) {
+  auto output_verifier = [&](const std::vector<OrtValue>& fetches, const std::string& /*provider_type*/) {
     // Only one output. Each value in output tensoer is between low and high.
     // Mean of output values are near attribute mean of low and high.
     ASSERT_EQ(fetches.size(), 1u);
diff --git a/onnxruntime/test/unittest_main/test_main.cc b/onnxruntime/test/unittest_main/test_main.cc
index 4c38c90c2b418..d7e8bf9063645 100644
--- a/onnxruntime/test/unittest_main/test_main.cc
+++ b/onnxruntime/test/unittest_main/test_main.cc
@@ -32,17 +32,30 @@ void ortenv_setup() {
 }
 
 #ifdef USE_TENSORRT
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4100)  // Ignore warning C4100: unreferenced format parameter.
+#endif
+
 // TensorRT will load/unload libraries as builder objects are created and torn down. This will happen for
 // every single unit test, which leads to excessive test execution time due to that overhead.
 // Nvidia suggests to keep a placeholder builder object around to avoid this.
 #include "NvInfer.h"
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
 class DummyLogger : public nvinfer1::ILogger {
  public:
-  DummyLogger(Severity verbosity) {}
-  void log(Severity severity, const char* msg) noexcept override {}
+  DummyLogger(Severity /*verbosity*/) {}
+  void log(Severity /*severity*/, const char* /*msg*/) noexcept override {}
 };
 DummyLogger trt_logger(nvinfer1::ILogger::Severity::kWARNING);
+
 auto const placeholder = std::unique_ptr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(trt_logger));
+
 #endif
 
 #define TEST_MAIN main
diff --git a/orttraining/orttraining/test/training_ops/cuda/cross_entropy_test.cc b/orttraining/orttraining/test/training_ops/cuda/cross_entropy_test.cc
index d9800ce0e0d3e..d36f9b307ec70 100644
--- a/orttraining/orttraining/test/training_ops/cuda/cross_entropy_test.cc
+++ b/orttraining/orttraining/test/training_ops/cuda/cross_entropy_test.cc
@@ -311,11 +311,9 @@ template <typename T, typename TOut>
 static std::vector<OrtValue> RunSCELossWithEP(const char* op,
                                               int opset_version,
                                               const char* domain,
-                                              std::function<std::unique_ptr<IExecutionProvider>()>
-                                                  ep_creator,
+                                              std::function<std::unique_ptr<IExecutionProvider>()> ep_creator,
                                               const std::string& reduction,
                                               const std::int64_t ignore_index,
-                                              const double error_tolerance,
                                               const std::vector<int64_t>* X_dims,
                                               const std::vector<int64_t>* index_dims,
                                               const std::vector<int64_t>* weight_dims,
@@ -403,7 +401,7 @@ static void TestSCELoss(const char* op, int opset_version,
     cpu_fetches = RunSCELossWithEP<float, float>(
         op, opset_version, domain,
         []() -> std::unique_ptr<IExecutionProvider> { return DefaultCpuExecutionProvider(); },
-        reduction, ignore_index, error_tolerance,
+        reduction, ignore_index,
         X_dims, index_dims, weight_dims,
         Y_dims, log_prob_dims,
         X_data_temp, index_data, weight_data_temp);
@@ -411,7 +409,7 @@ static void TestSCELoss(const char* op, int opset_version,
     cpu_fetches = RunSCELossWithEP<T, float>(
         op, opset_version, domain,
         []() -> std::unique_ptr<IExecutionProvider> { return DefaultCpuExecutionProvider(); },
-        reduction, ignore_index, error_tolerance,
+        reduction, ignore_index,
         X_dims, index_dims, weight_dims,
         Y_dims, log_prob_dims,
         X_data, index_data, weight_data);
@@ -429,7 +427,7 @@ static void TestSCELoss(const char* op, int opset_version,
         return DefaultRocmExecutionProvider();
 #endif
       },
-      reduction, ignore_index, error_tolerance,
+      reduction, ignore_index,
       X_dims, index_dims, weight_dims,
       Y_dims, log_prob_dims,
       X_data, index_data, weight_data);
diff --git a/orttraining/orttraining/training_ops/cuda/nn/conv_shared.cc b/orttraining/orttraining/training_ops/cuda/nn/conv_shared.cc
index d23905496c9bb..9b30bd128b161 100644
--- a/orttraining/orttraining/training_ops/cuda/nn/conv_shared.cc
+++ b/orttraining/orttraining/training_ops/cuda/nn/conv_shared.cc
@@ -105,7 +105,8 @@ struct AlgoSearch<T_BwdDataPerf> {
         CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT, CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING,
         CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD, CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED};
     static constexpr int num_algos = CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT;
-    ORT_ENFORCE(sizeof(algos) / sizeof(algos[0]) == num_algos, "Missing cuDNN convolution backward data algorithms.");
+    static_assert(sizeof(algos) / sizeof(algos[0]) == num_algos,
+                  "Missing cuDNN convolution backward data algorithms.");
     int perf_count;
     std::unique_ptr<T_BwdDataPerf[]> candidates = std::make_unique<T_BwdDataPerf[]>(num_algos);
     if (args.params.algo_mode == OrtCudnnConvAlgoSearchHeuristic) {
@@ -146,7 +147,9 @@ struct AlgoSearch<T_BwdFilterPerf> {
 
     // NOTE: - 1 because ALGO_WINOGRAD is not implemented.
     static constexpr int num_algos = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT - 1;
-    ORT_ENFORCE(sizeof(algos) / sizeof(algos[0]) == num_algos, "Missing cuDNN convolution backward filter algorithms.");
+    static_assert(sizeof(algos) / sizeof(algos[0]) == num_algos,
+                  "Missing cuDNN convolution backward filter algorithms.");
+
     std::unique_ptr<T_BwdFilterPerf[]> candidates = std::make_unique<T_BwdFilterPerf[]>(num_algos);
     int perf_count;
     if (args.params.algo_mode == OrtCudnnConvAlgoSearchHeuristic) {
@@ -188,7 +191,9 @@ struct AlgoSearch<T_FwdPerf> {
     };
 
     static constexpr int num_algos = CUDNN_CONVOLUTION_FWD_ALGO_COUNT;
-    ORT_ENFORCE(sizeof(algos) / sizeof(algos[0]) == num_algos, "Missing cuDNN convolution backward filter algorithms.");
+    static_assert(sizeof(algos) / sizeof(algos[0]) == num_algos,
+                  "Missing cuDNN convolution backward filter algorithms.");
+
     std::unique_ptr<T_FwdPerf[]> candidates = std::make_unique<T_FwdPerf[]>(num_algos);
     int perf_count;
     if (args.params.algo_mode == OrtCudnnConvAlgoSearchHeuristic) {
diff --git a/orttraining/orttraining/training_ops/cuda/nn/conv_transpose_grad.cc b/orttraining/orttraining/training_ops/cuda/nn/conv_transpose_grad.cc
index d3f5a89434a48..5d12e0ac312c0 100644
--- a/orttraining/orttraining/training_ops/cuda/nn/conv_transpose_grad.cc
+++ b/orttraining/orttraining/training_ops/cuda/nn/conv_transpose_grad.cc
@@ -53,7 +53,6 @@ Status ConvTransposeGrad<T>::ComputeInputGradient(onnxruntime::Stream* stream, c
             algo_perf.algo, workspace.get(), algo_perf.memory, &zero, args.y_tensor, args.y_data));
         return Status::OK();
       });
-  return Status::OK();
 }
 
 template <typename T>
@@ -71,7 +70,6 @@ Status ConvTransposeGrad<T>::ComputeWeightGradient(onnxruntime::Stream* stream,
             algo_perf.algo, workspace.get(), algo_perf.memory, &zero, args.w_desc, args.dw_data));
         return Status::OK();
       });
-  return Status::OK();
 }
 
 template <typename T>
diff --git a/orttraining/orttraining/training_ops/cuda/nn/layer_norm_impl.cu b/orttraining/orttraining/training_ops/cuda/nn/layer_norm_impl.cu
index 2d89ed05712e0..ad577afa06c18 100644
--- a/orttraining/orttraining/training_ops/cuda/nn/layer_norm_impl.cu
+++ b/orttraining/orttraining/training_ops/cuda/nn/layer_norm_impl.cu
@@ -30,8 +30,6 @@
 namespace onnxruntime {
 namespace cuda {
 
-using namespace onnxruntime::cuda;
-
 namespace {
   // This is the un-specialized struct.  Note that we prevent instantiation of this
   // struct by putting an undefined symbol in the function body so it won't compile.
diff --git a/orttraining/orttraining/training_ops/cuda/optimizer/lamb_impl.cu b/orttraining/orttraining/training_ops/cuda/optimizer/lamb_impl.cu
index c90809eb2fdcc..fd55f7c30ff75 100644
--- a/orttraining/orttraining/training_ops/cuda/optimizer/lamb_impl.cu
+++ b/orttraining/orttraining/training_ops/cuda/optimizer/lamb_impl.cu
@@ -619,7 +619,7 @@ CudaKernel::CudaAsyncBuffer<LambMultiTensorSyncRangeAndLock> compute_tensor_rang
 
 template <typename TIn1, typename TIn2, typename TOut1, typename TOut2, typename TBuf>
 void LambMultiTensorReductionFunctor<TIn1, TIn2, TOut1, TOut2, TBuf>::operator()(
-    cudaStream_t stream,
+    cudaStream_t /*stream*/,
     ChunkGroup<4> chunk_group,
     const CudaKernel& kernel,
     void* reduction_buffer,
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-prebuild-steps.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-prebuild-steps.yml
index 9516753d50113..864513bc4d671 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-prebuild-steps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-prebuild-steps.yml
@@ -93,8 +93,17 @@ steps:
         $ccache_parent_dir = (Split-Path -parent $ccache_path)
         Copy-Item "C:\ProgramData\chocolatey\lib\ccache\tools\ccache-4.7.4-windows-x86_64\ccache.exe" -Destination "C:\ProgramData\chocolatey\bin\cl.exe"
         Get-ChildItem $ccache_parent_dir
-        ccache --version
       }
+
+      "ccache info:"
+      ccache --version
+      ccache --show-config
+
+      "cl.exe from path: $((Get-Command cl).Path). Version:"
+      (cl.exe -?) -match 'Compiler Version'
+      "C:\ProgramData\chocolatey\bin\cl.exe version:"
+      (C:\ProgramData\chocolatey\bin\cl.exe -?) -match 'Compiler Version'
+
     displayName: Install ccache and update PATH to use linked versions of gcc, cc, etc
 
   - ${{ if eq(parameters.WITHCACHE, true) }}:

From e93a860819545ea64acfe36e19e2b954389d48bf Mon Sep 17 00:00:00 2001
From: Ashwini Khade <askhade@microsoft.com>
Date: Tue, 5 Mar 2024 21:54:48 -0800
Subject: [PATCH 116/279] Remove arm build for training (#19788)

We no longer support Win arm 32 so removing the associated build and
packaging job.
---
 .../ondevice-training-cpu-packaging-pipeline.yml | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml
index cf39be23cbdaf..b3faaf2a7f1a6 100644
--- a/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml
@@ -61,21 +61,6 @@ stages:
     buildJava: false
     buildNodejs: false
 
-- template: win-ci.yml
-  parameters:
-    DoCompliance: ${{ parameters.DoCompliance }}
-    DoEsrp: ${{ parameters.DoEsrp }}
-    stage_name_suffix: Training_CPU_arm_${{ parameters.BuildVariant }}
-    artifact_name_suffix: -training
-    buildArch: x64
-    msbuildPlatform: arm
-    packageName: arm
-    buildparameter: --arm ${{ parameters.AdditionalBuildFlags }}  ${{ parameters.AdditionalWinBuildFlags}} --path_to_protoc_exe $(Build.BinariesDirectory)\RelWithDebInfo\installed\bin\protoc.exe
-    runTests: false
-    buildJava: false
-    buildNodejs: false
-    ort_build_pool_name: onnxruntime-Win-CPU-2022
-
 - template: win-ci.yml
   parameters:
     DoCompliance: ${{ parameters.DoCompliance }}
@@ -127,7 +112,6 @@ stages:
   - Linux_C_API_Packaging_Training_CPU
   - Windows_Packaging_Training_CPU_x86_${{ parameters.BuildVariant }}
   - Windows_Packaging_Training_CPU_x64_${{ parameters.BuildVariant }}
-  - Windows_Packaging_Training_CPU_arm_${{ parameters.BuildVariant }}
   - Windows_Packaging_Training_CPU_arm64_${{ parameters.BuildVariant }}
   - Android_Java_API_AAR_Packaging_Training_Full
   condition: succeeded()

From d9bf85613d7171b54a6ece45fc0f241b008a1fd8 Mon Sep 17 00:00:00 2001
From: pengwa <pengwa@microsoft.com>
Date: Wed, 6 Mar 2024 21:54:16 +0800
Subject: [PATCH 117/279] Adapt memory optimizer to fit PHI2 (#19757)

### Adapt memory optimizer to fit PHI2

Few improvements and bug fixes:
1. Fix bug related to transformer layer detection.
2. Use default reversed typo order to create recompute node, to avoid
the leaf nodes are handled too late, then having lowest priority for
execution.
3. Add early stop when activation's element count is constant and total
element count < 1M. This can avoid overhead to search subgraphs.


Using export ORTMODULE_MEMORY_OPT_LEVEL=1 to enable layerwise recompute,
on given recipe, memory consumption dropped from ~22GB to ~13GB .
---
 .../memory_optimizer/memory_insight.cc        |  3 +-
 .../memory_optimizer/memory_optimizer.cc      | 37 +++++++++++++++-
 .../memory_optimizer/recompute_analysis.cc    | 18 +++++++-
 .../memory_optimizer/transformer_specific.cc  | 42 +++++++++++++++++--
 .../memory_optimizer/transformer_specific.h   |  3 ++
 5 files changed, 95 insertions(+), 8 deletions(-)

diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/memory_insight.cc b/orttraining/orttraining/core/optimizer/memory_optimizer/memory_insight.cc
index 08c402bf669c8..54c49db0597c7 100644
--- a/orttraining/orttraining/core/optimizer/memory_optimizer/memory_insight.cc
+++ b/orttraining/orttraining/core/optimizer/memory_optimizer/memory_insight.cc
@@ -258,7 +258,8 @@ Status FindORTModuleMemoryOpportunity(const GraphViewer& graph_viewer,
                                                      logger));
 
   InlinedHashSet<const Node*> layer_boundary_ln_nodes;
-  FindLayerBoundaryLayerNormNodes(graph_viewer, logger, layer_boundary_ln_nodes);
+  FindLayerBoundaryLayerNormNodes(graph_viewer, logger, node_index_to_its_order_in_topological_sort_map,
+                                  yield_op_order_in_topological_sort, layer_boundary_ln_nodes);
 
   // The first pass - find the candidate subgraphs.
   for (int i = static_cast<int>(node_ids.size()) - 1; i >= 0; --i) {
diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/memory_optimizer.cc b/orttraining/orttraining/core/optimizer/memory_optimizer/memory_optimizer.cc
index 525e3b4b8de35..40fa2fc5cc737 100644
--- a/orttraining/orttraining/core/optimizer/memory_optimizer/memory_optimizer.cc
+++ b/orttraining/orttraining/core/optimizer/memory_optimizer/memory_optimizer.cc
@@ -190,11 +190,44 @@ Status MemoryOptimizer::ApplyImpl(Graph& graph, bool& modified, int /*graph_leve
                   .IsOK());
 
   // The second pass - apply the transformation.
-  // Iterate through the nodes in reversed topological order and find the subgraph that can be alleviated.
+  // Note 1: Iterate through the nodes in reversed topological order and find the subgraph that can be alleviated.
   // The reason we do reversed topological order is that we want the later layers' recompute nodes can be appended
   // earlier than the earlier layers, in this way, the execution order of later layers will be in front of the earlier
   // layers.
-  const auto& node_ids = graph_viewer.GetNodesInTopologicalOrder(ExecutionOrder::PRIORITY_BASED);
+  //
+  // Note 2: Here we use default typo order (which tries to BFS from the outputs,
+  // so the nearest node to graph output will be visited last). So in reversed default typo order,
+  // the neareast node to graph output will be visited first.
+  // Imagine there is a such subgraph
+  //         input1 input2 input3
+  //             \    |     /
+  //         multiple layers
+  //             |
+  //            node M
+  // labels-------|-----
+  //    \         |     |
+  //    node1     |     |
+  //      \       |     |
+  //      node2  /      |
+  //        \   /       |
+  //      node loss     /
+  //          |        /
+  //       YieldOp  node1_recompute
+  //         |      /
+  //         \   node2 recompute
+  //          \ /
+  //     node loss_grad
+  //           |
+  //     critical grad path
+  //
+  // In PriorityBased order, node1 will be visited first, so it's recompute node node1_recompute will be added
+  // at last because we do this following reversed topological order. Then node1_recompute node will have lowest
+  // priority to execute, as a result, if at this time, the queue to visit contains only recompute nodes, then
+  // node1_recompute will be run at last, affecting the backward critical path, which is not what we want.
+  // Current workaround is to use default order, which will execute node1_recompute earlier than other recompute nodes
+  // in this case.
+
+  const auto& node_ids = graph_viewer.GetNodesInTopologicalOrder(ExecutionOrder::DEFAULT);
   for (int i = static_cast<int>(node_ids.size()) - 1; i >= 0; --i) {
     Node* p_node = graph.GetNode(node_ids[i]);
     if (p_node == nullptr) {
diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.cc b/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.cc
index 12c83591c0036..76b3325f36116 100644
--- a/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.cc
+++ b/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.cc
@@ -19,7 +19,7 @@ namespace onnxruntime::optimizer::memory_optimizer {
 
 namespace {
 
-constexpr int32_t MAXIMUM_RECOMPUTE_NODE_COUNT = 15;
+constexpr int32_t MAXIMUM_RECOMPUTE_NODE_COUNT = 50;
 
 static size_t GetElementSize(const ONNX_NAMESPACE::DataType& tensor_type) {
   const ONNX_NAMESPACE::TypeProto& type_proto = ONNX_NAMESPACE::Utils::DataTypeUtils::ToTypeProto(tensor_type);
@@ -291,6 +291,22 @@ Status SelectRecomputeSubgraph(const Node& entry_node,
         const auto current_node_input_index = input_edge.GetDstArgIndex();
         if (std::find(input_arg_indices.begin(), input_arg_indices.end(), current_node_input_index) !=
             input_arg_indices.end()) {
+          // If the tensor size is constant and very small (Now < 1M), we stop adding the input edge into queue.
+          auto output_shape = parent_node.OutputDefs()[parent_node_output_index]->Shape();
+          if (output_shape) {
+            bool all_constant_dim = true;
+            int64_t num_elem = 1;
+            for (int k = 0, dim_size = output_shape->dim_size(); k < dim_size; ++k) {
+              if (!output_shape->dim(k).has_dim_value()) {
+                all_constant_dim = false;
+                num_elem *= output_shape->dim(k).dim_value();
+              }
+            }
+            if (all_constant_dim && num_elem < 1 * 1024 * 1024) {
+              // Skip this input index.
+              continue;
+            }
+          }
           NodeOutputPort next_p = std::make_pair(&parent_node, parent_node_output_index);
 
           MO_LOG_DEBUG_INFO(logger, "Node " + parent_node.Name() + "(" + parent_node.OpType() + ")'s " +
diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/transformer_specific.cc b/orttraining/orttraining/core/optimizer/memory_optimizer/transformer_specific.cc
index 04f2679ac774f..c88a0f05d36b8 100644
--- a/orttraining/orttraining/core/optimizer/memory_optimizer/transformer_specific.cc
+++ b/orttraining/orttraining/core/optimizer/memory_optimizer/transformer_specific.cc
@@ -19,6 +19,9 @@ namespace onnxruntime::optimizer::memory_optimizer {
 void FindLayerBoundaryLayerNormNodes(
     const GraphViewer& graph_viewer,
     const logging::Logger&,
+    const InlinedHashMap<NodeIndex, ptrdiff_t>&
+        node_index_to_its_order_in_topological_sort_map,
+    const ptrdiff_t& yield_op_order_in_topological_sort,
     InlinedHashSet<const Node*>& layer_boundary_ln_nodes) {
   // Loop all nodes to find LayerNormalization nodes.
   // For each LayerNormalization node, keep checking its output nodes,
@@ -40,9 +43,16 @@ void FindLayerBoundaryLayerNormNodes(
     std::deque<const Node*> nodes_to_check;
     std::set<const Node*> visited_nodes;
     for (auto node_it = node.OutputNodesBegin(); node_it != node.OutputNodesEnd(); ++node_it) {
-      nodes_to_check.push_back(&(*node_it));
+      // Ignore those nodes after YieldOp.
+      if (node_index_to_its_order_in_topological_sort_map.at(node_it->Index()) < yield_op_order_in_topological_sort) {
+        nodes_to_check.push_back(&(*node_it));
+      }
     }
 
+    bool unexpected_failure = false;
+    bool found_softmax = false;
+    bool found_layernorm = false;
+    ptrdiff_t next_layernorm_execution_oder = -1;
     while (!nodes_to_check.empty()) {
       const Node* next_node = nodes_to_check.front();
       nodes_to_check.pop_front();
@@ -53,16 +63,40 @@ void FindLayerBoundaryLayerNormNodes(
 
       visited_nodes.insert(next_node);
       if (softmax_ops.find(next_node->OpType()) != softmax_ops.end()) {
-        layer_boundary_ln_nodes.insert(&node);
-        break;
+        found_softmax = true;
       } else if (layernorm_ops.find(next_node->OpType()) != layernorm_ops.end()) {
-        break;
+        if (found_layernorm) {
+          // If we found another LayerNormalization node, we would report as warning, and do nothing for layer boundary detection.
+          unexpected_failure = true;
+          break;
+        }
+        found_layernorm = true;  // don't trace further
+        next_layernorm_execution_oder = node_index_to_its_order_in_topological_sort_map.at(next_node->Index());
+        continue;
       } else {
         for (auto node_it = next_node->OutputNodesBegin(); node_it != next_node->OutputNodesEnd(); ++node_it) {
+          // Stop if the node is after next Layernorm node in execution order.
+          if (found_layernorm &&
+              node_index_to_its_order_in_topological_sort_map.at(node_it->Index()) >= next_layernorm_execution_oder) {
+            continue;
+          }
           nodes_to_check.push_back(&(*node_it));
         }
       }
     }
+
+    if (unexpected_failure) {
+      layer_boundary_ln_nodes.clear();
+      break;
+    }
+
+    if (found_softmax) {
+      layer_boundary_ln_nodes.insert(&node);
+    } else if (!found_layernorm) {
+      // If no Softmax found, and no other LayerNormalization found, this should be the last LayerNormalization node,
+      // we also consider it as boundary node.
+      layer_boundary_ln_nodes.insert(&node);
+    }
   }
 }
 
diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/transformer_specific.h b/orttraining/orttraining/core/optimizer/memory_optimizer/transformer_specific.h
index f2cfd640b0840..b58d822124f43 100644
--- a/orttraining/orttraining/core/optimizer/memory_optimizer/transformer_specific.h
+++ b/orttraining/orttraining/core/optimizer/memory_optimizer/transformer_specific.h
@@ -20,6 +20,9 @@ namespace onnxruntime::optimizer::memory_optimizer {
 
 void FindLayerBoundaryLayerNormNodes(const GraphViewer& graph_viewer,
                                      const logging::Logger& logger,
+                                     const InlinedHashMap<NodeIndex, ptrdiff_t>&
+                                         node_index_to_its_order_in_topological_sort_map,
+                                     const ptrdiff_t& yield_op_order_in_topological_sort,
                                      InlinedHashSet<const Node*>& layer_boundary_ln_nodes);
 
 }  // namespace onnxruntime::optimizer::memory_optimizer

From f9a92e589ad8588424725a91bbd0683a63bda950 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Wed, 6 Mar 2024 09:10:35 -0800
Subject: [PATCH 118/279] Upgrade the Windows SDK version that is used in
 WindowsAI Nuget Packaging pipeline (#19786)

### Description
1. Upgrade the version from 10.0.19041.0 to 10.0.22621.0. The old one
misses some macros that are needed by PyTorch's CPUINFO
2. Also update cmake.


### Motivation and Context
In PR #19655 I added CPUINFO to all Windows builds, but forgot to test
this pipeline.
---
 .pipelines/windowsai-steps.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.pipelines/windowsai-steps.yml b/.pipelines/windowsai-steps.yml
index ff5179e6135c2..855573de753b0 100644
--- a/.pipelines/windowsai-steps.yml
+++ b/.pipelines/windowsai-steps.yml
@@ -80,11 +80,11 @@ jobs:
 
     # must call vsdevcmd first to add cmake to PATH
     - script: |
-        curl -O -L https://github.com/Kitware/CMake/releases/download/v3.26.3/cmake-3.26.3-windows-x86_64.zip
-        7z x cmake-3.26.3-windows-x86_64.zip
+        curl -O -L https://github.com/Kitware/CMake/releases/download/v3.28.3/cmake-3.28.3-windows-x86_64.zip
+        7z x cmake-3.28.3-windows-x86_64.zip
         set PYTHONHOME=$(Build.BinariesDirectory)\${{ parameters.PythonPackageName }}.3.9.7\tools
         set PYTHONPATH=$(Build.BinariesDirectory)\${{ parameters.PythonPackageName }}.3.9.7\tools
-        $(Build.BinariesDirectory)\${{ parameters.PythonPackageName }}.3.9.7\tools\python.exe "$(Build.SourcesDirectory)\tools\ci_build\build.py" --build_dir $(Build.BinariesDirectory) --parallel --use_binskim_compliant_compile_flags --build_shared_lib --enable_onnx_tests --ms_experimental --use_dml --use_winml --cmake_generator "Visual Studio 17 2022" --update --config RelWithDebInfo --enable_lto --use_telemetry --disable_rtti --enable_wcos $(BuildFlags) --cmake_extra_defines "CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO=/PROFILE" "CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO=/PROFILE" CMAKE_SYSTEM_VERSION=10.0.19041.0 --cmake_path $(Build.BinariesDirectory)\cmake-3.26.3-windows-x86_64\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake-3.26.3-windows-x86_64\bin\ctest.exe
+        $(Build.BinariesDirectory)\${{ parameters.PythonPackageName }}.3.9.7\tools\python.exe "$(Build.SourcesDirectory)\tools\ci_build\build.py" --build_dir $(Build.BinariesDirectory) --parallel --use_binskim_compliant_compile_flags --build_shared_lib --enable_onnx_tests --ms_experimental --use_dml --use_winml --cmake_generator "Visual Studio 17 2022" --update --config RelWithDebInfo --enable_lto --use_telemetry --disable_rtti --enable_wcos --windows_sdk_version "10.0.22621.0" $(BuildFlags) --cmake_extra_defines "CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO=/PROFILE" "CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO=/PROFILE"  --cmake_path $(Build.BinariesDirectory)\cmake-3.28.3-windows-x86_64\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake-3.28.3-windows-x86_64\bin\ctest.exe
       workingDirectory: '$(Build.BinariesDirectory)'
       displayName: 'Generate cmake config'
 

From db8d0c8e06fd030da6b7bf00cf3fb20661dd13b8 Mon Sep 17 00:00:00 2001
From: Hector Li <hecli@microsoft.com>
Date: Wed, 6 Mar 2024 11:21:19 -0800
Subject: [PATCH 119/279] reset dcvsEnable for different HTP performance mode
 (#19728)

reset dcvsEnable for different HTP performance mode
---
 .../qnn/builder/qnn_backend_manager.cc        | 80 ++++++++++---------
 1 file changed, 44 insertions(+), 36 deletions(-)

diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
index e354bf6562722..6bb57b6a3e56c 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
@@ -678,13 +678,13 @@ Status QnnBackendManager::SetHtpPowerConfig(uint32_t htp_power_config_client_id,
   dcvs_v3.setSleepDisable = 0;
   dcvs_v3.sleepDisable = 0;
   dcvs_v3.setDcvsEnable = 1;
-  dcvs_v3.dcvsEnable = kDcvsDisable;
   dcvs_v3.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE;
   // choose performance mode
   switch (htp_performance_mode) {
     case HtpPerformanceMode::kHtpBurst:
       dcvs_v3.setSleepLatency = 1;  // true
       dcvs_v3.sleepLatency = kSleepMinLatency;
+      dcvs_v3.dcvsEnable = kDcvsDisable;
       dcvs_v3.setBusParams = 1;
       dcvs_v3.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
       dcvs_v3.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
@@ -698,6 +698,7 @@ Status QnnBackendManager::SetHtpPowerConfig(uint32_t htp_power_config_client_id,
     case HtpPerformanceMode::kHtpHighPerformance:
       dcvs_v3.setSleepLatency = 1;  // true
       dcvs_v3.sleepLatency = kSleepLowLatency;
+      dcvs_v3.dcvsEnable = kDcvsDisable;
       dcvs_v3.setBusParams = 1;
       dcvs_v3.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_TURBO;
       dcvs_v3.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_TURBO;
@@ -707,33 +708,36 @@ Status QnnBackendManager::SetHtpPowerConfig(uint32_t htp_power_config_client_id,
       dcvs_v3.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_TURBO;
       dcvs_v3.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_TURBO;
       break;
-    case HtpPerformanceMode::kHtpPowerSaver:
+    case HtpPerformanceMode::kHtpBalanced:
       dcvs_v3.setSleepLatency = 1;  // true
       dcvs_v3.sleepLatency = kSleepMediumLatency;
+      dcvs_v3.dcvsEnable = kDcvsEnable;
       dcvs_v3.setBusParams = 1;
-      dcvs_v3.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_SVS;
-      dcvs_v3.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_SVS;
-      dcvs_v3.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_SVS;
+      dcvs_v3.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_NOM_PLUS;
+      dcvs_v3.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_NOM_PLUS;
+      dcvs_v3.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_NOM_PLUS;
       dcvs_v3.setCoreParams = 1;
-      dcvs_v3.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_SVS;
-      dcvs_v3.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_SVS;
-      dcvs_v3.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_SVS;
+      dcvs_v3.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_NOM_PLUS;
+      dcvs_v3.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_NOM_PLUS;
+      dcvs_v3.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_NOM_PLUS;
       break;
-    case HtpPerformanceMode::kHtpLowPowerSaver:
+    case HtpPerformanceMode::kHtpLowBalanced:
       dcvs_v3.setSleepLatency = 1;  // true
       dcvs_v3.sleepLatency = kSleepMediumLatency;
+      dcvs_v3.dcvsEnable = kDcvsEnable;
       dcvs_v3.setBusParams = 1;
-      dcvs_v3.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_SVS2;
-      dcvs_v3.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_SVS2;
-      dcvs_v3.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_SVS2;
+      dcvs_v3.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_NOM;
+      dcvs_v3.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_NOM;
+      dcvs_v3.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_NOM;
       dcvs_v3.setCoreParams = 1;
-      dcvs_v3.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_SVS2;
-      dcvs_v3.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_SVS2;
-      dcvs_v3.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_SVS2;
+      dcvs_v3.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_NOM;
+      dcvs_v3.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_NOM;
+      dcvs_v3.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_NOM;
       break;
     case HtpPerformanceMode::kHtpHighPowerSaver:
       dcvs_v3.setSleepLatency = 1;  // true
       dcvs_v3.sleepLatency = kSleepMediumLatency;
+      dcvs_v3.dcvsEnable = kDcvsEnable;
       dcvs_v3.setBusParams = 1;
       dcvs_v3.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_SVS_PLUS;
       dcvs_v3.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_SVS_PLUS;
@@ -743,41 +747,45 @@ Status QnnBackendManager::SetHtpPowerConfig(uint32_t htp_power_config_client_id,
       dcvs_v3.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_SVS_PLUS;
       dcvs_v3.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_SVS_PLUS;
       break;
-    case HtpPerformanceMode::kHtpExtremePowerSaver:
+    case HtpPerformanceMode::kHtpPowerSaver:
       dcvs_v3.setSleepLatency = 1;  // true
       dcvs_v3.sleepLatency = kSleepMediumLatency;
+      dcvs_v3.dcvsEnable = kDcvsEnable;
       dcvs_v3.setBusParams = 1;
-      dcvs_v3.busVoltageCornerMin = DCVS_VOLTAGE_CORNER_DISABLE;
-      dcvs_v3.busVoltageCornerTarget = DCVS_VOLTAGE_CORNER_DISABLE;
-      dcvs_v3.busVoltageCornerMax = DCVS_VOLTAGE_CORNER_DISABLE;
+      dcvs_v3.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_SVS;
+      dcvs_v3.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_SVS;
+      dcvs_v3.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_SVS;
       dcvs_v3.setCoreParams = 1;
-      dcvs_v3.coreVoltageCornerMin = DCVS_VOLTAGE_CORNER_DISABLE;
-      dcvs_v3.coreVoltageCornerTarget = DCVS_VOLTAGE_CORNER_DISABLE;
-      dcvs_v3.coreVoltageCornerMax = DCVS_VOLTAGE_CORNER_DISABLE;
+      dcvs_v3.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_SVS;
+      dcvs_v3.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_SVS;
+      dcvs_v3.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_SVS;
       break;
-    case HtpPerformanceMode::kHtpLowBalanced:
+    case HtpPerformanceMode::kHtpLowPowerSaver:
       dcvs_v3.setSleepLatency = 1;  // true
       dcvs_v3.sleepLatency = kSleepMediumLatency;
+      dcvs_v3.dcvsEnable = kDcvsEnable;
       dcvs_v3.setBusParams = 1;
-      dcvs_v3.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_NOM;
-      dcvs_v3.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_NOM;
-      dcvs_v3.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_NOM;
+      dcvs_v3.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_SVS2;
+      dcvs_v3.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_SVS2;
+      dcvs_v3.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_SVS2;
       dcvs_v3.setCoreParams = 1;
-      dcvs_v3.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_NOM;
-      dcvs_v3.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_NOM;
-      dcvs_v3.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_NOM;
+      dcvs_v3.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_SVS2;
+      dcvs_v3.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_SVS2;
+      dcvs_v3.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_SVS2;
       break;
-    case HtpPerformanceMode::kHtpBalanced:
+    case HtpPerformanceMode::kHtpExtremePowerSaver:
+      dcvs_v3.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_POWER_SAVER_MODE;
       dcvs_v3.setSleepLatency = 1;  // true
       dcvs_v3.sleepLatency = kSleepMediumLatency;
+      dcvs_v3.dcvsEnable = kDcvsEnable;
       dcvs_v3.setBusParams = 1;
-      dcvs_v3.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_NOM_PLUS;
-      dcvs_v3.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_NOM_PLUS;
-      dcvs_v3.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_NOM_PLUS;
+      dcvs_v3.busVoltageCornerMin = DCVS_VOLTAGE_CORNER_DISABLE;
+      dcvs_v3.busVoltageCornerTarget = DCVS_VOLTAGE_CORNER_DISABLE;
+      dcvs_v3.busVoltageCornerMax = DCVS_VOLTAGE_CORNER_DISABLE;
       dcvs_v3.setCoreParams = 1;
-      dcvs_v3.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_NOM_PLUS;
-      dcvs_v3.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_NOM_PLUS;
-      dcvs_v3.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_NOM_PLUS;
+      dcvs_v3.coreVoltageCornerMin = DCVS_VOLTAGE_CORNER_DISABLE;
+      dcvs_v3.coreVoltageCornerTarget = DCVS_VOLTAGE_CORNER_DISABLE;
+      dcvs_v3.coreVoltageCornerMax = DCVS_VOLTAGE_CORNER_DISABLE;
       break;
     default:
       ORT_THROW("Invalid performance profile %d", static_cast<int>(htp_performance_mode));

From 8bd1335d00375179fa9cdccf1c6fbda8c04304df Mon Sep 17 00:00:00 2001
From: aciddelgado <139922440+aciddelgado@users.noreply.github.com>
Date: Wed, 6 Mar 2024 12:34:33 -0800
Subject: [PATCH 120/279] Fix GQA Rotary Embedding sequence length (#19801)

### Description
Previously, GQA incorrectly enforced rotary cos and sin cache to be of
sequence length equal to present sequence length. Now it enforces that
it be greater than or equal to present sequence length since to match
Rotary Embedding Op it should be of max_sequence_length


### Motivation and Context
Fixes issue with fusing Rotary Embedding and GQA for certain models
which prefer this optimization.
---
 .../contrib_ops/cuda/bert/group_query_attention_helper.h  | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/contrib_ops/cuda/bert/group_query_attention_helper.h b/onnxruntime/contrib_ops/cuda/bert/group_query_attention_helper.h
index 853e1a710cb24..6fa11200fd5be 100644
--- a/onnxruntime/contrib_ops/cuda/bert/group_query_attention_helper.h
+++ b/onnxruntime/contrib_ops/cuda/bert/group_query_attention_helper.h
@@ -214,13 +214,13 @@ Status CheckInputs(const Tensor* query,
                              "head_size shall be a multiple of 16. Got head_size % 16 == ",
                              head_size % 16);
     }
-    if (cos_dims[0] != present_sequence_length) {
+    if (cos_dims[0] < present_sequence_length) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                             "cos_cache dimension 0 must be of present_sequence_length.");
+                             "cos_cache dimension 0 should be of max_sequence_length.");
     }
-    if (sin_dims[0] != present_sequence_length) {
+    if (sin_dims[0] < present_sequence_length) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                             "sin_cache dimension 0 must be of present_sequence_length.");
+                             "sin_cache dimension 0 should be of max_sequence_length.");
     }
     if (cos_dims[1] != (head_size / 16) * 8) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,

From f2dc725b3355ec25e61d6970b6c030c68f9d3ac4 Mon Sep 17 00:00:00 2001
From: Markus Tavenrath <mtavenrath@users.noreply.github.com>
Date: Wed, 6 Mar 2024 21:35:55 +0100
Subject: [PATCH 121/279] Add SpaceToDepth and DepthToSpace CUDA NHWC Ops
 (#19646)

### Description
- Adding CUDA NHWC support for SpaceToDepth and DepthToSpace
- Add a new test which verifies that swizzling SpaceToDepth swizzling
for the H axis is correct.
- If CUDA NHWC is enabled, run all tests on the CUDA EP with NHWC as
well.

### Motivation and Context
Adding more NHWC operations to avoid layout transformations when using
the CUDA EP for more efficiency.
---
 include/onnxruntime/core/graph/constants.h    |   1 +
 .../contrib_ops/internal_nhwc_onnx_schemas.cc |   1 +
 .../layout_transformation.cc                  |   3 +-
 .../providers/cpu/tensor/space_depth_ops.h    |  16 +-
 .../core/providers/cuda/cuda_nhwc_kernels.cc  |  16 ++
 .../providers/cuda/tensor/space_depth_ops.cc  | 196 +++++++++++++-----
 .../providers/cuda/tensor/space_depth_ops.h   |   2 +
 .../test/contrib_ops/gridsample_test.cc       |  17 +-
 onnxruntime/test/providers/base_tester.cc     |   7 +
 .../providers/cpu/generator/random_test.cc    |  12 +-
 .../providers/cpu/nn/batch_norm_op_test.cc    |   6 +-
 .../test/providers/cpu/nn/conv_op_test.cc     |   2 +
 .../cpu/nn/conv_transpose_op_test.cc          |  15 +-
 .../test/providers/cpu/nn/pool_op_test.cc     |  86 ++++----
 .../cpu/reduction/reduction_ops_test.cc       |   3 +
 .../test/providers/cpu/rnn/rnn_op_test.cc     |   7 +-
 .../cpu/tensor/gather_elements_op_test.cc     |   2 +-
 .../providers/cpu/tensor/resize_op_test.cc    |  22 +-
 .../providers/cpu/tensor/scatter_op_test.cc   |   7 +-
 .../cpu/tensor/space_depth_ops_test.cc        |  47 +++++
 .../providers/cpu/tensor/upsample_op_test.cc  |   6 +-
 21 files changed, 345 insertions(+), 129 deletions(-)

diff --git a/include/onnxruntime/core/graph/constants.h b/include/onnxruntime/core/graph/constants.h
index 9b26ba914c7dd..8e04050d089a0 100644
--- a/include/onnxruntime/core/graph/constants.h
+++ b/include/onnxruntime/core/graph/constants.h
@@ -31,6 +31,7 @@ constexpr size_t kMaxExecutionProviderNameLen = 30;
 
 constexpr const char* kCpuExecutionProvider = "CPUExecutionProvider";
 constexpr const char* kCudaExecutionProvider = "CUDAExecutionProvider";
+constexpr const char* kCudaNHWCExecutionProvider = "CUDANHWCExecutionProvider";
 constexpr const char* kDnnlExecutionProvider = "DnnlExecutionProvider";
 constexpr const char* kOpenVINOExecutionProvider = "OpenVINOExecutionProvider";
 constexpr const char* kVitisAIExecutionProvider = "VitisAIExecutionProvider";
diff --git a/onnxruntime/core/graph/contrib_ops/internal_nhwc_onnx_schemas.cc b/onnxruntime/core/graph/contrib_ops/internal_nhwc_onnx_schemas.cc
index c8960578f9e3d..6bf19654a3ce9 100644
--- a/onnxruntime/core/graph/contrib_ops/internal_nhwc_onnx_schemas.cc
+++ b/onnxruntime/core/graph/contrib_ops/internal_nhwc_onnx_schemas.cc
@@ -106,6 +106,7 @@ void OpSet_Internal_NHWC_ONNX::ForEachSchema(const std::function<void(ONNX_NAMES
   REGISTER_NHWC_SCHEMA_WITH_ACTIVATION(fn, BatchNormalization, 14);
   REGISTER_NHWC_SCHEMA_WITH_ACTIVATION(fn, BatchNormalization, 15);
 
+  REGISTER_NHWC_SCHEMA(fn, DepthToSpace, 1);
   REGISTER_NHWC_SCHEMA(fn, DepthToSpace, 11);
   REGISTER_NHWC_SCHEMA(fn, DepthToSpace, 13);
 
diff --git a/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc b/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc
index a8717b99a8750..085a02c7c4127 100644
--- a/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc
+++ b/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc
@@ -43,7 +43,8 @@ const std::unordered_set<std::string_view>& GetCUDALayoutSensitiveOps() {
         "GlobalAveragePool",
         "AveragePool",
         "GridSample",
-    };
+        "DepthToSpace",
+        "SpaceToDepth"};
   }();
   return cuda_nhwc_ops;
 }
diff --git a/onnxruntime/core/providers/cpu/tensor/space_depth_ops.h b/onnxruntime/core/providers/cpu/tensor/space_depth_ops.h
index 7d117317ba172..3218c8952d6ec 100644
--- a/onnxruntime/core/providers/cpu/tensor/space_depth_ops.h
+++ b/onnxruntime/core/providers/cpu/tensor/space_depth_ops.h
@@ -14,6 +14,7 @@ class SpaceDepthBase {
                 "Attribute blocksize is not set.");
   }
 
+  template <bool IsNHWC = false>
   Status InputValidationsAndOutputDimsCalc(const Tensor& input,
                                            int64_t& batch,
                                            int64_t& input_depth, int64_t& input_height, int64_t& input_width,
@@ -27,9 +28,15 @@ class SpaceDepthBase {
     }
 
     batch = input_shape[0];
-    input_depth = input_shape[1];
-    input_height = input_shape[2];
-    input_width = input_shape[3];
+    if constexpr (IsNHWC) {
+      input_depth = input_shape[3];
+      input_height = input_shape[1];
+      input_width = input_shape[2];
+    } else {
+      input_depth = input_shape[1];
+      input_height = input_shape[2];
+      input_width = input_shape[3];
+    }
 
     if (is_space_to_depth) {  // SpaceToDepth op
       if ((input_height % this->blocksize_) != 0) {
@@ -46,7 +53,8 @@ class SpaceDepthBase {
 
     } else {  // DepthToSpace op
       if ((input_depth % (blocksize_ * blocksize_) != 0)) {
-        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "DepthToSpace requires input depth to be a multiple of (block_size * blok_size)");
+        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                               "DepthToSpace requires input depth to be a multiple of (block_size * block_size)");
       }
 
       output_depth = input_depth / blocksize_ / blocksize_;
diff --git a/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc b/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc
index 64edc319e15ac..da7802fe8d5dc 100644
--- a/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc
+++ b/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc
@@ -86,6 +86,11 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalN
                                             BatchNormalization);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 15, MLFloat16,
                                             BatchNormalization);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 10, DepthToSpace);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11, 12, DepthToSpace);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 13, DepthToSpace);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 12, SpaceToDepth);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 13, SpaceToDepth);
 
 Status RegisterCudaNhwcKernels(KernelRegistry& kernel_registry) {
   static const BuildKernelCreateInfoFn nhwc_function_table[] = {
@@ -171,6 +176,17 @@ Status RegisterCudaNhwcKernels(KernelRegistry& kernel_registry) {
           kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 10, float, ConvTranspose)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
           kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 10, MLFloat16, ConvTranspose)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain,
+                                                                      1, 10, DepthToSpace)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain,
+                                                                      11, 12, DepthToSpace)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain,
+                                                            13, DepthToSpace)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain,
+                                                                      1, 12, SpaceToDepth)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain,
+                                                            13, SpaceToDepth)>,
   };
 
   for (auto& function_table_entry : nhwc_function_table) {
diff --git a/onnxruntime/core/providers/cuda/tensor/space_depth_ops.cc b/onnxruntime/core/providers/cuda/tensor/space_depth_ops.cc
index 407a2ef3981f1..aaaf3600b676e 100644
--- a/onnxruntime/core/providers/cuda/tensor/space_depth_ops.cc
+++ b/onnxruntime/core/providers/cuda/tensor/space_depth_ops.cc
@@ -20,7 +20,22 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX(
                         {DataTypeImpl::GetTensorType<float>(),
                          DataTypeImpl::GetTensorType<double>(),
                          DataTypeImpl::GetTensorType<MLFloat16>()}),
-    SpaceToDepth);
+    SpaceToDepth<LAYOUT_NCHW>);
+
+#ifdef ENABLE_CUDA_NHWC_OPS
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    SpaceToDepth,
+    kMSInternalNHWCDomain,
+    1,
+    12,
+    kCudaExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T",
+                        {DataTypeImpl::GetTensorType<float>(),
+                         DataTypeImpl::GetTensorType<double>(),
+                         DataTypeImpl::GetTensorType<MLFloat16>()}),
+    SpaceToDepth<LAYOUT_NHWC>);
+#endif
 
 ONNX_OPERATOR_KERNEL_EX(
     SpaceToDepth,
@@ -32,7 +47,21 @@ ONNX_OPERATOR_KERNEL_EX(
                         {DataTypeImpl::GetTensorType<float>(),
                          DataTypeImpl::GetTensorType<double>(),
                          DataTypeImpl::GetTensorType<MLFloat16>()}),
-    SpaceToDepth);
+    SpaceToDepth<LAYOUT_NCHW>);
+
+#ifdef ENABLE_CUDA_NHWC_OPS
+ONNX_OPERATOR_KERNEL_EX(
+    SpaceToDepth,
+    kMSInternalNHWCDomain,
+    13,
+    kCudaExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T",
+                        {DataTypeImpl::GetTensorType<float>(),
+                         DataTypeImpl::GetTensorType<double>(),
+                         DataTypeImpl::GetTensorType<MLFloat16>()}),
+    SpaceToDepth<LAYOUT_NHWC>);
+#endif
 
 ONNX_OPERATOR_VERSIONED_KERNEL_EX(
     DepthToSpace,
@@ -45,7 +74,22 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX(
                         {DataTypeImpl::GetTensorType<float>(),
                          DataTypeImpl::GetTensorType<double>(),
                          DataTypeImpl::GetTensorType<MLFloat16>()}),
-    DepthToSpace);
+    DepthToSpace<LAYOUT_NCHW>);
+
+#ifdef ENABLE_CUDA_NHWC_OPS
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    DepthToSpace,
+    kMSInternalNHWCDomain,
+    1,
+    10,
+    kCudaExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T",
+                        {DataTypeImpl::GetTensorType<float>(),
+                         DataTypeImpl::GetTensorType<double>(),
+                         DataTypeImpl::GetTensorType<MLFloat16>()}),
+    DepthToSpace<LAYOUT_NHWC>);
+#endif
 
 ONNX_OPERATOR_VERSIONED_KERNEL_EX(
     DepthToSpace,
@@ -58,7 +102,22 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX(
                         {DataTypeImpl::GetTensorType<float>(),
                          DataTypeImpl::GetTensorType<double>(),
                          DataTypeImpl::GetTensorType<MLFloat16>()}),
-    DepthToSpace);
+    DepthToSpace<LAYOUT_NCHW>);
+
+#ifdef ENABLE_CUDA_NHWC_OPS
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    DepthToSpace,
+    kMSInternalNHWCDomain,
+    11,
+    12,
+    kCudaExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T",
+                        {DataTypeImpl::GetTensorType<float>(),
+                         DataTypeImpl::GetTensorType<double>(),
+                         DataTypeImpl::GetTensorType<MLFloat16>()}),
+    DepthToSpace<LAYOUT_NHWC>);
+#endif
 
 ONNX_OPERATOR_KERNEL_EX(
     DepthToSpace,
@@ -70,23 +129,35 @@ ONNX_OPERATOR_KERNEL_EX(
                         {DataTypeImpl::GetTensorType<float>(),
                          DataTypeImpl::GetTensorType<double>(),
                          DataTypeImpl::GetTensorType<MLFloat16>()}),
-    DepthToSpace);
+    DepthToSpace<LAYOUT_NCHW>);
+
+#ifdef ENABLE_CUDA_NHWC_OPS
+ONNX_OPERATOR_KERNEL_EX(
+    DepthToSpace,
+    kMSInternalNHWCDomain,
+    13,
+    kCudaExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T",
+                        {DataTypeImpl::GetTensorType<float>(),
+                         DataTypeImpl::GetTensorType<double>(),
+                         DataTypeImpl::GetTensorType<MLFloat16>()}),
+    DepthToSpace<LAYOUT_NHWC>);
+#endif
 
 static Status SpaceDepthOpCudaImpl(const cudaDeviceProp& prop,
                                    cudaStream_t stream,
                                    const cublasHandle_t cublas_handle,
                                    const Tensor& input, Tensor& output,
                                    const std::vector<size_t>& permutation,
-                                   const int64_t batch_size,
-                                   const int64_t in_dim1, const int64_t in_dim2, const int64_t in_dim3,
-                                   const int64_t in_dim4, const int64_t in_dim5,
+                                   const TensorShape& virtual_input_shape,
                                    const TensorShape& virtual_output_shape) {
-  TensorShape virtual_input_shape{batch_size, in_dim1, in_dim2, in_dim3, in_dim4, in_dim5};
   return Transpose::DoTranspose(prop, stream, cublas_handle, permutation, input, output,
                                 &virtual_input_shape, &virtual_output_shape);
 }
 
-Status SpaceToDepth::ComputeInternal(OpKernelContext* context) const {
+template <bool Layout>
+Status SpaceToDepth<Layout>::ComputeInternal(OpKernelContext* context) const {
   const auto* tensor_pointer = context->Input<Tensor>(0);
   if (tensor_pointer == nullptr) return Status(common::ONNXRUNTIME, common::FAIL, "input count mismatch");
   const Tensor& input = *tensor_pointer;
@@ -101,29 +172,44 @@ Status SpaceToDepth::ComputeInternal(OpKernelContext* context) const {
   int64_t output_height = -1;
   int64_t output_width = -1;
 
-  ORT_RETURN_IF_ERROR(InputValidationsAndOutputDimsCalc(input,
-                                                        batch,
-                                                        input_depth, input_height, input_width,
-                                                        output_depth, output_height, output_width,
-                                                        true));
+  ORT_RETURN_IF_ERROR(
+      InputValidationsAndOutputDimsCalc<Layout == LAYOUT_NHWC>(input,
+                                                               batch,
+                                                               input_depth, input_height, input_width,
+                                                               output_depth, output_height, output_width,
+                                                               true));
 
   // We use the "actual" output shape to construct the output tensor
-  Tensor& output = *context->Output(0, {batch, output_depth, output_height, output_width});
+  Tensor& output = (Layout == LAYOUT_NCHW)
+                       ? *context->Output(0, {batch, output_depth, output_height, output_width})
+                       : *context->Output(0, {batch, output_height, output_width, output_depth});
+
+  TensorShape virtual_input_shape = (Layout == LAYOUT_NCHW)
+                                        ? TensorShape{batch, input_depth, input_height / blocksize_,
+                                                      blocksize_, input_width / blocksize_, blocksize_}
+                                        : TensorShape{batch, input_height / blocksize_, blocksize_,
+                                                      input_width / blocksize_, blocksize_, input_depth};
 
   // We will pass in the "virtual" output shape to be used by DoTranspose() in SpaceDepthOpCudaImpl(...)
-  TensorShape virtual_output_shape{batch, blocksize_, blocksize_, input_depth,
-                                   input_height / blocksize_, input_width / blocksize_};
+  TensorShape virtual_output_shape = (Layout == LAYOUT_NCHW)
+                                         ? TensorShape{batch, blocksize_, blocksize_, input_depth,
+                                                       input_height / blocksize_, input_width / blocksize_}
+                                         : TensorShape{batch, input_height / blocksize_, input_width / blocksize_,
+                                                       blocksize_, blocksize_, input_depth};
 
-  std::vector<size_t> permutation = {0, 3, 5, 1, 2, 4};
+  std::vector<size_t> permutation = (Layout == LAYOUT_NCHW)
+                                        ? std::vector<size_t>{0, 3, 5, 1, 2, 4}
+                                        : std::vector<size_t>{0, 1, 3, 2, 4, 5};
 
-  ORT_RETURN_IF_ERROR(SpaceDepthOpCudaImpl(GetDeviceProp(), Stream(context), GetCublasHandle(context), input, output, permutation, batch,
-                                           input_depth, input_height / blocksize_, blocksize_, input_width / blocksize_, blocksize_,
-                                           virtual_output_shape));
+  ORT_RETURN_IF_ERROR(
+      SpaceDepthOpCudaImpl(GetDeviceProp(), Stream(context), GetCublasHandle(context), input, output, permutation,
+                           virtual_input_shape, virtual_output_shape));
 
   return Status::OK();
 }
 
-Status DepthToSpace::ComputeInternal(OpKernelContext* context) const {
+template <bool Layout>
+Status DepthToSpace<Layout>::ComputeInternal(OpKernelContext* context) const {
   const auto* tensor_pointer = context->Input<Tensor>(0);
   if (tensor_pointer == nullptr) return Status(common::ONNXRUNTIME, common::FAIL, "input count mismatch");
   const Tensor& input = *tensor_pointer;
@@ -138,46 +224,56 @@ Status DepthToSpace::ComputeInternal(OpKernelContext* context) const {
   int64_t output_height = -1;
   int64_t output_width = -1;
 
-  ORT_RETURN_IF_ERROR(InputValidationsAndOutputDimsCalc(input,
-                                                        batch,
-                                                        input_depth, input_height, input_width,
-                                                        output_depth, output_height, output_width,
-                                                        false));
+  ORT_RETURN_IF_ERROR(
+      InputValidationsAndOutputDimsCalc<Layout == LAYOUT_NHWC>(input,
+                                                               batch,
+                                                               input_depth, input_height, input_width,
+                                                               output_depth, output_height, output_width,
+                                                               false));
 
   // We use the "actual" output shape to construct the output tensor
-  Tensor& output = *context->Output(0, {batch, output_depth, output_height, output_width});
+  Tensor& output = (Layout == LAYOUT_NCHW)
+                       ? *context->Output(0, {batch, output_depth, output_height, output_width})
+                       : *context->Output(0, {batch, output_height, output_width, output_depth});
+
+  int64_t virtual_input_depth = input_depth / blocksize_ / blocksize_;
+  TensorShape virtual_input_shape;
+
+  // cdr only here!
+  if (is_dcr_) {
+    virtual_input_shape = (Layout == LAYOUT_NCHW)
+                              ? TensorShape{batch, blocksize_, blocksize_,
+                                            virtual_input_depth, input_height, input_width}
+                              : TensorShape{batch, input_height, input_width,
+                                            blocksize_, blocksize_, virtual_input_depth};
+  } else {
+    virtual_input_shape = (Layout == LAYOUT_NCHW)
+                              ? TensorShape{batch, virtual_input_depth, blocksize_,
+                                            blocksize_, input_height, input_width}
+                              : TensorShape{batch, input_height, input_width,
+                                            virtual_input_depth, blocksize_, blocksize_};
+  }
 
   // We will pass in the "virtual" output shape to be used by DoTranspose() in SpaceDepthOpCudaImpl(...)
-  TensorShape virtual_output_shape{batch, input_depth / blocksize_ / blocksize_,
-                                   input_height, blocksize_, input_width, blocksize_};
+  TensorShape virtual_output_shape = (Layout == LAYOUT_NCHW)
+                                         ? TensorShape{batch, virtual_input_depth, input_height,
+                                                       blocksize_, input_width, blocksize_}
+                                         : TensorShape{batch, input_height, blocksize_,
+                                                       input_width, blocksize_, virtual_input_depth};
 
   std::vector<size_t> permutation;
-  permutation.reserve(6);
-  permutation.push_back(0);
 
   if (is_dcr_) {
-    permutation.push_back(3);
-    permutation.push_back(4);
-    permutation.push_back(1);
-    permutation.push_back(5);
-    permutation.push_back(2);
+    permutation = (Layout == LAYOUT_NCHW)
+                      ? std::vector<size_t>({0, 3, 4, 1, 5, 2})
+                      : std::vector<size_t>({0, 1, 3, 2, 4, 5});
 
   } else {
-    permutation.push_back(1);
-    permutation.push_back(4);
-    permutation.push_back(2);
-    permutation.push_back(5);
-    permutation.push_back(3);
+    permutation = std::vector<size_t>({0, 1, 4, 2, 5, 3});
   }
 
-  int64_t dim1 = is_dcr_ ? blocksize_ : input_depth / blocksize_ / blocksize_;
-  int64_t dim3 = is_dcr_ ? input_depth / blocksize_ / blocksize_ : blocksize_;
-
   ORT_RETURN_IF_ERROR(SpaceDepthOpCudaImpl(GetDeviceProp(), Stream(context), GetCublasHandle(context), input, output,
-                                           permutation,
-                                           batch,
-                                           dim1, blocksize_, dim3, input_height, input_width,
-                                           virtual_output_shape));
+                                           permutation, virtual_input_shape, virtual_output_shape));
 
   return Status::OK();
 }
diff --git a/onnxruntime/core/providers/cuda/tensor/space_depth_ops.h b/onnxruntime/core/providers/cuda/tensor/space_depth_ops.h
index 57b85556f1dbe..8780d9b365005 100644
--- a/onnxruntime/core/providers/cuda/tensor/space_depth_ops.h
+++ b/onnxruntime/core/providers/cuda/tensor/space_depth_ops.h
@@ -9,6 +9,7 @@
 namespace onnxruntime {
 namespace cuda {
 
+template <bool Layout>
 class SpaceToDepth final : public CudaKernel, SpaceDepthBase {
  public:
   explicit SpaceToDepth(const OpKernelInfo& info) : CudaKernel(info), SpaceDepthBase(info) {
@@ -17,6 +18,7 @@ class SpaceToDepth final : public CudaKernel, SpaceDepthBase {
   Status ComputeInternal(OpKernelContext* context) const override;
 };
 
+template <bool Layout>
 class DepthToSpace final : public CudaKernel, SpaceDepthBase {
  public:
   explicit DepthToSpace(const OpKernelInfo& info) : CudaKernel(info), SpaceDepthBase(info) {
diff --git a/onnxruntime/test/contrib_ops/gridsample_test.cc b/onnxruntime/test/contrib_ops/gridsample_test.cc
index 1f31c2bd21f14..46ed04301a9e8 100644
--- a/onnxruntime/test/contrib_ops/gridsample_test.cc
+++ b/onnxruntime/test/contrib_ops/gridsample_test.cc
@@ -32,7 +32,7 @@ TEST(GridsampleContribOpTest, gridsample_default) {
                          3.8000f, 7.9000f, 8.7000f, 9.5000f, 10.3000f, 5.3000f,
                          5.4000f, 11.1000f, 11.9000f, 12.7000f, 13.5000f, 6.9000f,
                          3.0000f, 6.1500f, 6.5500f, 6.9500f, 7.3500f, 3.7500f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider});
 }
 
 TEST(GridsampleContribOpTest, gridsample_paddingmode_zeros) {
@@ -45,7 +45,7 @@ TEST(GridsampleContribOpTest, gridsample_paddingmode_zeros) {
                         5.0000f, 5.0000f, 10.0000f, 10.0000f});
   test.AddAttribute("padding_mode", "zeros");
   test.AddOutput<float>("Y", {1, 1, 2, 4}, {0.0000f, 0.0000f, 1.7000f, 0.0000f, 0.0000f, 1.7000f, 0.0000f, 0.0000f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider});
 }
 
 TEST(GridsampleContribOpTest, gridsample_paddingmode_border) {
@@ -58,7 +58,7 @@ TEST(GridsampleContribOpTest, gridsample_paddingmode_border) {
                         5.0000f, 5.0000f, 10.0000f, 10.0000f});
   test.AddAttribute("padding_mode", "border");
   test.AddOutput<float>("Y", {1, 1, 2, 4}, {0.0000f, 0.0000f, 1.7000f, 5.0000f, 5.0000f, 1.7000f, 5.0000f, 5.0000f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider});
 }
 
 TEST(GridsampleContribOpTest, gridsample_paddingmode_reflection) {
@@ -71,7 +71,8 @@ TEST(GridsampleContribOpTest, gridsample_paddingmode_reflection) {
                         5.0000f, 5.0000f, 10.0000f, 10.0000f});
   test.AddAttribute("padding_mode", "reflection");
   test.AddOutput<float>("Y", {1, 1, 2, 4}, {2.5000f, 0.0000f, 1.7000f, 2.5000f, 2.5000f, 1.7000f, 5.0000f, 2.5000f});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kQnnExecutionProvider});  // Accuracy issue for QNN
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kCudaNHWCExecutionProvider, kQnnExecutionProvider});  // Accuracy issue for QNN
 }
 
 TEST(GridsampleContribOpTest, gridsample_aligncorners_true) {
@@ -86,7 +87,7 @@ TEST(GridsampleContribOpTest, gridsample_aligncorners_true) {
   test.AddAttribute("mode", "bilinear");
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", {1, 1, 2, 4}, {0.0000f, 1.2500f, 2.0000f, 2.5000f, 2.5000f, 2.0000f, 3.7500f, 5.0000f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider});
 }
 
 TEST(GridsampleContribOpTest, gridsample_mode_bilinear) {
@@ -99,7 +100,7 @@ TEST(GridsampleContribOpTest, gridsample_mode_bilinear) {
                         0.5000f, 0.5000f, 1.0000f, 1.0000f});
   test.AddAttribute("mode", "bilinear");
   test.AddOutput<float>("Y", {1, 1, 2, 4}, {0.0000f, 0.5000f, 1.7000f, 2.5000f, 2.5000f, 1.7000f, 4.5000f, 1.2500f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider});
 }
 
 TEST(GridsampleContribOpTest, gridsample_mode_nearest) {
@@ -112,7 +113,7 @@ TEST(GridsampleContribOpTest, gridsample_mode_nearest) {
                         0.5000f, 0.5000f, 1.0000f, 1.0000f});
   test.AddAttribute("mode", "nearest");
   test.AddOutput<float>("Y", {1, 1, 2, 4}, {0.f, 0.f, 2.f, 2.f, 2.f, 2.f, 5.f, 0.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider});
 }
 
 TEST(GridsampleContribOpTest, gridsample_mode_bicubic) {
@@ -125,7 +126,7 @@ TEST(GridsampleContribOpTest, gridsample_mode_bicubic) {
                         0.5000f, 0.5000f, 1.0000f, 1.0000f});
   test.AddAttribute("mode", "bicubic");
   test.AddOutput<float>("Y", {1, 1, 2, 4}, {-0.1406f, 0.3828f, 1.7556f, 2.9688f, 2.9688f, 1.7556f, 5.1445f, 1.3906f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider});
 }
 
 }  // namespace test
diff --git a/onnxruntime/test/providers/base_tester.cc b/onnxruntime/test/providers/base_tester.cc
index 16cce85f7cb0a..84cb663a2984a 100644
--- a/onnxruntime/test/providers/base_tester.cc
+++ b/onnxruntime/test/providers/base_tester.cc
@@ -622,6 +622,9 @@ void BaseTester::RunWithConfig(size_t* number_of_pre_packed_weights_counter,
       static const std::string all_provider_types[] = {
           kCpuExecutionProvider,
           kCudaExecutionProvider,
+#ifdef ENABLE_CUDA_NHWC_OPS
+          kCudaNHWCExecutionProvider,
+#endif
           kDnnlExecutionProvider,
           kTensorrtExecutionProvider,
           kOpenVINOExecutionProvider,
@@ -650,6 +653,10 @@ void BaseTester::RunWithConfig(size_t* number_of_pre_packed_weights_counter,
           execution_provider = DefaultCpuExecutionProvider();
         else if (provider_type == onnxruntime::kCudaExecutionProvider)
           execution_provider = DefaultCudaExecutionProvider();
+#ifdef ENABLE_CUDA_NHWC_OPS
+        else if (provider_type == onnxruntime::kCudaNHWCExecutionProvider)
+          execution_provider = DefaultCudaNHWCExecutionProvider();
+#endif
         else if (provider_type == onnxruntime::kDnnlExecutionProvider)
           execution_provider = DefaultDnnlExecutionProvider();
         else if (provider_type == onnxruntime::kOpenVINOExecutionProvider)
diff --git a/onnxruntime/test/providers/cpu/generator/random_test.cc b/onnxruntime/test/providers/cpu/generator/random_test.cc
index 532b98317405f..be049d1cf0ce3 100644
--- a/onnxruntime/test/providers/cpu/generator/random_test.cc
+++ b/onnxruntime/test/providers/cpu/generator/random_test.cc
@@ -36,7 +36,8 @@ TEST(Random, RandomNormal2DDouble) {
 
   // The expected_output is generated using std lib, which is used by CPU kernel only.
   // So we need to exclude other EPs here. Ditto for other places.
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider});
 }
 
 void RunRandomNormalLike3DFloat(bool infer_dtype = false) {
@@ -72,7 +73,8 @@ void RunRandomNormalLike3DFloat(bool infer_dtype = false) {
   test.AddOutput<float>("Y", dims, expected_output);
 
   // TensorRT does not support manual seed overrides and there will be result mismatch
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider, kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider, kTensorrtExecutionProvider});
 }
 
 TEST(Random, RandomNormalLike3DDouble) {
@@ -109,7 +111,8 @@ TEST(Random, RandomUniform1DFloat) {
   test.AddOutput<float>("Y", dims, expected_output);
 
   // TensorRT does not support manual seed overrides and there will be result mismatch
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider, kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider, kTensorrtExecutionProvider});
 }
 
 void RunRandomUniformLikeTest(bool infer_dtype = false) {
@@ -142,7 +145,8 @@ void RunRandomUniformLikeTest(bool infer_dtype = false) {
   test.AddOutput<double>("Y", dims, expected_output);
 
   // TensorRT does not support seed parameter and there will be result mismatch
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider, kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider, kTensorrtExecutionProvider});
 }
 
 TEST(Random, RandomUniformLike2DDouble) {
diff --git a/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc b/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc
index 54e5c71bd753a..3d30fc62a945d 100644
--- a/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc
@@ -917,7 +917,7 @@ TEST(BatchNormTest, ForwardTrainingTestWithSavedOutputsOpset9) {
   // exclude TRT and OpenVINO for same reasons as seen in TestBatchNorm()
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
            // TODO(mtavenrath) flakiness of running_mean for CUDA has been fixed, the delta of running_var is still ~0.1
-           {kCudaExecutionProvider, kRocmExecutionProvider,
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider,
             kTensorrtExecutionProvider, kOpenVINOExecutionProvider, kDnnlExecutionProvider});
 }
 
@@ -945,7 +945,7 @@ TEST(BatchNormTest, ForwardTrainingTestOpset14) {
   // exclude CUDA Execution Provider due to flakiness
   // exclude TRT and OpenVINO for same reasons as seen in TestBatchNorm()
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kCudaExecutionProvider, kRocmExecutionProvider,
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider,
             kTensorrtExecutionProvider, kOpenVINOExecutionProvider, kDnnlExecutionProvider});
 }
 
@@ -972,7 +972,7 @@ TEST(BatchNormTest, ForwardTrainingTestOpset15) {
 
   // Same exclusions as the opset 14 test
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kCudaExecutionProvider, kRocmExecutionProvider,
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider,
             kTensorrtExecutionProvider, kOpenVINOExecutionProvider, kDnnlExecutionProvider});
 }
 #endif  // BATCHNORM_INCLUDE_TRAINING_SUPPORT
diff --git a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc
index dede278b7274f..0efa78af2795c 100644
--- a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc
@@ -59,6 +59,8 @@ void TestConvOp(const ConvOpAndTestAttributes& attributes,
   std::unordered_set<std::string> excluded_providers(attributes.excluded_providers);
   // Disable TensorRT because weight as input is not supported
   excluded_providers.insert(kTensorrtExecutionProvider);
+  // Disable CUDA NHWC execution provider as it is currently flaky
+  excluded_providers.insert(kCudaNHWCExecutionProvider);
 
   // QNN SDK 2.10.0 has a bug that breaks support for dynamic bias inputs.
   excluded_providers.insert(kQnnExecutionProvider);
diff --git a/onnxruntime/test/providers/cpu/nn/conv_transpose_op_test.cc b/onnxruntime/test/providers/cpu/nn/conv_transpose_op_test.cc
index 472f841aa8565..ec93dc249eeb2 100644
--- a/onnxruntime/test/providers/cpu/nn/conv_transpose_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/conv_transpose_op_test.cc
@@ -75,7 +75,8 @@ void TestConvTransposeOp(const ConvTransposeOpAttributes& attributes,
                          const vector<int64_t>& expected_output_shape,
                          OpTester::ExpectResult expect_result = OpTester::ExpectResult::kExpectSuccess,
                          const std::string& err_str = "",
-                         const std::unordered_set<std::string>& excluded_provider_types = {kTensorrtExecutionProvider, kQnnExecutionProvider}) {
+                         const std::unordered_set<std::string>& excluded_provider_types =
+                             {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kQnnExecutionProvider}) {
   std::unordered_set<std::string> extra_exclude_openvino_for_initializer_filter = excluded_provider_types;
   extra_exclude_openvino_for_initializer_filter.insert(kOpenVINOExecutionProvider);
   TestConvTransposeOpInitializer(attributes, inputs, input_shapes, expected_output, expected_output_shape,
@@ -409,7 +410,8 @@ TEST(ConvTransposeTest, ConvTranspose_2D_OutputShape_2) {
   vector<int64_t> Y_shape = {1, 1, 1, 14};
   auto expected_vals = {1.0f, 2.0f, 5.0f, 11.0f, 19.0f, 28.0f, 37.0f, 46.0f, 55.0f, 64.0f, 63.0f, 51.0f, 27.0f, 10.0f};
   TestConvTransposeOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape,
-                      OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kQnnExecutionProvider});
+                      OpTester::ExpectResult::kExpectSuccess, "",
+                      {kOpenVINOExecutionProvider, kCudaNHWCExecutionProvider, kQnnExecutionProvider});
 }
 
 TEST(ConvTransposeTest, ConvTranspose_2D_OutputShapeWithBatchSize) {
@@ -434,7 +436,8 @@ TEST(ConvTransposeTest, ConvTranspose_2D_OutputShapeWithBatchSize) {
   auto expected_vals = {1.0f, 2.0f, 5.0f, 11.0f, 19.0f, 28.0f, 37.0f, 46.0f, 55.0f, 64.0f, 63.0f, 51.0f, 27.0f, 10.0f,
                         11.0f, 32.0f, 65.0f, 91.0f, 109.0f, 118.0f, 127.0f, 136.0f, 145.0f, 154.0f, 143.0f, 111.0f, 57.0f, 20.0f};
   TestConvTransposeOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape,
-                      OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kQnnExecutionProvider});
+                      OpTester::ExpectResult::kExpectSuccess, "",
+                      {kOpenVINOExecutionProvider, kCudaNHWCExecutionProvider, kQnnExecutionProvider});
 }
 
 TEST(ConvTransposeTest, ConvTranspose_InvalidKernelShape) {
@@ -871,7 +874,8 @@ TEST(ConvTransposeTest, DimWithZero) {
 
   TestConvTransposeOp(attrs, {X, W}, {X_shape, W_shape}, expected_vals, Y_shape,
                       OpTester::ExpectResult::kExpectSuccess, "",
-                      {kTensorrtExecutionProvider, kAclExecutionProvider, kQnnExecutionProvider});
+                      {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider,
+                       kAclExecutionProvider, kQnnExecutionProvider});
 }
 
 TEST(ConvTransposeTest, ConvTranspose_3D) {
@@ -1005,7 +1009,8 @@ TEST(ConvTransposeTest, ConvTranspose_3D) {
 
   TestConvTransposeOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape,
                       OpTester::ExpectResult::kExpectSuccess, "",
-                      {kTensorrtExecutionProvider, kCudaExecutionProvider, kQnnExecutionProvider});
+                      {kTensorrtExecutionProvider, kCudaExecutionProvider,
+                       kCudaNHWCExecutionProvider, kQnnExecutionProvider});
 }
 
 TEST(ConvTransposeTest, ConvTranspose_1D_AsymmetricPads) {
diff --git a/onnxruntime/test/providers/cpu/nn/pool_op_test.cc b/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
index 4b194ec18b31b..e24cda17166ed 100644
--- a/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
@@ -57,7 +57,8 @@ TEST(PoolTest, MaxPool) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT: result differs
+  // TensorRT: result differs
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
 }
 
 // Only CUDA kernel has float 16 support
@@ -115,7 +116,8 @@ TEST(PoolTest, MaxPool_F16) {
 
   test.AddInput<MLFloat16>("X", x_dims, f_X);
   test.AddOutput<MLFloat16>("Y", expected_dims, f_Y);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT: Assertion `!attrs.count("pads")' failed
+  // TensorRT: Assertion `!attrs.count("pads")' failed
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
 }
 #endif
 
@@ -167,7 +169,9 @@ static void MaxPool_8_WithIndexTest(bool has_index, int64_t storage_order = 0) {
     storage_order == 0 ? test.AddOutput<int64_t>("Indices", expected_dims, expected_indices_row)
                        : test.AddOutput<int64_t>("Indices", expected_dims, expected_indices_col);
   }
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kDnnlExecutionProvider, kTensorrtExecutionProvider, kAclExecutionProvider, kArmNNExecutionProvider, kOpenVINOExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kDnnlExecutionProvider, kCudaNHWCExecutionProvider, kTensorrtExecutionProvider,
+            kAclExecutionProvider, kArmNNExecutionProvider, kOpenVINOExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool_8_With_Index) {
@@ -196,7 +200,7 @@ TEST(PoolTest, MaxPool1D) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
 }
 
 static void MaxPool1D_8_WithIndexTest(int64_t storage_order) {
@@ -217,7 +221,8 @@ static void MaxPool1D_8_WithIndexTest(int64_t storage_order) {
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
   test.AddOutput<int64_t>("Indices", expected_dims, expected_indices);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kAclExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kAclExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool1D_8_With_Index) {
@@ -243,7 +248,8 @@ static void MaxPool1D_12_WithIndexTest_int8(int64_t storage_order) {
   test.AddInput<int8_t>("X", x_dims, x_vals);
   test.AddOutput<int8_t>("Y", expected_dims, expected_vals);
   test.AddOutput<int64_t>("Indices", expected_dims, expected_indices);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kAclExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kAclExecutionProvider});
 }
 
 static void MaxPool1D_12_WithIndexTest_uint8(int64_t storage_order) {
@@ -264,7 +270,8 @@ static void MaxPool1D_12_WithIndexTest_uint8(int64_t storage_order) {
   test.AddInput<uint8_t>("X", x_dims, x_vals);
   test.AddOutput<uint8_t>("Y", expected_dims, expected_vals);
   test.AddOutput<int64_t>("Indices", expected_dims, expected_indices);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kAclExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kAclExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool1D_12_With_Index_8bits) {
@@ -302,9 +309,9 @@ TEST(PoolTest, MaxPool2D_uint8) {
 
   test.AddOutput<uint8_t>("Output", output_shape, output);
 #if defined(OPENVINO_CONFIG_GPU_FP32) || defined(OPENVINO_CONFIG_GPU_FP16)
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kOpenVINOExecutionProvider});
 #else
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider});
 #endif
 }
 
@@ -330,7 +337,7 @@ TEST(PoolTest, MaxPool_10_Dilation_1d) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool_DefaultDilations) {
@@ -350,7 +357,7 @@ TEST(PoolTest, MaxPool_DefaultDilations) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool_DefaultDilations_int8) {
@@ -370,7 +377,7 @@ TEST(PoolTest, MaxPool_DefaultDilations_int8) {
 
   test.AddInput<int8_t>("X", x_dims, x_vals);
   test.AddOutput<int8_t>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool_DefaultDilations_uint8) {
@@ -390,7 +397,7 @@ TEST(PoolTest, MaxPool_DefaultDilations_uint8) {
 
   test.AddInput<uint8_t>("X", x_dims, x_vals);
   test.AddOutput<uint8_t>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool_10_DilationPadding_1d) {
@@ -416,7 +423,7 @@ TEST(PoolTest, MaxPool_10_DilationPadding_1d) {
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kCudaExecutionProvider, kTensorrtExecutionProvider, kRocmExecutionProvider});
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kRocmExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool_10_Dilation_2d) {
@@ -444,7 +451,7 @@ TEST(PoolTest, MaxPool_10_Dilation_2d) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool_10_Dilation_2d_int8) {
@@ -472,7 +479,7 @@ TEST(PoolTest, MaxPool_10_Dilation_2d_int8) {
 
   test.AddInput<int8_t>("X", x_dims, x_vals);
   test.AddOutput<int8_t>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool_10_DilationPadding_2d) {
@@ -500,7 +507,7 @@ TEST(PoolTest, MaxPool_10_DilationPadding_2d) {
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kCudaExecutionProvider, kTensorrtExecutionProvider, kRocmExecutionProvider});
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kRocmExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool_10_Dilation_Ceil0_2d) {
@@ -528,7 +535,8 @@ TEST(PoolTest, MaxPool_10_Dilation_Ceil0_2d) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kAclExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kAclExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool_12_Dilation_Ceil0_2d_int8) {
@@ -556,7 +564,8 @@ TEST(PoolTest, MaxPool_12_Dilation_Ceil0_2d_int8) {
 
   test.AddInput<int8_t>("X", x_dims, x_vals);
   test.AddOutput<int8_t>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kAclExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kAclExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool_10_Dilation_Ceil1_2d) {
@@ -585,7 +594,8 @@ TEST(PoolTest, MaxPool_10_Dilation_Ceil1_2d) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kAclExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kAclExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool_10_DilationPadding_3d) {
@@ -621,7 +631,7 @@ TEST(PoolTest, MaxPool_10_DilationPadding_3d) {
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kCudaExecutionProvider, kTensorrtExecutionProvider, kRocmExecutionProvider});
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kRocmExecutionProvider});
 }
 
 TEST(PoolTest, GlobalMaxPool) {
@@ -697,7 +707,7 @@ TEST(PoolTest, GlobalMaxPool) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider});
 }
 
 TEST(PoolTest, GlobalMaxPool3D) {
@@ -773,7 +783,7 @@ TEST(PoolTest, GlobalMaxPool3D) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
 }
 
 TEST(PoolTest, AveragePool) {
@@ -854,7 +864,7 @@ TEST(PoolTest, AveragePool) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
 }
 
 TEST(PoolTest, AveragePool_IncludePadPixel) {
@@ -878,7 +888,7 @@ TEST(PoolTest, AveragePool_IncludePadPixel) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
 }
 
 // test 'strides' attribute not specified
@@ -897,7 +907,7 @@ TEST(PoolTest, AveragePool_DefaultStrides) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
 }
 
 TEST(PoolTest, AveragePool_10_ceil1_2d) {
@@ -920,7 +930,8 @@ TEST(PoolTest, AveragePool_10_ceil1_2d) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kAclExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kAclExecutionProvider});
 }
 
 TEST(PoolTest, AveragePool_19_dilation_2d) {
@@ -944,7 +955,7 @@ TEST(PoolTest, AveragePool_19_dilation_2d) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kAclExecutionProvider, kOpenVINOExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kAclExecutionProvider, kOpenVINOExecutionProvider});
 }
 
 TEST(PoolTest, GlobalAveragePool) {
@@ -1020,7 +1031,7 @@ TEST(PoolTest, GlobalAveragePool) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider});
 }
 
 TEST(PoolTest, GlobalAveragePool_Large_128) {
@@ -1033,7 +1044,7 @@ TEST(PoolTest, GlobalAveragePool_Large_128) {
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals,
                         /*sort_output=*/false, /*rel_error=*/1e-3f, /*abs_error=*/1e-2f);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider});
 }
 
 TEST(PoolTest, GlobalAveragePool_Large_256) {
@@ -1046,7 +1057,7 @@ TEST(PoolTest, GlobalAveragePool_Large_256) {
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals,
                         /*sort_output=*/false, /*rel_error=*/1e-3f, /*abs_error=*/1e-2f);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider});
 }
 
 TEST(PoolTest, LpPool) {
@@ -1353,7 +1364,7 @@ TEST(PoolTest, LpPool) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider});
 }
 
 // test data generated with lp_pool_test_generator.py
@@ -1385,7 +1396,7 @@ TEST(PoolTest, LpPool1d) {
 
       // https://docs.nvidia.com/deeplearning/tensorrt/api/c_api/classnvinfer1_1_1_i_network_definition.html#a94f434942252e6d98ac17705c06ce060
       // TensorRT does not support 1d pooling
-      test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+      test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
       y_count++;
     }
 }
@@ -1417,7 +1428,7 @@ TEST(PoolTest, LpPool2d) {
       test.AddAttribute("kernel_shape", kernel_sizes[kernel_size_count]);
 
       test.AddOutput<float>("Y", y_sizes[y_count], ys[y_count]);
-      test.Run();
+      test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider});
       y_count++;
     }
 }
@@ -1435,7 +1446,7 @@ TEST(PoolTest, LpPoolCeilMode) {
 
   // https://docs.nvidia.com/deeplearning/tensorrt/api/c_api/classnvinfer1_1_1_i_network_definition.html#a94f434942252e6d98ac17705c06ce060
   // TensorRT does not support 1d pooling
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
 }
 
 TEST(PoolTest, GlobalLpPool) {
@@ -1690,7 +1701,7 @@ TEST(PoolTest, GlobalLpPool) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider});
 }
 
 TEST(PoolTest, MaxPoolDimWithZeroForN) {
@@ -1707,7 +1718,8 @@ TEST(PoolTest, MaxPoolDimWithZeroForN) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kQnnExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kQnnExecutionProvider});
 }
 
 }  // namespace test
diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
index b0e0a0dd0d564..2902995df1e71 100644
--- a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
@@ -3541,6 +3541,7 @@ TEST(ReductionOpTest, ReduceDimWithZero1) {
                {
                    kCoreMLExecutionProvider,
                    kCudaExecutionProvider,
+                   kCudaNHWCExecutionProvider,
                    kDnnlExecutionProvider,
                    kMIGraphXExecutionProvider,
                    kOpenVINOExecutionProvider,
@@ -3591,6 +3592,7 @@ TEST(ReductionOpTest, ReduceDimWithZero2) {
                {
                    kCoreMLExecutionProvider,
                    kCudaExecutionProvider,
+                   kCudaNHWCExecutionProvider,
                    kDnnlExecutionProvider,
                    kMIGraphXExecutionProvider,
                    kOpenVINOExecutionProvider,
@@ -5779,6 +5781,7 @@ void test_empty_set(const std::string& op, int opset, bool axes_as_input, float
       {
           kCoreMLExecutionProvider,
           kCudaExecutionProvider,
+          kCudaNHWCExecutionProvider,
           kDmlExecutionProvider,
           kDnnlExecutionProvider,
           kMIGraphXExecutionProvider,
diff --git a/onnxruntime/test/providers/cpu/rnn/rnn_op_test.cc b/onnxruntime/test/providers/cpu/rnn/rnn_op_test.cc
index 1a31743e2f7e7..38734ab9f668f 100644
--- a/onnxruntime/test/providers/cpu/rnn/rnn_op_test.cc
+++ b/onnxruntime/test/providers/cpu/rnn/rnn_op_test.cc
@@ -744,7 +744,9 @@ TEST(RNNTest, RNN_invalid_sequence_lens) {
     test.AddOutput<float>("Y_h", Y_h_dims, Y_h_data);
 
     // the CUDA RNN version allows the invalid sequence lengths, so disable testing on CUDA and TensorRT
-    test.Run(OpTester::ExpectResult::kExpectFailure, error_msg, {kCudaExecutionProvider, kTensorrtExecutionProvider, kOpenVINOExecutionProvider});
+    test.Run(OpTester::ExpectResult::kExpectFailure, error_msg,
+             {kCudaExecutionProvider, kCudaNHWCExecutionProvider,
+              kTensorrtExecutionProvider, kOpenVINOExecutionProvider});
   };
 
   // should batch batch_size to be valid
@@ -842,7 +844,8 @@ TEST(RNNTest, RNN_bidirectional_with_sequence_lens) {
 
   test.AddOutput<float>("Y_h", Y_h_dims, Y_h_data);
 
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kTensorrtExecutionProvider, kOpenVINOExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kCudaExecutionProvider, kTensorrtExecutionProvider, kOpenVINOExecutionProvider});
 }
 
 TEST(RNNTest, RNN_with_invalid_activation_load_failure) {
diff --git a/onnxruntime/test/providers/cpu/tensor/gather_elements_op_test.cc b/onnxruntime/test/providers/cpu/tensor/gather_elements_op_test.cc
index 8a8bc5560c084..b4bd3fca7b712 100644
--- a/onnxruntime/test/providers/cpu/tensor/gather_elements_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/gather_elements_op_test.cc
@@ -383,7 +383,7 @@ TEST(GatherElementsOpTest, IndicesOutOfBounds) {
   // skip openvino which will not throw error message but will ensure no out-of-bound access
   // skip TensorRT because it doesn't support out of bounds indices
   test.Run(OpTester::ExpectResult::kExpectFailure, "",
-           {kCudaExecutionProvider, kRocmExecutionProvider, kOpenVINOExecutionProvider,
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider, kOpenVINOExecutionProvider,
             kTensorrtExecutionProvider, kDmlExecutionProvider});
 }
 
diff --git a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
index 5addb5dd9ce46..062f25b989a70 100644
--- a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
@@ -102,7 +102,7 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_tf_crop_and_resize_with_extr
   // TensorRT: results mismatch
   // ROCm: results mismatch
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kCudaExecutionProvider, kTensorrtExecutionProvider, kRocmExecutionProvider});
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kRocmExecutionProvider});
 }
 
 TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_tf_crop_and_resize_with_extrapolation_uint8) {
@@ -132,7 +132,8 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_tf_crop_and_resize_with_extr
   test.AddOutput<uint8_t>("Y", {N, static_cast<int64_t>(H * scales[1]), static_cast<int64_t>(W * scales[2]), C}, Y);
   // CUDA: result mismatch due to not implementing NHWC support
   // ROCm: results mismatch
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider});
 }
 
 TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_tf_crop_and_resize_with_extrapolation_int8) {
@@ -192,7 +193,7 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_tf_crop_and_resize_without_e
   // DML: results mismatch
   test.Run(
       OpTester::ExpectResult::kExpectSuccess, "",
-      {kCudaExecutionProvider, kRocmExecutionProvider, kDmlExecutionProvider});
+      {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider, kDmlExecutionProvider});
 }
 
 TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_tf_crop_and_resize_without_extrapolation_int8) {
@@ -267,7 +268,7 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear) {
   // CUDA: result mismatch due to not implementing NHWC support
   // ROCm: results mismatch
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kCudaExecutionProvider, kRocmExecutionProvider});
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider});
 }
 
 TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_uint8) {
@@ -291,7 +292,8 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_uint8) {
   test.AddOutput<uint8_t>("Y", {N, static_cast<int64_t>(H * scales[1]), static_cast<int64_t>(W * scales[2]), C}, Y);
   // CUDA: result mismatch due to not implementing NHWC support
   // ROCm: results mismatch
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider});
 }
 
 TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_int8) {
@@ -439,7 +441,8 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_align_corners_uin
     test.AddOutput<uint8_t>("Y", {N, static_cast<int64_t>(H * scales[1]), static_cast<int64_t>(W * scales[2]), C}, Y);
     // CUDA: result mismatch due to not implementing NHWC support
     // ROCm: results mismatch
-    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider});
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+             {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider});
   };
 
   run_test(false);
@@ -539,7 +542,7 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_pytorch_half_pixe
   // ROCm: results mismatch
   // DML: results mismatch
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kCudaExecutionProvider, kRocmExecutionProvider, kDmlExecutionProvider});
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider, kDmlExecutionProvider});
 }
 
 TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_pytorch_half_pixel_int8) {
@@ -650,7 +653,8 @@ TEST(ResizeOpTest, NhwcResizeOpLinearUpSampleTest_4DBilinear_asymmetric_uint8) {
                             Y, false, .0f, 1.0f);
     // CUDA: result mismatch due to not implementing NHWC support
     // ROCm: results mismatch
-    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider});
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+             {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider});
   };
 
   run_test(false);
@@ -1913,6 +1917,8 @@ void TestAntialiasing(std::map<std::string, std::string> attributes,
                  });
   // TensorRT 8.5 supports operators up to Opset 17. Temporarily exclude TensorRT EP due to accuracy issue.
   excluded_eps.insert(kTensorrtExecutionProvider);
+  // Test is flaky on kCudaNHWCExecutionProvider
+  excluded_eps.insert(kCudaNHWCExecutionProvider);
 
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", excluded_eps);
 }
diff --git a/onnxruntime/test/providers/cpu/tensor/scatter_op_test.cc b/onnxruntime/test/providers/cpu/tensor/scatter_op_test.cc
index 30e27bb15fa57..b1dfec7951338 100644
--- a/onnxruntime/test/providers/cpu/tensor/scatter_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/scatter_op_test.cc
@@ -268,7 +268,7 @@ static void scatter_invalid_index(const char* op_name, int op_version) {
   test.AddOutput<float>("y", {4, 2, 1}, {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 5.0f, 0.0f});
   test.Run(OpTester::ExpectResult::kExpectFailure,
            "indices element out of data bounds, idx=4 must be within the inclusive range [-4,3]",
-           {kCudaExecutionProvider, kTensorrtExecutionProvider});
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
 }
 
 TEST(Scatter, InvalidIndex) {
@@ -291,9 +291,10 @@ static void scatter_bool_with_axis_tests(const char* op_name, int op_version) {
   test.AddOutput<bool>("y", {1, 5}, {false, true, false, false, false});
 #if defined(OPENVINO_CONFIG_GPU_FP32) || defined(OPENVINO_CONFIG_GPU_FP16)
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kOpenVINOExecutionProvider});  // OpenVINO: Disabled due to failure for GPU
+           {kCudaNHWCExecutionProvider, kOpenVINOExecutionProvider});  // OpenVINO: Disabled due to failure for GPU
 #else
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kCudaNHWCExecutionProvider});  // OpenVINO: Disabled due to failure for GPU
 #endif
 }
 
diff --git a/onnxruntime/test/providers/cpu/tensor/space_depth_ops_test.cc b/onnxruntime/test/providers/cpu/tensor/space_depth_ops_test.cc
index 63b92cfc187bd..5222380d9ca56 100644
--- a/onnxruntime/test/providers/cpu/tensor/space_depth_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/space_depth_ops_test.cc
@@ -108,6 +108,53 @@ TEST(TensorOpTest, SpaceToDepthTest_2) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kQnnExecutionProvider});
 }
 
+TEST(TensorOpTest, SpaceToDepthTest_3) {
+  // Test swizzling with H_output > 1
+  OpTester test("SpaceToDepth");
+  constexpr int64_t blocksize = 2;
+  test.AddAttribute("blocksize", blocksize);
+  constexpr int64_t N = 1, C = 2, H = 4, W = 8;
+
+  const std::vector<float> X = {
+      0.0f, 0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f,
+      1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f, 1.6f, 1.7f,
+
+      2.0f, 2.1f, 2.2f, 2.3f, 2.4f, 2.5f, 2.6f, 2.7f,
+      3.0f, 3.1f, 3.2f, 3.3f, 3.4f, 3.5f, 3.6f, 3.7f,
+
+      4.0f, 4.1f, 4.2f, 4.3f, 4.4f, 4.5f, 4.6f, 4.7f,
+      5.0f, 5.1f, 5.2f, 5.3f, 5.4f, 5.5f, 5.6f, 5.7f,
+      6.0f, 6.1f, 6.2f, 6.3f, 6.4f, 6.5f, 6.6f, 6.7f,
+      7.0f, 7.1f, 7.2f, 7.3f, 7.4f, 7.5f, 7.6f, 7.7f};
+
+  test.AddInput<float>("input", {N, C, H, W}, X);
+
+  const std::vector<float> result = {
+      0.0f, 0.2f, 0.4f, 0.6f,
+      2.0f, 2.2f, 2.4f, 2.6f,
+      4.0f, 4.2f, 4.4f, 4.6f,
+      6.0f, 6.2f, 6.4f, 6.6f,
+
+      0.1f, 0.3f, 0.5f, 0.7f,
+      2.1f, 2.3f, 2.5f, 2.7f,
+      4.1f, 4.3f, 4.5f, 4.7f,
+      6.1f, 6.3f, 6.5f, 6.7f,
+
+      1.0f, 1.2f, 1.4f, 1.6f,
+      3.0f, 3.2f, 3.4f, 3.6f,
+      5.0f, 5.2f, 5.4f, 5.6f,
+      7.0f, 7.2f, 7.4f, 7.6f,
+
+      1.1f, 1.3f, 1.5f, 1.7f,
+      3.1f, 3.3f, 3.5f, 3.7f,
+      5.1f, 5.3f, 5.5f, 5.7f,
+      7.1f, 7.3f, 7.5f, 7.7f};
+
+  test.AddOutput<float>("output", {N, C * blocksize * blocksize, H / blocksize, W / blocksize}, result);
+
+  test.Run();
+}
+
 TEST(TensorOpTest, DepthToSpaceTest_1) {
   OpTester test("DepthToSpace", 7);  // create an opset 7 model
   constexpr int64_t blocksize = 2;
diff --git a/onnxruntime/test/providers/cpu/tensor/upsample_op_test.cc b/onnxruntime/test/providers/cpu/tensor/upsample_op_test.cc
index 72cb84d50f078..188532cfa350a 100644
--- a/onnxruntime/test/providers/cpu/tensor/upsample_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/upsample_op_test.cc
@@ -692,7 +692,7 @@ TEST(UpsampleOpTest, NhwcUpsampleOp4D1CBilinearTest) {
   // TensorRT: results mismatch
   // ROCm: results mismatch
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kCudaExecutionProvider, kTensorrtExecutionProvider, kRocmExecutionProvider});
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kRocmExecutionProvider});
 }
 
 TEST(UpsampleOpTest, NhwcUpsampleOp4DBilinearTest) {
@@ -766,7 +766,7 @@ TEST(UpsampleOpTest, NhwcUpsampleOp4DBilinearTest) {
   // TensorRT: results mismatch
   // ROCm: results mismatch
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kCudaExecutionProvider, kTensorrtExecutionProvider, kRocmExecutionProvider});
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kRocmExecutionProvider});
 }
 
 TEST(UpsampleOpTest, UpsampleOp2DBilinearTest) {
@@ -886,7 +886,7 @@ TEST(UpsampleOpTest, NhwcUpsampleOp4DBilinearTest_int32) {
   // TensorRT: results mismatch
   // ROCm: results mismatch
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kCudaExecutionProvider, kTensorrtExecutionProvider, kRocmExecutionProvider});
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kRocmExecutionProvider});
 }
 
 TEST(UpsampleOpTest, UpsampleOpNearestTest_1D) {

From 1ce5bfb0ecc94a4a98eb093a53cd248ab6b7167b Mon Sep 17 00:00:00 2001
From: Wanming Lin <wanming.lin@intel.com>
Date: Thu, 7 Mar 2024 08:19:59 +0800
Subject: [PATCH 122/279] [WebNN EP] Make sure optional input is provided
 (#19686)

Some optional input is presented as empty string, we should not only
check if the input size is correct, but also check if the optional input
is not empty.

e.g. Pad node has empty optional input in sam-b-encoder.onnx model:
<img width="514" alt="image"
src="https://github.com/microsoft/onnxruntime/assets/3271201/cc3b06fe-46b9-4ee7-aca5-157bdf112856">
---
 .../core/providers/webnn/builders/impl/pad_op_builder.cc    | 6 +++---
 .../providers/webnn/builders/impl/reduction_op_builder.cc   | 2 +-
 .../core/providers/webnn/builders/impl/split_op_builder.cc  | 2 +-
 .../webnn/builders/impl/squeeze_unsqueeze_op_builder.cc     | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/onnxruntime/core/providers/webnn/builders/impl/pad_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/pad_op_builder.cc
index 52b5518857773..9852db0abc9d2 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/pad_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/pad_op_builder.cc
@@ -88,15 +88,15 @@ Status PadOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
     const auto& pads_tensor = *initializers.at(input_defs[1]->Name());
     ORT_RETURN_IF_NOT(ReadIntArrayFrom1DTensor(pads_tensor, pads, logger), "Error while read pads tensor");
 
-    // Constant value and axes are optional.
-    if (input_defs.size() >= 3) {
+    // Constant value and axes are optional. Make sure they are not empty.
+    if (!GetTensorName(input_defs, 2).empty()) {
       const auto value_tensor = *initializers.at(input_defs[2]->Name());
       emscripten::val value = emscripten::val::object();
       ORT_RETURN_IF_NOT(ReadScalarTensorData(value_tensor, value, logger), "Cannot read constant value");
       options.set("value", value);
     }
 
-    if (input_defs.size() == 4) {
+    if (!GetTensorName(input_defs, 3).empty()) {
       const auto input_rank = input_shape.size();
       std::vector<int64_t> axes;
       const auto& axes_tensor = *initializers.at(input_defs[3]->Name());
diff --git a/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc
index f446a7b81d1c0..c0954f7cf6fb1 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc
@@ -65,7 +65,7 @@ Status ReductionOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   if (opset >= 18 || (op_type == "ReduceSum" && opset >= 13)) {
     // 'axes' is an optional input.
     const auto noop_with_empty_axes = helper.Get("noop_with_empty_axes", 0);
-    if (input_defs.size() > 1) {
+    if (!GetTensorName(input_defs, 1).empty()) {
       // Optional input axes is provided, use axes initializer data.
       const auto& initializers(model_builder.GetInitializerTensors());
       const auto& axes_tensor = *initializers.at(input_defs[1]->Name());
diff --git a/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc
index 91f21b196be54..9819e4ce7ac5b 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc
@@ -57,7 +57,7 @@ Status SplitOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   axis = SafeInt<int32_t>(HandleNegativeAxis(axis, rank));
   options.set("axis", axis);
 
-  if (input_defs.size() == 2) {
+  if (!GetTensorName(input_defs, 1).empty()) {
     // Inputs contains optional 'split' input
     std::vector<int32_t> splits;
     const auto& initializers(model_builder.GetInitializerTensors());
diff --git a/onnxruntime/core/providers/webnn/builders/impl/squeeze_unsqueeze_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/squeeze_unsqueeze_op_builder.cc
index 15149bd8fe821..8e6feb62fa8c4 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/squeeze_unsqueeze_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/squeeze_unsqueeze_op_builder.cc
@@ -58,7 +58,7 @@ Status SqueezeUnsqueezeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_buil
   std::vector<int32_t> axes_data;
   auto rank = input_rank;
 
-  if (node.SinceVersion() >= 13 && input_defs.size() > 1) {
+  if (node.SinceVersion() >= 13 && !GetTensorName(input_defs, 1).empty()) {
     // Input axes is provided, use axes initializer data.
     const auto& initializers = model_builder.GetInitializerTensors();
     const auto& axes_tensor = *initializers.at(input_defs[1]->Name());

From 5c5d6e99ce8deac2f68167173736735a77fa53b2 Mon Sep 17 00:00:00 2001
From: pengwa <pengwa@microsoft.com>
Date: Thu, 7 Mar 2024 09:12:12 +0800
Subject: [PATCH 123/279] Define recomputable op list with domain/opset
 (#19722)

### Define recomputable op list with domain/opset

Originally, we just check the OpType and decide whether it is
recomputable.

In this PR, few improvements are made:
1. [Op type search] Domain + OpType are used to check whether the op is
supported to recompute.
2. [Opset search] Then, node.SinceVersion() will be searched in the
supported opsets.
3. During subgraph detection, If the node in that this opset is
supported, get the ignorable input indices, which means we don't
consider in the bottom-up search. This would save time for the subgraph
detection.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 onnxruntime/core/common/string_utils.h        |   9 +-
 .../compute_optimizer/upstream_gather.cc      |  25 +-
 .../compute_optimizer/upstream_reshape.cc     |  15 +-
 .../upstream_transformer_base.cc              |   3 +-
 .../upstream_transformer_base.h               |   7 -
 .../memory_optimizer/recompute_analysis.cc    | 414 +++++++++++++++---
 6 files changed, 382 insertions(+), 91 deletions(-)

diff --git a/onnxruntime/core/common/string_utils.h b/onnxruntime/core/common/string_utils.h
index 03e94cefd0564..716eed1afec51 100644
--- a/onnxruntime/core/common/string_utils.h
+++ b/onnxruntime/core/common/string_utils.h
@@ -66,7 +66,14 @@ inline std::string TrimString(std::string s) {
 }
 
 /**
- * So use this simple hash to generate unique int by given string input.
+ * @brief A consistent way to construct the full qualified op name.
+ */
+inline std::string GetFullQualifiedOpName(const std::string& op_type, const std::string& domain) {
+  return MakeString(domain, "::", op_type);
+}
+
+/**
+ * Use this simple hash to generate unique int by given string input.
  */
 inline uint32_t GetHashFromString(const std::string& str_value) {
   uint32_t hash = 0;
diff --git a/onnxruntime/core/optimizer/compute_optimizer/upstream_gather.cc b/onnxruntime/core/optimizer/compute_optimizer/upstream_gather.cc
index 9c98ed6d3e114..1516fb37a7e9f 100644
--- a/onnxruntime/core/optimizer/compute_optimizer/upstream_gather.cc
+++ b/onnxruntime/core/optimizer/compute_optimizer/upstream_gather.cc
@@ -4,6 +4,7 @@
 #ifdef ENABLE_TRAINING
 
 #include <onnx/defs/attr_proto_util.h>
+#include "core/common/string_utils.h"
 #include "core/graph/graph_utils.h"
 #include "core/optimizer/initializer.h"
 #include "core/optimizer/utils.h"
@@ -26,38 +27,38 @@ UpStreamGatherGraphTransformer::UpStreamGatherGraphTransformer(
       // 2. Whether the outputs have the same dim changes if the Gather node moves before that operator.
       // 3. Should all inputs be allowed when tracking back further (bottom-up);
       //    if not, add the input index restriction as MatMul did.
-      {GetFullQualifiedOpName("Add", kOnnxDomain),
+      {utils::GetFullQualifiedOpName("Add", kOnnxDomain),
        OpPassThroughConfig<UpStreamGatherOperatorActorBase>(std::make_shared<SimplePointwiseGatherActor<true>>(),
                                                             opset_14_13_7_6_1)},
-      {GetFullQualifiedOpName("BiasGelu", kMSDomain),
+      {utils::GetFullQualifiedOpName("BiasGelu", kMSDomain),
        OpPassThroughConfig<UpStreamGatherOperatorActorBase>(std::make_shared<SimplePointwiseGatherActor<true>>(), opset_1)},
 
-      {GetFullQualifiedOpName("Cast", kOnnxDomain),
+      {utils::GetFullQualifiedOpName("Cast", kOnnxDomain),
        OpPassThroughConfig<UpStreamGatherOperatorActorBase>(std::make_shared<SimplePointwiseGatherActor<true>>(),
                                                             opset_19_13_9_6_1)},
-      {GetFullQualifiedOpName("Div", kOnnxDomain),
+      {utils::GetFullQualifiedOpName("Div", kOnnxDomain),
        OpPassThroughConfig<UpStreamGatherOperatorActorBase>(std::make_shared<SimplePointwiseGatherActor<true>>(),
                                                             opset_14_13_7_6_1)},
-      {GetFullQualifiedOpName("Dropout", kOnnxDomain),
+      {utils::GetFullQualifiedOpName("Dropout", kOnnxDomain),
        OpPassThroughConfig<UpStreamGatherOperatorActorBase>(std::make_shared<SimplePointwiseGatherActor<true>>(),
                                                             opset_13_12_10_7_6_1)},
-      {GetFullQualifiedOpName("Gelu", kMSDomain),
+      {utils::GetFullQualifiedOpName("Gelu", kMSDomain),
        OpPassThroughConfig<UpStreamGatherOperatorActorBase>(std::make_shared<SimplePointwiseGatherActor<true>>(),
                                                             opset_1)},
       {// Be noted, this is our own implementation of ONNX domain op.
-       GetFullQualifiedOpName("LayerNormalization", kOnnxDomain),
+       utils::GetFullQualifiedOpName("LayerNormalization", kOnnxDomain),
        OpPassThroughConfig<UpStreamGatherOperatorActorBase>(std::make_shared<LayerNormalizationGatherActor>(),
                                                             opset_1)},
-      {GetFullQualifiedOpName("MatMul", kOnnxDomain),
+      {utils::GetFullQualifiedOpName("MatMul", kOnnxDomain),
        OpPassThroughConfig<UpStreamGatherOperatorActorBase>(std::make_shared<MatMulGatherActor>(),
                                                             opset_13_9_1)},
-      {GetFullQualifiedOpName("Reshape", kOnnxDomain),
+      {utils::GetFullQualifiedOpName("Reshape", kOnnxDomain),
        OpPassThroughConfig<UpStreamGatherOperatorActorBase>(std::make_shared<ReshapeGatherActor>(),
                                                             opset_19_14_13_5_1)},
-      {GetFullQualifiedOpName("Softmax", kOnnxDomain),
+      {utils::GetFullQualifiedOpName("Softmax", kOnnxDomain),
        OpPassThroughConfig<UpStreamGatherOperatorActorBase>(std::make_shared<SoftmaxGatherActor>(),
                                                             opset_13_11_1)},
-      {GetFullQualifiedOpName("Transpose", kOnnxDomain),
+      {utils::GetFullQualifiedOpName("Transpose", kOnnxDomain),
        OpPassThroughConfig<UpStreamGatherOperatorActorBase>(std::make_shared<TransposeGatherActor>(),
                                                             opset_13_1)},
   });
@@ -69,7 +70,7 @@ bool UpStreamGatherGraphTransformer::UpStreamInternal(
     const OpPassThroughConfig<UpStreamGatherOperatorActorBase>& pass_through_config,
     const logging::Logger& logger) const {
   Node& slice_node = *info.node_ptr;
-  const std::string op_type = GetFullQualifiedOpName(current_node.OpType(), current_node.Domain());
+  const std::string op_type = utils::GetFullQualifiedOpName(current_node.OpType(), current_node.Domain());
 
   std::unordered_map<int, int> propagate_input_indices;
   std::unordered_map<int, std::vector<DimCompare>> all_input_cmp_rets;
diff --git a/onnxruntime/core/optimizer/compute_optimizer/upstream_reshape.cc b/onnxruntime/core/optimizer/compute_optimizer/upstream_reshape.cc
index f7b48de2caaf5..716988e93312c 100644
--- a/onnxruntime/core/optimizer/compute_optimizer/upstream_reshape.cc
+++ b/onnxruntime/core/optimizer/compute_optimizer/upstream_reshape.cc
@@ -4,6 +4,7 @@
 #ifdef ENABLE_TRAINING
 
 #include "core/framework/tensorprotoutils.h"
+#include "core/common/string_utils.h"
 #include "core/graph/graph_utils.h"
 #include "core/optimizer/utils.h"
 #include "core/optimizer/compute_optimizer/upstream_reshape_actors.h"
@@ -21,23 +22,23 @@ UpStreamReshapeGraphTransformer::UpStreamReshapeGraphTransformer(
       //    If optype is not enough to guarantee the equivalence, we need to add a customized pre-check function.
       // 2. Should all inputs be allowed when tracking back further (bottom-up);
       //    if not, add the input index restriction.
-      {GetFullQualifiedOpName("Add", kOnnxDomain),
+      {utils::GetFullQualifiedOpName("Add", kOnnxDomain),
        OpPassThroughConfig<UpStreamReshapeOperatorActorBase>(
            std::make_shared<SimplePointwiseReshapeActor<true>>(), opset_14_13_7_6_1)},
-      {GetFullQualifiedOpName("BiasGelu", kMSDomain),
+      {utils::GetFullQualifiedOpName("BiasGelu", kMSDomain),
        OpPassThroughConfig<UpStreamReshapeOperatorActorBase>(
            std::make_shared<SimplePointwiseReshapeActor<true>>(), opset_1)},
-      {GetFullQualifiedOpName("Cast", kOnnxDomain),
+      {utils::GetFullQualifiedOpName("Cast", kOnnxDomain),
        OpPassThroughConfig<UpStreamReshapeOperatorActorBase>(
            std::make_shared<SimplePointwiseReshapeActor<true>>(), opset_19_13_9_6_1)},
-      {GetFullQualifiedOpName("Dropout", kOnnxDomain),
+      {utils::GetFullQualifiedOpName("Dropout", kOnnxDomain),
        OpPassThroughConfig<UpStreamReshapeOperatorActorBase>(
            std::make_shared<SimplePointwiseReshapeActor<true>>(), opset_13_12_10_7_6_1)},
       {// Be noted, this is our own implementation of ONNX domain op.
-       GetFullQualifiedOpName("LayerNormalization", kOnnxDomain),
+       utils::GetFullQualifiedOpName("LayerNormalization", kOnnxDomain),
        OpPassThroughConfig<UpStreamReshapeOperatorActorBase>(
            std::make_shared<LayerNormalizationReshapeActor>(), opset_1)},
-      {GetFullQualifiedOpName("MatMul", kOnnxDomain),
+      {utils::GetFullQualifiedOpName("MatMul", kOnnxDomain),
        OpPassThroughConfig<UpStreamReshapeOperatorActorBase>(
            std::make_shared<MatMulReshapeActor>(), opset_13_9_1)},
   });
@@ -47,7 +48,7 @@ bool UpStreamReshapeGraphTransformer::UpStreamInternal(
     Graph& graph, std::deque<ReshapeInfo>& queue, Node& current_node, ReshapeInfo& info,
     const OpPassThroughConfig<UpStreamReshapeOperatorActorBase>& pass_through_config,
     const logging::Logger& logger) const {
-  const std::string op_type = GetFullQualifiedOpName(current_node.OpType(), current_node.Domain());
+  const std::string op_type = utils::GetFullQualifiedOpName(current_node.OpType(), current_node.Domain());
 
   std::vector<int> propagate_input_indices;
   std::unordered_map<int, std::vector<DimCompare>> all_input_cmp_rets;
diff --git a/onnxruntime/core/optimizer/compute_optimizer/upstream_transformer_base.cc b/onnxruntime/core/optimizer/compute_optimizer/upstream_transformer_base.cc
index f08e37296d259..4582f26a7dc68 100644
--- a/onnxruntime/core/optimizer/compute_optimizer/upstream_transformer_base.cc
+++ b/onnxruntime/core/optimizer/compute_optimizer/upstream_transformer_base.cc
@@ -5,6 +5,7 @@
 
 #include <onnx/defs/attr_proto_util.h>
 #include "core/common/safeint.h"
+#include "core/common/string_utils.h"
 #include "core/graph/graph_utils.h"
 #include "core/optimizer/initializer.h"
 #include "core/optimizer/utils.h"
@@ -130,7 +131,7 @@ template <typename T1, typename T2>
 bool UpStreamGraphTransformerBase<T1, T2>::Upstream(Graph& graph, std::deque<T1>& queue,
                                                     Node& current_node, T1& info,
                                                     const logging::Logger& logger) const {
-  const std::string op_type = GetFullQualifiedOpName(current_node.OpType(), current_node.Domain());
+  const std::string op_type = utils::GetFullQualifiedOpName(current_node.OpType(), current_node.Domain());
   if (allowed_passthrough_ops_.count(op_type)) {
     auto& pass_through_config = allowed_passthrough_ops_.at(op_type);
     LOG_DEBUG_INFO(logger, "Enter reorder handle for node " + current_node.Name() + "(" + op_type + ")");
diff --git a/onnxruntime/core/optimizer/compute_optimizer/upstream_transformer_base.h b/onnxruntime/core/optimizer/compute_optimizer/upstream_transformer_base.h
index 6e22fc791ade3..d848a03c555bb 100644
--- a/onnxruntime/core/optimizer/compute_optimizer/upstream_transformer_base.h
+++ b/onnxruntime/core/optimizer/compute_optimizer/upstream_transformer_base.h
@@ -72,13 +72,6 @@ class UpStreamGraphTransformerBase : public GraphTransformer {
                                 const OpPassThroughConfig<T2>& pass_through_config,
                                 const logging::Logger& logger) const = 0;
 
-  /**
-   * @brief A consistent way to construct the full qualified op name.
-   */
-  std::string GetFullQualifiedOpName(const std::string& op_type, const std::string& domain) const {
-    return domain + "::" + op_type;
-  }
-
   std::unordered_map<std::string, OpPassThroughConfig<T2>> allowed_passthrough_ops_;
 
  private:
diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.cc b/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.cc
index 76b3325f36116..b421eb2ab32da 100644
--- a/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.cc
+++ b/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.cc
@@ -48,75 +48,352 @@ float InputOutputSizeRatio(const Node* node) {
   return 1.0f;
 }
 
+using IgnorableInputIndices = InlinedVector<int>;
+using OpsetToIgnorableIndicesMap = InlinedHashMap<int, IgnorableInputIndices>;
+
 /**
- * @brief Used to define per-op recompute config.
+ * @brief Get the Allowed Recompute Ops object
+ *
+ * The supported op types are predefined.
+ * Most recent revisited for ONNX v1.15.0 release - https://github.com/onnx/onnx/blob/b86cc54efce19530fb953e4b21f57e6b3888534c/docs/Operators.md
  *
+ * We defined supported list explicitly instead of using a excluding list for the following reasons:
+ * 1. Some ops generate indeterministic results (for example using random number generator). We need evaluate whether
+ *   this is a problem for recompute before adding the support, instead of fixing this after we find and try to
+ *   fix convergence issues (which will be very hard if we have multiple indeterministic operators by default supported.)
+ * 2. Some ops schema will be changed in new opsets, we need also check manually whether it is applicable to recompute
+ *   or not.
+ * 3. Some ops are not supported in older opsets, we need to check whether it is applicable to recompute or not.
  */
-struct AllowedRecomputeNodeConfig {
-  InlinedVector<int> input_arg_indices;  // input index to iterate further (bottom up)
-};
-
-// The supported op types are predefined.
-
-const InlinedHashMap<std::string, AllowedRecomputeNodeConfig>& GetAllowedRecomputeOps(int probe_op_level) {
-  static InlinedHashMap<int, InlinedHashMap<std::string, AllowedRecomputeNodeConfig>> recomputable_op_table_map;
+const InlinedHashMap<std::string, OpsetToIgnorableIndicesMap>& GetAllowedRecomputeOps(int probe_op_level) {
+  static InlinedHashMap<int, InlinedHashMap<std::string, OpsetToIgnorableIndicesMap>> recomputable_op_table_map;
   if (recomputable_op_table_map.find(probe_op_level) != recomputable_op_table_map.end()) {
     return recomputable_op_table_map.at(probe_op_level);
   }
 
-  recomputable_op_table_map.insert({probe_op_level, InlinedHashMap<std::string, AllowedRecomputeNodeConfig>()});
+  recomputable_op_table_map.insert({probe_op_level, InlinedHashMap<std::string, OpsetToIgnorableIndicesMap>()});
   auto& recomputable_op_table = recomputable_op_table_map.at(probe_op_level);
   if (probe_op_level >= static_cast<int>(ProbeLevel::Basic)) {
     recomputable_op_table.insert({
-        // Binary elementwise
-        {"Add", AllowedRecomputeNodeConfig{{0, 1}}},
-        {"BiasGelu", AllowedRecomputeNodeConfig{{0, 1}}},
-        {"Div", AllowedRecomputeNodeConfig{{0, 1}}},
-        {"Equal", AllowedRecomputeNodeConfig{{0, 1}}},
-        {"Mul", AllowedRecomputeNodeConfig{{0, 1}}},
-        {"Sub", AllowedRecomputeNodeConfig{{0, 1}}},
-
-        // Data layout
-        /// The shape input is trivial whether it exists or not in backward.
-        {"Reshape", AllowedRecomputeNodeConfig{{0}}},
-        {"Shape", AllowedRecomputeNodeConfig{{0}}},
-        {"Squeeze", AllowedRecomputeNodeConfig{{0}}},
-        {"Transpose", AllowedRecomputeNodeConfig{{0}}},
-        {"Unsqueeze", AllowedRecomputeNodeConfig{{0}}},
-
-        // Unary elementwise
-        {"Dropout", AllowedRecomputeNodeConfig{{0}}},
-        {"BiasGelu", AllowedRecomputeNodeConfig{{0, 1}}},
-        /// The ratio and mode input are trivial whether they exist or not in backward
-        {"BitmaskDropout", AllowedRecomputeNodeConfig{{0}}},
-        /// The axis input is trivial whether it exists or not in backward
-        {"CumSum", AllowedRecomputeNodeConfig{{0}}},
-        {"Expand", AllowedRecomputeNodeConfig{{0}}},
-        {"FastGelu", AllowedRecomputeNodeConfig{{0}}},
-        {"Gelu", AllowedRecomputeNodeConfig{{0}}},
-        {"QuickGelu", AllowedRecomputeNodeConfig{{0}}},
-
-        // Ternary elementwise
-        {"Where", AllowedRecomputeNodeConfig{{0, 1, 2}}},
-
-        // Data copy
-        {"Tile", AllowedRecomputeNodeConfig{{0}}},
-        {"Cast", AllowedRecomputeNodeConfig{{0}}},
-        {"ConcatTraining", AllowedRecomputeNodeConfig{{0, 1}}},  // Input could be more than 2. But mostly 2.
-        {"Slice", AllowedRecomputeNodeConfig{{0}}},
-        {"Split", AllowedRecomputeNodeConfig{{0}}},
-        {"Gather", AllowedRecomputeNodeConfig{{0}}},
+        {
+            utils::GetFullQualifiedOpName("Add", kOnnxDomain),
+            {
+                {1, {}},
+                {6, {}},
+                {7, {}},
+                {13, {}},
+                {14, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("BatchNormalization", kOnnxDomain),
+            {
+                {1, {}},
+                {6, {}},
+                {7, {}},
+                {9, {}},
+                {14, {}},
+                {15, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("BiasGelu", kMSDomain),
+            {
+                {1, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("BiasDropout", kMSDomain),
+            {
+                {1, {3, 4}},  // ignore ratio (optional) and training mode (optional)
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("BitmaskBiasDropout", kMSDomain),
+            {
+                {1, {3, 4}},  // ignore ratio (optional) and training mode (optional)
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("BitmaskDropout", kMSDomain),
+            {
+                {1, {1, 2}},  // ignore ratio (optional) and training mode (optional)
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Cast", kOnnxDomain),
+            {
+                {1, {}},
+                {6, {}},
+                {9, {}},
+                {13, {}},
+                {19, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("ConcatTraining", kMSDomain),
+            {
+                {1, {}},
+
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("ConstantOfShape", kOnnxDomain),
+            {
+                {9, {0}},  // ignore the `input`, e.g. the shape of the expected output tensor
+                {20, {0}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Dropout", kOnnxDomain),
+            {
+                // ONNX Dropout 1, 6, 7, 10 do not have seed attribute, so we remove them from the recompute support.
+                {12, {1, 2}},  // ignore ratio and training_mode
+                {13, {1, 2}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Div", kOnnxDomain),
+            {
+                {1, {}},
+                {6, {}},
+                {7, {}},
+                {13, {}},
+                {14, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Expand", kOnnxDomain),
+            {
+                {8, {1}},  // Ignore the shape.
+                {13, {1}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Cos", kOnnxDomain),
+            {
+                {7, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("CumSum", kOnnxDomain),
+            {
+                // The axis input is trivial
+                {11, {1}},
+                {14, {1}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Einsum", kOnnxDomain),
+            {
+                {12, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Equal", kOnnxDomain),
+            {
+                {1, {}},
+                {7, {}},
+                {11, {}},
+                {13, {}},
+                {19, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("FastGelu", kMSDomain),
+            {
+                {1, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Gather", kOnnxDomain),
+            {
+                {1, {1}},  // ignore the indices
+                {11, {1}},
+                {13, {1}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Gelu", kOnnxDomain),
+            {
+                {20, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Gelu", kMSDomain),
+            {
+                {1, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Less", kOnnxDomain),
+            {
+                {1, {}},
+                {7, {}},
+                {9, {}},
+                {13, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Mul", kOnnxDomain),
+            {
+                {1, {}},
+                {6, {}},
+                {7, {}},
+                {13, {}},
+                {14, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Range", kOnnxDomain),
+            {
+                {11, {0, 1, 2}},  // ignore start, end, delta, because they are scalars.
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Reshape", kOnnxDomain),
+            {
+                {1, {}},
+                {5, {}},  // ignore the shape.
+                {13, {}},
+                {14, {}},
+                {19, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Sin", kOnnxDomain),
+            {
+                {7, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Slice", kOnnxDomain),
+            {
+                {1, {}},
+                {10, {1, 2, 3, 4}},  // ignore starts, ends, axes (optional) and steps (optional)
+                {11, {1, 2, 3, 4}},
+                {13, {1, 2, 3, 4}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Split", kOnnxDomain),
+            {
+                {1, {1}},  // ignore split (optional)
+                {2, {}},
+                {11, {}},
+                {13, {1}},  // ignore the split (optional)
+                {18, {1}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Squeeze", kOnnxDomain),
+            {
+                {1, {}},
+                {11, {}},
+                {13, {1}},  // ignore the axes (optional)
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Sub", kOnnxDomain),
+            {
+                {1, {}},
+                {6, {}},
+                {7, {}},
+                {13, {}},
+                {14, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Tile", kOnnxDomain),
+            {
+                {1, {1, 2}},
+                {6, {1}},
+                {13, {1}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Transpose", kOnnxDomain),
+            {
+                {1, {}},
+                {13, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Trilu", kOnnxDomain),
+            {
+                {14, {1}},  // ignore k (optional)
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("QuickGelu", kMSDomain),
+            {
+                {1, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Unsqueeze", kOnnxDomain),
+            {
+                {1, {}},
+                {11, {}},
+                {13, {1}},  // ignore the axes (optional)
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Where", kOnnxDomain),
+            {
+                {9, {}},
+                {16, {}},
+            },
+        },
+
     });
   }
 
   if (probe_op_level >= static_cast<int>(ProbeLevel::Advanced)) {
     recomputable_op_table.insert({
-        {"LayerNormalization", AllowedRecomputeNodeConfig{{0, 1, 2}}},
-        {"MatMul", AllowedRecomputeNodeConfig{{0, 1}}},
-        {"FusedMatMul", AllowedRecomputeNodeConfig{{0, 1}}},
-        {"Softmax", AllowedRecomputeNodeConfig{{0}}},
-        {"BiasSoftmax", AllowedRecomputeNodeConfig{{0, 1}}},
-        {"BiasSoftmaxDropout", AllowedRecomputeNodeConfig{{0, 1}}},
+        {
+            utils::GetFullQualifiedOpName("BiasSoftmax", kMSDomain),
+            {
+                {1, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("BiasSoftmaxDropout", kMSDomain),
+            {
+                {1, {2}},  // ignore ratio (optional)
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("LayerNormalization", kOnnxDomain),
+            {
+                // Opset 1 in ONNX official does not have LayerNormalization,
+                // while our contrib op defined LayerNormalization in opset 1 in ONNX domain.
+                {1, {}},
+                {17, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("MatMul", kOnnxDomain),
+            {
+                {1, {}},
+                {9, {}},
+                {13, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("FusedMatMul", kMSDomain),
+            {
+                {1, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Softmax", kOnnxDomain),
+            {
+                {1, {}},
+                {11, {}},
+                {13, {}},
+            },
+        },
     });
   }
 
@@ -127,8 +404,20 @@ const InlinedHashMap<std::string, AllowedRecomputeNodeConfig>& GetAllowedRecompu
  * @brief Check whether a node is a recomputable node at given probe level.
  */
 bool IsRecomputable(const Node& node, ProbeLevel probe_level) {
-  const auto& op_table = GetAllowedRecomputeOps(static_cast<int>(probe_level));
-  return op_table.find(node.OpType()) != op_table.end();
+  const InlinedHashMap<std::string, OpsetToIgnorableIndicesMap>& op_table = GetAllowedRecomputeOps(static_cast<int>(probe_level));
+  auto it = op_table.find(utils::GetFullQualifiedOpName(node.OpType(), node.Domain()));
+  if (it == op_table.end()) {
+    return false;
+  }
+  return it->second.count(node.SinceVersion());
+}
+
+const InlinedVector<int>& GetIgnorableInputIndices(const Node& node, ProbeLevel probe_level) {
+  const InlinedHashMap<std::string, OpsetToIgnorableIndicesMap>& op_table = GetAllowedRecomputeOps(static_cast<int>(probe_level));
+  auto it = op_table.find(utils::GetFullQualifiedOpName(node.OpType(), node.Domain()));
+  ORT_ENFORCE(it != op_table.end(), "Cannot get ignorable indices since the node type is supported in the list.");
+  ORT_ENFORCE(it->second.count(node.SinceVersion()) > 0, "Cannot get ignorable indices since the opset is supported");
+  return it->second.at(node.SinceVersion());
 }
 
 /**
@@ -163,7 +452,6 @@ Status SelectRecomputeSubgraph(const Node& entry_node,
                                bool& can_compromise_stashed_activation,
                                float& save_ratio) {
   const ProbeLevel probe_level = probe_config.probe_level;
-  const auto& recomputable_op_table = GetAllowedRecomputeOps(static_cast<int>(probe_level));
 
   can_compromise_stashed_activation = false;
 
@@ -213,7 +501,7 @@ Status SelectRecomputeSubgraph(const Node& entry_node,
       //  If current op is NOT in allowed list:
       //    1). the output does not exist in backward, we cannot find a good solution for so, the search terminates.
       //    2). the output is used in backward, we don't need to trace back further, so continue searching.
-      auto op_recompute_config_it = recomputable_op_table.find(curr_node->OpType());
+      bool is_recomputable = IsRecomputable(*curr_node, probe_level);
       auto cur_output_arg_name = curr_node->OutputDefs()[p.second]->Name();
       if (is_first_queue_scan) {
         // We handle the entry node outputs differently because, we don't want this case falls into and succeed one of
@@ -221,14 +509,14 @@ Status SelectRecomputeSubgraph(const Node& entry_node,
         // 1. "op is not in recompute op list, but its output is used in backward"
         // 2. "op is in recompute op list, but its output is used in backward"
         // (either of the above checks is true for entry node outputs)
-        if (op_recompute_config_it == recomputable_op_table.end()) {
+        if (!is_recomputable) {
           early_stop = true;
           MO_LOG_DEBUG_INFO(logger, "Entry Node " + curr_node->Name() + "(" + curr_node->OpType() +
                                         ") is **NOT** in recompute op list, search terminates.");
           break;
         }
       } else {
-        if (op_recompute_config_it == recomputable_op_table.end()) {
+        if (!is_recomputable) {
           if (fw_op_output_arg_used_map.at(cur_output_arg_name).second) {
             MO_LOG_DEBUG_INFO(logger, "Node " + curr_node->Name() + "(" + curr_node->OpType() +
                                           ") is **NOT** in recompute op list, but its output [" +
@@ -283,14 +571,14 @@ Status SelectRecomputeSubgraph(const Node& entry_node,
       }
 
       // Iterate all input nodes according to allowed input arg index of the entry node.
-      const auto& input_arg_indices = op_recompute_config_it->second.input_arg_indices;
+      const auto& igorable_input_arg_indices = GetIgnorableInputIndices(*curr_node, probe_level);
       for (auto it = curr_node->InputEdgesBegin(), end = curr_node->InputEdgesEnd(); it != end; ++it) {
         const Node::EdgeEnd& input_edge = *it;
         const auto& parent_node = input_edge.GetNode();
         const auto parent_node_output_index = input_edge.GetSrcArgIndex();
         const auto current_node_input_index = input_edge.GetDstArgIndex();
-        if (std::find(input_arg_indices.begin(), input_arg_indices.end(), current_node_input_index) !=
-            input_arg_indices.end()) {
+        if (std::find(igorable_input_arg_indices.begin(), igorable_input_arg_indices.end(), current_node_input_index) ==
+            igorable_input_arg_indices.end()) {
           // If the tensor size is constant and very small (Now < 1M), we stop adding the input edge into queue.
           auto output_shape = parent_node.OutputDefs()[parent_node_output_index]->Shape();
           if (output_shape) {

From bff4f8bf75562704720624fac63b149d10042ac8 Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Wed, 6 Mar 2024 17:47:17 -0800
Subject: [PATCH 124/279] Update tolerance of provider tests to fix flaky tests
 (#19792)

### Description

Check float/double/float16/bfloat16 tensors are close like
[numpy.isclose](https://numpy.org/doc/stable/reference/generated/numpy.isclose.html).
```
absolute(a - b) <= (atol + rtol * absolute(b))
```

The default tolerance thresholds:
- float: atol=1e-5 and rtol=1e-4
- float16: atol=0.0025 and rtol=0.001
- bfloat16: atol=0.02 and rtol=0.01

### Motivation and Context

Current pipeline has frequent failure due to using only relative
tolerance in https://github.com/microsoft/onnxruntime/pull/19608:

[ RUN      ] MatMulIntegerToFloat.NoZeroPoint_NoBias_test_U8S8
1: C:\a\_work\1\s\onnxruntime\test\providers\checkers.cc(272): error:
The difference between cur_expected[i] and cur_actual[i] is
1.3113021850585938e-06, which exceeds *(params.relative_error) *
std::abs(cur_expected[i]), where
1: cur_expected[i] evaluates to -1.3113021850585938e-06,
1: cur_actual[i] evaluates to 0, and
1: *(params.relative_error) * std::abs(cur_expected[i]) evaluates to
2.6226043559063328e-08.

It is not reasonable to use relative tolerance for a small value very
close to 0. Combining relative tolerance with a positive absolute
tolerance could avoid such issue.
---
 .../matmul_integer_to_float_test.cc           |   1 +
 onnxruntime/test/providers/checkers.cc        | 159 +++++++++---------
 2 files changed, 83 insertions(+), 77 deletions(-)

diff --git a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
index 6f3ca7e239671..72a5ba4dcefbf 100644
--- a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
@@ -127,6 +127,7 @@ void TestMatMulIntegerToFloat(bool is_matrix_b_constant,
 
   if (std::is_same_v<OType, float>) {
     test.AddOutput<float>("Y", {M, N}, Y_data);
+    test.SetOutputAbsErr("Y", 0.0001f);
     test.SetOutputRelErr("Y", 0.02f);
   } else {
     test.AddOutput<MLFloat16>("Y", {M, N}, ToFloat16(Y_data));
diff --git a/onnxruntime/test/providers/checkers.cc b/onnxruntime/test/providers/checkers.cc
index 85ccb8f175f62..c97e6d9de4911 100644
--- a/onnxruntime/test/providers/checkers.cc
+++ b/onnxruntime/test/providers/checkers.cc
@@ -14,6 +14,54 @@
 namespace onnxruntime {
 namespace test {
 namespace {
+
+template <typename T>
+struct DefaultTolerance;
+
+template <>
+struct DefaultTolerance<double> {
+  static constexpr float absolute = 1e-6f;
+  static constexpr float relative = 1e-5f;
+};
+
+template <>
+struct DefaultTolerance<float> {
+  static constexpr float absolute = 1e-5f;
+  static constexpr float relative = 1e-4f;
+};
+
+template <>
+struct DefaultTolerance<MLFloat16> {
+  // The thresholds are estimated with PyTorch script like the following:
+  //    x = torch.rand(1000, 1000)
+  //    absolute = ((x + 1e-6).to(torch.float16) - x).abs().max() * 10
+  //    x[abs(x) < absolute] = absolute
+  //    relative = ((x - x.to(torch.float16)) / x).abs().max() * 2
+  static constexpr float absolute = 0.0025f;
+  static constexpr float relative = 0.001f;
+};
+
+template <>
+struct DefaultTolerance<BFloat16> {
+  static constexpr float absolute = 0.02f;
+  static constexpr float relative = 0.01f;
+};
+
+template <typename T>
+T get_tolerance(float absolute, float relative, T expected_value) {
+  static_assert(std::is_floating_point<T>::value, "T must be a floating point type");
+
+  // The formula is similar to numpy.isclose: https://numpy.org/doc/stable/reference/generated/numpy.isclose.html
+  return static_cast<T>(absolute) + static_cast<T>(relative) * std::abs(expected_value);
+}
+
+template <typename T, typename D>  // D is the original data type
+T get_tolerance(const ValidateOutputParams& params, T expected_value) {
+  float absolute = (params.absolute_error.has_value() ? *(params.absolute_error) : DefaultTolerance<D>::absolute);
+  float relative = (params.relative_error.has_value() ? *(params.relative_error) : DefaultTolerance<D>::relative);
+  return get_tolerance<T>(absolute, relative, expected_value);
+}
+
 template <typename T>
 Tensor copy_sort(const Tensor& src, const AllocatorPtr& allocator) {
   Tensor result(src.DataType(), src.Shape(), allocator);
@@ -67,7 +115,7 @@ struct TensorCheck {
       cur_actual = actual.Data<T>();
     }
 
-    for (int i = 0; i < size; ++i) {
+    for (int64_t i = 0; i < size; ++i) {
       EXPECT_EQ(cur_expected[i], cur_actual[i]) << "i:" << i;
     }
   }
@@ -111,7 +159,7 @@ struct TensorCheck<uint8_t> {
       double threshold = has_abs_err ? *(params.absolute_error)
                                      : 0.0;
 
-      for (int i = 0; i < size; ++i) {
+      for (int64_t i = 0; i < size; ++i) {
         if (has_rel_err) {
           EXPECT_NEAR(cur_expected[i], cur_actual[i],
                       *(params.relative_error) * cur_expected[i])  // expected[i] is unsigned, can't be negative
@@ -121,7 +169,7 @@ struct TensorCheck<uint8_t> {
         }
       }
     } else {
-      for (int i = 0; i < size; ++i) {
+      for (int64_t i = 0; i < size; ++i) {
         EXPECT_EQ(cur_expected[i], cur_actual[i]) << "i:" << i;
       }
     }
@@ -157,11 +205,11 @@ struct TensorCheck<int8_t> {
     if (has_abs_err) {
       double threshold = *(params.absolute_error);
 
-      for (int i = 0; i < size; ++i) {
+      for (int64_t i = 0; i < size; ++i) {
         EXPECT_NEAR(cur_expected[i], cur_actual[i], threshold) << "i:" << i;
       }
     } else {
-      for (int i = 0; i < size; ++i) {
+      for (int64_t i = 0; i < size; ++i) {
         EXPECT_EQ(cur_expected[i], cur_actual[i]) << "i:" << i;
       }
     }
@@ -176,8 +224,7 @@ struct TensorCheck<double> {
                   const std::string& /*provider_type*/) const {
     auto size = actual.Shape().Size();
 
-    bool has_abs_err = params.absolute_error.has_value();
-    bool has_rel_err = params.relative_error.has_value();
+    const bool has_tolerance = params.absolute_error.has_value() || params.relative_error.has_value();
 
     // deal with rare cases in which order of output data from a kernel MAY be
     // undefined
@@ -198,7 +245,7 @@ struct TensorCheck<double> {
     threshold = 0.005;
 #endif
 
-    for (int i = 0; i < size; ++i) {
+    for (int64_t i = 0; i < size; ++i) {
       // NOTE: Check isnan first to work around MSVC linker bug when /LTCG:incremental is specified.
       // If the isinf check is first the isnan check and branch gets omitted
       if (std::isnan(cur_expected[i])) {
@@ -206,44 +253,33 @@ struct TensorCheck<double> {
       } else if (std::isinf(cur_expected[i])) {  // Test infinity for equality
         EXPECT_EQ(cur_expected[i], cur_actual[i]) << "Expected infinity. i:" << i;
       } else {
-        if (!has_abs_err && !has_rel_err) {
-          // the default for existing tests
-          EXPECT_NEAR(cur_expected[i], cur_actual[i], threshold) << "i:" << i;
-        } else {
-          if (has_abs_err) {
-            EXPECT_NEAR(cur_expected[i], cur_actual[i], *(params.absolute_error)) << "i:" << i;
-          }
-          if (has_rel_err) {
-            EXPECT_NEAR(cur_expected[i], cur_actual[i], *(params.relative_error) * std::abs(cur_expected[i]))
-                << "i:" << i;
-          }
-        }
+        double tolerance = has_tolerance ? get_tolerance<double, double>(params, cur_expected[i]) : threshold;
+        EXPECT_NEAR(cur_expected[i], cur_actual[i], tolerance) << "i:" << i;
       }
     }
   }
 };
 
-template <typename TypeToCheck>
+template <typename T>
 void InternalNumericalCheck(const Tensor& expected,
                             const Tensor& actual,
                             const ValidateOutputParams& params,
                             const std::string& /*provider_type*/) {
-  const bool has_abs_err = params.absolute_error.has_value();
-  const bool has_rel_err = params.relative_error.has_value();
+  const bool has_tolerance = params.absolute_error.has_value() || params.relative_error.has_value();
 
   // deal with rare cases in which order of output data from a kernel MAY be
   // undefined
   Tensor expected_sorted, actual_sorted;
-  const TypeToCheck* cur_expected;
-  const TypeToCheck* cur_actual;
+  const T* cur_expected;
+  const T* cur_actual;
   auto size = actual.Shape().Size();
   if (params.sort_output) {
-    sort_expected_and_actual_buffers<TypeToCheck>(expected, expected_sorted, actual, actual_sorted);
-    cur_expected = expected_sorted.Data<TypeToCheck>();
-    cur_actual = actual_sorted.Data<TypeToCheck>();
+    sort_expected_and_actual_buffers<T>(expected, expected_sorted, actual, actual_sorted);
+    cur_expected = expected_sorted.Data<T>();
+    cur_actual = actual_sorted.Data<T>();
   } else {
-    cur_expected = expected.Data<TypeToCheck>();
-    cur_actual = actual.Data<TypeToCheck>();
+    cur_expected = expected.Data<T>();
+    cur_actual = actual.Data<T>();
   }
 
 #if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML)
@@ -252,7 +288,7 @@ void InternalNumericalCheck(const Tensor& expected,
   constexpr float threshold = 0.0001f;
 #endif
 
-  for (int i = 0; i < size; ++i) {
+  for (int64_t i = 0; i < size; ++i) {
     // NOTE: Check isnan first to work around MSVC linker bug when /LTCG:incremental is specified.
     // If the isinf check is first the isnan check and branch gets omitted
     if (std::isnan(cur_expected[i])) {
@@ -260,19 +296,8 @@ void InternalNumericalCheck(const Tensor& expected,
     } else if (std::isinf(cur_expected[i])) {  // Test infinity for equality
       EXPECT_EQ(cur_expected[i], cur_actual[i]) << "Expected infinity. i:" << i;
     } else {
-      if (!has_abs_err && !has_rel_err) {
-        // the default for existing tests
-        EXPECT_NEAR(cur_expected[i], cur_actual[i], threshold) << "i:" << i;
-      } else {
-        if (has_abs_err) {
-          EXPECT_NEAR(cur_expected[i], cur_actual[i], *(params.absolute_error))
-              << "i:" << i;
-        }
-        if (has_rel_err) {
-          EXPECT_NEAR(cur_expected[i], cur_actual[i], *(params.relative_error) * std::abs(cur_expected[i]))
-              << "i:" << i;
-        }
-      }
+      T tolerance = has_tolerance ? get_tolerance<T, T>(params, cur_expected[i]) : threshold;
+      EXPECT_NEAR(cur_expected[i], cur_actual[i], tolerance) << "i:" << i;
     }
   }
 }
@@ -308,8 +333,7 @@ struct TensorCheck<MLFloat16> {
       sort_expected_and_actual_buffers<float>(f_expected, f_actual);
     }
 
-    const bool has_abs_err = params.absolute_error.has_value();
-    const bool has_rel_err = params.relative_error.has_value();
+    const bool has_tolerance = params.absolute_error.has_value() || params.relative_error.has_value();
 
     float threshold = 0.001f;
 #if defined(USE_TENSORRT) || defined(ENABLE_TRAINING_CORE) || defined(USE_CUDA) || defined(USE_ROCM)
@@ -317,25 +341,14 @@ struct TensorCheck<MLFloat16> {
 #elif defined(USE_DML)
     threshold = 0.02f;
 #endif
-    for (int i = 0; i < size; ++i) {
+    for (int64_t i = 0; i < size; ++i) {
       if (std::isnan(f_expected[i])) {
         EXPECT_TRUE(std::isnan(f_expected[i])) << "Expected NaN. i:" << i;
       } else if (std::isinf(f_expected[i])) {  // Test infinity for equality
         EXPECT_EQ(f_expected[i], f_actual[i]) << "Expected infinity. i:" << i;
       } else {
-        if (!has_abs_err && !has_rel_err) {
-          // the default for existing tests
-          EXPECT_NEAR(f_expected[i], f_actual[i], threshold) << "i:" << i;
-        } else {
-          if (has_abs_err) {
-            EXPECT_NEAR(f_expected[i], f_actual[i], *(params.absolute_error))
-                << "i:" << i;
-          }
-          if (has_rel_err) {
-            EXPECT_NEAR(f_expected[i], f_actual[i], *(params.relative_error) * std::abs(static_cast<float>(cur_expected[i])))
-                << "i:" << i;
-          }
-        }
+        float tolerance = has_tolerance ? get_tolerance<float, MLFloat16>(params, f_expected[i]) : threshold;
+        EXPECT_NEAR(f_expected[i], f_actual[i], tolerance) << "i:" << i;
       }
     }
   }
@@ -362,32 +375,24 @@ struct TensorCheck<BFloat16> {
       sort_expected_and_actual_buffers<float>(f_expected, f_actual);
     }
 
-    /// XXX: May need to adjust threshold as BFloat is coarse
+    const bool has_tolerance = params.absolute_error.has_value() || params.relative_error.has_value();
+
     float abs_threshold = 0.0001f;
-    float threshold = 0.001f;
+    float rel_threshold = 0.001f;
 #if defined(USE_TENSORRT) || defined(ENABLE_TRAINING_CORE) || defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML) || defined(USE_DNNL)
-    threshold = 0.05f;  // expect at least 95% close
+    rel_threshold = 0.05f;  // expect at least 95% close
 #endif
 
-    for (int i = 0; i < size; ++i) {
+    for (int64_t i = 0; i < size; ++i) {
       if (std::isnan(f_expected[i])) {
         EXPECT_TRUE(std::isnan(f_expected[i])) << "Expected NaN. i:" << i;
       } else if (std::isinf(f_expected[i])) {  // Test infinity for equality
         EXPECT_EQ(f_expected[i], f_actual[i]) << "Expected infinity. i:" << i;
       } else {
-        // the default for existing tests
-        const float max_value = fmax(fabs(f_expected[i]), fabs(f_actual[i]));
-        if (max_value != 0) {  // max_value = 0 means output and expected are 0s.
-          const float abs_error = fabs(f_expected[i] - f_actual[i]);
-          if (abs_error <= abs_threshold) {
-            // if the absolute error is small enough, then no need to calculate realative error
-            EXPECT_NEAR(0, abs_error, abs_threshold);
-          } else {
-            // default for existing tests.
-            const float rel_error = abs_error / max_value;
-            EXPECT_NEAR(0, rel_error, threshold);
-          }
-        }
+        float tolerance = has_tolerance
+                              ? get_tolerance<float, BFloat16>(params, f_expected[i])
+                              : get_tolerance<float>(abs_threshold, rel_threshold, f_expected[i]);
+        EXPECT_NEAR(f_expected[i], f_actual[i], tolerance) << "i:" << i;
       }
     }
   }

From 72ce4de07df91b43d36d5c475a609095bde50a53 Mon Sep 17 00:00:00 2001
From: Ye Wang <52801275+wangyems@users.noreply.github.com>
Date: Thu, 7 Mar 2024 18:15:18 +0000
Subject: [PATCH 125/279] cuda graph enhancement (#19636)

### Description
<!-- Describe your changes. -->

1. add a config key in run_options to control cuda graph in runtime.
2. enhance cuda graph class to support mutiple graph saving and
retrieving in one ORT session
3. provide model modification/inference example on Phi2
4. benchmark shows an average of 13% latency reduction in token
generation.


limitation: TRT ep and ROCM ep hasn't applied this feature. we can
revisit this in the future.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../core/framework/execution_provider.h       |  14 +-
 .../onnxruntime_run_options_config_keys.h     |   7 +
 .../providers/cuda/cuda_execution_provider.cc |  74 ++++--
 .../providers/cuda/cuda_execution_provider.h  |  17 +-
 onnxruntime/core/providers/cuda/cuda_graph.cc |  89 +++++--
 onnxruntime/core/providers/cuda/cuda_graph.h  |  48 +++-
 .../providers/js/js_execution_provider.cc     |  10 +-
 .../core/providers/js/js_execution_provider.h |   4 +-
 .../providers/rocm/rocm_execution_provider.cc |  35 +--
 .../providers/rocm/rocm_execution_provider.h  |  12 +-
 .../providers/shared_library/provider_api.h   |   1 +
 .../shared_library/provider_interfaces.h      |   3 +
 .../shared_library/provider_wrappedtypes.h    |   8 +
 .../tensorrt/tensorrt_execution_provider.cc   |  38 +--
 .../tensorrt/tensorrt_execution_provider.h    |  16 +-
 onnxruntime/core/session/inference_session.cc |  22 +-
 onnxruntime/core/session/inference_session.h  |  16 +-
 .../core/session/provider_bridge_ort.cc       |   4 +
 .../models/phi2/convert_to_onnx.py            |  79 +++++-
 .../models/phi2/inference_example.py          | 236 ++++++++++++++++--
 .../onnxruntime_test_python_cudagraph.py      |  61 ++++-
 onnxruntime/test/shared_lib/test_inference.cc | 149 +++++++++++
 onnxruntime/test/testdata/mul_1_dynamic.onnx  | Bin 0 -> 142 bytes
 23 files changed, 766 insertions(+), 177 deletions(-)
 create mode 100644 onnxruntime/test/testdata/mul_1_dynamic.onnx

diff --git a/include/onnxruntime/core/framework/execution_provider.h b/include/onnxruntime/core/framework/execution_provider.h
index c1cc69edc17d8..40ca96a19aef1 100644
--- a/include/onnxruntime/core/framework/execution_provider.h
+++ b/include/onnxruntime/core/framework/execution_provider.h
@@ -202,21 +202,21 @@ class IExecutionProvider {
 
   /**
      Indicate whether the graph capturing mode (e.g., cuda graph) is enabled for
-     the provider. Currently only CUDA execution provider supports it.
+     the provider.
    */
   virtual bool IsGraphCaptureEnabled() const { return false; }
 
   /**
-     Indicate whether the graph has been captured and instantiated. Currently
-     only CUDA execution provider supports it.
+     Indicate whether the graph has been captured and instantiated.
    */
-  virtual bool IsGraphCaptured() const { return false; }
+  virtual bool IsGraphCaptured(int /*graph_annotation_id*/) const { return false; }
 
   /**
-     Run the instantiated graph. Currently only CUDA execution provider supports
-     it.
+     Run the instantiated graph.
    */
-  virtual common::Status ReplayGraph() { return Status::OK(); }
+  virtual common::Status ReplayGraph(int /*graph_annotation_id*/) {
+    return Status::OK();
+  }
 
   /**
      Called when session creation is complete
diff --git a/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h
index b0a17e175fef3..c80b8c0c164b6 100644
--- a/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h
+++ b/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h
@@ -42,3 +42,10 @@ static const char* const kOrtRunOptionsConfigQnnPerfModePostRun = "qnn.htp_perf_
 
 // Set RPC control latency for QNN HTP backend
 static const char* const kOrtRunOptionsConfigQnnRpcControlLatency = "qnn.rpc_control_latency";
+
+// Set graph annotation id for CUDA EP. Use with enable_cuda_graph=true.
+// The value should be an integer. If the value is not set, the default value is 0 and
+// ORT session only captures one cuda graph before another capture is requested.
+// If the value is set to -1, cuda graph capture/replay is disabled in that run.
+// User are not expected to set the value to 0 as it is reserved for internal use.
+static const char* const kOrtRunOptionsConfigCudaGraphAnnotation = "gpu_graph_id";
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index 3c0930638a205..bade2faf8f2e2 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -3,6 +3,7 @@
 // Licensed under the MIT License.
 
 #include "core/common/inlined_containers.h"
+#include "core/common/parse_string.h"
 #include "core/providers/shared_library/provider_api.h"
 #include "core/platform/env_var_utils.h"
 #include "core/providers/cuda/cuda_execution_provider.h"
@@ -11,6 +12,7 @@
 #include "core/providers/cuda/cuda_fwd.h"
 #include "core/providers/cuda/gpu_data_transfer.h"
 #include "core/providers/cuda/cuda_profiler.h"
+#include "core/session/onnxruntime_run_options_config_keys.h"
 
 #ifndef USE_CUDA_MINIMAL
 #ifndef DISABLE_CONTRIB_OPS
@@ -190,27 +192,46 @@ CUDAExecutionProvider::PerThreadContext::~PerThreadContext() {
 #endif
 }
 
-bool CUDAExecutionProvider::PerThreadContext::IsGraphCaptureAllowed() const {
-  return regular_run_count_before_graph_capture_ >= min_num_runs_before_cuda_graph_capture_;
+bool CUDAExecutionProvider::PerThreadContext::IsGraphCaptureAllowed(
+    CudaGraphAnnotation_t cuda_graph_annotation_id) const {
+  return regular_run_count_before_graph_capture_ >= min_num_runs_before_cuda_graph_capture_ &&
+         IsGraphCaptureAllowedOnRun(cuda_graph_annotation_id);
 }
 
-void CUDAExecutionProvider::PerThreadContext::CaptureBegin() {
-  cuda_graph_.Reset();
-  cuda_graph_.CaptureBegin();
+bool CUDAExecutionProvider::PerThreadContext::IsGraphCaptureAllowedOnRun(
+    CudaGraphAnnotation_t cuda_graph_annotation_id) const {
+  return cuda_graph_.IsGraphCaptureAllowedOnRun(cuda_graph_annotation_id);
 }
 
-void CUDAExecutionProvider::PerThreadContext::CaptureEnd() {
-  cuda_graph_.CaptureEnd();
-  is_graph_captured_ = true;
+CudaGraphAnnotation_t CUDAExecutionProvider::PerThreadContext::GetCudaGraphAnnotationId(
+    const onnxruntime::RunOptions& run_options) const {
+  auto graph_annotation_str =
+      run_options.GetConfigOptions().GetConfigEntry(kOrtRunOptionsConfigCudaGraphAnnotation);
+  // If graph annotation is not provided, fall back to the one cuda graph per session behavior
+  CudaGraphAnnotation_t cuda_graph_annotation_id = 0;
+  if (graph_annotation_str.has_value()) {
+    ORT_ENFORCE(TryParseStringWithClassicLocale<int>(*graph_annotation_str, cuda_graph_annotation_id),
+                "Failed to parse the cuda graph annotation id: ",
+                *graph_annotation_str);
+  }
+
+  return cuda_graph_annotation_id;
+}
+
+void CUDAExecutionProvider::PerThreadContext::CaptureBegin(CudaGraphAnnotation_t cuda_graph_annotation_id) {
+  cuda_graph_.CaptureBegin(cuda_graph_annotation_id);
+}
+
+void CUDAExecutionProvider::PerThreadContext::CaptureEnd(CudaGraphAnnotation_t cuda_graph_annotation_id) {
+  cuda_graph_.CaptureEnd(cuda_graph_annotation_id);
 }
 
-bool CUDAExecutionProvider::PerThreadContext::IsGraphCaptured() const {
-  return is_graph_captured_;
+bool CUDAExecutionProvider::PerThreadContext::IsGraphCaptured(CudaGraphAnnotation_t graph_annotation_id) const {
+  return cuda_graph_.IsGraphCaptured(graph_annotation_id);
 }
 
-Status CUDAExecutionProvider::PerThreadContext::ReplayGraph() {
-  ORT_ENFORCE(IsGraphCaptured());
-  return cuda_graph_.Replay();
+Status CUDAExecutionProvider::PerThreadContext::ReplayGraph(CudaGraphAnnotation_t graph_annotation_id) {
+  return cuda_graph_.Replay(graph_annotation_id);
 }
 
 void CUDAExecutionProvider::PerThreadContext::IncrementRegularRunCountBeforeGraphCapture() {
@@ -386,23 +407,26 @@ Status CUDAExecutionProvider::Sync() const {
   return Status::OK();
 }
 
-Status CUDAExecutionProvider::OnRunStart(const onnxruntime::RunOptions& /*run_options*/) {
+Status CUDAExecutionProvider::OnRunStart(const onnxruntime::RunOptions& run_options) {
   // always set CUDA device when session::Run() in case it runs in a worker thread
   CUDA_RETURN_IF_ERROR(cudaSetDevice(GetDeviceId()));
-  if (IsGraphCaptureEnabled() && GetPerThreadContext().IsGraphCaptureAllowed() && !GetPerThreadContext().IsGraphCaptured()) {
+  CudaGraphAnnotation_t cuda_graph_annotation_id = GetPerThreadContext().GetCudaGraphAnnotationId(run_options);
+  if (IsGraphCaptureEnabled() && !GetPerThreadContext().IsGraphCaptured(cuda_graph_annotation_id) &&
+      GetPerThreadContext().IsGraphCaptureAllowed(cuda_graph_annotation_id)) {
     LOGS(*GetLogger(), INFO) << "Capturing the cuda graph for this model";
-    GetPerThreadContext().CaptureBegin();
+    GetPerThreadContext().CaptureBegin(cuda_graph_annotation_id);
   }
   return Status::OK();
 }
 
-Status CUDAExecutionProvider::OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& /*run_options*/) {
-  if (IsGraphCaptureEnabled() && !GetPerThreadContext().IsGraphCaptured()) {
-    if (GetPerThreadContext().IsGraphCaptureAllowed()) {
-      GetPerThreadContext().CaptureEnd();
+Status CUDAExecutionProvider::OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& run_options) {
+  CudaGraphAnnotation_t cuda_graph_annotation_id = GetPerThreadContext().GetCudaGraphAnnotationId(run_options);
+  if (IsGraphCaptureEnabled() && !GetPerThreadContext().IsGraphCaptured(cuda_graph_annotation_id)) {
+    if (GetPerThreadContext().IsGraphCaptureAllowed(cuda_graph_annotation_id)) {
+      GetPerThreadContext().CaptureEnd(cuda_graph_annotation_id);
       // CUDA work issued to a capturing stream doesn’t actually run on the GPU,
       // so run the captured graph here to actually execute the work.
-      ORT_RETURN_IF_ERROR(GetPerThreadContext().ReplayGraph());
+      ORT_RETURN_IF_ERROR(GetPerThreadContext().ReplayGraph(cuda_graph_annotation_id));
     } else {
       GetPerThreadContext().IncrementRegularRunCountBeforeGraphCapture();
     }
@@ -433,12 +457,12 @@ bool CUDAExecutionProvider::IsGraphCaptureEnabled() const {
   return info_.enable_cuda_graph;
 }
 
-bool CUDAExecutionProvider::IsGraphCaptured() const {
-  return GetPerThreadContext().IsGraphCaptured();
+bool CUDAExecutionProvider::IsGraphCaptured(int graph_annotation_id) const {
+  return GetPerThreadContext().IsGraphCaptured(graph_annotation_id);
 }
 
-Status CUDAExecutionProvider::ReplayGraph() {
-  return GetPerThreadContext().ReplayGraph();
+Status CUDAExecutionProvider::ReplayGraph(int graph_annotation_id) {
+  return GetPerThreadContext().ReplayGraph(graph_annotation_id);
 }
 
 namespace cuda {
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.h b/onnxruntime/core/providers/cuda/cuda_execution_provider.h
index 75fe1dff7c4a4..6c70e6abc4fdf 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.h
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.h
@@ -92,8 +92,8 @@ class CUDAExecutionProvider : public IExecutionProvider {
   std::unique_ptr<profiling::EpProfiler> GetProfiler() override;
 
   bool IsGraphCaptureEnabled() const override;
-  bool IsGraphCaptured() const override;
-  Status ReplayGraph() override;
+  bool IsGraphCaptured(CudaGraphAnnotation_t graph_annotation_id) const override;
+  Status ReplayGraph(CudaGraphAnnotation_t graph_annotation_id) override;
   void RegisterStreamHandlers(IStreamCommandHandleRegistry& stream_handle_registry, AllocatorMap& allocators) const override;
   OrtDevice GetOrtDeviceByMemType(OrtMemType mem_type) const override;
   std::vector<AllocatorPtr> CreatePreferredAllocators() override;
@@ -168,11 +168,13 @@ class CUDAExecutionProvider : public IExecutionProvider {
       }
     }
 
-    bool IsGraphCaptureAllowed() const;
-    void CaptureBegin();
-    void CaptureEnd();
-    bool IsGraphCaptured() const;
-    Status ReplayGraph();
+    bool IsGraphCaptureAllowed(CudaGraphAnnotation_t cuda_graph_annotation_id) const;
+    bool IsGraphCaptureAllowedOnRun(CudaGraphAnnotation_t cuda_graph_annotation_id) const;
+    void CaptureBegin(CudaGraphAnnotation_t cuda_graph_annotation_id);
+    void CaptureEnd(CudaGraphAnnotation_t cuda_graph_annotation_id);
+    bool IsGraphCaptured(CudaGraphAnnotation_t cuda_graph_annotation_id) const;
+    CudaGraphAnnotation_t GetCudaGraphAnnotationId(const onnxruntime::RunOptions& run_options) const;
+    Status ReplayGraph(CudaGraphAnnotation_t cuda_graph_annotation_id);
     void IncrementRegularRunCountBeforeGraphCapture();
 
    private:
@@ -192,7 +194,6 @@ class CUDAExecutionProvider : public IExecutionProvider {
     // Cuda graph with multi threads will be supported in the future, so cuda_graph_
     // is put under PerThreadContext.
     CUDAGraph cuda_graph_;
-    bool is_graph_captured_ = false;
     int regular_run_count_before_graph_capture_ = 0;
 
     // There is chance that the second regular run allocates GPU memory for causes like:
diff --git a/onnxruntime/core/providers/cuda/cuda_graph.cc b/onnxruntime/core/providers/cuda/cuda_graph.cc
index 230d664391611..8353c654681fc 100644
--- a/onnxruntime/core/providers/cuda/cuda_graph.cc
+++ b/onnxruntime/core/providers/cuda/cuda_graph.cc
@@ -9,17 +9,44 @@
 
 namespace onnxruntime {
 
-CUDAGraph::CUDAGraph(cudaStream_t stream) : stream_(stream) {
+CudaGraphSet::~CudaGraphSet() {
+  Clear();
 }
 
-void CUDAGraph::SetStream(cudaStream_t stream) {
+void CudaGraphSet::Clear() {
+  for (auto& it : cuda_graphs_) {
+    CUDA_CALL_THROW(cudaGraphExecDestroy(it.second));
+  }
+  cuda_graphs_.clear();
+}
+
+bool CudaGraphSet::Contains(CudaGraphAnnotation_t cuda_graph_annotation_id) const {
+  return cuda_graphs_.find(cuda_graph_annotation_id) != cuda_graphs_.end();
+}
+
+void CudaGraphSet::Put(CudaGraphAnnotation_t cuda_graph_annotation_id, cudaGraphExec_t graph_exec) {
+  ORT_ENFORCE(!Contains(cuda_graph_annotation_id));
+  cuda_graphs_.emplace(cuda_graph_annotation_id, graph_exec);
+}
+
+cudaGraphExec_t CudaGraphSet::Get(CudaGraphAnnotation_t cuda_graph_annotation_id) const {
+  ORT_ENFORCE(Contains(cuda_graph_annotation_id));
+  return cuda_graphs_.at(cuda_graph_annotation_id);
+}
+
+CUDAGraphManager::CUDAGraphManager(cudaStream_t stream) : stream_(stream) {
+}
+
+void CUDAGraphManager::SetStream(cudaStream_t stream) {
   stream_ = stream;
 }
 
-void CUDAGraph::CaptureBegin() {
-  ORT_ENFORCE(!has_graph_exec_,
-              "This cuda graph has already captured a graph. "
-              "Create a new instance to capture a new graph.");
+void CUDAGraphManager::CaptureBegin(CudaGraphAnnotation_t cuda_graph_annotation_id) {
+  ORT_ENFORCE(IsGraphCaptureAllowedOnRun(cuda_graph_annotation_id));
+
+  ORT_ENFORCE(!cuda_graph_set_.Contains(cuda_graph_annotation_id),
+              "Trying to capture a graph with annotation id ", cuda_graph_annotation_id,
+              " that already used. Please use a different annotation id.");
 
   CUDA_CALL_THROW(cudaStreamSynchronize(stream_));
   // For now cuda graph can only work with a single thread. In the future, we
@@ -29,40 +56,48 @@ void CUDAGraph::CaptureBegin() {
   CUDA_CALL_THROW(cudaStreamBeginCapture(stream_, cudaStreamCaptureModeGlobal));
 }
 
-void CUDAGraph::CaptureEnd() {
-  CUDA_CALL_THROW(cudaStreamEndCapture(stream_, &graph_));
-  if (graph_ == NULL) {
+void CUDAGraphManager::CaptureEnd(CudaGraphAnnotation_t cuda_graph_annotation_id) {
+  cudaGraph_t graph = NULL;
+  CUDA_CALL_THROW(cudaStreamEndCapture(stream_, &graph));
+  if (graph == NULL) {
     ORT_THROW("CUDAGraph::CaptureEnd: graph_ is NULL");
   }
 
-  has_graph_ = true;
-  CUDA_CALL_THROW(cudaGraphInstantiate(&graph_exec_, graph_, NULL, NULL, 0));
-  has_graph_exec_ = true;
-  CUDA_CALL_THROW(cudaGraphDestroy(graph_));
-  has_graph_ = false;
+  cudaGraphExec_t graph_exec = NULL;
+  CUDA_CALL_THROW(cudaGraphInstantiate(&graph_exec, graph, NULL, NULL, 0));
+  CUDA_CALL_THROW(cudaGraphDestroy(graph));
+
+  // Currently all the captured graphs will be tied to the session's lifecycle
+  // TODO(wy): Addd an interface to free captured graphs
+  cuda_graph_set_.Put(cuda_graph_annotation_id, graph_exec);
 }
 
-Status CUDAGraph::Replay() {
+Status CUDAGraphManager::Replay(CudaGraphAnnotation_t cuda_graph_annotation_id) {
   // Although this function is not thread safe, the lock is not needed here because
   // CUDA EP maintains a separate cuda graph per thread
-  LOGS_DEFAULT(INFO) << "Replaying CUDA graph on stream " << stream_;
-  CUDA_RETURN_IF_ERROR(cudaGraphLaunch(graph_exec_, stream_));
+  LOGS_DEFAULT(INFO) << "Replaying CUDA graph on stream " << stream_ << " with cuda_graph_annotation_id "
+                     << cuda_graph_annotation_id;
+
+  cudaGraphExec_t graph_exec = cuda_graph_set_.Get(cuda_graph_annotation_id);
+  CUDA_RETURN_IF_ERROR(cudaGraphLaunch(graph_exec, stream_));
+
   CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream_));
   return Status::OK();
 }
 
-void CUDAGraph::Reset() {
-  if (has_graph_) {
-    CUDA_CALL_THROW(cudaGraphDestroy(graph_));
-    has_graph_ = false;
-  }
-  if (has_graph_exec_) {
-    CUDA_CALL_THROW(cudaGraphExecDestroy(graph_exec_));
-    has_graph_exec_ = false;
-  }
+bool CUDAGraphManager::IsGraphCaptureAllowedOnRun(CudaGraphAnnotation_t cuda_graph_annotation_id) const {
+  return cuda_graph_annotation_id != kCudaGraphAnnotationSkip;
+}
+
+bool CUDAGraphManager::IsGraphCaptured(CudaGraphAnnotation_t cuda_graph_annotation_id) const {
+  return cuda_graph_set_.Contains(cuda_graph_annotation_id);
+}
+
+void CUDAGraphManager::Reset() {
+  cuda_graph_set_.Clear();
 }
 
-CUDAGraph::~CUDAGraph() {
+CUDAGraphManager::~CUDAGraphManager() {
   Reset();
 }
 
diff --git a/onnxruntime/core/providers/cuda/cuda_graph.h b/onnxruntime/core/providers/cuda/cuda_graph.h
index 9bcefcc64ea77..064994c1f14ae 100644
--- a/onnxruntime/core/providers/cuda/cuda_graph.h
+++ b/onnxruntime/core/providers/cuda/cuda_graph.h
@@ -3,33 +3,55 @@
 
 #pragma once
 
+#include <unordered_map>
+
 #include "core/common/common.h"
 #include "core/platform/ort_mutex.h"
 #include "core/providers/cuda/cuda_pch.h"
 
 namespace onnxruntime {
 
-using CaptureId_t = unsigned long long;
+using CudaGraphAnnotation_t = int;
+using CudaGraphSet_t = std::unordered_map<CudaGraphAnnotation_t, cudaGraphExec_t>;
+
+constexpr CudaGraphAnnotation_t kCudaGraphAnnotationSkip = -1;
+constexpr CudaGraphAnnotation_t kCudaGraphAnnotationDefault = 0;
+
+struct CudaGraphSet {
+  CudaGraphSet(){};
+  ~CudaGraphSet();
 
-struct CUDAGraph {
-  CUDAGraph(){};
-  CUDAGraph(cudaStream_t stream);
-  ~CUDAGraph();
+  void Clear();
+  bool Contains(CudaGraphAnnotation_t cuda_graph_annotation_id) const;
+  void Put(CudaGraphAnnotation_t cuda_graph_annotation_id, cudaGraphExec_t graph_exec);
+  cudaGraphExec_t Get(CudaGraphAnnotation_t cuda_graph_annotation_id) const;
+
+ private:
+  CudaGraphSet_t cuda_graphs_;
+};
+
+struct CUDAGraphManager {
+  CUDAGraphManager(){};
+  CUDAGraphManager(cudaStream_t stream);
+  ~CUDAGraphManager();
 
   void SetStream(cudaStream_t stream);
-  void CaptureBegin();
-  void CaptureEnd();
-  Status Replay();
+  void CaptureBegin(CudaGraphAnnotation_t cuda_graph_annotation_id);
+  void CaptureEnd(CudaGraphAnnotation_t cuda_graph_annotation_id);
+  Status Replay(CudaGraphAnnotation_t cuda_graph_annotation_id);
+
   void Reset();
 
- private:
-  cudaGraph_t graph_ = NULL;
-  cudaGraphExec_t graph_exec_ = NULL;
+  bool IsGraphCaptureAllowedOnRun(CudaGraphAnnotation_t cuda_graph_annotation_id) const;
+  bool IsGraphCaptured(CudaGraphAnnotation_t cuda_graph_annotation_id) const;
 
-  bool has_graph_ = false;
-  bool has_graph_exec_ = false;
+ private:
+  CudaGraphSet cuda_graph_set_;
+  CudaGraphAnnotation_t cuda_graph_annotation_id_ = kCudaGraphAnnotationDefault;
 
   cudaStream_t stream_ = nullptr;  // Does not own the stream
 };
 
+using CUDAGraph = CUDAGraphManager;
+
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc
index 62c3981682cfc..2d2c89f36f1a7 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.cc
+++ b/onnxruntime/core/providers/js/js_execution_provider.cc
@@ -757,7 +757,7 @@ JsExecutionProvider::~JsExecutionProvider() {
 }
 
 Status JsExecutionProvider::OnRunStart(const onnxruntime::RunOptions& /*run_options*/) {
-  if (IsGraphCaptureEnabled() && IsGraphCaptureAllowed() && !IsGraphCaptured()) {
+  if (IsGraphCaptureEnabled() && IsGraphCaptureAllowed() && !IsGraphCaptured(0)) {
     LOGS(*GetLogger(), INFO) << "Capturing the webgpu graph for this model";
     EM_ASM({ Module.jsepCaptureBegin(); });
   }
@@ -765,7 +765,7 @@ Status JsExecutionProvider::OnRunStart(const onnxruntime::RunOptions& /*run_opti
 }
 
 Status JsExecutionProvider::OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& /*run_options*/) {
-  if (IsGraphCaptureEnabled() && !IsGraphCaptured()) {
+  if (IsGraphCaptureEnabled() && !IsGraphCaptured(0)) {
     if (IsGraphCaptureAllowed()) {
       EM_ASM({ Module.jsepCaptureEnd(); });
       is_graph_captured_ = true;
@@ -781,12 +781,12 @@ bool JsExecutionProvider::IsGraphCaptureEnabled() const {
   return enable_graph_capture_;
 }
 
-bool JsExecutionProvider::IsGraphCaptured() const {
+bool JsExecutionProvider::IsGraphCaptured(int) const {
   return is_graph_captured_;
 }
 
-Status JsExecutionProvider::ReplayGraph() {
-  ORT_ENFORCE(IsGraphCaptured());
+Status JsExecutionProvider::ReplayGraph(int) {
+  ORT_ENFORCE(IsGraphCaptured(0));
   EM_ASM({ Module.jsepReplay(); });
   return Status::OK();
 }
diff --git a/onnxruntime/core/providers/js/js_execution_provider.h b/onnxruntime/core/providers/js/js_execution_provider.h
index b4518c67d1e60..efacf510e75df 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.h
+++ b/onnxruntime/core/providers/js/js_execution_provider.h
@@ -63,8 +63,8 @@ class JsExecutionProvider : public IExecutionProvider {
   Status OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& run_options) override;
 
   bool IsGraphCaptureEnabled() const override;
-  bool IsGraphCaptured() const override;
-  Status ReplayGraph() override;
+  bool IsGraphCaptured(int graph_annotation_id) const override;
+  Status ReplayGraph(int graph_annotation_id) override;
 
  private:
   bool IsGraphCaptureAllowed() const;
diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
index 4a679b790ee40..32be74550951e 100644
--- a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
+++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
@@ -183,23 +183,24 @@ bool ROCMExecutionProvider::PerThreadContext::IsGraphCaptureAllowed() const {
   return regular_run_count_before_graph_capture_ >= min_num_runs_before_hip_graph_capture_;
 }
 
-void ROCMExecutionProvider::PerThreadContext::CaptureBegin() {
+void ROCMExecutionProvider::PerThreadContext::CaptureBegin(int) {
   hip_graph_.Reset();
-  hip_graph_.CaptureBegin();
+  hip_graph_.CaptureBegin(0);
 }
 
-void ROCMExecutionProvider::PerThreadContext::CaptureEnd() {
-  hip_graph_.CaptureEnd();
+void ROCMExecutionProvider::PerThreadContext::CaptureEnd(int) {
+  hip_graph_.CaptureEnd(0);
   is_graph_captured_ = true;
 }
 
-bool ROCMExecutionProvider::PerThreadContext::IsGraphCaptured() const {
+bool ROCMExecutionProvider::PerThreadContext::IsGraphCaptured(int) const {
   return is_graph_captured_;
 }
 
-Status ROCMExecutionProvider::PerThreadContext::ReplayGraph() {
-  ORT_ENFORCE(IsGraphCaptured());
-  return hip_graph_.Replay();
+Status ROCMExecutionProvider::PerThreadContext::ReplayGraph(int graph_annotation_id) {
+  ORT_ENFORCE(IsGraphCaptured(graph_annotation_id));
+
+  return hip_graph_.Replay(graph_annotation_id);
 }
 
 void ROCMExecutionProvider::PerThreadContext::IncrementRegularRunCountBeforeGraphCapture() {
@@ -356,20 +357,20 @@ Status ROCMExecutionProvider::Sync() const {
 Status ROCMExecutionProvider::OnRunStart(const onnxruntime::RunOptions& /*run_options*/) {
   // always set ROCM device when session::Run() in case it runs in a worker thread
   HIP_RETURN_IF_ERROR(hipSetDevice(GetDeviceId()));
-  if (IsGraphCaptureEnabled() && GetPerThreadContext().IsGraphCaptureAllowed() && !GetPerThreadContext().IsGraphCaptured()) {
+  if (IsGraphCaptureEnabled() && GetPerThreadContext().IsGraphCaptureAllowed() && !GetPerThreadContext().IsGraphCaptured(0)) {
     LOGS_DEFAULT(INFO) << "Capturing the hip graph for this model";
-    GetPerThreadContext().CaptureBegin();
+    GetPerThreadContext().CaptureBegin(0);
   }
   return Status::OK();
 }
 
 Status ROCMExecutionProvider::OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& /*run_options*/) {
-  if (IsGraphCaptureEnabled() && !GetPerThreadContext().IsGraphCaptured()) {
+  if (IsGraphCaptureEnabled() && !GetPerThreadContext().IsGraphCaptured(0)) {
     if (GetPerThreadContext().IsGraphCaptureAllowed()) {
-      GetPerThreadContext().CaptureEnd();
+      GetPerThreadContext().CaptureEnd(0);
       // HIP work issued to a capturing stream doesn’t actually run on the GPU,
       // so run the captured graph here to actually execute the work.
-      ORT_RETURN_IF_ERROR(GetPerThreadContext().ReplayGraph());
+      ORT_RETURN_IF_ERROR(GetPerThreadContext().ReplayGraph(0));
     } else {
       GetPerThreadContext().IncrementRegularRunCountBeforeGraphCapture();
     }
@@ -400,12 +401,12 @@ bool ROCMExecutionProvider::IsGraphCaptureEnabled() const {
   return info_.enable_hip_graph;
 }
 
-bool ROCMExecutionProvider::IsGraphCaptured() const {
-  return GetPerThreadContext().IsGraphCaptured();
+bool ROCMExecutionProvider::IsGraphCaptured(int) const {
+  return GetPerThreadContext().IsGraphCaptured(0);
 }
 
-Status ROCMExecutionProvider::ReplayGraph() {
-  return GetPerThreadContext().ReplayGraph();
+Status ROCMExecutionProvider::ReplayGraph(int /*graph_annotation_id*/) {
+  return GetPerThreadContext().ReplayGraph(0);
 }
 
 namespace rocm {
diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.h b/onnxruntime/core/providers/rocm/rocm_execution_provider.h
index da671d9e863bb..6d6c05027e7bd 100644
--- a/onnxruntime/core/providers/rocm/rocm_execution_provider.h
+++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.h
@@ -75,8 +75,8 @@ class ROCMExecutionProvider : public IExecutionProvider {
   std::unique_ptr<profiling::EpProfiler> GetProfiler() override;
 
   bool IsGraphCaptureEnabled() const override;
-  bool IsGraphCaptured() const override;
-  Status ReplayGraph() override;
+  bool IsGraphCaptured(int graph_annotation_id) const override;
+  Status ReplayGraph(int graph_annotation_id) override;
   void RegisterStreamHandlers(IStreamCommandHandleRegistry& stream_handle_registry, AllocatorMap& allocators) const override;
   OrtDevice GetOrtDeviceByMemType(OrtMemType mem_type) const override;
   std::vector<AllocatorPtr> CreatePreferredAllocators() override;
@@ -139,10 +139,10 @@ class ROCMExecutionProvider : public IExecutionProvider {
     }
 
     bool IsGraphCaptureAllowed() const;
-    void CaptureBegin();
-    void CaptureEnd();
-    bool IsGraphCaptured() const;
-    Status ReplayGraph();
+    void CaptureBegin(int graph_annotation_id);
+    void CaptureEnd(int graph_annotation_id);
+    bool IsGraphCaptured(int graph_annotation_id) const;
+    Status ReplayGraph(int graph_annotation_id);
     void IncrementRegularRunCountBeforeGraphCapture();
 
    private:
diff --git a/onnxruntime/core/providers/shared_library/provider_api.h b/onnxruntime/core/providers/shared_library/provider_api.h
index b78279040acb6..1cebe4a256fd4 100644
--- a/onnxruntime/core/providers/shared_library/provider_api.h
+++ b/onnxruntime/core/providers/shared_library/provider_api.h
@@ -159,6 +159,7 @@ class OpKernel;
 struct OpKernelContext;
 struct OpKernelInfo;
 struct PrimitiveDataTypeBase;
+struct OrtRunOptions;
 struct Tensor;
 struct SparseTensor;
 class TensorSeq;
diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index f5a8327443864..0b8551e0c5a66 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -481,6 +481,9 @@ struct ProviderHost {
   // ConfigOptions
   virtual std::optional<std::string> ConfigOptions__GetConfigEntry(const ConfigOptions* p, const std::string& config_key) = 0;
 
+  // OrtRunOptions
+  virtual const ConfigOptions& RunOptions__GetConfigOptions(const RunOptions* p) = 0;
+
   // ComputeCapability
   virtual std::unique_ptr<ComputeCapability> ComputeCapability__construct(std::unique_ptr<IndexedSubGraph> t_sub_graph) = 0;
   virtual void ComputeCapability__operator_delete(ComputeCapability* p) = 0;
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index dde4005c80b9d..dc2b79015d95e 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -393,6 +393,14 @@ struct ConfigOptions final {
   PROVIDER_DISALLOW_ALL(ConfigOptions)
 };
 
+struct OrtRunOptions final {
+  const ConfigOptions& GetConfigOptions() const {
+    return g_host->RunOptions__GetConfigOptions(this);
+  }
+
+  PROVIDER_DISALLOW_ALL(OrtRunOptions)
+};
+
 struct ComputeCapability final {
   static std::unique_ptr<ComputeCapability> Create(std::unique_ptr<IndexedSubGraph> t_sub_graph) { return g_host->ComputeCapability__construct(std::move(t_sub_graph)); }
   static void operator delete(void* p) { g_host->ComputeCapability__operator_delete(reinterpret_cast<ComputeCapability*>(p)); }
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index e521640681a77..632d521dc21a8 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -1633,26 +1633,26 @@ bool TensorrtExecutionProvider::IsGraphCaptureAllowed() const {
   return regular_run_count_before_graph_capture_ >= min_num_runs_before_cuda_graph_capture_;
 }
 
-void TensorrtExecutionProvider::CaptureBegin() {
+void TensorrtExecutionProvider::CaptureBegin(int) {
   cuda_graph_.Reset();
-  cuda_graph_.CaptureBegin();
+  cuda_graph_.CaptureBegin(0);
 }
 
-void TensorrtExecutionProvider::CaptureEnd() {
-  cuda_graph_.CaptureEnd();
+void TensorrtExecutionProvider::CaptureEnd(int) {
+  cuda_graph_.CaptureEnd(0);
   is_graph_captured_ = true;
 }
 
-bool TensorrtExecutionProvider::IsGraphCaptured() const {
+bool TensorrtExecutionProvider::IsGraphCaptured(int) const {
   return is_graph_captured_;
 }
 
-Status TensorrtExecutionProvider::ReplayGraph() {
-  ORT_ENFORCE(IsGraphCaptured());
+Status TensorrtExecutionProvider::ReplayGraph(int) {
+  ORT_ENFORCE(IsGraphCaptured(0));
   // Please note that CUDAGraph::Replay() is not thread safe.
-  // ORT TRT calls ReplayGraph() in compute_func() where synchromization is enforced due to lock_guard(),
+  // ORT TRT calls ReplayGraph() in compute_func() where synchronization is enforced due to lock_guard(),
   // therefore calling CUDAGraph::Replay() here is guaranteed to be thread safe.
-  return cuda_graph_.Replay();
+  return cuda_graph_.Replay(0);
 }
 
 void TensorrtExecutionProvider::IncrementRegularRunCountBeforeGraphCapture() {
@@ -3412,10 +3412,10 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
     // Start CUDA graph capture.
     // Note: The reason we don't put graph capture in OnRunStart() like CUDA EP does is because
     // current ORT TRT doesn't get cuda stream until compute time and graph capture requires cuda stream.
-    if (cuda_graph_enable_ && IsGraphCaptureAllowed() && !IsGraphCaptured()) {
+    if (cuda_graph_enable_ && IsGraphCaptureAllowed() && !IsGraphCaptured(0)) {
       LOGS_DEFAULT(INFO) << "Capturing the cuda graph for this model";
       cuda_graph_.SetStream(stream);
-      CaptureBegin();
+      CaptureBegin(0);
     }
 
     // Run TRT inference
@@ -3483,12 +3483,12 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
     // Note: One reason we don't put end of graph capture in OnRunEnd() like CUDA EP does is because of cuda stream mentioned in graph capture
     // above, another reason is because OnRunEnd() is not synchronized with OnRunStart() and ExecuteGraph() per inference_session.cc.
     // It's safe to start/end CUDA graph capture in compute_func() here since cuda graph object is maintained by a per thread basis.
-    if (cuda_graph_enable_ && !IsGraphCaptured()) {
+    if (cuda_graph_enable_ && !IsGraphCaptured(0)) {
       if (IsGraphCaptureAllowed()) {
-        CaptureEnd();
+        CaptureEnd(0);
         // CUDA work issued to a capturing stream doesn’t actually run on the GPU,
         // so run the captured graph here to actually execute the work.
-        ORT_RETURN_IF_ERROR(ReplayGraph());
+        ORT_RETURN_IF_ERROR(ReplayGraph(0));
       } else {
         IncrementRegularRunCountBeforeGraphCapture();
       }
@@ -3705,10 +3705,10 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(con
     // Start CUDA graph capture.
     // Note: The reason we don't put graph capture in OnRunStart() like CUDA EP does is because
     // current ORT TRT doesn't get cuda stream until compute time and graph capture requires cuda stream.
-    if (cuda_graph_enable_ && IsGraphCaptureAllowed() && !IsGraphCaptured()) {
+    if (cuda_graph_enable_ && IsGraphCaptureAllowed() && !IsGraphCaptured(0)) {
       LOGS_DEFAULT(INFO) << "Capturing the cuda graph for this model";
       cuda_graph_.SetStream(stream);
-      CaptureBegin();
+      CaptureBegin(0);
     }
 
     // Run TRT inference
@@ -3776,12 +3776,12 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(con
     // Note: One reason we don't put end of graph capture in OnRunEnd() like CUDA EP does is because of cuda stream mentioned in graph capture
     // above, another reason is because OnRunEnd() is not synchronized with OnRunStart() and ExecuteGraph() per inference_session.cc.
     // It's safe to start/end CUDA graph capture in compute_func() here since cuda graph object is maintained by a per thread basis.
-    if (cuda_graph_enable_ && !IsGraphCaptured()) {
+    if (cuda_graph_enable_ && !IsGraphCaptured(0)) {
       if (IsGraphCaptureAllowed()) {
-        CaptureEnd();
+        CaptureEnd(0);
         // CUDA work issued to a capturing stream doesn’t actually run on the GPU,
         // so run the captured graph here to actually execute the work.
-        ORT_RETURN_IF_ERROR(ReplayGraph());
+        ORT_RETURN_IF_ERROR(ReplayGraph(0));
       } else {
         IncrementRegularRunCountBeforeGraphCapture();
       }
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
index 339c45a8742d2..f73031eaefceb 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
@@ -250,8 +250,8 @@ class TensorrtExecutionProvider : public IExecutionProvider {
   std::vector<AllocatorPtr> CreatePreferredAllocators() override;
 
   bool IsGraphCaptureEnabled() const override;
-  bool IsGraphCaptured() const override;
-  Status ReplayGraph() override;
+  bool IsGraphCaptured(int graph_annotation_id) const override;
+  Status ReplayGraph(int graph_annotation_id) override;
 
  private:
   mutable TensorrtExecutionProviderInfo info_;
@@ -373,10 +373,10 @@ class TensorrtExecutionProvider : public IExecutionProvider {
     void InitCUDAGraph();
     void SetGraphStream(cudaStream_t stream);
     bool IsGraphCaptureAllowed() const;
-    void CaptureBegin();
-    void CaptureEnd();
-    bool IsGraphCaptured() const;
-    Status ReplayGraph();
+    void CaptureBegin(int graph_annotation_id);
+    void CaptureEnd(int graph_annotation_id);
+    bool IsGraphCaptured(int graph_annotation_id) const;
+    Status ReplayGraph(int graph_annotation_id);
     void IncrementRegularRunCountBeforeGraphCapture();
 
    private:
@@ -540,8 +540,8 @@ class TensorrtExecutionProvider : public IExecutionProvider {
                                         std::vector<NodeComputeInfo>& node_compute_funcs);
 
   bool IsGraphCaptureAllowed() const;
-  void CaptureBegin();
-  void CaptureEnd();
+  void CaptureBegin(int graph_annotation_id);
+  void CaptureEnd(int graph_annotation_id);
   void IncrementRegularRunCountBeforeGraphCapture();
 
   /**
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index 5fd66c459d382..684f390857d0b 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -2383,21 +2383,32 @@ Status InferenceSession::Run(const RunOptions& run_options,
   Status retval = Status::OK();
   const Env& env = Env::Default();
 
+  int graph_annotation_id = 0;
+  const std::string& graph_annotation_str =
+      run_options.config_options.GetConfigOrDefault(kOrtRunOptionsConfigCudaGraphAnnotation, "");
+  if (!graph_annotation_str.empty()) {
+    if (!TryParseStringWithClassicLocale<int>(graph_annotation_str, graph_annotation_id)) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Failed to parse the cuda graph annotation id: ",
+                             graph_annotation_str);
+    }
+  }
+
   // Increment/decrement concurrent_num_runs_ and control
   // session threads spinning as configured. Do nothing for graph replay except the counter.
   const bool control_spinning = use_per_session_threads_ &&
                                 force_spinning_stop_between_runs_ &&
-                                !cached_execution_provider_for_graph_replay_.IsGraphCaptured();
+                                !cached_execution_provider_for_graph_replay_.IsGraphCaptured(graph_annotation_id);
   auto* intra_tp = (control_spinning) ? thread_pool_.get() : nullptr;
   auto* inter_tp = (control_spinning) ? inter_op_thread_pool_.get() : nullptr;
   ThreadPoolSpinningSwitch runs_refcounter_and_tp_spin_control(intra_tp, inter_tp, current_num_runs_);
 
   // Check if this Run() is simply going to be a CUDA Graph replay.
-  if (cached_execution_provider_for_graph_replay_.IsGraphCaptured()) {
+  if (cached_execution_provider_for_graph_replay_.IsGraphCaptured(graph_annotation_id)) {
     LOGS(*session_logger_, INFO) << "Replaying the captured "
                                  << cached_execution_provider_for_graph_replay_.Type()
-                                 << " CUDA Graph for this model with tag: " << run_options.run_tag;
-    ORT_RETURN_IF_ERROR_SESSIONID_(cached_execution_provider_for_graph_replay_.ReplayGraph());
+                                 << " CUDA Graph for this model with tag: " << run_options.run_tag
+                                 << " with graph annotation id: " << graph_annotation_id;
+    ORT_RETURN_IF_ERROR_SESSIONID_(cached_execution_provider_for_graph_replay_.ReplayGraph(graph_annotation_id));
   } else {
     InlinedVector<IExecutionProvider*> exec_providers_to_stop;
     exec_providers_to_stop.reserve(execution_providers_.NumProviders());
@@ -2559,7 +2570,8 @@ Status InferenceSession::Run(const RunOptions& run_options,
   // N is defined in min_num_runs_before_hip_graph_capture_ for ROCM EP,
   // and the value could be different for other EP.
   if (retval.IsOK() && cached_execution_provider_for_graph_replay_.IsGraphCaptureEnabled() &&
-      !cached_execution_provider_for_graph_replay_.IsGraphCaptured()) {
+      cached_execution_provider_for_graph_replay_.AllowGraphCaptureOnRun(graph_annotation_id) &&
+      !cached_execution_provider_for_graph_replay_.IsGraphCaptured(graph_annotation_id)) {
     LOGS(*session_logger_, INFO) << "Start another run for necessary memory allocation or graph capture.";
     ORT_RETURN_IF_ERROR(Run(run_options, feed_names, feeds, output_names, p_fetches, p_fetches_device_info));
   }
diff --git a/onnxruntime/core/session/inference_session.h b/onnxruntime/core/session/inference_session.h
index f8211bfd2dd4e..3038c8d22ec80 100644
--- a/onnxruntime/core/session/inference_session.h
+++ b/onnxruntime/core/session/inference_session.h
@@ -675,7 +675,6 @@ class InferenceSession {
    * If we encounter an invalid request, we return an error
    * back to the user.
    */
-
   [[nodiscard]] common::Status ValidateAndParseShrinkArenaString(const std::string& ort_device_list,
                                                                  /*out*/ InlinedVector<AllocatorPtr>& arenas_to_shrink) const;
 
@@ -867,14 +866,17 @@ class InferenceSession {
       return cached_execution_provider_for_graph_replay_ != nullptr && cached_execution_provider_for_graph_replay_->IsGraphCaptureEnabled();
     }
 
-    bool IsGraphCaptured() const {
-      return cached_execution_provider_for_graph_replay_ != nullptr && cached_execution_provider_for_graph_replay_->IsGraphCaptured();
+    bool IsGraphCaptured(int graph_annotation_id) const {
+      return cached_execution_provider_for_graph_replay_ != nullptr && cached_execution_provider_for_graph_replay_->IsGraphCaptured(graph_annotation_id);
+    }
+
+    bool AllowGraphCaptureOnRun(int graph_annotation_id) const {
+      return cached_execution_provider_for_graph_replay_ != nullptr && graph_annotation_id != kGraphAnnotationSkip;
     }
 
-    Status ReplayGraph() {
-      ORT_ENFORCE(IsGraphCaptured());
+    Status ReplayGraph(int graph_annotation_id) {
       if (cached_execution_provider_for_graph_replay_) {
-        return cached_execution_provider_for_graph_replay_->ReplayGraph();
+        return cached_execution_provider_for_graph_replay_->ReplayGraph(graph_annotation_id);
       }
       return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Cached EP instance for graph replay is not set yet before calling ReplayGraph()");
     }
@@ -884,6 +886,8 @@ class InferenceSession {
     }
 
     IExecutionProvider* cached_execution_provider_for_graph_replay_ = nullptr;
+    // TODO(wy): Same as kCudaGraphAnnotationSkip in cuda_graph.h. Move to a common place.
+    constexpr static int kGraphAnnotationSkip = -1;
   };
 
   CachedExecutionProviderForGraphReplay cached_execution_provider_for_graph_replay_;
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index 3bec9aa146f76..d6797512d9e47 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -14,6 +14,7 @@
 #include "core/framework/execution_provider.h"
 #include "core/framework/kernel_registry.h"
 #include "core/framework/provider_shutdown.h"
+#include "core/framework/run_options.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/framework/TensorSeq.h"
 #include "core/framework/provider_options.h"
@@ -676,6 +677,9 @@ struct ProviderHostImpl : ProviderHost {
     return p->GetConfigEntry(config_key);
   }
 
+  // OrtRunOptions (wrapped)
+  const ConfigOptions& RunOptions__GetConfigOptions(const RunOptions* p) override { return p->config_options; }
+
   // ComputeCapability (wrapped)
   std::unique_ptr<ComputeCapability> ComputeCapability__construct(std::unique_ptr<IndexedSubGraph> t_sub_graph) override { return std::make_unique<ComputeCapability>(std::move(t_sub_graph)); }
   void ComputeCapability__operator_delete(ComputeCapability* p) override { delete p; }
diff --git a/onnxruntime/python/tools/transformers/models/phi2/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/phi2/convert_to_onnx.py
index 796d6ec55ef80..8083778423241 100644
--- a/onnxruntime/python/tools/transformers/models/phi2/convert_to_onnx.py
+++ b/onnxruntime/python/tools/transformers/models/phi2/convert_to_onnx.py
@@ -13,6 +13,7 @@
 import torch
 from benchmark_helper import Precision
 from fusion_options import AttentionOpType
+from onnx_model import OnnxModel
 from transformers import AutoConfig, AutoModelForCausalLM
 
 from onnxruntime.quantization.matmul_4bits_quantizer import MatMul4BitsQuantizer
@@ -168,6 +169,58 @@ def optimize_phi2_onnx(self, onnx_path: str, onnx_path_opt: str):
             quant.process()
             quant.model.save_model_to_file(onnx_path_opt, use_external_data_format=True)
 
+    # This function currently only works for phi2 model
+    def convert_to_use_cuda_graph(self, in_onnx_path: str, out_onnx_path: str):
+        onnx_model = OnnxModel(onnx.load(in_onnx_path, load_external_data=True))
+
+        from onnx import TensorProto, helper
+
+        graph = onnx_model.graph()
+        new_inputs = []
+        for vi in graph.input:
+            if "attention_mask" in vi.name:
+                vi_seqlen_k = helper.make_tensor_value_info(
+                    "seqlens_k",
+                    elem_type=TensorProto.INT32,
+                    shape=["batch_size"],
+                )
+                vi_total_seq_len = helper.make_tensor_value_info(
+                    "total_sequence_length",
+                    elem_type=TensorProto.INT32,
+                    shape=[1],
+                )
+                new_inputs.extend([vi_seqlen_k, vi_total_seq_len])
+            else:
+                new_inputs.append(vi)
+
+        graph.ClearField("input")
+        graph.input.extend(new_inputs)
+
+        gqas = onnx_model.get_nodes_by_op_type("GroupQueryAttention")
+        gqa = gqas[0]
+        seqlens_path = onnx_model.match_parent_path(
+            gqa,
+            ["Cast", "Sub", "ReduceSum", "Cast"],
+            [5, 0, 0, 0],
+        )
+        if seqlens_path is None:
+            raise RuntimeError("Failed to find seqlens path for GroupQueryAttention node.")
+        total_seq_len_path = onnx_model.match_parent_path(
+            gqa,
+            ["Cast", "Gather", "Shape"],
+            [6, 0, 0],
+        )
+        if total_seq_len_path is None:
+            raise RuntimeError("Failed to find total_seq_len path for GroupQueryAttention node.")
+        onnx_model.remove_nodes(seqlens_path)
+        onnx_model.remove_nodes(total_seq_len_path)
+
+        for gqa in gqas:
+            gqa.input[5] = "seqlens_k"
+            gqa.input[6] = "total_sequence_length"
+
+        onnx_model.save(onnx_model.model, out_onnx_path, save_as_external_data=True)
+
 
 def parse_arguments():
     parser = argparse.ArgumentParser()
@@ -235,6 +288,13 @@ def parse_arguments():
         help="Generate int4 ONNX model for ORT VLLM",
     )
 
+    parser.add_argument(
+        "--use_cuda_graph",
+        required=False,
+        action="store_true",
+        help="Use CUDA Graph in decoding process",
+    )
+
     parser.add_argument(
         "--overwrite",
         required=False,
@@ -265,6 +325,13 @@ def parse_arguments():
         help="Run ORT inference example",
     )
 
+    parser.add_argument(
+        "--run_benchmark",
+        required=False,
+        action="store_true",
+        help="Run ORT benchmark",
+    )
+
     parser.add_argument(
         "--skip_export",
         required=False,
@@ -375,6 +442,9 @@ def run_optimize_phi2_onnx(
         ):
             converter.init_attn_type_and_precision(attention_type, precision)
             converter.optimize_phi2_onnx(original_onnx_path, optimized_onnx_path)
+            if args.use_cuda_graph:
+                assert args.fp16_gpu_sm8x or args.int4_gpu_sm8x
+                converter.convert_to_use_cuda_graph(optimized_onnx_path, optimized_onnx_path)
 
         processes = []
         if args.fp32_cpu:
@@ -447,7 +517,7 @@ def run_optimize_phi2_onnx(
         [p.start() for p in processes]
         [p.join() for p in processes]
 
-    if args.run_example:
+    if args.run_example or args.run_benchmark:
         from inference_example import run_phi2
 
         if args.fp16_gpu_sm8x:
@@ -457,6 +527,8 @@ def run_optimize_phi2_onnx(
                 use_buffer_share=True,
                 device_id=args.device_id,
                 use_step=True,
+                use_cuda_graph=args.use_cuda_graph,
+                run_benchmark=args.run_benchmark,
             )
         if args.int4_gpu_sm8x:
             logging.info("Running int4_gpu_sm8x example...")
@@ -465,6 +537,8 @@ def run_optimize_phi2_onnx(
                 use_buffer_share=True,
                 device_id=args.device_id,
                 use_step=True,
+                use_cuda_graph=args.use_cuda_graph,
+                run_benchmark=args.run_benchmark,
             )
         if args.fp32_gpu:
             logging.info("Running fp32_gpu example...")
@@ -474,6 +548,7 @@ def run_optimize_phi2_onnx(
                 device_id=args.device_id,
                 packed_kv=True,
                 use_fp16=False,
+                run_benchmark=args.run_benchmark,
             )
         if args.fp16_gpu:
             logging.info("Running fp16_gpu example...")
@@ -482,6 +557,7 @@ def run_optimize_phi2_onnx(
                 use_buffer_share=False,
                 device_id=args.device_id,
                 packed_kv=True,
+                run_benchmark=args.run_benchmark,
             )
         if args.int4_gpu:
             logging.info("Running int4_gpu example...")
@@ -490,6 +566,7 @@ def run_optimize_phi2_onnx(
                 use_buffer_share=False,
                 device_id=args.device_id,
                 packed_kv=True,
+                run_benchmark=args.run_benchmark,
             )
         if args.fp32_cpu or args.int4_cpu or args.fp16_vllm or args.int4_vllm:
             raise NotImplementedError("CPU/vllm inference example is not implemented yet.")
diff --git a/onnxruntime/python/tools/transformers/models/phi2/inference_example.py b/onnxruntime/python/tools/transformers/models/phi2/inference_example.py
index 28828ffb853cb..829334b46b469 100644
--- a/onnxruntime/python/tools/transformers/models/phi2/inference_example.py
+++ b/onnxruntime/python/tools/transformers/models/phi2/inference_example.py
@@ -17,6 +17,17 @@
 }
 
 
+def cuda_memcpy(dst, src):
+    from cuda import cudart
+
+    cudart.cudaMemcpy(
+        dst.data_ptr(),
+        src.data_ptr(),
+        src.element_size() * src.nelement(),
+        cudart.cudaMemcpyKind.cudaMemcpyDeviceToDevice,
+    )
+
+
 class ORTGenerator:
     def __init__(self, decoder_path):
         self.onnx_decoder_path = decoder_path
@@ -24,13 +35,68 @@ def __init__(self, decoder_path):
         self.head_size = 80
         self.num_layers = 32
         self.max_sequence_length = 2048
+        self.device_id = 0
+        self.use_cuda_graph = False
+        self.use_traced_inputs = False
+        self.static_inputs_map = {}
+
+    def append_static_inputs(self, batch_size):
+        # Only use this function with GQA and with use_cuda_graph=True
+        if batch_size in self.static_inputs_map:
+            return
+
+        cpu_device = torch.device("cpu")
+        cuda_device = torch.device("cuda", self.device_id)
+
+        static_io = {}
+        static_io["input_ids"] = torch.zeros((batch_size, 1), dtype=torch.int32, device=cuda_device)
+        static_io["step"] = torch.tensor([0], dtype=torch.int64, device=cuda_device)
+        static_io["seqlens_k"] = torch.tensor(batch_size * [0], dtype=torch.int32, device=cuda_device)
+        static_io["total_sequence_length"] = torch.tensor([0], dtype=torch.int32, device=cpu_device)
+
+        cache_shape = (batch_size, self.num_heads, self.max_sequence_length, self.head_size)
+        for i in range(self.num_layers):
+            cache = torch.zeros(cache_shape, device=cuda_device, dtype=torch.float16)
+            static_io.update({f"past_key_{i}": cache.contiguous(), f"past_value_{i}": cache.clone().contiguous()})
+
+        static_io["logits"] = torch.zeros((batch_size, 1, 51200), dtype=torch.float16, device=cuda_device)
+
+        self.static_inputs_map[batch_size] = static_io
 
     def get_initial_inputs_and_outputs(self, encodings_dict):
         self.torch_dtype = torch.float16 if self.use_fp16 else torch.float32
 
         input_ids = torch.tensor(encodings_dict["input_ids"], device=self.device, dtype=torch.int32)
         attention_mask = torch.tensor(encodings_dict["attention_mask"], device=self.device, dtype=torch.int32)
-        step = torch.tensor([0], device=self.device, dtype=torch.int64)
+
+        batch_size, sequence_length = input_ids.shape
+
+        self.use_traced_inputs = (
+            self.use_cuda_graph
+            and (batch_size in self.static_inputs_map)
+            and self.use_buffer_share
+            and not self.packed_kv
+        )
+
+        step = (
+            torch.tensor([0], device=self.device, dtype=torch.int64)
+            if not self.use_traced_inputs
+            else self.static_inputs_map[batch_size]["step"]
+        )
+
+        seqlens_k = (
+            torch.tensor(batch_size * [0], device=self.device, dtype=torch.int32)
+            if not self.use_traced_inputs
+            else self.static_inputs_map[batch_size]["seqlens_k"]
+        )
+        cuda_memcpy(seqlens_k, attention_mask.sum(1).sub(1).to(torch.int32))
+
+        total_seq_length = (
+            torch.tensor([0], device=torch.device("cpu"), dtype=torch.int32)
+            if not self.use_traced_inputs
+            else self.static_inputs_map[batch_size]["total_sequence_length"]
+        )
+        total_seq_length[0] = sequence_length
 
         inputs = {
             "input_ids": input_ids.contiguous(),
@@ -40,7 +106,10 @@ def get_initial_inputs_and_outputs(self, encodings_dict):
         if self.use_step:
             inputs["step"] = step.contiguous()
 
-        batch_size, sequence_length = input_ids.shape
+        if self.use_cuda_graph:
+            inputs["seqlens_k"] = seqlens_k.contiguous()
+            inputs["total_sequence_length"] = total_seq_length.contiguous()
+            del inputs["attention_mask"]
 
         past_seq_length = self.max_sequence_length if self.use_buffer_share else 0
         past_shape = (
@@ -48,11 +117,21 @@ def get_initial_inputs_and_outputs(self, encodings_dict):
             if self.packed_kv
             else (batch_size, self.num_heads, past_seq_length, self.head_size)
         )
-        for i in range(self.num_layers):
-            past = torch.zeros(past_shape, device=self.device, dtype=self.torch_dtype)
-            inputs.update(
-                {f"past_key_{i}": past.contiguous(), f"past_value_{i}": past.clone().contiguous()}
-            ) if not self.packed_kv else inputs.update({f"past_{i}": past.contiguous()})
+
+        if not self.use_traced_inputs:
+            for i in range(self.num_layers):
+                past = torch.zeros(past_shape, device=self.device, dtype=self.torch_dtype)
+                inputs.update(
+                    {f"past_key_{i}": past.contiguous(), f"past_value_{i}": past.clone().contiguous()}
+                ) if not self.packed_kv else inputs.update({f"past_{i}": past.contiguous()})
+        else:
+            for i in range(self.num_layers):
+                inputs.update(
+                    {
+                        f"past_key_{i}": self.static_inputs_map[batch_size][f"past_key_{i}"].contiguous(),
+                        f"past_value_{i}": self.static_inputs_map[batch_size][f"past_value_{i}"].contiguous(),
+                    }
+                )
 
         logits = torch.zeros(batch_size, sequence_length, 51200, device=self.device, dtype=self.torch_dtype)
         outputs = {"logits": logits.contiguous()}
@@ -111,12 +190,23 @@ def apply_io_binding(self, model: ort.InferenceSession, inputs: dict, outputs: d
 
         return io_binding
 
-    def create_session(self, device_id, use_fp16=True, use_buffer_share=True, packed_kv=False, use_step=False):
+    def create_session(
+        self, device_id, use_fp16=True, use_buffer_share=True, packed_kv=False, use_step=False, use_cuda_graph=False
+    ):
+        self.device_id = device_id
         sess_options = ort.SessionOptions()
-        ep = ("CUDAExecutionProvider", {"device_id": device_id}) if device_id >= 0 else "CPUExecutionProvider"
+        sess_options.log_verbosity_level = 4
+        sess_options.log_severity_level = 4
+        self.use_cuda_graph = use_cuda_graph
+        ep = (
+            ("CUDAExecutionProvider", {"device_id": self.device_id, "enable_cuda_graph": self.use_cuda_graph})
+            if self.device_id >= 0
+            else "CPUExecutionProvider"
+        )
         self.sess = ort.InferenceSession(self.onnx_decoder_path, sess_options=sess_options, providers=[ep])
+        self.ro = ort.RunOptions()
 
-        self.device = torch.device("cuda", device_id) if torch.cuda.is_available() else torch.device("cpu")
+        self.device = torch.device("cuda", self.device_id) if torch.cuda.is_available() else torch.device("cpu")
         self.use_fp16 = use_fp16
         self.use_buffer_share = use_buffer_share
         self.packed_kv = packed_kv
@@ -125,9 +215,7 @@ def create_session(self, device_id, use_fp16=True, use_buffer_share=True, packed
         self.tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
         self.tokenizer.pad_token = "[PAD]"
 
-    def generate(self, prompt, max_length):
-        encodings_dict = self.tokenizer.batch_encode_plus(prompt, padding=True)
-
+    def generate_impl(self, encodings_dict, max_length, cuda_graph_annotation, benchmark=False):
         inputs, outputs = self.get_initial_inputs_and_outputs(encodings_dict)
 
         all_token_ids = inputs["input_ids"].clone()
@@ -136,13 +224,38 @@ def generate(self, prompt, max_length):
         current_length = sequence_length
         has_eos = torch.zeros(batch_size, device=self.device, dtype=torch.bool)
 
+        if benchmark:
+            import time
+
+            latency = []
+
+        prompt_run = True
         while current_length < max_length:
             io_binding = self.apply_io_binding(self.sess, inputs, outputs)
 
+            if benchmark:
+                start = time.time()
+
             io_binding.synchronize_inputs()
-            self.sess.run_with_iobinding(io_binding)
+            if prompt_run:
+                if self.use_cuda_graph:
+                    # Disable CUDA graph for the prompt run
+                    self.ro.add_run_config_entry("gpu_graph_id", "-1")
+                self.sess.run_with_iobinding(io_binding, self.ro)
+                if self.use_cuda_graph:
+                    # Enable CUDA graph for the decoding run
+                    self.ro.add_run_config_entry(
+                        "gpu_graph_id", str(cuda_graph_annotation) if self.use_traced_inputs else "-1"
+                    )
+                prompt_run = False
+            else:
+                self.sess.run_with_iobinding(io_binding, self.ro)
             io_binding.synchronize_outputs()
 
+            if benchmark:
+                end = time.time()
+                latency.append(end - start)
+
             # Sample with argmax (greedy search)
             next_token_logits = outputs["logits"][:, -1, :]
             next_tokens = torch.argmax(next_token_logits, dim=-1)
@@ -161,16 +274,37 @@ def generate(self, prompt, max_length):
 
             # Update inputs for next inference run
             current_length += 1
+
             inputs["input_ids"] = tokens_to_add.to(torch.int32)
+            if self.use_traced_inputs:
+                cuda_memcpy(self.static_inputs_map[batch_size]["input_ids"], inputs["input_ids"])
+                inputs["input_ids"] = self.static_inputs_map[batch_size]["input_ids"]
+
             if self.use_step:
                 inputs["step"] = torch.tensor([current_length - 1], device=self.device, dtype=torch.int64)
-            inputs["attention_mask"] = torch.cat([inputs["attention_mask"], (~has_eos).reshape(batch_size, 1)], 1).to(
-                torch.int32
-            )
+                if self.use_traced_inputs:
+                    cuda_memcpy(self.static_inputs_map[batch_size]["step"], inputs["step"])
+                    inputs["step"] = self.static_inputs_map[batch_size]["step"]
+
+            if self.use_cuda_graph:
+                previous_seqlens_k = inputs["seqlens_k"]
+                inputs["seqlens_k"] = (previous_seqlens_k + (~has_eos).reshape(batch_size, 1)).to(torch.int32)
+                inputs["total_sequence_length"][0] = current_length
+                if self.use_traced_inputs:
+                    cuda_memcpy(self.static_inputs_map[batch_size]["seqlens_k"], inputs["seqlens_k"])
+                    inputs["seqlens_k"] = self.static_inputs_map[batch_size]["seqlens_k"]
+                    self.static_inputs_map[batch_size]["total_sequence_length"][0] = inputs["total_sequence_length"][0]
+                    inputs["total_sequence_length"] = self.static_inputs_map[batch_size]["total_sequence_length"]
+            else:
+                inputs["attention_mask"] = torch.cat(
+                    [inputs["attention_mask"], (~has_eos).reshape(batch_size, 1)], 1
+                ).to(torch.int32)
 
             # Set logits to zeros for next inference run and re-use memory buffer
             if outputs["logits"].shape[1] != 1:
                 outputs["logits"] = outputs["logits"][:, :1, :].contiguous()
+                if self.use_traced_inputs:
+                    outputs["logits"] = self.static_inputs_map[batch_size]["logits"]
             outputs["logits"].zero_()
 
             if not self.use_buffer_share:
@@ -193,11 +327,59 @@ def generate(self, prompt, max_length):
                         {f"present_key_{i}": present.contiguous(), f"present_value_{i}": present.clone().contiguous()}
                     ) if not self.packed_kv else outputs.update({f"present_{i}": present.contiguous()})
 
+        if benchmark:
+            print(
+                f"Batch size: {batch_size}, Sequence length: {sequence_length}, Token num: {max_length - sequence_length}"
+            )
+            print(f"Prompt letency: {1000 * latency[0]}ms, Token latency: {1000 * np.mean(latency[1:])}ms")
+            return
+
         texts = self.tokenizer.batch_decode(all_token_ids, skip_special_tokens=True)
         return texts
 
+    def generate(self, prompt, max_length, cuda_graph_annotation):
+        encodings_dict = self.tokenizer.batch_encode_plus(prompt, padding=True)
+
+        return self.generate_impl(encodings_dict, max_length, cuda_graph_annotation)
+
+    def generate_benchmark(self, prompt_shape, token_num, cuda_graph_annotation):
+        batch_size, sequence_length = prompt_shape
+        max_length = sequence_length + token_num
+
+        encodings_dict = {}
+        encodings_dict["input_ids"] = torch.randint(0, 50264, (batch_size, sequence_length), dtype=torch.int32).tolist()
+        encodings_dict["attention_mask"] = torch.ones((batch_size, sequence_length), dtype=torch.int32).tolist()
+
+        # Warm up run
+        self.generate_impl(encodings_dict, max_length, cuda_graph_annotation, benchmark=False)
+
+        # Benchmark run
+        self.generate_impl(encodings_dict, max_length, cuda_graph_annotation, benchmark=True)
+
+
+def run_phi2(
+    onnx_model_path,
+    use_buffer_share,
+    device_id,
+    packed_kv=False,
+    use_fp16=True,
+    use_step=False,
+    use_cuda_graph=False,
+    run_benchmark=False,
+):
+    generator = ORTGenerator(onnx_model_path)
+    generator.create_session(device_id, use_fp16, use_buffer_share, packed_kv, use_step, use_cuda_graph)
+
+    def simple_run(prompt):
+        example_batch_size = len(prompt)
+        if use_cuda_graph:
+            generator.append_static_inputs(batch_size=example_batch_size)
+        texts = generator.generate(prompt, max_length=210, cuda_graph_annotation=example_batch_size)
+
+        for i in range(len(texts)):
+            print("Prompt: ", prompt[i])
+            print("Texts: ", texts[i])
 
-def run_phi2(onnx_model_path, use_buffer_share, device_id, packed_kv=False, use_fp16=True, use_step=False):
     prompt = [
         '''```python
     def print_prime(n):
@@ -206,10 +388,14 @@ def print_prime(n):
     """'''
     ]
 
-    generator = ORTGenerator(onnx_model_path)
-    generator.create_session(device_id, use_fp16, use_buffer_share, packed_kv, use_step)
-    texts = generator.generate(prompt, max_length=200)
-
-    for i in range(len(texts)):
-        print("Prompt: ", prompt[i])
-        print("Texts: ", texts[i])
+    if not run_benchmark:
+        simple_run(prompt)
+
+    # Run simple benchmark. Time the decoder only.
+    if run_benchmark:
+        token_num = 32
+        for batch_size in [1, 2, 4, 8]:
+            generator.append_static_inputs(batch_size)
+            for sequence_length in [16, 512]:
+                prompt_shape = (batch_size, sequence_length)
+                generator.generate_benchmark(prompt_shape, token_num, cuda_graph_annotation=batch_size)
diff --git a/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py b/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py
index c4e13e773535d..ce04dff2aecb0 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py
@@ -84,6 +84,7 @@ def test_select_ep_to_run_cuda_graph(self):
         elif "CUDAExecutionProvider" in onnxrt.get_available_providers():
             providers = [("CUDAExecutionProvider", {"enable_cuda_graph": True})]
             self.run_model_with_cuda_graph(providers)
+            self.run_model_with_cuda_graph_annotation(providers)
 
     def run_model_with_cuda_graph(self, providers):
         INPUT_SIZE = 1280  # noqa: N806
@@ -100,13 +101,15 @@ def run_model_with_cuda_graph(self, providers):
         io_binding.bind_ortvalue_input("X", x_ortvalue)
         io_binding.bind_ortvalue_output("Y", y_ortvalue)
 
+        ro = onnxrt.RunOptions()
+
         # One regular run for the necessary memory allocation and cuda graph capturing
-        session.run_with_iobinding(io_binding)
+        session.run_with_iobinding(io_binding, ro)
         expected_y = np.array([[5.0], [11.0], [17.0]] * INPUT_SIZE, dtype=np.float32)
         np.testing.assert_allclose(expected_y, y_ortvalue.numpy(), rtol=1e-05, atol=1e-05)
 
         # After capturing, CUDA graph replay happens from this Run onwards
-        session.run_with_iobinding(io_binding)
+        session.run_with_iobinding(io_binding, ro)
         np.testing.assert_allclose(expected_y, y_ortvalue.numpy(), rtol=1e-05, atol=1e-05)
 
         # Update input and then replay CUDA graph
@@ -116,7 +119,7 @@ def run_model_with_cuda_graph(self, providers):
                 dtype=np.float32,
             )
         )
-        session.run_with_iobinding(io_binding)
+        session.run_with_iobinding(io_binding, ro)
         np.testing.assert_allclose(
             np.array([[50.0], [110.0], [170.0]] * INPUT_SIZE, dtype=np.float32),
             y_ortvalue.numpy(),
@@ -124,6 +127,58 @@ def run_model_with_cuda_graph(self, providers):
             atol=1e-05,
         )
 
+    def run_model_with_cuda_graph_annotation(self, providers):
+        INPUT_SIZE = 1280  # noqa: N806
+
+        x_base = [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]]
+        y_base = [[0.0], [0.0], [0.0], [0.0]]
+        expected_y_base = [[5.0], [11.0], [17.0], [23.0]]
+
+        x_base_mul_10 = [[10.0, 20.0], [30.0, 40.0], [50.0, 60.0], [70.0, 80.0]]
+        expected_y_base_mul_10 = [[50.0], [110.0], [170.0], [230.0]]
+
+        test_num = 4
+
+        x_ortvalues = []
+        y_ortvalues = []
+        for i in range(test_num):
+            x = np.array(x_base[: i + 1][:] * INPUT_SIZE, dtype=np.float32)
+            y = np.array(y_base[: i + 1][:] * INPUT_SIZE, dtype=np.float32)
+            x_ortvalues.append(onnxrt.OrtValue.ortvalue_from_numpy(x, "cuda", 0))
+            y_ortvalues.append(onnxrt.OrtValue.ortvalue_from_numpy(y, "cuda", 0))
+
+        onnxrt.set_default_logger_severity(0)
+        session = onnxrt.InferenceSession(get_name("matmul_2.onnx"), providers=providers)
+        io_bindings = [session.io_binding()] * test_num
+        ro = onnxrt.RunOptions()
+
+        # Regular run to capture CUDA graph
+        for i in range(test_num):
+            io_bindings[i].bind_ortvalue_input("X", x_ortvalues[i])
+            io_bindings[i].bind_ortvalue_output("Y", y_ortvalues[i])
+            # TODO: Temporarily remove the default cuda graph capture test for the first regular run
+            # because it fails on a training CI. Need to investigate the root cause.
+            ro.add_run_config_entry("gpu_graph_id", str(i + 1))
+            io_bindings[i].synchronize_inputs()
+            session.run_with_iobinding(io_bindings[i], ro)
+            io_bindings[i].synchronize_outputs()
+            expected_y = np.array(expected_y_base[: i + 1][:] * INPUT_SIZE, dtype=np.float32)
+            np.testing.assert_allclose(expected_y, y_ortvalues[i].numpy(), rtol=1e-05, atol=1e-05)
+
+        del ro
+        ro = onnxrt.RunOptions()
+
+        # After capturing, CUDA graph replay happens from this Run onwards
+        for i in range(test_num):
+            # Update input and then replay CUDA graph
+            x_ortvalues[i].update_inplace(np.array(x_base_mul_10[: i + 1][:] * INPUT_SIZE, dtype=np.float32))
+            ro.add_run_config_entry("gpu_graph_id", str(i + 1))
+            io_bindings[i].synchronize_inputs()
+            session.run_with_iobinding(io_bindings[i], ro)
+            io_bindings[i].synchronize_outputs()
+            expected_y = np.array(expected_y_base_mul_10[: i + 1][:] * INPUT_SIZE, dtype=np.float32)
+            np.testing.assert_allclose(expected_y, y_ortvalues[i].numpy(), rtol=1e-05, atol=1e-05)
+
     def test_arena_with_cuda_graph(self):
         if "CUDAExecutionProvider" in onnxrt.get_available_providers():
             # To test cuda graph catpure, we set Arena extend strategy to be SameAsRequested so as to detect any
diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc
index 8dad2c8e2d10d..453b5fdd360bf 100644
--- a/onnxruntime/test/shared_lib/test_inference.cc
+++ b/onnxruntime/test/shared_lib/test_inference.cc
@@ -180,6 +180,9 @@ static void TestInference(Ort::Env& env, const std::basic_string<ORTCHAR_T>& mod
 }
 
 static constexpr PATH_TYPE MODEL_URI = TSTR("testdata/mul_1.onnx");
+#if defined(USE_CUDA)
+static constexpr PATH_TYPE CUDA_GRAPH_ANNOTATION_MODEL_URI = TSTR("testdata/mul_1_dynamic.onnx");
+#endif
 static constexpr PATH_TYPE MATMUL_MODEL_URI = TSTR("testdata/matmul_1.onnx");
 #ifndef ORT_NO_RTTI
 static constexpr PATH_TYPE SEQUENCE_MODEL_URI = TSTR("testdata/sequence_length.onnx");
@@ -2082,6 +2085,152 @@ TEST(CApiTest, basic_cuda_graph) {
 #endif
 }
 
+#if defined(USE_CUDA)
+struct CudaGraphInputOutputData_0 {
+  const std::array<int64_t, 2> x_shape = {3, 2};
+  std::array<float, 3 * 2> x_values = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  const std::array<int64_t, 2> expected_y_shape = {3, 2};
+  std::array<float, 3 * 2> expected_y = {1.0f, 4.0f, 9.0f, 16.0f, 25.0f, 36.0f};
+
+  std::array<float, 3 * 2> y_values;
+  std::array<float, 3 * 2> new_x_values = {10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f};
+  std::array<float, 3 * 2> new_expected_y = {10.0f, 40.0f, 90.0f, 160.0f, 250.0f, 360.0f};
+} cg_data_0;
+
+struct CudaGraphInputOutputData_1 {
+  const std::array<int64_t, 2> x_shape = {3, 1};
+  std::array<float, 3> x_values = {1.0f, 3.0f, 5.0f};
+  const std::array<int64_t, 2> expected_y_shape = {3, 2};
+  std::array<float, 3 * 2> expected_y = {1.0f, 2.0f, 9.0f, 12.0f, 25.0f, 30.0f};
+
+  std::array<float, 3 * 2> y_values;
+  std::array<float, 3> new_x_values = {10.0f, 30.0f, 50.0f};
+  std::array<float, 3 * 2> new_expected_y = {10.0f, 20.0f, 90.0f, 120.0f, 250.0f, 300.0f};
+} cg_data_1;
+
+struct CudaGraphInputOutputData_2 {
+  const std::array<int64_t, 2> x_shape = {1, 2};
+  std::array<float, 3 * 2> x_values = {1.0f, 2.0f};
+  const std::array<int64_t, 2> expected_y_shape = {3, 2};
+  std::array<float, 3 * 2> expected_y = {1.0f, 4.0f, 3.0f, 8.0f, 5.0f, 12.0f};
+
+  std::array<float, 3 * 2> y_values;
+  std::array<float, 3 * 2> new_x_values = {10.0f, 20.0f};
+  std::array<float, 3 * 2> new_expected_y = {10.0f, 40.0f, 30.0f, 80.0f, 50.0f, 120.0f};
+} cg_data_2;
+
+template <typename T>
+static void RunWithCudaGraphAnnotation(T& cg_data,
+                                       Ort::Session& session,
+                                       Ort::MemoryInfo& info_mem,
+                                       Ort::MemoryAllocation& input_data,
+                                       Ort::MemoryAllocation& output_data,
+                                       const char* cuda_graph_annotation) {
+  (void)cudaMemcpy(input_data.get(),
+                   cg_data.x_values.data(),
+                   sizeof(float) * cg_data.x_values.size(),
+                   cudaMemcpyHostToDevice);
+
+  // Create an OrtValue tensor backed by data on CUDA memory
+  Ort::Value bound_x = Ort::Value::CreateTensor(info_mem,
+                                                reinterpret_cast<float*>(input_data.get()),
+                                                cg_data.x_values.size(),
+                                                cg_data.x_shape.data(),
+                                                cg_data.x_shape.size());
+
+  // Create an OrtValue tensor backed by data on CUDA memory
+  Ort::Value bound_y = Ort::Value::CreateTensor(info_mem,
+                                                reinterpret_cast<float*>(output_data.get()),
+                                                cg_data.expected_y.size(),
+                                                cg_data.expected_y_shape.data(),
+                                                cg_data.expected_y_shape.size());
+
+  // Create IoBinding for inputs and outputs.
+  Ort::IoBinding binding(session);
+  binding.BindInput("X", bound_x);
+  binding.BindOutput("Y", bound_y);
+
+  Ort::RunOptions run_option;
+  if (cuda_graph_annotation != nullptr) {
+    run_option.AddConfigEntry(kOrtRunOptionsConfigCudaGraphAnnotation, cuda_graph_annotation);
+  }
+
+  // One regular run for necessary memory allocation and graph capturing
+  session.Run(run_option, binding);
+
+  // Check the values against the bound raw memory (needs copying from device to host first)
+  (void)cudaMemcpy(cg_data.y_values.data(),
+                   output_data.get(),
+                   sizeof(float) * cg_data.y_values.size(),
+                   cudaMemcpyDeviceToHost);
+  ASSERT_THAT(cg_data.y_values, ::testing::ContainerEq(cg_data.expected_y));
+
+  // Replay the captured CUDA graph
+  session.Run(run_option, binding);
+  (void)cudaMemcpy(cg_data.y_values.data(),
+                   output_data.get(),
+                   sizeof(float) * cg_data.y_values.size(),
+                   cudaMemcpyDeviceToHost);
+  ASSERT_THAT(cg_data.y_values, ::testing::ContainerEq(cg_data.expected_y));
+
+  // Change the input and replay the CUDA graph again.
+  (void)cudaMemcpy(input_data.get(),
+                   cg_data.new_x_values.data(),
+                   sizeof(float) * cg_data.new_x_values.size(),
+                   cudaMemcpyHostToDevice);
+  binding.SynchronizeInputs();
+
+  session.Run(run_option, binding);
+  (void)cudaMemcpy(cg_data.y_values.data(),
+                   output_data.get(),
+                   sizeof(float) * cg_data.y_values.size(),
+                   cudaMemcpyDeviceToHost);
+  ASSERT_THAT(cg_data.y_values, ::testing::ContainerEq(cg_data.new_expected_y));
+
+  // Clean up
+  binding.ClearBoundInputs();
+  binding.ClearBoundOutputs();
+}
+
+TEST(CApiTest, basic_cuda_graph_with_annotation) {
+  const auto& api = Ort::GetApi();
+  Ort::SessionOptions session_options;
+
+  // Enable cuda graph in cuda provider option.
+  OrtCUDAProviderOptionsV2* cuda_options = nullptr;
+  ASSERT_TRUE(api.CreateCUDAProviderOptions(&cuda_options) == nullptr);
+  std::unique_ptr<OrtCUDAProviderOptionsV2, decltype(api.ReleaseCUDAProviderOptions)>
+      rel_cuda_options(cuda_options, api.ReleaseCUDAProviderOptions);
+  std::vector<const char*> keys{"enable_cuda_graph"};
+  std::vector<const char*> values{"1"};
+  ASSERT_TRUE(api.UpdateCUDAProviderOptions(rel_cuda_options.get(), keys.data(), values.data(), 1) == nullptr);
+
+  ASSERT_TRUE(api.SessionOptionsAppendExecutionProvider_CUDA_V2(
+                  static_cast<OrtSessionOptions*>(session_options),
+                  rel_cuda_options.get()) == nullptr);
+
+  Ort::Session session(*ort_env, CUDA_GRAPH_ANNOTATION_MODEL_URI, session_options);
+  Ort::MemoryInfo info_mem("Cuda", OrtAllocatorType::OrtArenaAllocator, 0, OrtMemTypeDefault);
+
+  Ort::Allocator allocator(session, info_mem);
+  auto allocator_info = allocator.GetInfo();
+  ASSERT_TRUE(info_mem == allocator_info);
+
+  size_t max_input_size = 6;
+  size_t max_output_size = 6;
+
+  auto input_data = allocator.GetAllocation(max_input_size * sizeof(float));
+  auto output_data = allocator.GetAllocation(max_output_size * sizeof(float));
+
+  ASSERT_NE(input_data.get(), nullptr);
+  ASSERT_NE(output_data.get(), nullptr);
+
+  RunWithCudaGraphAnnotation(cg_data_0, session, info_mem, input_data, output_data, nullptr);
+  RunWithCudaGraphAnnotation(cg_data_1, session, info_mem, input_data, output_data, "1");
+  RunWithCudaGraphAnnotation(cg_data_2, session, info_mem, input_data, output_data, "2");
+}
+#endif
+
 // The following test uses some ops not supported in the reduced ops build
 #ifndef REDUCED_OPS_BUILD
 #if defined(USE_CUDA) || defined(USE_TENSORRT)
diff --git a/onnxruntime/test/testdata/mul_1_dynamic.onnx b/onnxruntime/test/testdata/mul_1_dynamic.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..fb7822498b0048716e701f4c23846d30ae36a6dc
GIT binary patch
literal 142
zcmd;J7Gg`zNX;urw5s6}<zkHBVhk5zjFe){EzOBHRATlm%@N`NvK2~Fi%YbWIhZ+^
z1Q?wd!@VRJ7#i#u7#JLY2#6bicmWU}aEOuwnj$2^CB(rf#K*-d#GDy#0HF+%fKrjj
RQuz?6{CGnr7A^(>b^xTQ7&8C>

literal 0
HcmV?d00001


From 3dfce2f1cd9776f312f68f1cfc0d826875adcb67 Mon Sep 17 00:00:00 2001
From: Jambay Kinley <jambaykinley@microsoft.com>
Date: Thu, 7 Mar 2024 11:31:34 -0800
Subject: [PATCH 126/279] Fix argparser in `matmul_bnb4_quantizer` (#19812)

### Description
<!-- Describe your changes. -->
The argparser had incorrectly used `description` and `options` instead
of `help` and `choices`.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
Fixes:
#19751
---
 .../python/tools/quantization/matmul_bnb4_quantizer.py        | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/python/tools/quantization/matmul_bnb4_quantizer.py b/onnxruntime/python/tools/quantization/matmul_bnb4_quantizer.py
index 951746a089305..2bf47fe1680e9 100644
--- a/onnxruntime/python/tools/quantization/matmul_bnb4_quantizer.py
+++ b/onnxruntime/python/tools/quantization/matmul_bnb4_quantizer.py
@@ -199,14 +199,14 @@ def parse_args():
         "--quant_type",
         required=False,
         default=1,
-        options=[MatMulBnb4Quantizer.FP4, MatMulBnb4Quantizer.NF4],
+        choices=[MatMulBnb4Quantizer.FP4, MatMulBnb4Quantizer.NF4],
         help="Quantization data type. 0: FP4, 1: NF4",
     )
     parser.add_argument(
         "--block_size",
         required=False,
         default=64,
-        description="Block size for blockwise quantization. Note: bnb.nn.Linear4bit only uses block_size=64",
+        help="Block size for blockwise quantization. Note: bnb.nn.Linear4bit only uses block_size=64",
     )
     parser.add_argument("-v", "--verbose", required=False, action="store_true")
     parser.set_defaults(verbose=False)

From 33578cc76efc19b50c9fc011215b2777de193cd1 Mon Sep 17 00:00:00 2001
From: Yi-Hong Lyu <yilyu@microsoft.com>
Date: Thu, 7 Mar 2024 13:54:16 -0800
Subject: [PATCH 127/279] Remove memset for the case no any mask (#19823)

Improved OCR model speed by 1.034 end-to-end, by eliminating unnecessary
memset when no mask is present.
---
 .../contrib_ops/cpu/bert/attention_cpu_base.h       | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h b/onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h
index b761b1afd8529..c617533319a18 100644
--- a/onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h
+++ b/onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h
@@ -140,17 +140,6 @@ class AttentionCPUBase : public AttentionBase {
       if (mask_data != nullptr) {
         PrepareMask(mask_index, mask_index_dims, mask_data,
                     causal, batch_size, sequence_length, past_sequence_length, mask_filter_value_);
-      } else {  // no any mask
-        const int memset_loop_len = batch_size * num_heads_;
-        const double memset_cost = static_cast<double>(sequence_length) * total_sequence_length;
-
-        ThreadPool::TryParallelFor(tp, memset_loop_len, memset_cost, [&](std::ptrdiff_t begin, std::ptrdiff_t end) {
-          for (std::ptrdiff_t i = begin; i != end; ++i) {
-            const int output_offset = static_cast<int>(i) * sequence_length * total_sequence_length;
-            T* output = attention_probs + output_offset;
-            memset(output, 0, static_cast<size_t>(sequence_length) * total_sequence_length * sizeof(T));
-          }
-        });
       }
 
       const int loop_len = batch_size * num_heads_;
@@ -188,7 +177,7 @@ class AttentionCPUBase : public AttentionBase {
           // B: K'               (B x N x) T x H          (B x N x) H x T        H x T
           // C: attention_probs  (B x N x) S x T          (B x N x) S x T        S x T
           math::Gemm<T, ThreadPool>(CblasNoTrans, CblasTrans, sequence_length, total_sequence_length, head_size, alpha,
-                                    Q + q_input_chunk_length * i, k, 1.0,
+                                    Q + q_input_chunk_length * i, k, mask_data != nullptr ? 1.0f : 0.0f,
                                     output, nullptr);
 
           if (relative_position_bias_data != nullptr) {

From 296435264182e09cc37cfd981b012854226ddd2c Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <yuslepukhin@users.noreply.github.com>
Date: Thu, 7 Mar 2024 15:46:11 -0800
Subject: [PATCH 128/279] Implement IsNaN-9,13,20 for CUDA along with tests
 (#19807)

### Description


### Motivation and Context
Some models require IsNan CUDA along with training
---
 docs/OperatorKernels.md                       |  5 +-
 .../providers/cpu/cpu_execution_provider.cc   |  4 ++
 .../core/providers/cpu/tensor/isnan.cc        | 19 +++++-
 .../core/providers/cuda/cu_inc/common.cuh     | 59 ++++++++++++++++++-
 .../providers/cuda/cuda_execution_provider.cc |  7 ++-
 .../cuda/math/unary_elementwise_ops.cc        | 44 ++++++++++++++
 .../cuda/math/unary_elementwise_ops.h         |  6 ++
 .../cuda/math/unary_elementwise_ops_impl.cu   | 24 +++++++-
 .../cuda/math/unary_elementwise_ops_impl.h    | 14 +++++
 .../core/providers/rocm/cu_inc/common.cuh     | 57 ++++++++++++++++++
 .../providers/rocm/rocm_execution_provider.cc |  6 ++
 .../test/providers/cpu/tensor/isnan_test.cc   | 16 ++++-
 12 files changed, 252 insertions(+), 9 deletions(-)

diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index 4514a85531d6b..9f5cd4cc842dc 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -162,7 +162,7 @@ Do not modify directly.*
 |InstanceNormalization|*in* input:**T**<br> *in* scale:**T**<br> *in* B:**T**<br> *out* output:**T**|6+|**T** = tensor(float)|
 |IsInf|*in* X:**T1**<br> *out* Y:**T2**|20+|**T1** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)<br/> **T2** = tensor(bool)|
 |||[10, 19]|**T1** = tensor(double), tensor(float)<br/> **T2** = tensor(bool)|
-|IsNaN|*in* X:**T1**<br> *out* Y:**T2**|20+|**T1** = tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)<br/> **T2** = tensor(bool)|
+|IsNaN|*in* X:**T1**<br> *out* Y:**T2**|20+|**T1** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)<br/> **T2** = tensor(bool)|
 |||[13, 19]|**T1** = tensor(double), tensor(float), tensor(float16)<br/> **T2** = tensor(bool)|
 |||[9, 12]|**T1** = tensor(double), tensor(float), tensor(float16)<br/> **T2** = tensor(bool)|
 |LRN|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(float)|
@@ -633,6 +633,9 @@ Do not modify directly.*
 |InstanceNormalization|*in* input:**T**<br> *in* scale:**T**<br> *in* B:**T**<br> *out* output:**T**|6+|**T** = tensor(double), tensor(float), tensor(float16)|
 |IsInf|*in* X:**T1**<br> *out* Y:**T2**|20+|**T1** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)<br/> **T2** = tensor(bool)|
 |||[10, 19]|**T1** = tensor(double), tensor(float)<br/> **T2** = tensor(bool)|
+|IsNaN|*in* X:**T1**<br> *out* Y:**T2**|20+|**T1** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)<br/> **T2** = tensor(bool)|
+|||[13, 19]|**T1** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)<br/> **T2** = tensor(bool)|
+|||[9, 12]|**T1** = tensor(double), tensor(float), tensor(float16)<br/> **T2** = tensor(bool)|
 |LRN|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16)|
 |||[1, 12]|**T** = tensor(double), tensor(float), tensor(float16)|
 |LSTM|*in* X:**T**<br> *in* W:**T**<br> *in* R:**T**<br> *in* B:**T**<br> *in* sequence_lens:**T1**<br> *in* initial_h:**T**<br> *in* initial_c:**T**<br> *in* P:**T**<br> *out* Y:**T**<br> *out* Y_h:**T**<br> *out* Y_c:**T**|14+|**T** = tensor(double), tensor(float), tensor(float16)<br/> **T1** = tensor(int32)|
diff --git a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
index 7e0f919deb0a7..c3d5a51b636ef 100644
--- a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
+++ b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
@@ -714,6 +714,7 @@ class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDoma
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 19, float, IsNaN);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 19, double, IsNaN);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 19, MLFloat16, IsNaN);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 19, BFloat16, IsNaN);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, bool, NonZero);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, NonZero);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t, NonZero);
@@ -1023,6 +1024,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, float, IsNaN);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, double, IsNaN);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, MLFloat16, IsNaN);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, BFloat16, IsNaN);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Gelu);
 #if !defined(DISABLE_FLOAT8_TYPES)
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E4M3FN, IsNaN);
@@ -2553,6 +2555,8 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, double, IsNaN)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, MLFloat16,
                                                                 IsNaN)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, BFloat16,
+                                                                IsNaN)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Gelu)>,
 #if !defined(DISABLE_FLOAT8_TYPES)
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E4M3FN,
diff --git a/onnxruntime/core/providers/cpu/tensor/isnan.cc b/onnxruntime/core/providers/cpu/tensor/isnan.cc
index 34495e382278a..0e15c64b126f3 100644
--- a/onnxruntime/core/providers/cpu/tensor/isnan.cc
+++ b/onnxruntime/core/providers/cpu/tensor/isnan.cc
@@ -46,9 +46,11 @@ ADD_TYPED_ISNAN_OP_9(MLFloat16);
 ADD_TYPED_ISNAN_OP_13(float);
 ADD_TYPED_ISNAN_OP_13(double);
 ADD_TYPED_ISNAN_OP_13(MLFloat16);
+ADD_TYPED_ISNAN_OP_13(BFloat16);
 ADD_TYPED_ISNAN_OP(float);
 ADD_TYPED_ISNAN_OP(double);
 ADD_TYPED_ISNAN_OP(MLFloat16);
+ADD_TYPED_ISNAN_OP(BFloat16);
 
 #if !defined(DISABLE_FLOAT8_TYPES)
 ADD_TYPED_ISNAN_OP(Float8E4M3FN);
@@ -75,9 +77,7 @@ Status IsNaN<T>::Compute(OpKernelContext* context) const {
 template <>
 Status IsNaN<MLFloat16>::Compute(OpKernelContext* context) const {
   const auto* X_ptr = context->Input<Tensor>(0);
-  if (!X_ptr) {
-    return Status(common::ONNXRUNTIME, common::FAIL, "Null input ptr");
-  }
+
   auto X_data = X_ptr->Data<MLFloat16>();
   auto& dims = X_ptr->Shape();
   auto shape_size = dims.Size();
@@ -91,6 +91,19 @@ Status IsNaN<MLFloat16>::Compute(OpKernelContext* context) const {
   return Status::OK();
 }
 
+template <>
+Status IsNaN<BFloat16>::Compute(OpKernelContext* context) const {
+  const auto* X_ptr = context->Input<Tensor>(0);
+
+  auto X_data = X_ptr->DataAsSpan<BFloat16>();
+  auto& Y = *context->Output(0, X_ptr->Shape());
+
+  std::transform(X_data.begin(), X_data.end(), Y.MutableData<bool>(),
+                 [](BFloat16 x) { return x.IsNaN(); });
+
+  return Status::OK();
+}
+
 #if !defined(DISABLE_FLOAT8_TYPES)
 template <>
 Status IsNaN<Float8E4M3FN>::Compute(OpKernelContext* context) const {
diff --git a/onnxruntime/core/providers/cuda/cu_inc/common.cuh b/onnxruntime/core/providers/cuda/cu_inc/common.cuh
index bba9178348132..bed2f677166d6 100644
--- a/onnxruntime/core/providers/cuda/cu_inc/common.cuh
+++ b/onnxruntime/core/providers/cuda/cu_inc/common.cuh
@@ -485,7 +485,7 @@ struct IsInfTyped<BFloat16> {
 
 #if !defined(DISABLE_FLOAT8_TYPES)
 
-template<typename T>
+template <typename T>
 struct ReturnFalse {
   constexpr static bool __device__ __inline__ IsInf(T) { return false; }
   constexpr static bool __device__ __inline__ IsInfPos(T) { return false; }
@@ -532,6 +532,63 @@ struct _IsInf {
   }
 };
 
+// float and double
+template <typename T>
+struct _IsNan {
+  __device__ __inline__ bool operator()(T a) const {
+    return isnan(a);
+  }
+};
+
+template <>
+struct _IsNan<half> {
+  __device__ __inline__ bool operator()(half a) const {
+    return static_cast<uint16_t>(*reinterpret_cast<const uint16_t*>(&a) & ~MLFloat16::kSignMask) 
+           > MLFloat16::kPositiveInfinityBits;
+  }
+};
+
+template <>
+struct _IsNan<BFloat16> {
+  __device__ __inline__ bool operator()(BFloat16 a) const {
+    return static_cast<uint16_t>(*reinterpret_cast<const uint16_t*>(&a) & ~BFloat16::kSignMask) 
+           > BFloat16::kPositiveInfinityBits;
+  }
+};
+
+#if !defined(DISABLE_FLOAT8_TYPES)
+
+template<>
+struct _IsNan<Float8E4M3FN> {
+  __device__ __inline__ bool operator()(Float8E4M3FN a) const {
+    return (*reinterpret_cast<const uint8_t*>(&a) & 0x7f) == 0x7f;
+  }
+};
+
+template<>
+struct _IsNan<Float8E4M3FNUZ> {
+  __device__ __inline__ bool operator()(Float8E4M3FNUZ a) const {
+    return *reinterpret_cast<const uint8_t*>(&a) == 0x80;
+  }
+};
+
+template<>
+struct _IsNan<Float8E5M2> {
+  __device__ __inline__ bool operator()(Float8E5M2 a) const {
+    uint8_t c = *reinterpret_cast<const uint8_t*>(&a);
+    return ((c & 0x7c) == 0x7c) && ((c & 0x03) != 0x00);
+  }
+};
+
+template<>
+struct _IsNan<Float8E5M2FNUZ> {
+  __device__ __inline__ bool operator()(Float8E5M2FNUZ a) const {
+    return *reinterpret_cast<const uint8_t*>(&a) == 0x80;
+  }
+};
+
+#endif
+
 // We would like to use 64-bit integer to support large matrices. However, CUDA seems to support only 32-bit integer
 // For now, use int32_t to ensure that both Linux and Windows see this as 32 bit integer type.
 #ifndef CUDA_LONG
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index bade2faf8f2e2..18c7334af6611 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -746,6 +746,7 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kO
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, uint32_t, Cast);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, uint64_t, Cast);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, bool, Cast);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, IsNaN);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 2, 10, float, Pad);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 2, 10, double, Pad);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 2, 10, MLFloat16, Pad);
@@ -938,7 +939,6 @@ class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDom
 
 // OpSet 12
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, Clip);
-
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, float, MaxPool);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, double, MaxPool);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, MLFloat16, MaxPool);
@@ -1087,6 +1087,7 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, U
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Concat);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Gather);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, GatherElements);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 19, IsNaN);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, MatMul);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, MatMul);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, MatMul);
@@ -1368,6 +1369,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, double, Gelu);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, MLFloat16, Gelu);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, IsInf);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, IsNaN);
 
 template <>
 KernelCreateInfo BuildKernelCreateInfo<void>() {
@@ -1553,6 +1555,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, float, Erf)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, double, Erf)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, MLFloat16, Erf)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, IsNaN)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, bool, Not)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, float, BatchNormalization)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, double, BatchNormalization)>,
@@ -1979,6 +1982,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, uint32_t, Cast)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, uint64_t, Cast)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, bool, Cast)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 19, IsNaN)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, Reshape)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 14, Shape)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Size)>,
@@ -2279,6 +2283,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, double, Gelu)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, MLFloat16, Gelu)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, IsInf)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, IsNaN)>,
 #endif
   };
 
diff --git a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc
index 00de1b37f3302..24593b255371c 100644
--- a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc
+++ b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc
@@ -109,6 +109,50 @@ Status IsInf::ComputeInternal(OpKernelContext* context) const {
   return Status::OK();
 }
 
+// IsNan
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    IsNaN,
+    kOnnxDomain,
+    9,
+    12,
+    kCudaExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T1", BuildKernelDefConstraints<ISNAN_OPSET9_FLOATS>())
+        .TypeConstraint("T2", DataTypeImpl::GetTensorType<bool>()),
+    IsNaN);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    IsNaN,
+    kOnnxDomain,
+    13,
+    19,
+    kCudaExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T1", BuildKernelDefConstraints<ISNAN_OPSET13_FLOATS>())
+        .TypeConstraint("T2", DataTypeImpl::GetTensorType<bool>()),
+    IsNaN);
+
+ONNX_OPERATOR_KERNEL_EX(
+    IsNaN,
+    kOnnxDomain,
+    20,
+    kCudaExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T1", BuildKernelDefConstraints<ISNAN_OPSET20_FLOATS>())
+        .TypeConstraint("T2", DataTypeImpl::GetTensorType<bool>()),
+    IsNaN);
+
+Status IsNaN::ComputeInternal(OpKernelContext* context) const {
+  UnaryElementwisePreparation p;
+  ORT_RETURN_IF_ERROR(UnaryElementwise::Prepare(context, &p));
+
+  Explicit_Impl_IsNan(Stream(context), p.input_tensor->GetElementType(), p.input_tensor->DataRaw(),
+                      p.output_tensor->MutableData<bool>(),
+                      p.input_tensor->Shape().Size());
+
+  return Status::OK();
+}
+
 #define UNARY_OP_VERSIONED_TYPED(name, startver, endver, T) \
   UNARY_ELEMENTWISE_REGISTER_VERSIONED_KERNEL(name, startver, endver, T)
 
diff --git a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.h b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.h
index 3b7d6df7221b7..95d68b5e1d534 100644
--- a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.h
+++ b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.h
@@ -131,5 +131,11 @@ class IsInf final : public UnaryElementwise {
   int opset_;
 };
 
+class IsNaN : public UnaryElementwise {
+ public:
+  explicit IsNaN(const OpKernelInfo& info) : UnaryElementwise(info) {}
+  Status ComputeInternal(OpKernelContext* context) const override;
+};
+
 }  // namespace cuda
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.cu b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.cu
index 554d5908cf854..2cdfcda5be26a 100644
--- a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.cu
+++ b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.cu
@@ -315,13 +315,33 @@ void Explicit_Impl_IsInf(cudaStream_t stream, int op_set,
   if (op_set < 20) {
     utils::MLTypeCallDispatcher<float, double> dispatcher{input_data_type};
     dispatcher.Invoke<isinf_details::IsInf_DispFunc>(stream, input_raw, output_data,
-                                                 detect_positive, detect_negative, count);
+                                                     detect_positive, detect_negative, count);
   } else {
     utils::MLTypeCallDispatcher<ISINF_OPSET20_ALL_FLOATS> dispatcher{input_data_type};
     dispatcher.Invoke<isinf_details::IsInf_DispFunc>(stream, input_raw, output_data,
-                                                 detect_positive, detect_negative, count);
+                                                     detect_positive, detect_negative, count);
   }
 }
 
+// IsNan
+
+namespace isnan_details {
+template <typename T>
+struct IsNan_Disp {
+  void operator()(cudaStream_t stream, const void* input_raw, bool* output_data, size_t count) const {
+    using CudaType = typename ToCudaType<T>::MappedType;
+    const auto* input_data = reinterpret_cast<const CudaType*>(input_raw);
+    UnaryElementWiseImpl(stream, input_data, output_data, _IsNan<CudaType>{}, count);
+  }
+};
+}  // namespace isnan_details
+
+void Explicit_Impl_IsNan(cudaStream_t stream, int32_t input_data_type,
+                         const void* input_raw, bool* output_data, size_t count) {
+  // KernelDef constraints would ensure only subset of datatypes is used.
+  utils::MLTypeCallDispatcher<ISNAN_OPSET20_FLOATS> dispatcher{input_data_type};
+  dispatcher.Invoke<isnan_details::IsNan_Disp>(stream, input_raw, output_data, count);
+}
+
 }  // namespace cuda
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.h b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.h
index a606d479bc79b..2588f56e32c12 100644
--- a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.h
+++ b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.h
@@ -151,6 +151,20 @@ void Explicit_Impl_IsInf(cudaStream_t stream, int op_set,
                          int32_t input_data_type,
                          const void* input_raw, bool* output_data,
                          size_t count);
+
+// IsNan
+#define ISNAN_OPSET9_FLOATS float, double, MLFloat16
+#define ISNAN_OPSET13_FLOATS float, double, MLFloat16, BFloat16
+#if !defined(DISABLE_FLOAT8_TYPES)
+#define ISNAN_OPSET20_FLOATS float, double, MLFloat16, BFloat16, Float8E4M3FN, Float8E4M3FNUZ, Float8E5M2, \
+                             Float8E5M2FNUZ
+#else
+#define ISNAN_OPSET20_FLOATS ISNAN_OPSET13_FLOATS
+#endif
+
+void Explicit_Impl_IsNan(cudaStream_t stream, int32_t input_data_type,
+                         const void* input_raw, bool* output_data, size_t count);
+
 }  // namespace cuda
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/rocm/cu_inc/common.cuh b/onnxruntime/core/providers/rocm/cu_inc/common.cuh
index f3685606c17f5..1698e5ca8478c 100644
--- a/onnxruntime/core/providers/rocm/cu_inc/common.cuh
+++ b/onnxruntime/core/providers/rocm/cu_inc/common.cuh
@@ -429,6 +429,63 @@ struct _IsInf {
   }
 };
 
+// float and double
+template <typename T>
+struct _IsNan {
+  __device__ __inline__ bool operator()(T a) const {
+    return isnan(a);
+  }
+};
+
+template <>
+struct _IsNan<half> {
+  __device__ __inline__ bool operator()(half a) const {
+    return static_cast<uint16_t>(*reinterpret_cast<const uint16_t*>(&a) & ~MLFloat16::kSignMask) 
+                                > MLFloat16::kPositiveInfinityBits;
+  }
+};
+
+template <>
+struct _IsNan<BFloat16> {
+  __device__ __inline__ bool operator()(BFloat16 a) const {
+    return static_cast<uint16_t>(*reinterpret_cast<const uint16_t*>(&a) & ~BFloat16::kSignMask) 
+                               > BFloat16::kPositiveInfinityBits;
+  }
+};
+
+#if !defined(DISABLE_FLOAT8_TYPES)
+
+template <>
+struct _IsNan<Float8E4M3FN> {
+  __device__ __inline__ bool operator()(Float8E4M3FN a) const {
+    return (*reinterpret_cast<const uint8_t*>(&a) & 0x7f) == 0x7f;
+  }
+};
+
+template <>
+struct _IsNan<Float8E4M3FNUZ> {
+  __device__ __inline__ bool operator()(Float8E4M3FNUZ a) const {
+    return *reinterpret_cast<const uint8_t*>(&a) == 0x80;
+  }
+};
+
+template <>
+struct _IsNan<Float8E5M2> {
+  __device__ __inline__ bool operator()(Float8E5M2 a) const {
+    uint8_t c = *reinterpret_cast<const uint8_t*>(&a);
+    return ((c & 0x7c) == 0x7c) && ((c & 0x03) != 0x00);
+  }
+};
+
+template <>
+struct _IsNan<Float8E5M2FNUZ> {
+  __device__ __inline__ bool operator()(Float8E5M2FNUZ a) const {
+    return *reinterpret_cast<const uint8_t*>(&a) == 0x80;
+  }
+};
+
+#endif
+
 // We would like to use 64-bit integer to support large matrices. However, ROCM seems to support only 32-bit integer
 // For now, use int32_t to ensure that both Linux and Windows see this as 32 bit integer type.
 #ifndef HIP_LONG
diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
index 32be74550951e..87daaeea969ac 100644
--- a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
+++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
@@ -734,6 +734,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 9, float, Shrink);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 9, double, Shrink);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 9, MLFloat16, Shrink);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 9, 12, IsNaN);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 7, 8, float, Less);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 7, 8, double, Less);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 7, 8, MLFloat16, Less);
@@ -1067,6 +1068,7 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kO
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 18, uint32_t, Cast);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 18, uint64_t, Cast);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 18, bool, Cast);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 19, IsNaN);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 13, Reshape);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 14, Shape);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, Size);
@@ -1346,6 +1348,7 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, S
 
 // Opset 20
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 20, IsInf);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 20, IsNaN);
 
 template <>
 KernelCreateInfo BuildKernelCreateInfo<void>() {
@@ -1531,6 +1534,7 @@ static Status RegisterRocmKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 9, 12, float, Erf)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 9, 12, double, Erf)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 9, 12, MLFloat16, Erf)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 9, 12, IsNaN)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, bool, Not)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 7, 8, float, BatchNormalization)>,
     // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 7, 8, double, BatchNormalization)>,
@@ -1941,6 +1945,7 @@ static Status RegisterRocmKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, float, Abs)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, double, Abs)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, MLFloat16, Abs)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 19, IsNaN)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, int8_t, Neg)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, int16_t, Neg)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, int32_t, Neg)>,
@@ -2304,6 +2309,7 @@ static Status RegisterRocmKernels(KernelRegistry& kernel_registry) {
 
     // opset 20
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 20, IsInf)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 20, IsNaN)>,
   };
 
   for (auto& function_table_entry : function_table) {
diff --git a/onnxruntime/test/providers/cpu/tensor/isnan_test.cc b/onnxruntime/test/providers/cpu/tensor/isnan_test.cc
index 0f1e5c07cdd9b..3cf99fde2cce7 100644
--- a/onnxruntime/test/providers/cpu/tensor/isnan_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/isnan_test.cc
@@ -38,9 +38,23 @@ TEST(IsNaNOpTest, IsNaNFloat16_9) {
   run_is_nan_test(9, dims, input, output);
 }
 
+TEST(IsNaNOpTest, IsNaNFloat16_13) {
+  std::vector<int64_t> dims{2, 2};
+  std::initializer_list<MLFloat16> input = {MLFloat16::One, MLFloat16::NaN, MLFloat16(2.0f), MLFloat16::NaN};
+  std::initializer_list<bool> output = {false, true, false, true};
+  run_is_nan_test(13, dims, input, output);
+}
+
 TEST(IsNaNOpTest, IsNaNFloat16_20) {
   std::vector<int64_t> dims{2, 2};
-  std::initializer_list<MLFloat16> input = {MLFloat16(1.0f), MLFloat16::NaN, MLFloat16(2.0f), MLFloat16::NaN};
+  std::initializer_list<MLFloat16> input = {MLFloat16::One, MLFloat16::NaN, MLFloat16(2.0f), MLFloat16::NaN};
+  std::initializer_list<bool> output = {false, true, false, true};
+  run_is_nan_test(20, dims, input, output);
+}
+
+TEST(IsNaNOpTest, IsNaNBFloat16_20) {
+  std::vector<int64_t> dims{2, 2};
+  std::initializer_list<BFloat16> input = {BFloat16::One, BFloat16::NaN, BFloat16(2.0f), BFloat16::NaN};
   std::initializer_list<bool> output = {false, true, false, true};
   run_is_nan_test(20, dims, input, output);
 }

From 6c3bed674008694847374a59c9057a640cdd40e2 Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Fri, 8 Mar 2024 12:50:13 +1000
Subject: [PATCH 129/279] Run CoreML EP with NeuralNetwork and ML Program in CI
 unit tests (#19796)

### Description
<!-- Describe your changes. -->
Add synthetic CoreML EP name to the list of providers so we test with
NeuralNetwork and MLProgram model types.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
Automatically test new MLProgram support in CI
---
 onnxruntime/test/providers/base_tester.cc             | 11 +++++++++++
 .../test/providers/coreml/coreml_basic_test.cc        |  6 +++++-
 onnxruntime/test/util/default_providers.cc            |  6 +++---
 onnxruntime/test/util/include/default_providers.h     |  2 +-
 4 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/onnxruntime/test/providers/base_tester.cc b/onnxruntime/test/providers/base_tester.cc
index 84cb663a2984a..e94f8c2673be3 100644
--- a/onnxruntime/test/providers/base_tester.cc
+++ b/onnxruntime/test/providers/base_tester.cc
@@ -613,6 +613,9 @@ void BaseTester::RunWithConfig(size_t* number_of_pre_packed_weights_counter,
                          number_of_pre_packed_weights_counter,
                          number_of_shared_pre_packed_weights_counter);
     } else {
+      // synthetic EP name for testing CoreML EP with ML Program
+      constexpr const char* kCoreMLExecutionProviderMLProgram = "CoreMLExecutionProvider_MLProgram";
+
 #ifdef USE_TENSORRT
       // only run trt ep to reduce test time
       static const std::string all_provider_types[] = {
@@ -634,10 +637,16 @@ void BaseTester::RunWithConfig(size_t* number_of_pre_packed_weights_counter,
           kNnapiExecutionProvider,
           kRocmExecutionProvider,
           kCoreMLExecutionProvider,
+          kCoreMLExecutionProviderMLProgram,
           kQnnExecutionProvider,
           kSnpeExecutionProvider,
           kXnnpackExecutionProvider,
       };
+
+      // need to special case any synthetic EP names in the exclude list
+      if (ctx_.excluded_provider_types.count(kCoreMLExecutionProvider) > 0) {
+        ctx_.excluded_provider_types.insert(kCoreMLExecutionProviderMLProgram);
+      }
 #endif
 
       bool has_run = false;
@@ -675,6 +684,8 @@ void BaseTester::RunWithConfig(size_t* number_of_pre_packed_weights_counter,
           execution_provider = DefaultRocmExecutionProvider();
         else if (provider_type == onnxruntime::kCoreMLExecutionProvider)
           execution_provider = DefaultCoreMLExecutionProvider();
+        else if (provider_type == kCoreMLExecutionProviderMLProgram)
+          execution_provider = DefaultCoreMLExecutionProvider(/*use_mlprogram*/ true);
         else if (provider_type == onnxruntime::kSnpeExecutionProvider)
           execution_provider = DefaultSnpeExecutionProvider();
         else if (provider_type == onnxruntime::kQnnExecutionProvider)
diff --git a/onnxruntime/test/providers/coreml/coreml_basic_test.cc b/onnxruntime/test/providers/coreml/coreml_basic_test.cc
index 94817158017bd..0f068ba48d3d8 100644
--- a/onnxruntime/test/providers/coreml/coreml_basic_test.cc
+++ b/onnxruntime/test/providers/coreml/coreml_basic_test.cc
@@ -192,8 +192,10 @@ TEST(CoreMLExecutionProviderTest, TestOrtFormatModel) {
 #endif
 }
 
-// Test that we fix invalid names in model inputs, initializers and outputs.
+#if defined(COREML_ENABLE_MLPROGRAM)
 // Names in CoreML cannot start with [0-9] or contain anything but "[a-z][A-Z][0-9]_"
+// Test that we fix invalid names in model inputs, initializers and outputs.
+// This is only enforced for ML Program, so we only do name sanitization when creating an ML Program format model.
 TEST(CoreMLExecutionProviderTest, TestNameSanitization) {
   OpTester test("Clip", 11);
 
@@ -212,5 +214,7 @@ TEST(CoreMLExecutionProviderTest, TestNameSanitization) {
   // TensorRT does not support Clip opset 11 yet.
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
+#endif
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/util/default_providers.cc b/onnxruntime/test/util/default_providers.cc
index b404c12db3582..c12a52c4356aa 100644
--- a/onnxruntime/test/util/default_providers.cc
+++ b/onnxruntime/test/util/default_providers.cc
@@ -223,21 +223,21 @@ std::unique_ptr<IExecutionProvider> DefaultRocmExecutionProvider(bool test_tunab
   return nullptr;
 }
 
-std::unique_ptr<IExecutionProvider> DefaultCoreMLExecutionProvider() {
+std::unique_ptr<IExecutionProvider> DefaultCoreMLExecutionProvider(bool use_mlprogram) {
   // To manually test CoreML model generation on a non-macOS platform, comment out the `&& defined(__APPLE__)` below.
   // The test will create a model but execution of it will obviously fail.
-  // To test creating an ML Program, set the environment variable COREML_EP_TEST_MLPROGRAM to any value.
 #if defined(USE_COREML) && defined(__APPLE__)
   // We want to run UT on CPU only to get output value without losing precision
   uint32_t coreml_flags = 0;
   coreml_flags |= COREML_FLAG_USE_CPU_ONLY;
 
-  if (!Env::Default().GetEnvironmentVar("COREML_EP_TEST_MLPROGRAM").empty()) {
+  if (use_mlprogram) {
     coreml_flags |= COREML_FLAG_CREATE_MLPROGRAM;
   }
 
   return CoreMLProviderFactoryCreator::Create(coreml_flags)->CreateProvider();
 #else
+  ORT_UNUSED_PARAMETER(use_mlprogram);
   return nullptr;
 #endif
 }
diff --git a/onnxruntime/test/util/include/default_providers.h b/onnxruntime/test/util/include/default_providers.h
index 738fc66d775c6..ae8e89c386994 100644
--- a/onnxruntime/test/util/include/default_providers.h
+++ b/onnxruntime/test/util/include/default_providers.h
@@ -54,7 +54,7 @@ std::unique_ptr<IExecutionProvider> DefaultRknpuExecutionProvider();
 std::unique_ptr<IExecutionProvider> DefaultAclExecutionProvider(bool enable_arena = true);
 std::unique_ptr<IExecutionProvider> DefaultArmNNExecutionProvider(bool enable_arena = true);
 std::unique_ptr<IExecutionProvider> DefaultRocmExecutionProvider(bool test_tunable_op = false);
-std::unique_ptr<IExecutionProvider> DefaultCoreMLExecutionProvider();
+std::unique_ptr<IExecutionProvider> DefaultCoreMLExecutionProvider(bool use_mlprogram = false);
 std::unique_ptr<IExecutionProvider> DefaultSnpeExecutionProvider();
 std::unique_ptr<IExecutionProvider> DefaultQnnExecutionProvider();
 std::unique_ptr<IExecutionProvider> QnnExecutionProviderWithOptions(const ProviderOptions& options,

From 24b72d26134a5b8d841588efc8dff7579241b0ce Mon Sep 17 00:00:00 2001
From: Satya Kumar Jandhyala <satya.k.jandhyala@gmail.com>
Date: Thu, 7 Mar 2024 19:07:49 -0800
Subject: [PATCH 130/279] [JS/WebGPU] Preserve zero size input tensor dims.
 (#19737)

### Description
For Concat operation, the zero-size input tensor shape need to be
preserved and, unlike non-zero tensors, the dims are not constrained to
match other input tensors' dims.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 js/web/lib/wasm/jsep/webgpu/ops/concat.ts    | 146 +++++++++----------
 js/web/test/data/ops/concat_zero-sized.jsonc |  80 ++++++++++
 2 files changed, 149 insertions(+), 77 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/concat.ts b/js/web/lib/wasm/jsep/webgpu/ops/concat.ts
index b142a82e551a7..010ee589c44fa 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/concat.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/concat.ts
@@ -13,25 +13,32 @@ export interface ConcatAttributes extends AttributeWithCacheKey {
   readonly axis: number;
 }
 
-const validateInputs = (inputs: readonly TensorView[]): void => {
+const validateInputs = (inputs: readonly TensorView[], axis: number): void => {
   if (!inputs || inputs.length < 1) {
     throw new Error('too few inputs');
   }
-
-  const inputType = inputs[0].dataType;
-  const inputDimensionality = inputs[0].dims.length;
-
-  for (const input of inputs) {
+  const referenceIndex = 0;
+  const referenceInput = inputs[referenceIndex];
+  const inputType = referenceInput.dataType;
+  const inputRank = referenceInput.dims.length;
+  inputs.forEach((input, i) => {
+    if (i === referenceIndex) {
+      return;
+    }
     // make sure types of all inputs match
     if (input.dataType !== inputType) {
       throw new Error('input tensors should be one type');
     }
-
     // make sure the dimensionality of all inputs are the same
-    if (input.dims.length !== inputDimensionality) {
+    if (input.dims.length !== inputRank) {
       throw new Error('input tensors should have the same shape');
     }
-  }
+    input.dims.forEach((dim, i) => {
+      if (i !== axis && dim !== referenceInput.dims[i]) {
+        throw new Error('non concat dimensions must match');
+      }
+    });
+  });
 };
 
 const calculateInputIndexImpl = (numberOfTensors: number, sizeInConcatAxisStr: string): string => `
@@ -64,65 +71,43 @@ const assignOutputData = (inputs: readonly IndicesHelper[], output: IndicesHelpe
   return codeLines.join('\n');
 };
 
-const createConcatProgramInfo = (inputs: readonly TensorView[], axis: number): ProgramInfo => {
-  const inputShape = inputs[0].dims.slice();
-  if (axis >= inputShape.length || axis < (-1 * inputShape.length)) {
-    throw new Error('axis specified for concat doesn\'t match input dimensionality');
-  }
-  const adjustedAxis = (axis < 0) ? inputShape.length + axis : axis;
-  // ensure all of the non-concatenated axes match each other
-  // calculate the shape of the output tensor while we do that
-  const outputShape = inputShape.slice(0);
-  for (let i = 1; i < inputs.length; i++) {
-    const dataNShape = inputs[i].dims.slice();
-    for (let axisIndex = 0; axisIndex < inputShape.length; axisIndex++) {
-      // add to the placeholder for computing output shape
-      if (axisIndex === adjustedAxis) {
-        outputShape[adjustedAxis] += dataNShape[axisIndex];
+const createConcatProgramInfo =
+    (inputs: readonly TensorView[], adjustedAxis: number, outputShape: number[], dataType: DataType): ProgramInfo => {
+      const outputSize = ShapeUtil.size(outputShape);
+
+      const sizeInConcatAxis = new Array<number>(inputs.length);
+      const inputVars = new Array<IndicesHelper>(inputs.length);
+
+      let previousSum = 0;
+      const inputDependencies: ProgramInputTensorInfoDependency[] = [];
+      const inputRanks = [];
+      const programUniforms: ProgramUniform[] = [{type: DataType.uint32, data: outputSize}];
+      for (let i = 0; i < inputs.length; ++i) {
+        previousSum += inputs[i].dims[adjustedAxis];
+        sizeInConcatAxis[i] = previousSum;
+        inputRanks.push(inputs[i].dims.length);
+        inputVars[i] = inputVariable(`input${i}`, dataType, inputRanks[i]);
+        inputDependencies.push('rank');
+        programUniforms.push({type: DataType.uint32, data: sizeInConcatAxis[i]});
       }
-      // ensure all non-cancatenated axes match each other
-      else if (inputShape[axisIndex] !== dataNShape[axisIndex]) {
-        throw new Error('non concat dimensions must match');
+      for (let i = 0; i < inputs.length; ++i) {
+        programUniforms.push(...createTensorShapeVariables(inputs[i].dims));
       }
-    }
-  }
-
-  const outputSize = ShapeUtil.size(outputShape);
-
-  const sizeInConcatAxis = new Array<number>(inputs.length);
-  const inputVars = new Array<IndicesHelper>(inputs.length);
-  const dataType = inputs[0].dataType;
-
-  let previousSum = 0;
-  const inputDependencies: ProgramInputTensorInfoDependency[] = [];
-  const inputRanks = [];
-  const programUniforms: ProgramUniform[] = [{type: DataType.uint32, data: outputSize}];
-  for (let i = 0; i < inputs.length; ++i) {
-    previousSum += inputs[i].dims[adjustedAxis];
-    sizeInConcatAxis[i] = previousSum;
-    inputRanks.push(inputs[i].dims.length);
-    inputVars[i] = inputVariable(`input${i}`, dataType, inputRanks[i]);
-    inputDependencies.push('rank');
-    programUniforms.push({type: DataType.uint32, data: sizeInConcatAxis[i]});
-  }
-  for (let i = 0; i < inputs.length; ++i) {
-    programUniforms.push(...createTensorShapeVariables(inputs[i].dims));
-  }
-  programUniforms.push(...createTensorShapeVariables(outputShape));
+      programUniforms.push(...createTensorShapeVariables(outputShape));
 
-  const output = outputVariable('output', dataType, outputShape.length);
-  const indicesAxis = output.indicesGet('indices', adjustedAxis);
-  const sizeInConcatAxisStr =
-      Array.from(Array(sizeInConcatAxis.length).keys()).map(i => `uniforms.sizeInConcatAxis${i}`).join(',');
-  const getShaderSource = (shaderHelper: ShaderHelper) => `
+      const output = outputVariable('output', dataType, outputShape.length);
+      const indicesAxis = output.indicesGet('indices', adjustedAxis);
+      const sizeInConcatAxisStr =
+          Array.from(Array(sizeInConcatAxis.length).keys()).map(i => `uniforms.sizeInConcatAxis${i}`).join(',');
+      const getShaderSource = (shaderHelper: ShaderHelper) => `
 
   ${(() => {
-    shaderHelper.registerUniform('outputSize', 'u32');
-    for (let i = 0; i < inputs.length; i++) {
-      shaderHelper.registerUniform(`sizeInConcatAxis${i}`, 'u32');
-    }
-    return shaderHelper.declareVariables(...inputVars, output);
-  })()}
+        shaderHelper.registerUniform('outputSize', 'u32');
+        for (let i = 0; i < inputs.length; i++) {
+          shaderHelper.registerUniform(`sizeInConcatAxis${i}`, 'u32');
+        }
+        return shaderHelper.declareVariables(...inputVars, output);
+      })()}
 
   ${calculateInputIndexImpl(sizeInConcatAxis.length, sizeInConcatAxisStr)}
 
@@ -140,23 +125,30 @@ const createConcatProgramInfo = (inputs: readonly TensorView[], axis: number): P
     ${assignOutputData(inputVars, output)}
   }`;
 
-  return {
-    name: 'Concat',
-    shaderCache: {hint: `${axis}`, inputDependencies},
-    getRunData: () => ({
-      outputs: [{dims: outputShape, dataType: inputs[0].dataType}],
-      dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)},
-      programUniforms,
-    }),
-    getShaderSource,
-  };
-};
+      return {
+        name: 'Concat',
+        shaderCache: {hint: `${adjustedAxis}`, inputDependencies},
+        getRunData: () => ({
+          outputs: [{dims: outputShape, dataType}],
+          dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)},
+          programUniforms,
+        }),
+        getShaderSource,
+      };
+    };
 
 export const concat = (context: ComputeContext, attributes: ConcatAttributes): void => {
-  validateInputs(context.inputs);
+  const inputs = context.inputs;
+  const inputShape = inputs[0].dims;
+  const adjustedAxis = ShapeUtil.normalizeAxis(attributes.axis, inputShape.length);
+  validateInputs(inputs, adjustedAxis);
+  const outputShape = inputShape.slice();
+  outputShape[adjustedAxis] =
+      inputs.reduce((sum, input) => sum + (input.dims.length > adjustedAxis ? input.dims[adjustedAxis] : 0), 0);
   // 0 length tensors are valid for concat, remove them
-  const nonEmptyInputs = context.inputs.filter(input => ShapeUtil.size(input.dims) > 0);
-  context.compute(createConcatProgramInfo(nonEmptyInputs, attributes.axis), {inputs: nonEmptyInputs});
+  const nonEmptyInputs = inputs.filter(input => ShapeUtil.size(input.dims) > 0);
+  context.compute(
+      createConcatProgramInfo(nonEmptyInputs, adjustedAxis, outputShape, inputs[0].dataType), {inputs: nonEmptyInputs});
 };
 
 export const parseConcatAttributes = (attributes: Record<string, unknown>): ConcatAttributes =>
diff --git a/js/web/test/data/ops/concat_zero-sized.jsonc b/js/web/test/data/ops/concat_zero-sized.jsonc
index 7be8e8c1cc602..be9625145d157 100644
--- a/js/web/test/data/ops/concat_zero-sized.jsonc
+++ b/js/web/test/data/ops/concat_zero-sized.jsonc
@@ -557,5 +557,85 @@
         ]
       }
     ]
+  },
+  {
+    "name": "Concat 2D axis=1; Preserve dims",
+    "operator": "Concat",
+    "attributes": [
+      {
+        "name": "axis",
+        "data": 0,
+        "type": "int"
+      }
+    ],
+    "cases": [
+      {
+        "name": "Some but not all input tensors are zero-sized",
+        "inputs": [
+          {
+            "data": [],
+            "dims": [0, 1],
+            "type": "float32"
+          },
+          {
+            "data": [1],
+            "dims": [1, 1],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [1],
+            "dims": [1, 1],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "Concat 2D axis=1; Preserve dims",
+    "operator": "Concat",
+    "attributes": [
+      {
+        "name": "axis",
+        "data": 1,
+        "type": "int"
+      }
+    ],
+    "cases": [
+      {
+        "name": "All input tensors are zero-sized",
+        "inputs": [
+          {
+            "data": [],
+            "dims": [0, 0],
+            "type": "float32"
+          },
+          {
+            "data": [],
+            "dims": [0, 1],
+            "type": "float32"
+          },
+          {
+            "data": [],
+            "dims": [0, 2],
+            "type": "float32"
+          },
+          {
+            "data": [],
+            "dims": [0, 3],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [],
+            "dims": [0, 6],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
   }
 ]

From 01c376a0b9ebd251d5712fa14a448335a2bde780 Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Fri, 8 Mar 2024 17:52:47 +1000
Subject: [PATCH 131/279] Update script to run CIs for a branch. (#19797)

### Description
<!-- Describe your changes. -->
- Support multiple include/exclude values.
- e.g. can now run with `-i MacOS -i iOS` to run CIs for both Apple
platforms.
- Default to current branch if run from directory in repo.
  - make lazier usage possible

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
Improve tools.

---------

Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com>
---
 tools/python/run_CIs_for_branch.py | 55 +++++++++++++++++++++++++-----
 1 file changed, 46 insertions(+), 9 deletions(-)

diff --git a/tools/python/run_CIs_for_branch.py b/tools/python/run_CIs_for_branch.py
index c507cae0d9f43..975ea2b988d75 100644
--- a/tools/python/run_CIs_for_branch.py
+++ b/tools/python/run_CIs_for_branch.py
@@ -13,13 +13,20 @@
 from util.platform_helpers import is_windows
 
 
+class DefaultArgsRawHelpFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
+    pass
+
+
 def _parse_args():
     parser = argparse.ArgumentParser(
         os.path.basename(__file__),
-        formatter_class=argparse.RawDescriptionHelpFormatter,
+        formatter_class=DefaultArgsRawHelpFormatter,
         description="""Run the CIs used to validate PRs for the specified branch.
 
+        If not specified, the branch will be inferred (if possible) by running `git branch --show-current`.
+
         If specified, the `--include` filter is applied first, followed by any `--exclude` filter.
+        `--include` and `--exclude` can be specified multiple times to accumulate values to include/exclude.
 
         Requires the Azure CLI with DevOps extension to be installed.
           Azure CLI: https://learn.microsoft.com/en-us/cli/azure/install-azure-cli
@@ -44,12 +51,30 @@ def _parse_args():
         """,
     )
 
-    parser.add_argument("-i", "--include", type=str, help="Include CIs that match this string. Case insensitive.")
-    parser.add_argument("-e", "--exclude", type=str, help="Exclude CIs that match this string. Case insensitive.")
+    current_branch = None
+    get_branch_result = subprocess.run(["git", "branch", "--show-current"], capture_output=True, text=True, check=False)
+    if get_branch_result.returncode == 0:
+        current_branch = get_branch_result.stdout.strip()
+
+    parser.add_argument(
+        "-i", "--include", action="append", type=str, help="Include CIs that match this string. Case insensitive."
+    )
+    parser.add_argument(
+        "-e", "--exclude", action="append", type=str, help="Exclude CIs that match this string. Case insensitive."
+    )
     parser.add_argument("--dry-run", action="store_true", help="Print selected CIs but do not run them.")
-    parser.add_argument("branch", type=str, help="Specify the branch to run.")
+    parser.add_argument(
+        "branch",
+        type=str,
+        nargs="?",
+        default=current_branch,
+        help="Specify the branch to run. Default is current branch if available.",
+    )
 
     args = parser.parse_args()
+    if not args.branch:
+        raise ValueError("Branch was unable to be inferred and must be specified")
+
     return args
 
 
@@ -77,25 +102,37 @@ def main():
     pipelines = get_pipeline_names()
     pipelines_to_run = []
     if args.include:
-        value = args.include.lower().strip()
+        values = [i.lower().strip() for i in args.include]
         for p in pipelines:
-            if value in p.lower():
+            include = False
+            for value in values:
+                if value in p.lower():
+                    include = True
+                    break
+
+            if include:
                 print(f"Including {p}")
                 pipelines_to_run.append(p)
     else:
         pipelines_to_run = pipelines
 
     if args.exclude:
-        value = args.exclude.lower().strip()
+        values = [e.lower().strip() for e in args.exclude]
         cur_pipelines = pipelines_to_run
         pipelines_to_run = []
         for p in cur_pipelines:
-            if value in p.lower():
+            exclude = False
+            for value in values:
+                if value in p.lower():
+                    exclude = True
+                    break
+
+            if exclude:
                 print(f"Excluding {p}")
             else:
                 pipelines_to_run.append(p)
 
-    print("Pipelines to run:")
+    print(f"Pipelines to run for {args.branch}:")
     for p in pipelines_to_run:
         print(f"\t{p}")
 

From 3170a48e60979ce1fb0d391cab7b0572bab90fff Mon Sep 17 00:00:00 2001
From: Yifan Li <109183385+yf711@users.noreply.github.com>
Date: Fri, 8 Mar 2024 10:24:36 -0800
Subject: [PATCH 132/279] [EP Perf] Add tag to indicate which TRT parser is
 using  (#19784)

### Description
* Add tag to distinguish if TRT `builtin` or `oss` parser is being used
* `oss` tag will be inserted with onnx-tensorrt commit id, to indicate
which version oss parser is
### Validate
DB entry before/after this PR
(during test, `builtin` or `oss_{commit_id}` tag was inserted in the
database entries):

### Motivation and Context
To distinguish perf results using builtin/oss parser in the database,
this parser tag is needed.
In future, results using different parsers will be listed in different
Perf Dashboard pages.
---
 .../python/tools/tensorrt/perf/post.py        | 25 ++++++++++++++++---
 ...linux-gpu-tensorrt-daily-perf-pipeline.yml |  6 ++++-
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/python/tools/tensorrt/perf/post.py b/onnxruntime/python/tools/tensorrt/perf/post.py
index 0f5614bd5160f..363fa3a96d283 100644
--- a/onnxruntime/python/tools/tensorrt/perf/post.py
+++ b/onnxruntime/python/tools/tensorrt/perf/post.py
@@ -56,6 +56,7 @@ def parse_arguments():
     parser.add_argument("-b", "--branch", help="Branch", required=True)
     parser.add_argument("--kusto_conn", help="Kusto connection URL", required=True)
     parser.add_argument("--database", help="Database name", required=True)
+    parser.add_argument("--use_tensorrt_oss_parser", help="Use TensorRT OSS parser", required=False)
     parser.add_argument(
         "-d",
         "--commit_datetime",
@@ -370,7 +371,7 @@ def write_table(
     ingest_client.ingest_from_dataframe(table, ingestion_properties=ingestion_props)
 
 
-def get_identifier(commit_datetime, commit_hash, trt_version, branch):
+def get_identifier(commit_datetime, commit_hash, trt_version, branch, use_tensorrt_oss_parser):
     """
     Returns an identifier that associates uploaded data with an ORT commit/date/branch and a TensorRT version.
 
@@ -383,7 +384,23 @@ def get_identifier(commit_datetime, commit_hash, trt_version, branch):
     """
 
     date = str(commit_datetime.date())  # extract date only
-    return date + "_" + commit_hash + "_" + trt_version + "_" + branch
+    if use_tensorrt_oss_parser:
+        current_dir = os.path.dirname(os.path.abspath(__file__))
+        root_dir = os.path.abspath(os.path.join(current_dir, "../../../../.."))
+        deps_txt_path = os.path.join(root_dir, "cmake", "deps.txt")
+        commit_head = ""
+        with open(deps_txt_path) as file:
+            for line in file:
+                parts = line.split(";")
+                if parts[0] == "onnx_tensorrt":
+                    url = parts[1]
+                    commit = url.split("/")[-1]
+                    commit_head = commit[:6]
+                    break
+        parser = f"oss_{commit_head}"
+    else:
+        parser = "builtin"
+    return "_".join([date, commit_hash, trt_version, parser, branch])
 
 
 def main():
@@ -396,7 +413,9 @@ def main():
     # connect to database
     kcsb_ingest = KustoConnectionStringBuilder.with_az_cli_authentication(args.kusto_conn)
     ingest_client = QueuedIngestClient(kcsb_ingest)
-    identifier = get_identifier(args.commit_datetime, args.commit_hash, args.trt_version, args.branch)
+    identifier = get_identifier(
+        args.commit_datetime, args.commit_hash, args.trt_version, args.branch, args.use_tensorrt_oss_parser
+    )
     upload_time = datetime.datetime.now(tz=datetime.timezone.utc).replace(microsecond=0)
 
     try:
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml
index eaadc6ad728c0..9f3a127262bb1 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml
@@ -76,6 +76,10 @@ jobs:
 
     - name: image
       value: ort-image-$(Build.BuildId)
+    
+    - name: parser
+      ${{ if eq(parameters.UseTensorrtOssParser, true) }}:
+        value: --use_tensorrt_oss_parser $(parameters.UseTensorrtOssParser) }}
 
   steps:
     - ${{ if and(eq(parameters.TrtVersion, 'BIN'), eq(parameters.UseTensorrtOssParser, false)) }}:
@@ -155,7 +159,7 @@ jobs:
           inlineScript: |
             short_hash=$(git rev-parse --short HEAD) &&
             commit_date=$(git log -1 --date=iso-strict --pretty=format:%cd) &&
-            python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/post.py -r $(Build.SourcesDirectory)/Artifact/result -c $short_hash -d $commit_date -u "$(reportUrl)?buildId=$(Build.BuildId)" -t $(trtVersion) -b $(branchName) --kusto_conn $(kustoConn) --database $(database)
+            python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/post.py -r $(Build.SourcesDirectory)/Artifact/result -c $short_hash -d $commit_date -u "$(reportUrl)?buildId=$(Build.BuildId)" -t $(trtVersion) -b $(branchName) --kusto_conn $(kustoConn) --database $(database) $(parser)
     
     - template: templates/component-governance-component-detection-steps.yml
       parameters :

From 069d2d6f54f5cfa49e2ddfea4542150b88f47a55 Mon Sep 17 00:00:00 2001
From: Yifan Li <109183385+yf711@users.noreply.github.com>
Date: Fri, 8 Mar 2024 13:58:22 -0800
Subject: [PATCH 133/279] [EP Perf] Update EP Perf dockerfiles with
 cuda12/cudnn9  (#19781)

### Description
* Update name of existing dockerfiles and add support to test latest
TensorRT EA binary located in the image
* Add cuda 12.3/cuDNN 9/TensorRT 8.6 dockerfile
* Add detail to CI prompts and configs

Instruction to test latest TRT via BIN:
1. Select `BIN` in TensorRT Version
2. In Variables, update related tarCudaVersion, **clear**
tarCudnnVersion (not required in latest TRT tar binary) , and path to
binary.
---
 .../tools/tensorrt/perf/build/build_image.py  | 37 +++----
 ...linux-gpu-tensorrt-daily-perf-pipeline.yml | 17 ++--
 .../Dockerfile.ubuntu_cuda12_3_tensorrt8_6    | 96 +++++++++++++++++++
 .../docker/Dockerfile.ubuntu_tensorrt_bin     | 93 +++++++++++++-----
 4 files changed, 183 insertions(+), 60 deletions(-)
 create mode 100644 tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_3_tensorrt8_6

diff --git a/onnxruntime/python/tools/tensorrt/perf/build/build_image.py b/onnxruntime/python/tools/tensorrt/perf/build/build_image.py
index 2ae64a72d08fe..b95ad3c0a55ef 100644
--- a/onnxruntime/python/tools/tensorrt/perf/build/build_image.py
+++ b/onnxruntime/python/tools/tensorrt/perf/build/build_image.py
@@ -14,9 +14,10 @@
 from typing import List, Optional
 
 TRT_DOCKER_FILES = {
-    "8.4": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_6_tensorrt8_4",
-    "8.5": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_5",
-    "8.6": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6",
+    "8.4.cuda_11_6_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_6_tensorrt8_4",
+    "8.5.cuda_11_8_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_5",
+    "8.6.cuda_11_8_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6",
+    "8.6.cuda_12_3_cudnn_9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_3_tensorrt8_6",
     "BIN": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin",
 }
 
@@ -99,18 +100,11 @@ def docker_build_trt(args: argparse.Namespace):
     :param args: The arguments to this script.
     """
 
-    if not is_valid_ver_str(args.trt_version, min_comps=2, max_comps=4):
-        print(f"[ERROR]: Invalid TensorRT version '{args.trt_version}'", file=sys.stderr)
-        sys.exit(1)
-
-    vers_comps = args.trt_version.split(".")
-    trt_ver_key = f"{vers_comps[0]}.{vers_comps[1]}"
-
-    if trt_ver_key not in TRT_DOCKER_FILES:
+    if args.trt_version not in TRT_DOCKER_FILES:
         print(f"[ERROR]: TensorRT version '{args.trt_version}' is currently unsupported", file=sys.stderr)
         sys.exit(1)
 
-    docker_file = TRT_DOCKER_FILES[trt_ver_key]
+    docker_file = TRT_DOCKER_FILES[args.trt_version]
     docker_file_path = os.path.normpath(os.path.join(args.repo_path, docker_file))
 
     if not os.path.isfile(docker_file_path):
@@ -144,11 +138,7 @@ def docker_build_trt_bin(args: argparse.Namespace):
         sys.exit(1)
 
     if not is_valid_ver_str(args.tar_cuda_version, 2, 2):
-        print("[ERROR]: Must specify a valid CUDA version for binary TensorRT installs (e.g., 11.x)", file=sys.stderr)
-        sys.exit(1)
-
-    if not is_valid_ver_str(args.tar_cudnn_version, 2, 2):
-        print("[ERROR]: Must specify a valid cuDNN version for binary TensorRT installs (e.g., 8.x)", file=sys.stderr)
+        print("[ERROR]: Must specify a valid CUDA version for binary TensorRT installs (e.g., 12.4)", file=sys.stderr)
         sys.exit(1)
 
     if not os.path.isfile(docker_file_path):
@@ -170,8 +160,6 @@ def docker_build_trt_bin(args: argparse.Namespace):
             "--build-arg",
             f"TAR_CUDA_VERSION={args.tar_cuda_version}",
             "--build-arg",
-            f"TAR_CUDNN_VERSION={args.tar_cudnn_version}",
-            "--build-arg",
             f"TRT_BINS_DIR={args.trt_bins_dir}",
             "-f",
             f"{docker_file_path}",
@@ -195,7 +183,9 @@ def parse_arguments() -> argparse.Namespace:
     parser.add_argument("-r", "--repo_path", required=True, help="Path to the onnxruntime repository")
     parser.add_argument("-i", "--image_name", required=True, help="The resulting Docker image name")
     parser.add_argument("-b", "--branch", default="main", help="Name of the onnxruntime git branch to checkout")
-    parser.add_argument("-t", "--trt_version", default="8.6.1.6", help="TensorRT version (e.g., 8.6.1.6)")
+    parser.add_argument(
+        "-t", "--trt_version", default="8.6.cuda_11_8_cudnn_8", help="TensorRT version (e.g., 8.6.cuda_11_8_cudnn_8)"
+    )
     parser.add_argument("-a", "--cuda_arch", default="75", help="CUDA architecture (e.g., 75)")
 
     # Command-line options for installing TensorRT from binaries.
@@ -208,12 +198,7 @@ def parse_arguments() -> argparse.Namespace:
     parser.add_argument(
         "--tar_cuda_version",
         default="",
-        help="CUDA version (e.g., 11.8) used to find TensorRT EA binary tar.gz package",
-    )
-    parser.add_argument(
-        "--tar_cudnn_version",
-        default="",
-        help="CUDA version (e.g., 8.6) used to find TensorRT EA binary tar.gz package",
+        help="CUDA version (e.g., 12.4) used to find TensorRT EA binary tar.gz package",
     )
     parser.add_argument("--trt_bins_dir", default="", help="Directory containing TensorRT tar.gz package")
     parser.add_argument(
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml
index 9f3a127262bb1..15f558e6f9ef0 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml
@@ -8,15 +8,16 @@ parameters:
 - name: TrtVersion
   displayName: TensorRT Version
   type: string
-  default: 8.6.1.6
+  default: 8.6.cuda_11_8_cudnn_8
   values:
-  - 8.4.1.5
-  - 8.5.1.1
-  - 8.6.1.6
+  - 8.4.cuda_11_6_cudnn_8
+  - 8.5.cuda_11_8_cudnn_8
+  - 8.6.cuda_11_8_cudnn_8
+  - 8.6.cuda_12_3_cudnn_9
   - BIN
 
 - name: UseTensorrtOssParser
-  displayName: Use TensorRT-OSS Parser
+  displayName: Use TensorRT-OSS Parser (not compatible with BIN)
   type: boolean
   default: false
 
@@ -86,11 +87,11 @@ jobs:
       - script: 'ls -al $(trtBinsDir)'
         displayName: 'Show available TensorRT .tar.gz packages'
 
-      - script: 'cp $(trtBinsDir)/TensorRT-$(trtVersion).Linux.x86_64-gnu.cuda-$(tarCudaVersion).cudnn$(tarCudnnVersion).tar.gz $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/build/'
+      - script: 'cp $(trtBinsDir)/TensorRT-$(trtVersion).Linux.x86_64-gnu.cuda-$(tarCudaVersion).tar.gz $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/build/'
         displayName: 'Copy TensorRT .tar.gz package into Docker build directory'
 
-      - script: 'python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/build/build_image.py -r $(Build.SourcesDirectory) -i $(image) -b $(branchName) -t $(trtVersion) -a 75 --install_bin --tar_cuda_version=$(tarCudaVersion) --tar_cudnn_version=$(tarCudnnVersion) --trt_bins_dir=.'
-        displayName: 'Install TensorRT from binaries and build latest ORT Image'
+      - script: 'python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/build/build_image.py -r $(Build.SourcesDirectory) -i $(image) -b $(branchName) -t $(trtVersion) -a 75 --install_bin --tar_cuda_version=$(tarCudaVersion) --trt_bins_dir=.'
+        displayName: 'Install TensorRT $(tarTrtVersion) from binaries and build latest ORT Image'
         workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/build'
     
     # Build ORT with TensorRT built-in parser 
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_3_tensorrt8_6 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_3_tensorrt8_6
new file mode 100644
index 0000000000000..9493480784e81
--- /dev/null
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_3_tensorrt8_6
@@ -0,0 +1,96 @@
+# --------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------
+# Dockerfile to run ONNXRuntime with TensorRT integration
+
+# Build base image with required system packages
+FROM nvidia/cuda:12.3.1-devel-ubuntu20.04 AS base
+
+# The local directory into which to build and install CMAKE
+ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code
+
+ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${ONNXRUNTIME_LOCAL_CODE_DIR}/cmake-3.27.3-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update &&\
+    apt-get install -y sudo git bash unattended-upgrades wget
+RUN unattended-upgrade
+
+# Install python3
+RUN apt-get install -y --no-install-recommends \
+    python3 \
+    python3-pip \
+    python3-dev \
+    python3-wheel &&\
+    cd /usr/local/bin &&\
+    ln -s /usr/bin/python3 python &&\
+    ln -s /usr/bin/pip3 pip;
+
+RUN pip install --upgrade pip 
+RUN pip install setuptools>=68.2.2
+
+# Install cuDNN v9
+RUN apt-get -y install cudnn9-cuda-12
+
+# Install TensorRT
+RUN v="8.6.1.6-1+cuda12.0" &&\
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\
+    apt-get update &&\
+    sudo apt-get install -y libnvinfer8=${v} libnvonnxparsers8=${v} libnvparsers8=${v} libnvinfer-plugin8=${v} libnvinfer-lean8=${v} libnvinfer-vc-plugin8=${v} libnvinfer-dispatch8=${v}\
+        libnvinfer-headers-dev=${v} libnvinfer-headers-plugin-dev=${v} libnvinfer-dev=${v} libnvonnxparsers-dev=${v} libnvparsers-dev=${v} libnvinfer-plugin-dev=${v} libnvinfer-lean-dev=${v} libnvinfer-vc-plugin-dev=${v}  libnvinfer-dispatch-dev=${v}\
+        python3-libnvinfer=${v} libnvinfer-samples=${v} tensorrt-dev=${v} tensorrt-libs=${v}
+
+# Compile trtexec
+RUN cd /usr/src/tensorrt/samples/trtexec && make
+
+# Install Valgrind
+RUN apt-get install -y valgrind
+
+# Build final image from base. Builds ORT.
+FROM base as final
+ARG BUILD_USER=onnxruntimedev
+ARG BUILD_UID=1000
+RUN adduser --gecos 'onnxruntime Build User' --disabled-password $BUILD_USER --uid $BUILD_UID
+USER $BUILD_USER
+
+# ONNX Runtime arguments
+
+# URL to the github repo from which to clone ORT.
+ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime
+
+# The local directory into which to clone ORT.
+ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code
+
+# The git branch of ORT to checkout and build.
+ARG ONNXRUNTIME_BRANCH=main
+
+# Optional. The specific commit to pull and build from. If not set, the latest commit is used.
+ARG ONNXRUNTIME_COMMIT_ID
+
+# The supported CUDA architecture
+ARG CMAKE_CUDA_ARCHITECTURES=75
+
+WORKDIR ${ONNXRUNTIME_LOCAL_CODE_DIR}
+
+# Clone ORT repository with branch
+RUN git clone --single-branch --branch ${ONNXRUNTIME_BRANCH} --recursive ${ONNXRUNTIME_REPO} onnxruntime &&\
+    /bin/sh onnxruntime/dockerfiles/scripts/install_common_deps.sh
+
+WORKDIR ${ONNXRUNTIME_LOCAL_CODE_DIR}/onnxruntime
+
+# Reset to a specific commit if specified by build args.
+RUN if [ -z "$ONNXRUNTIME_COMMIT_ID" ] ; then echo "Building branch ${ONNXRUNTIME_BRANCH}" ;\
+    else echo "Building branch ${ONNXRUNTIME_BRANCH} @ commit ${ONNXRUNTIME_COMMIT_ID}" &&\
+    git reset --hard ${ONNXRUNTIME_COMMIT_ID} && git submodule update --recursive ; fi
+
+# Build ORT
+ENV CUDA_MODULE_LOADING "LAZY" 
+ARG PARSER_CONFIG=""
+RUN /bin/sh build.sh ${PARSER_CONFIG} --parallel --build_shared_lib --cuda_home /usr/local/cuda --cudnn_home /usr/lib/x86_64-linux-gnu/ --use_tensorrt --tensorrt_home /usr/lib/x86_64-linux-gnu/ --config Release --build_wheel --skip_tests --skip_submodule_sync --cmake_extra_defines '"CMAKE_CUDA_ARCHITECTURES='${CMAKE_CUDA_ARCHITECTURES}'"'
+
+# Switch to root to continue following steps of CI
+USER root
+
+# Intall ORT wheel
+RUN pip install ${ONNXRUNTIME_LOCAL_CODE_DIR}/onnxruntime/build/Linux/Release/dist/*.whl
\ No newline at end of file
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin
index 21b09b2d8978e..a26bf88fbbdf6 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin
@@ -4,29 +4,15 @@
 # --------------------------------------------------------------
 # Dockerfile to run ONNXRuntime with TensorRT installed from provided binaries
 
-FROM nvidia/cuda:11.7.1-cudnn8-devel-ubuntu20.04
+# Build base image with required system packages
+FROM nvidia/cuda:12.3.1-devel-ubuntu20.04 AS base
 
+# The local directory into which to build and install CMAKE
+ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code
 
-# ONNX Runtime Variables
-ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime
-ARG ONNXRUNTIME_BRANCH=main
-ARG CMAKE_CUDA_ARCHITECTURES=37;50;52;60;61;70;75;80
-
-# Must provide version numbers used to build the name of the tar file containing TensorRT binaries.
-# See: https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html#installing-tar
-ARG TAR_TRT_VERSION
-ARG TAR_CUDA_VERSION
-ARG TAR_CUDNN_VERSION
-
-# Directory containing TensorRT tar.gz installation package
-ARG TRT_BINS_DIR=.
-
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/code/cmake-3.27.3-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
-
+ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${ONNXRUNTIME_LOCAL_CODE_DIR}/cmake-3.27.3-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
 ENV DEBIAN_FRONTEND=noninteractive
 
-COPY ${TRT_BINS_DIR}/TensorRT-${TAR_TRT_VERSION}.Linux.x86_64-gnu.cuda-${TAR_CUDA_VERSION}.cudnn${TAR_CUDNN_VERSION}.tar.gz /TensorRT-${TAR_TRT_VERSION}.tar.gz
-
 RUN apt-get update &&\
     apt-get install -y sudo git bash unattended-upgrades wget
 RUN unattended-upgrade
@@ -44,22 +30,77 @@ RUN apt-get install -y --no-install-recommends \
 RUN pip install --upgrade pip 
 RUN pip install setuptools>=68.2.2
 
+# Install cuDNN v9
+RUN apt-get -y install cudnn9-cuda-12
+
+# Install TensorRT
+# Must provide version numbers used to build the name of the tar file containing TensorRT binaries.
+# See: https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html#installing-tar
+ARG TAR_TRT_VERSION
+ARG TAR_CUDA_VERSION
+
+# Directory containing TensorRT tar.gz installation package
+ARG TRT_BINS_DIR=.
+COPY ${TRT_BINS_DIR}/TensorRT-${TAR_TRT_VERSION}.Linux.x86_64-gnu.cuda-${TAR_CUDA_VERSION}.tar.gz /TensorRT-${TAR_TRT_VERSION}.tar.gz
+
 # Install TensorRT from tar.gz
 RUN tar -xzvf /TensorRT-${TAR_TRT_VERSION}.tar.gz
 
 RUN cd /TensorRT-${TAR_TRT_VERSION}/python &&\
-    python3 -m pip install tensorrt-${TAR_TRT_VERSION}-cp38-none-linux_x86_64.whl
+    python3 -m pip install tensorrt*cp38*.whl
 
 RUN cp -r /TensorRT-${TAR_TRT_VERSION}/lib/* /usr/lib/x86_64-linux-gnu/
 RUN cp /TensorRT-${TAR_TRT_VERSION}/include/* /usr/local/include/
 RUN cp /TensorRT-${TAR_TRT_VERSION}/bin/* /usr/local/bin/
 
-WORKDIR /code
+# Install Valgrind
+RUN apt-get install -y valgrind
+
+# Build final image from base. Builds ORT.
+FROM base as final
+ARG BUILD_USER=onnxruntimedev
+ARG BUILD_UID=1000
+RUN adduser --gecos 'onnxruntime Build User' --disabled-password $BUILD_USER --uid $BUILD_UID
+USER $BUILD_USER
+
+# ONNX Runtime arguments
+
+# URL to the github repo from which to clone ORT.
+ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime
+
+# The local directory into which to clone ORT.
+ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code
+
+# The git branch of ORT to checkout and build.
+ARG ONNXRUNTIME_BRANCH=main
+
+# Optional. The specific commit to pull and build from. If not set, the latest commit is used.
+ARG ONNXRUNTIME_COMMIT_ID
+
+# The supported CUDA architecture
+ARG CMAKE_CUDA_ARCHITECTURES=75
 
 # Prepare onnxruntime repository & build onnxruntime with TensorRT
+WORKDIR ${ONNXRUNTIME_LOCAL_CODE_DIR}
+
+# Clone ORT repository with branch
 RUN git clone --single-branch --branch ${ONNXRUNTIME_BRANCH} --recursive ${ONNXRUNTIME_REPO} onnxruntime &&\
-    /bin/sh onnxruntime/dockerfiles/scripts/install_common_deps.sh &&\
-    cd onnxruntime &&\
-    /bin/sh build.sh --parallel --build_shared_lib --cuda_home /usr/local/cuda --cudnn_home /usr/lib/x86_64-linux-gnu/ --use_tensorrt --tensorrt_home /usr/lib/x86_64-linux-gnu/ --config Release --build_wheel --skip_tests --skip_submodule_sync --cmake_extra_defines '"CMAKE_CUDA_ARCHITECTURES='${CMAKE_CUDA_ARCHITECTURES}'"' &&\
-    pip install /code/onnxruntime/build/Linux/Release/dist/*.whl &&\
-    cd .. 
+    /bin/sh onnxruntime/dockerfiles/scripts/install_common_deps.sh
+
+WORKDIR ${ONNXRUNTIME_LOCAL_CODE_DIR}/onnxruntime
+
+# Reset to a specific commit if specified by build args.
+RUN if [ -z "$ONNXRUNTIME_COMMIT_ID" ] ; then echo "Building branch ${ONNXRUNTIME_BRANCH}" ;\
+    else echo "Building branch ${ONNXRUNTIME_BRANCH} @ commit ${ONNXRUNTIME_COMMIT_ID}" &&\
+    git reset --hard ${ONNXRUNTIME_COMMIT_ID} && git submodule update --recursive ; fi
+
+# Build ORT
+ENV CUDA_MODULE_LOADING "LAZY" 
+ARG PARSER_CONFIG=""
+RUN /bin/sh build.sh ${PARSER_CONFIG} --parallel --build_shared_lib --cuda_home /usr/local/cuda --cudnn_home /usr/lib/x86_64-linux-gnu/ --use_tensorrt --tensorrt_home /usr/lib/x86_64-linux-gnu/ --config Release --build_wheel --skip_tests --skip_submodule_sync --cmake_extra_defines '"CMAKE_CUDA_ARCHITECTURES='${CMAKE_CUDA_ARCHITECTURES}'"'
+
+# Switch to root to continue following steps of CI
+USER root
+
+# Intall ORT wheel
+RUN pip install ${ONNXRUNTIME_LOCAL_CODE_DIR}/onnxruntime/build/Linux/Release/dist/*.whl
\ No newline at end of file

From 7deee944c0daa9950167f6ac399c52c00c907924 Mon Sep 17 00:00:00 2001
From: Sheil Kumar <smk2007@gmail.com>
Date: Fri, 8 Mar 2024 15:02:58 -0800
Subject: [PATCH 134/279] Implement STFT Decomposition transformer  (#19725)

Implement STFT Decomposition transformer.

Certain hardware does not support DXIL, and therefore existing operator
should be mapped to hardware supported functions.
Optimized convolution can be used to implement STFT.

---------

Co-authored-by: Sheil Kumar <sheilk@microsoft.com>
---
 .../core/optimizer/stft_decomposition.cc      | 381 ++++++++++++
 .../core/optimizer/stft_decomposition.h       |  30 +
 onnxruntime/core/providers/cpu/signal/dft.cc  |   2 +-
 .../src/ExecutionProvider.cpp                 |  10 +-
 .../src/Operators/GeneratedShaders/stockham.h | 588 +++++++++---------
 .../GeneratedShaders/stockham_fp16.h          | 257 ++++----
 .../src/Operators/Shaders/stockham.hlsl       |  21 +-
 onnxruntime/core/session/inference_session.cc |   9 +
 8 files changed, 864 insertions(+), 434 deletions(-)
 create mode 100644 onnxruntime/core/optimizer/stft_decomposition.cc
 create mode 100644 onnxruntime/core/optimizer/stft_decomposition.h

diff --git a/onnxruntime/core/optimizer/stft_decomposition.cc b/onnxruntime/core/optimizer/stft_decomposition.cc
new file mode 100644
index 0000000000000..a54904ff15e1e
--- /dev/null
+++ b/onnxruntime/core/optimizer/stft_decomposition.cc
@@ -0,0 +1,381 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <limits>
+
+#include "core/optimizer/stft_decomposition.h"
+#include "core/optimizer/initializer.h"
+#include "core/optimizer/utils.h"
+#include "core/graph/graph_utils.h"
+#include "core/optimizer/optimizer_execution_frame.h"
+#include "core/optimizer/utils.h"
+#include "core/framework/op_kernel.h"
+#include "core/framework/tensorprotoutils.h"
+
+using namespace onnxruntime::common;
+
+namespace onnxruntime {
+
+STFTDecomposition::STFTDecomposition(const InlinedHashSet<std::string_view>& compatible_execution_providers) noexcept
+    : GraphTransformer("STFTDecomposition", compatible_execution_providers) {
+}
+
+template <typename T>
+constexpr static ONNX_NAMESPACE::TensorProto_DataType GetDataType() {
+  if constexpr (std::is_same<T, float>::value) {
+    return ONNX_NAMESPACE::TensorProto_DataType_FLOAT;
+  } else if constexpr (std::is_same<T, MLFloat16>::value) {
+    return ONNX_NAMESPACE::TensorProto_DataType_FLOAT16;
+  } else if constexpr (std::is_same<T, double>::value) {
+    return ONNX_NAMESPACE::TensorProto_DataType_DOUBLE;
+  } else if constexpr (std::is_same<T, int64_t>::value) {
+    return ONNX_NAMESPACE::TensorProto_DataType_INT64;
+  } else {
+    throw std::logic_error("Invalid data type requested for STFT decomposition");
+  }
+}
+
+template <typename TDataType, size_t TDims>
+NodeArg* AddInitializer(Graph& graph, const char* name, const int64_t (&shape)[TDims], const TDataType* begin) {
+  ONNX_NAMESPACE::TensorProto proto;
+  proto.set_name(graph.GenerateNodeArgName(name));
+  proto.set_data_type(GetDataType<TDataType>());
+  int64_t element_count = 1;
+  for (size_t i = 0; i < TDims; i++) {
+    element_count *= shape[i];
+    proto.add_dims(shape[i]);
+  }
+  proto.set_raw_data(begin, element_count * sizeof(TDataType));
+  return &graph_utils::AddInitializer(graph, proto);
+}
+
+template <size_t TDims>
+NodeArg* AddShapeInitializer(Graph& graph, const char* name, const int64_t (&shape)[TDims]) {
+  int64_t shape_shape[] = {TDims};
+  return AddInitializer<int64_t>(graph, name, shape_shape, shape);
+}
+
+std::pair<Node*, NodeArg*> AddNode(Graph& graph,
+                                   const char* op_type,
+                                   ProviderType execution_provider_type,
+                                   gsl::span<NodeArg*> inputs) {
+  auto def_name = graph.GenerateNodeArgName(op_type);
+  auto node_arg = &graph.GetOrCreateNodeArg(def_name, nullptr);
+  Node& node = graph.AddNode(graph.GenerateNodeName(op_type),
+                             op_type,
+                             "",
+                             inputs,
+                             {node_arg});
+  node.SetExecutionProviderType(execution_provider_type);
+  return std::make_pair(&node, node_arg);
+}
+
+std::pair<Node*, NodeArg*> AddNodeCast(Graph& graph, NodeArg* in,
+                                       ONNX_NAMESPACE::TensorProto_DataType data_type) {
+  auto def_name = graph.GenerateNodeArgName("Cast");
+  auto node_arg = &graph.GetOrCreateNodeArg(def_name, nullptr);
+  Node& node = graph.AddNode(graph.GenerateNodeName("Cast"),
+                             "Cast",
+                             "",
+                             {in},
+                             {node_arg});
+  node.AddAttribute("to", static_cast<int64_t>(data_type));
+  node.SetExecutionProviderType(kCpuExecutionProvider);
+  return std::make_pair(&node, node_arg);
+}
+
+#define CONTINUE_IF_NO_DIM_VALUE(dim) \
+  if (!dim.has_dim_value()) {         \
+    continue;                         \
+  }
+#define CONTINUE_IF_NULL(x) \
+  if (x == nullptr) {       \
+    continue;               \
+  }
+
+/*
+    This function decomposes a STFT node into a subgraph.
+    The decomposition requires that:
+      1) The signal input is real valued and not complex valued!
+      2) Both (frame_step) *and* either (window or frame_length) inputs must be constant.
+    Otherwise the transform will not be applied.
+
+    Subgraph pattern 1: STFT with optional Window parameter set
+              [root]--(signal)--------------------+
+              [root]--(frame_step)---------------+|
+              [root]--(window)------------------+||
+              [root]--(frame_length) ----------+|||
+                                               ||||
+                                               vvvv
+                                              [STFT]--(output)-->
+    After Fusion:
+              [root]--(signal)-------------------------+
+              [root]                                   |
+              [root]--(window)--+                      |
+              [root]            |                      |
+                                v                      v
+         (only for non-fp32) [Cast]             +--[Reshape]
+                                |               |      |
+                                v               |      v
+                            [Reshape]-->[Mul]---|-->[Conv]-------+
+                                |               |                |
+                                |               +-----|          |
+                                |                     v          v
+                                +------>[Mul]------>[Conv]-->[Concat]-->[Reshape]-->[Transpose]--(output)-->
+
+
+    Subgraph pattern 2: STFT without optional Window parameter set
+              [root]--(signal)-------------------+
+              [root]--(frame_step)--------------+|
+              [root]                             |
+              [root]--(frame_length) ----------+||
+                                               |||
+                                               vvv
+                                              [STFT]--(output)-->
+    After Fusion:
+              [root]--(signal)-->[Reshape]-->[Conv]
+              [root]                 |         |
+              [root]                 |         v
+              [root]                 +------>[Conv]-->[Concat]-->[Reshape]-->[Transpose]--(output)-->
+*/
+Status STFTDecomposition::ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const {
+  GraphViewer graph_viewer(graph);
+  auto& order = graph_viewer.GetNodesInTopologicalOrder();
+
+  for (NodeIndex i : order) {
+    auto node = graph.GetNode(i);
+    CONTINUE_IF_NULL(node);
+    ORT_RETURN_IF_ERROR(Recurse(*node, modified, graph_level, logger));
+
+    if (node->OpType() != "STFT") {
+      continue;
+    }
+
+    Node& stft = *node;
+    auto signal = stft.MutableInputDefs()[0];
+    auto frame_step = stft.MutableInputDefs()[1];
+    auto window = stft.MutableInputDefs()[2];
+    auto frame_length = stft.MutableInputDefs()[3];
+
+    // If the signal has free dimensions, do not transform...
+    auto batch_size_dim = signal->Shape()->dim(0);
+    auto signal_length_dim = signal->Shape()->dim(1);
+    auto signal_components_dim = signal->Shape()->dim(2);
+    CONTINUE_IF_NO_DIM_VALUE(signal_length_dim);
+    CONTINUE_IF_NO_DIM_VALUE(signal_components_dim);
+
+    auto batch_size = batch_size_dim.has_dim_value() ? batch_size_dim.dim_value() : static_cast<int64_t>(-1);
+    auto signal_length = signal_length_dim.dim_value();
+    auto is_real = signal_components_dim.dim_value() == 1;
+    auto data_type = static_cast<ONNX_NAMESPACE::TensorProto_DataType>(signal->TypeAsProto()->tensor_type().elem_type());
+
+    auto frame_step_initializer = graph_utils::GetConstantInitializer(graph, frame_step->Name());
+    auto window_initializer = graph_utils::GetConstantInitializer(graph, window->Name());
+    auto frame_length_initializer = graph_utils::GetConstantInitializer(graph, frame_length->Name());
+    CONTINUE_IF_NULL(frame_step_initializer);
+    if (!frame_length_initializer && !window_initializer) {
+      continue;
+    }
+
+    auto read_int64_initializer = [](Graph& graph, const ONNX_NAMESPACE::TensorProto* initializer) {
+      return *Initializer(*initializer, graph.ModelPath()).data<int64_t>();
+    };
+    auto frame_step_value = read_int64_initializer(graph, frame_step_initializer);
+
+    // Get DFT Size
+    int64_t dft_size = 0;
+    if (frame_length_initializer) {
+      dft_size = read_int64_initializer(graph, frame_length_initializer);
+    }
+    if (dft_size == 0 && window_initializer) {
+      auto window_length_dim = window->Shape()->dim(0);
+      CONTINUE_IF_NO_DIM_VALUE(window_length_dim);
+      dft_size = window_length_dim.dim_value();
+    }
+
+    bool is_onesided = true;
+    auto& attrs = stft.GetAttributes();
+    if (attrs.find("onesided") != attrs.end()) {
+      auto& onesided_attr = attrs.at("onesided");
+      if (utils::HasInt(onesided_attr)) {
+        is_onesided = static_cast<bool>(onesided_attr.i());
+      }
+    }
+
+    auto dft_unique_bins = is_onesided ? ((dft_size >> 1) + 1) : dft_size;
+
+    Node* signal_recipient = nullptr;
+    Node* window_recipient = nullptr;
+    Node* stft_producer = nullptr;
+    if (is_real) {
+      auto output_num_frames = stft.MutableOutputDefs()[0]->Shape()->dim(1).dim_value();
+      auto output_frame_length = stft.MutableOutputDefs()[0]->Shape()->dim(2).dim_value();
+      auto weight_size = static_cast<size_t>(dft_unique_bins * dft_size);
+      auto real_weights_data = std::vector<float>(weight_size);
+      auto imag_weights_data = std::vector<float>(weight_size);
+
+      // Populate weights
+      for (size_t k = 0; k < static_cast<size_t>(dft_unique_bins); k++) {
+        for (size_t n = 0; n < static_cast<size_t>(dft_size); n++) {
+          auto index = static_cast<size_t>(k * dft_size + n);
+          auto theta = -2 * M_PI * k * n / static_cast<float>(dft_size);
+          real_weights_data[index] = static_cast<float>(cos(theta));
+          imag_weights_data[index] = static_cast<float>(sin(theta));
+        }
+      }
+
+      const int64_t weight_shape[] = {dft_unique_bins, 1, 1, dft_size};
+      auto real_weights = AddInitializer<float>(graph, "stft_real_conv_weights", weight_shape, real_weights_data.data());
+      auto imaginary_weights = AddInitializer<float>(graph, "stft_imaginary_conv_weights", weight_shape, imag_weights_data.data());
+
+      const int64_t signal_reshaped[] = {batch_size, 1, 1, signal_length};
+      auto signal_shape = AddShapeInitializer(graph, "stft_signal_shape", signal_reshaped);
+
+      const int64_t unsqueezed_output_shape[] = {2, batch_size, output_frame_length, output_num_frames};
+      auto unsqueezed_shape = AddShapeInitializer(graph, "stft_output_reshaped", unsqueezed_output_shape);
+
+      NodeArg* signal_reshaped_inputs[] = {signal, signal_shape};
+      Node* reshape_signal_node = nullptr;
+      NodeArg* reshape_output = nullptr;
+      std::tie(reshape_signal_node, reshape_output) =
+          AddNode(graph, "Reshape", stft.GetExecutionProviderType(), signal_reshaped_inputs);
+
+      NodeArg* real_weights_final = real_weights;
+      NodeArg* imag_weights_final = imaginary_weights;
+      if (!window->Exists()) {
+        // When we are missing a window function
+        if (real_weights_final->TypeAsProto()->tensor_type().elem_type() != data_type) {
+          std::tie(std::ignore, real_weights_final) =
+              AddNodeCast(graph, real_weights_final, data_type);
+        }
+        if (imag_weights_final->TypeAsProto()->tensor_type().elem_type() != data_type) {
+          std::tie(std::ignore, imag_weights_final) =
+              AddNodeCast(graph, imag_weights_final, data_type);
+        }
+      } else {
+        // When we have a window function
+        const int64_t window_reshaped_shape[] = {1, 1, 1, dft_size};
+        auto window_shape = AddShapeInitializer(graph, "stft_window_shape", window_reshaped_shape);
+
+        auto window_final = window;
+        if (window->TypeAsProto()->tensor_type().elem_type() != GetDataType<float>()) {
+          Node* window_cast_node = nullptr;
+          std::tie(window_cast_node, window_final) =
+              AddNodeCast(graph, window, GetDataType<float>());
+          window_recipient = window_cast_node;
+        }
+
+        NodeArg* window_reshaped_inputs[] = {window_final, window_shape};
+        Node* window_reshape_node;
+        NodeArg* window_reshaped = nullptr;
+        std::tie(window_reshape_node, window_reshaped) =
+            AddNode(graph, "Reshape", kCpuExecutionProvider, window_reshaped_inputs);
+        if (!window_recipient) {
+          window_recipient = window_reshape_node;
+        }
+
+        NodeArg* scale_real_weights_inputs[] = {real_weights, window_reshaped};
+        NodeArg* windowed_real_weights_output = nullptr;
+        std::tie(std::ignore, windowed_real_weights_output) =
+            AddNode(graph, "Mul", kCpuExecutionProvider, scale_real_weights_inputs);
+
+        NodeArg* scale_imag_weights_inputs[] = {imaginary_weights, window_reshaped};
+        NodeArg* windowed_imag_weights_output = nullptr;
+        std::tie(std::ignore, windowed_imag_weights_output) =
+            AddNode(graph, "Mul", kCpuExecutionProvider, scale_imag_weights_inputs);
+
+        std::tie(std::ignore, real_weights_final) =
+            AddNodeCast(graph, windowed_real_weights_output, data_type);
+        std::tie(std::ignore, imag_weights_final) =
+            AddNodeCast(graph, windowed_imag_weights_output, data_type);
+      }
+
+      // Add Convolution (reals)
+      NodeArg* conv_real_inputs[] = {reshape_output, real_weights_final};
+      Node* real_conv_node = nullptr;
+      NodeArg* real_conv_output = nullptr;
+      std::tie(real_conv_node, real_conv_output) =
+          AddNode(graph, "Conv", stft.GetExecutionProviderType(), conv_real_inputs);
+      real_conv_node->AddAttribute("strides", std::vector<int64_t>{1, frame_step_value});
+
+      // Add Convolution (imaginary)
+      NodeArg* conv_imag_inputs[] = {reshape_output, imag_weights_final};
+      Node* imag_conv_node = nullptr;
+      NodeArg* imag_conv_output = nullptr;
+      std::tie(imag_conv_node, imag_conv_output) =
+          AddNode(graph, "Conv", stft.GetExecutionProviderType(), conv_imag_inputs);
+      imag_conv_node->AddAttribute("strides", std::vector<int64_t>{1, frame_step_value});
+
+      // Concatenate
+      NodeArg* concatenate_inputs[] = {real_conv_output, imag_conv_output};
+      Node* concat_node = nullptr;
+      NodeArg* concatenated_conv_output = nullptr;
+      std::tie(concat_node, concatenated_conv_output) =
+          AddNode(graph, "Concat", stft.GetExecutionProviderType(), concatenate_inputs);
+      concat_node->AddAttribute("axis", static_cast<int64_t>(0));
+
+      // Unsqueeze Reshape
+      NodeArg* unsqueeze_reshape_inputs[] = {concatenated_conv_output, unsqueezed_shape};
+      NodeArg* unsqueezed_output = nullptr;
+      std::tie(std::ignore, unsqueezed_output) =
+          AddNode(graph, "Reshape", stft.GetExecutionProviderType(), unsqueeze_reshape_inputs);
+
+      // Transpose
+      NodeArg* transpose_inputs[] = {unsqueezed_output};
+      Node* transpose_node = nullptr;
+      NodeArg* transpose_output = nullptr;
+      std::tie(transpose_node, transpose_output) =
+          AddNode(graph, "Transpose", stft.GetExecutionProviderType(), transpose_inputs);
+      transpose_node->AddAttribute("perm", std::vector<int64_t>{1, 3, 2, 0});
+
+      signal_recipient = reshape_signal_node;
+      stft_producer = transpose_node;
+    } else {
+      continue;
+    }
+
+    auto input_edges = graph_utils::GraphEdge::GetNodeInputEdges(stft);
+    auto output_edges = graph_utils::GraphEdge::GetNodeOutputEdges(stft);
+
+    // Copy inputs
+    auto signal_target_idx = signal_recipient->Index();
+    auto window_target_idx = window_recipient->Index();
+    for (auto cur = input_edges.cbegin(), end = input_edges.cend(); cur != end; ++cur) {
+      const graph_utils::GraphEdge& edge = *cur;
+      NodeIndex target_idx = 0;
+      Node* recipient = nullptr;
+      switch (cur->dst_arg_index) {
+        case 0:
+          target_idx = signal_target_idx;
+          recipient = signal_recipient;
+          break;
+        case 2:
+          target_idx = window_target_idx;
+          recipient = window_recipient;
+          break;
+      }
+
+      if (!recipient) {
+        continue;
+      }
+
+      auto arg_index = graph_utils::GetNodeInputIndexFromInputName(*recipient, edge.arg_name);
+      graph.AddEdge(edge.src_node, target_idx, edge.src_arg_index, arg_index);
+    }
+
+    // Copy STFT outputs to stft_producer
+    stft_producer->MutableOutputDefs() = stft.MutableOutputDefs();
+    auto stft_producer_target_idx = stft_producer->Index();
+    for (auto cur = output_edges.cbegin(), end = output_edges.cend(); cur != end; ++cur) {
+      graph.AddEdge(stft_producer_target_idx, cur->dst_node, cur->src_arg_index, cur->dst_arg_index);
+    }
+
+    graph_utils::GraphEdge::RemoveGraphEdges(graph, input_edges);
+    graph_utils::GraphEdge::RemoveGraphEdges(graph, output_edges);
+    graph.RemoveNode(stft.Index());
+
+    modified = true;
+  }
+  return Status::OK();
+}
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/stft_decomposition.h b/onnxruntime/core/optimizer/stft_decomposition.h
new file mode 100644
index 0000000000000..cac058474375e
--- /dev/null
+++ b/onnxruntime/core/optimizer/stft_decomposition.h
@@ -0,0 +1,30 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/optimizer/graph_transformer.h"
+#include "core/framework/ort_value.h"
+#include <memory>
+#include "core/framework/execution_provider.h"
+
+namespace onnxruntime {
+
+/**
+@class STFTDecomposition
+
+Transformer that traverses the graph top-down and decomposes
+STFT into convolution.
+*/
+class STFTDecomposition : public GraphTransformer {
+ public:
+  /*! STFT decomposition .
+      \param execution_provider Execution provider instance to execute constant folding.
+  */
+  STFTDecomposition(const InlinedHashSet<std::string_view>& compatible_execution_providers = {}) noexcept;
+
+ private:
+  Status ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const override;
+};
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/signal/dft.cc b/onnxruntime/core/providers/cpu/signal/dft.cc
index 15bf633579e5f..50fe7d1344eaf 100644
--- a/onnxruntime/core/providers/cpu/signal/dft.cc
+++ b/onnxruntime/core/providers/cpu/signal/dft.cc
@@ -506,7 +506,7 @@ static Status short_time_fourier_transform(OpKernelContext* ctx, bool is_oneside
 
   // Calculate the window size with preference to the window input.
   const auto window_size = window ? window->Shape()[0] : frame_length;
-  ORT_ENFORCE(window_size < signal_size, "Ensure that the dft size is smaller than the signal.");
+  ORT_ENFORCE(window_size <= signal_size, "Ensure that the dft size is smaller than the signal.");
 
   // Calculate the number of dfts to run
   const auto n_dfts =
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index 8a32d06534dda..6c347ebdca7c1 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -771,8 +771,14 @@ namespace Dml
                 !native16BitShaderOpsSupported &&
                 IsCustomOpShader(node))
             {
-                nodeContainsSupportedDataTypes = false;
-                return;
+                // STFT is a special case since it has a dml ep registered
+                // graph transformation that will decompose fp16 STFT into convolution
+                // and so it is OK to register for fp16.
+                if (strcmp("STFT", node.OpType().c_str()) != 0)
+                {
+                    nodeContainsSupportedDataTypes = false;
+                    return;
+                }
             }
 
             // Allow nodeArgs that are SequenceTensor when they are actually implemented by CPU Kernels.
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/GeneratedShaders/stockham.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/GeneratedShaders/stockham.h
index 9c03b7f6de639..1bfd6e6c6068d 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/GeneratedShaders/stockham.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/GeneratedShaders/stockham.h
@@ -21,7 +21,7 @@ dcl_uav_structured u0, 4
 dcl_uav_structured u1, 4
 dcl_uav_structured u2, 4
 dcl_input vThreadID.x
-dcl_temps 6
+dcl_temps 5
 dcl_thread_group 64, 1, 1
 iadd r0.x, vThreadID.x, cb0[0].x
 ult r0.y, r0.x, cb0[0].y
@@ -40,66 +40,57 @@ if_nz r0.y
   ieq r1.y, cb0[7].x, l(1)
   ult r1.z, r0.w, cb0[5].z
   and r1.z, r1.z, r1.y
-  if_nz r1.z
-    imul null, r1.z, r0.w, cb0[6].z
-    ld_structured_indexable(structured_buffer, stride=4)(mixed,mixed,mixed,mixed) r4.x, r1.z, l(0), u2.xxxx
-    imad r1.z, r0.w, cb0[6].z, cb0[6].w
-    ieq r1.w, cb0[5].w, l(2)
-    ld_structured_indexable(structured_buffer, stride=4)(mixed,mixed,mixed,mixed) r1.z, r1.z, l(0), u2.xxxx
-    and r4.y, r1.z, r1.w
+  imul null, r1.w, r0.w, cb0[6].z
+  ld_structured_indexable(structured_buffer, stride=4)(mixed,mixed,mixed,mixed) r4.x, r1.w, l(0), u2.xxxx
+  ieq r1.w, cb0[5].w, l(2)
+  if_nz r1.w
+    imad r2.y, r0.w, cb0[6].z, cb0[6].w
+    ld_structured_indexable(structured_buffer, stride=4)(mixed,mixed,mixed,mixed) r4.y, r2.y, l(0), u2.xxxx
   else 
-    mov r4.xy, l(1.000000,0,0,0)
+    mov r4.y, l(0)
   endif 
+  movc r2.yz, r1.zzzz, r4.yyxy, l(0,0,1.000000,0)
   ult r1.z, r0.w, cb0[1].y
-  if_nz r1.z
-    imul null, r0.w, r0.w, cb0[2].y
-    imad r0.w, r1.x, cb0[2].x, r0.w
-    imad r0.w, r3.x, cb0[2].z, r0.w
-    ld_structured_indexable(structured_buffer, stride=4)(mixed,mixed,mixed,mixed) r5.x, r0.w, l(0), u0.xxxx
-    ieq r1.z, cb0[1].w, l(2)
-    if_nz r1.z
-      iadd r0.w, r0.w, cb0[2].w
-      ld_structured_indexable(structured_buffer, stride=4)(mixed,mixed,mixed,mixed) r5.y, r0.w, l(0), u0.xxxx
-    else 
-      mov r5.y, l(0)
-    endif 
+  imul null, r1.x, r1.x, cb0[2].x
+  imad r0.w, r0.w, cb0[2].y, r1.x
+  imad r0.w, r3.x, cb0[2].z, r0.w
+  ld_structured_indexable(structured_buffer, stride=4)(mixed,mixed,mixed,mixed) r4.x, r0.w, l(0), u0.xxxx
+  ieq r2.w, cb0[1].w, l(2)
+  if_nz r2.w
+    iadd r0.w, r0.w, cb0[2].w
+    ld_structured_indexable(structured_buffer, stride=4)(mixed,mixed,mixed,mixed) r4.y, r0.w, l(0), u0.xxxx
   else 
-    mov r5.xy, l(0,0,0,0)
+    mov r4.y, l(0)
   endif 
-  mul r0.w, r4.y, r5.y
-  mad r0.w, r5.x, r4.x, -r0.w
-  dp2 r1.z, r5.yxyy, r4.xyxx
-  ult r1.w, r0.y, cb0[5].z
-  and r1.y, r1.w, r1.y
-  if_nz r1.y
-    imul null, r1.y, r0.y, cb0[6].z
-    ld_structured_indexable(structured_buffer, stride=4)(mixed,mixed,mixed,mixed) r4.x, r1.y, l(0), u2.xxxx
-    imad r1.y, r0.y, cb0[6].z, cb0[6].w
-    ieq r1.w, cb0[5].w, l(2)
-    ld_structured_indexable(structured_buffer, stride=4)(mixed,mixed,mixed,mixed) r1.y, r1.y, l(0), u2.xxxx
-    and r4.y, r1.y, r1.w
+  and r3.yz, r1.zzzz, r4.xxyx
+  mul r0.w, r2.y, r3.z
+  mad r0.w, r3.y, r2.z, -r0.w
+  dp2 r1.z, r3.yzyy, r2.yzyy
+  ult r2.y, r0.y, cb0[5].z
+  and r1.y, r1.y, r2.y
+  imul null, r2.y, r0.y, cb0[6].z
+  ld_structured_indexable(structured_buffer, stride=4)(mixed,mixed,mixed,mixed) r4.x, r2.y, l(0), u2.xxxx
+  if_nz r1.w
+    imad r1.w, r0.y, cb0[6].z, cb0[6].w
+    ld_structured_indexable(structured_buffer, stride=4)(mixed,mixed,mixed,mixed) r4.y, r1.w, l(0), u2.xxxx
   else 
-    mov r4.xy, l(1.000000,0,0,0)
+    mov r4.y, l(0)
   endif 
-  ult r1.y, r0.y, cb0[1].y
-  if_nz r1.y
-    imul null, r0.y, r0.y, cb0[2].y
-    imad r0.y, r1.x, cb0[2].x, r0.y
-    imad r0.y, r3.x, cb0[2].z, r0.y
-    ld_structured_indexable(structured_buffer, stride=4)(mixed,mixed,mixed,mixed) r1.x, r0.y, l(0), u0.xxxx
-    ieq r1.w, cb0[1].w, l(2)
-    if_nz r1.w
-      iadd r0.y, r0.y, cb0[2].w
-      ld_structured_indexable(structured_buffer, stride=4)(mixed,mixed,mixed,mixed) r1.y, r0.y, l(0), u0.xxxx
-    else 
-      mov r1.y, l(0)
-    endif 
+  movc r1.yw, r1.yyyy, r4.yyyx, l(0,0,0,1.000000)
+  ult r2.y, r0.y, cb0[1].y
+  imad r0.y, r0.y, cb0[2].y, r1.x
+  imad r0.y, r3.x, cb0[2].z, r0.y
+  ld_structured_indexable(structured_buffer, stride=4)(mixed,mixed,mixed,mixed) r3.x, r0.y, l(0), u0.xxxx
+  if_nz r2.w
+    iadd r0.y, r0.y, cb0[2].w
+    ld_structured_indexable(structured_buffer, stride=4)(mixed,mixed,mixed,mixed) r3.y, r0.y, l(0), u0.xxxx
   else 
-    mov r1.xy, l(0,0,0,0)
+    mov r3.y, l(0)
   endif 
-  mul r0.y, r4.y, r1.y
-  mad r0.y, r1.x, r4.x, -r0.y
-  dp2 r1.x, r1.yxyy, r4.xyxx
+  and r2.yz, r2.yyyy, r3.xxyx
+  mul r0.y, r1.y, r2.z
+  mad r0.y, r2.y, r1.w, -r0.y
+  dp2 r1.x, r2.yzyy, r1.ywyy
   udiv null, r1.y, r2.x, r0.z
   ieq r1.w, cb0[0].w, l(1)
   movc r1.w, r1.w, l(6.283185), l(-6.283185)
@@ -117,17 +108,22 @@ if_nz r0.y
   mad r0.y, r3.x, r1.x, r0.y
   add r0.y, r0.y, r1.z
   mul r0.yw, r0.yyyw, cb0[7].zzzz
-  ne r1.x, cb0[7].y, l(0.000000)
-  mul r1.y, r1.y, r1.y
-  mul r1.y, r1.y, l(3.141593)
-  div r1.y, r1.y, cb0[7].y
-  sincos r2.x, r3.x, r1.y
-  mov r2.y, r3.x
-  movc r1.xy, r1.xxxx, r2.xyxx, l(0,1.000000,0,0)
-  mul r1.zw, r0.yyyy, r1.xxxy
-  mad r0.y, r0.w, r1.y, -r1.z
-  store_structured u1.x, r0.z, l(0), r0.y
-  mad r0.y, r0.w, r1.x, r1.w
+  eq r1.x, cb0[7].y, l(0.000000)
+  if_nz r1.x
+    mov r1.x, r0.w
+  else 
+    ne r1.z, cb0[7].y, l(0.000000)
+    mul r1.y, r1.y, r1.y
+    mul r1.y, r1.y, l(3.141593)
+    div r1.y, r1.y, cb0[7].y
+    sincos r2.x, r3.x, r1.y
+    mov r2.y, r3.x
+    movc r1.yz, r1.zzzz, r2.xxyx, l(0,0,1.000000,0)
+    mul r2.xy, r0.yyyy, r1.yzyy
+    mad r1.x, r0.w, r1.z, -r2.x
+    mad r0.y, r0.w, r1.y, r2.y
+  endif 
+  store_structured u1.x, r0.z, l(0), r1.x
   store_structured u1.x, r0.x, l(0), r0.y
 endif 
 ret 
@@ -136,11 +132,11 @@ ret
 
 const BYTE g_DFT[] =
 {
-     68,  88,  66,  67, 222, 156, 
-    188, 133, 179,  57, 118,  25, 
-    122, 216, 102,  13,  91, 242, 
-     99,  27,   1,   0,   0,   0, 
-    172,  12,   0,   0,   3,   0, 
+     68,  88,  66,  67,  63, 188, 
+    200, 227, 206,  73,  64,  21, 
+    140, 126,  47, 226, 169,  81, 
+    175, 134,   1,   0,   0,   0, 
+    112,  12,   0,   0,   3,   0, 
       0,   0,  44,   0,   0,   0, 
      60,   0,   0,   0,  76,   0, 
       0,   0,  73,  83,  71,  78, 
@@ -149,8 +145,8 @@ const BYTE g_DFT[] =
      79,  83,  71,  78,   8,   0, 
       0,   0,   0,   0,   0,   0, 
       8,   0,   0,   0,  83,  72, 
-     69,  88,  88,  12,   0,   0, 
-     80,   0,   5,   0,  22,   3, 
+     69,  88,  28,  12,   0,   0, 
+     80,   0,   5,   0,   7,   3, 
       0,   0, 106,   8,   0,   1, 
      89,   0,   0,   4,  70, 142, 
      32,   0,   0,   0,   0,   0, 
@@ -164,7 +160,7 @@ const BYTE g_DFT[] =
      17,   0,   2,   0,   0,   0, 
       4,   0,   0,   0,  95,   0, 
       0,   2,  18,   0,   2,   0, 
-    104,   0,   0,   2,   6,   0, 
+    104,   0,   0,   2,   5,   0, 
       0,   0, 155,   0,   0,   4, 
      64,   0,   0,   0,   1,   0, 
       0,   0,   1,   0,   0,   0, 
@@ -256,11 +252,9 @@ const BYTE g_DFT[] =
      16,   0,   1,   0,   0,   0, 
      42,   0,  16,   0,   1,   0, 
       0,   0,  26,   0,  16,   0, 
-      1,   0,   0,   0,  31,   0, 
-      4,   3,  42,   0,  16,   0, 
       1,   0,   0,   0,  38,   0, 
       0,   9,   0, 208,   0,   0, 
-     66,   0,  16,   0,   1,   0, 
+    130,   0,  16,   0,   1,   0, 
       0,   0,  58,   0,  16,   0, 
       0,   0,   0,   0,  42, 128, 
      32,   0,   0,   0,   0,   0, 
@@ -268,221 +262,203 @@ const BYTE g_DFT[] =
       0, 139,   2,  35,   0, 128, 
     131, 153,  25,   0,  18,   0, 
      16,   0,   4,   0,   0,   0, 
-     42,   0,  16,   0,   1,   0, 
+     58,   0,  16,   0,   1,   0, 
       0,   0,   1,  64,   0,   0, 
       0,   0,   0,   0,   6, 224, 
      17,   0,   2,   0,   0,   0, 
-     35,   0,   0,  11,  66,   0, 
+     32,   0,   0,   8, 130,   0, 
      16,   0,   1,   0,   0,   0, 
-     58,   0,  16,   0,   0,   0, 
-      0,   0,  42, 128,  32,   0, 
-      0,   0,   0,   0,   6,   0, 
-      0,   0,  58, 128,  32,   0, 
-      0,   0,   0,   0,   6,   0, 
-      0,   0,  32,   0,   0,   8, 
-    130,   0,  16,   0,   1,   0, 
-      0,   0,  58, 128,  32,   0, 
-      0,   0,   0,   0,   5,   0, 
-      0,   0,   1,  64,   0,   0, 
-      2,   0,   0,   0, 167,   0, 
+     58, 128,  32,   0,   0,   0, 
+      0,   0,   5,   0,   0,   0, 
+      1,  64,   0,   0,   2,   0, 
+      0,   0,  31,   0,   4,   3, 
+     58,   0,  16,   0,   1,   0, 
+      0,   0,  35,   0,   0,  11, 
+     34,   0,  16,   0,   2,   0, 
+      0,   0,  58,   0,  16,   0, 
+      0,   0,   0,   0,  42, 128, 
+     32,   0,   0,   0,   0,   0, 
+      6,   0,   0,   0,  58, 128, 
+     32,   0,   0,   0,   0,   0, 
+      6,   0,   0,   0, 167,   0, 
       0, 139,   2,  35,   0, 128, 
-    131, 153,  25,   0,  66,   0, 
-     16,   0,   1,   0,   0,   0, 
-     42,   0,  16,   0,   1,   0, 
+    131, 153,  25,   0,  34,   0, 
+     16,   0,   4,   0,   0,   0, 
+     26,   0,  16,   0,   2,   0, 
       0,   0,   1,  64,   0,   0, 
       0,   0,   0,   0,   6, 224, 
      17,   0,   2,   0,   0,   0, 
-      1,   0,   0,   7,  34,   0, 
-     16,   0,   4,   0,   0,   0, 
-     42,   0,  16,   0,   1,   0, 
-      0,   0,  58,   0,  16,   0, 
-      1,   0,   0,   0,  18,   0, 
-      0,   1,  54,   0,   0,   8, 
-     50,   0,  16,   0,   4,   0, 
+     18,   0,   0,   1,  54,   0, 
+      0,   5,  34,   0,  16,   0, 
+      4,   0,   0,   0,   1,  64, 
+      0,   0,   0,   0,   0,   0, 
+     21,   0,   0,   1,  55,   0, 
+      0,  12,  98,   0,  16,   0, 
+      2,   0,   0,   0, 166,  10, 
+     16,   0,   1,   0,   0,   0, 
+     86,   4,  16,   0,   4,   0, 
       0,   0,   2,  64,   0,   0, 
-      0,   0, 128,  63,   0,   0, 
       0,   0,   0,   0,   0,   0, 
-      0,   0,   0,   0,  21,   0, 
-      0,   1,  79,   0,   0,   8, 
-     66,   0,  16,   0,   1,   0, 
-      0,   0,  58,   0,  16,   0, 
-      0,   0,   0,   0,  26, 128, 
-     32,   0,   0,   0,   0,   0, 
-      1,   0,   0,   0,  31,   0, 
-      4,   3,  42,   0,  16,   0, 
-      1,   0,   0,   0,  38,   0, 
-      0,   9,   0, 208,   0,   0, 
-    130,   0,  16,   0,   0,   0, 
-      0,   0,  58,   0,  16,   0, 
-      0,   0,   0,   0,  26, 128, 
-     32,   0,   0,   0,   0,   0, 
-      2,   0,   0,   0,  35,   0, 
-      0,  10, 130,   0,  16,   0, 
-      0,   0,   0,   0,  10,   0, 
+      0,   0,   0,   0, 128,  63, 
+      0,   0,   0,   0,  79,   0, 
+      0,   8,  66,   0,  16,   0, 
+      1,   0,   0,   0,  58,   0, 
+     16,   0,   0,   0,   0,   0, 
+     26, 128,  32,   0,   0,   0, 
+      0,   0,   1,   0,   0,   0, 
+     38,   0,   0,   9,   0, 208, 
+      0,   0,  18,   0,  16,   0, 
+      1,   0,   0,   0,  10,   0, 
      16,   0,   1,   0,   0,   0, 
      10, 128,  32,   0,   0,   0, 
       0,   0,   2,   0,   0,   0, 
+     35,   0,   0,  10, 130,   0, 
+     16,   0,   0,   0,   0,   0, 
      58,   0,  16,   0,   0,   0, 
-      0,   0,  35,   0,   0,  10, 
-    130,   0,  16,   0,   0,   0, 
+      0,   0,  26, 128,  32,   0, 
+      0,   0,   0,   0,   2,   0, 
       0,   0,  10,   0,  16,   0, 
-      3,   0,   0,   0,  42, 128, 
+      1,   0,   0,   0,  35,   0, 
+      0,  10, 130,   0,  16,   0, 
+      0,   0,   0,   0,  10,   0, 
+     16,   0,   3,   0,   0,   0, 
+     42, 128,  32,   0,   0,   0, 
+      0,   0,   2,   0,   0,   0, 
+     58,   0,  16,   0,   0,   0, 
+      0,   0, 167,   0,   0, 139, 
+      2,  35,   0, 128, 131, 153, 
+     25,   0,  18,   0,  16,   0, 
+      4,   0,   0,   0,  58,   0, 
+     16,   0,   0,   0,   0,   0, 
+      1,  64,   0,   0,   0,   0, 
+      0,   0,   6, 224,  17,   0, 
+      0,   0,   0,   0,  32,   0, 
+      0,   8, 130,   0,  16,   0, 
+      2,   0,   0,   0,  58, 128, 
      32,   0,   0,   0,   0,   0, 
-      2,   0,   0,   0,  58,   0, 
+      1,   0,   0,   0,   1,  64, 
+      0,   0,   2,   0,   0,   0, 
+     31,   0,   4,   3,  58,   0, 
+     16,   0,   2,   0,   0,   0, 
+     30,   0,   0,   8, 130,   0, 
      16,   0,   0,   0,   0,   0, 
-    167,   0,   0, 139,   2,  35, 
-      0, 128, 131, 153,  25,   0, 
-     18,   0,  16,   0,   5,   0, 
-      0,   0,  58,   0,  16,   0, 
-      0,   0,   0,   0,   1,  64, 
-      0,   0,   0,   0,   0,   0, 
-      6, 224,  17,   0,   0,   0, 
-      0,   0,  32,   0,   0,   8, 
-     66,   0,  16,   0,   1,   0, 
+     58,   0,  16,   0,   0,   0, 
       0,   0,  58, 128,  32,   0, 
-      0,   0,   0,   0,   1,   0, 
-      0,   0,   1,  64,   0,   0, 
-      2,   0,   0,   0,  31,   0, 
-      4,   3,  42,   0,  16,   0, 
-      1,   0,   0,   0,  30,   0, 
-      0,   8, 130,   0,  16,   0, 
-      0,   0,   0,   0,  58,   0, 
+      0,   0,   0,   0,   2,   0, 
+      0,   0, 167,   0,   0, 139, 
+      2,  35,   0, 128, 131, 153, 
+     25,   0,  34,   0,  16,   0, 
+      4,   0,   0,   0,  58,   0, 
      16,   0,   0,   0,   0,   0, 
-     58, 128,  32,   0,   0,   0, 
-      0,   0,   2,   0,   0,   0, 
-    167,   0,   0, 139,   2,  35, 
-      0, 128, 131, 153,  25,   0, 
-     34,   0,  16,   0,   5,   0, 
-      0,   0,  58,   0,  16,   0, 
-      0,   0,   0,   0,   1,  64, 
-      0,   0,   0,   0,   0,   0, 
-      6, 224,  17,   0,   0,   0, 
-      0,   0,  18,   0,   0,   1, 
-     54,   0,   0,   5,  34,   0, 
-     16,   0,   5,   0,   0,   0, 
       1,  64,   0,   0,   0,   0, 
-      0,   0,  21,   0,   0,   1, 
-     18,   0,   0,   1,  54,   0, 
-      0,   8,  50,   0,  16,   0, 
-      5,   0,   0,   0,   2,  64, 
-      0,   0,   0,   0,   0,   0, 
-      0,   0,   0,   0,   0,   0, 
-      0,   0,   0,   0,   0,   0, 
-     21,   0,   0,   1,  56,   0, 
-      0,   7, 130,   0,  16,   0, 
-      0,   0,   0,   0,  26,   0, 
-     16,   0,   4,   0,   0,   0, 
-     26,   0,  16,   0,   5,   0, 
-      0,   0,  50,   0,   0,  10, 
-    130,   0,  16,   0,   0,   0, 
-      0,   0,  10,   0,  16,   0, 
-      5,   0,   0,   0,  10,   0, 
+      0,   0,   6, 224,  17,   0, 
+      0,   0,   0,   0,  18,   0, 
+      0,   1,  54,   0,   0,   5, 
+     34,   0,  16,   0,   4,   0, 
+      0,   0,   1,  64,   0,   0, 
+      0,   0,   0,   0,  21,   0, 
+      0,   1,   1,   0,   0,   7, 
+     98,   0,  16,   0,   3,   0, 
+      0,   0, 166,  10,  16,   0, 
+      1,   0,   0,   0,   6,   1, 
      16,   0,   4,   0,   0,   0, 
-     58,   0,  16, 128,  65,   0, 
-      0,   0,   0,   0,   0,   0, 
-     15,   0,   0,   7,  66,   0, 
-     16,   0,   1,   0,   0,   0, 
-     22,   5,  16,   0,   5,   0, 
-      0,   0,  70,   0,  16,   0, 
-      4,   0,   0,   0,  79,   0, 
-      0,   8, 130,   0,  16,   0, 
+     56,   0,   0,   7, 130,   0, 
+     16,   0,   0,   0,   0,   0, 
+     26,   0,  16,   0,   2,   0, 
+      0,   0,  42,   0,  16,   0, 
+      3,   0,   0,   0,  50,   0, 
+      0,  10, 130,   0,  16,   0, 
+      0,   0,   0,   0,  26,   0, 
+     16,   0,   3,   0,   0,   0, 
+     42,   0,  16,   0,   2,   0, 
+      0,   0,  58,   0,  16, 128, 
+     65,   0,   0,   0,   0,   0, 
+      0,   0,  15,   0,   0,   7, 
+     66,   0,  16,   0,   1,   0, 
+      0,   0, 150,   5,  16,   0, 
+      3,   0,   0,   0, 150,   5, 
+     16,   0,   2,   0,   0,   0, 
+     79,   0,   0,   8,  34,   0, 
+     16,   0,   2,   0,   0,   0, 
+     26,   0,  16,   0,   0,   0, 
+      0,   0,  42, 128,  32,   0, 
+      0,   0,   0,   0,   5,   0, 
+      0,   0,   1,   0,   0,   7, 
+     34,   0,  16,   0,   1,   0, 
+      0,   0,  26,   0,  16,   0, 
       1,   0,   0,   0,  26,   0, 
+     16,   0,   2,   0,   0,   0, 
+     38,   0,   0,   9,   0, 208, 
+      0,   0,  34,   0,  16,   0, 
+      2,   0,   0,   0,  26,   0, 
      16,   0,   0,   0,   0,   0, 
      42, 128,  32,   0,   0,   0, 
-      0,   0,   5,   0,   0,   0, 
-      1,   0,   0,   7,  34,   0, 
-     16,   0,   1,   0,   0,   0, 
-     58,   0,  16,   0,   1,   0, 
+      0,   0,   6,   0,   0,   0, 
+    167,   0,   0, 139,   2,  35, 
+      0, 128, 131, 153,  25,   0, 
+     18,   0,  16,   0,   4,   0, 
       0,   0,  26,   0,  16,   0, 
-      1,   0,   0,   0,  31,   0, 
-      4,   3,  26,   0,  16,   0, 
-      1,   0,   0,   0,  38,   0, 
-      0,   9,   0, 208,   0,   0, 
-     34,   0,  16,   0,   1,   0, 
+      2,   0,   0,   0,   1,  64, 
+      0,   0,   0,   0,   0,   0, 
+      6, 224,  17,   0,   2,   0, 
+      0,   0,  31,   0,   4,   3, 
+     58,   0,  16,   0,   1,   0, 
+      0,   0,  35,   0,   0,  11, 
+    130,   0,  16,   0,   1,   0, 
       0,   0,  26,   0,  16,   0, 
       0,   0,   0,   0,  42, 128, 
+     32,   0,   0,   0,   0,   0, 
+      6,   0,   0,   0,  58, 128, 
      32,   0,   0,   0,   0,   0, 
       6,   0,   0,   0, 167,   0, 
       0, 139,   2,  35,   0, 128, 
-    131, 153,  25,   0,  18,   0, 
+    131, 153,  25,   0,  34,   0, 
      16,   0,   4,   0,   0,   0, 
-     26,   0,  16,   0,   1,   0, 
+     58,   0,  16,   0,   1,   0, 
       0,   0,   1,  64,   0,   0, 
       0,   0,   0,   0,   6, 224, 
      17,   0,   2,   0,   0,   0, 
-     35,   0,   0,  11,  34,   0, 
-     16,   0,   1,   0,   0,   0, 
-     26,   0,  16,   0,   0,   0, 
-      0,   0,  42, 128,  32,   0, 
-      0,   0,   0,   0,   6,   0, 
-      0,   0,  58, 128,  32,   0, 
-      0,   0,   0,   0,   6,   0, 
-      0,   0,  32,   0,   0,   8, 
-    130,   0,  16,   0,   1,   0, 
-      0,   0,  58, 128,  32,   0, 
-      0,   0,   0,   0,   5,   0, 
-      0,   0,   1,  64,   0,   0, 
-      2,   0,   0,   0, 167,   0, 
-      0, 139,   2,  35,   0, 128, 
-    131, 153,  25,   0,  34,   0, 
+     18,   0,   0,   1,  54,   0, 
+      0,   5,  34,   0,  16,   0, 
+      4,   0,   0,   0,   1,  64, 
+      0,   0,   0,   0,   0,   0, 
+     21,   0,   0,   1,  55,   0, 
+      0,  12, 162,   0,  16,   0, 
+      1,   0,   0,   0,  86,   5, 
      16,   0,   1,   0,   0,   0, 
-     26,   0,  16,   0,   1,   0, 
-      0,   0,   1,  64,   0,   0, 
-      0,   0,   0,   0,   6, 224, 
-     17,   0,   2,   0,   0,   0, 
-      1,   0,   0,   7,  34,   0, 
-     16,   0,   4,   0,   0,   0, 
-     26,   0,  16,   0,   1,   0, 
-      0,   0,  58,   0,  16,   0, 
-      1,   0,   0,   0,  18,   0, 
-      0,   1,  54,   0,   0,   8, 
-     50,   0,  16,   0,   4,   0, 
+     86,   1,  16,   0,   4,   0, 
       0,   0,   2,  64,   0,   0, 
-      0,   0, 128,  63,   0,   0, 
       0,   0,   0,   0,   0,   0, 
-      0,   0,   0,   0,  21,   0, 
-      0,   1,  79,   0,   0,   8, 
-     34,   0,  16,   0,   1,   0, 
-      0,   0,  26,   0,  16,   0, 
-      0,   0,   0,   0,  26, 128, 
-     32,   0,   0,   0,   0,   0, 
-      1,   0,   0,   0,  31,   0, 
-      4,   3,  26,   0,  16,   0, 
-      1,   0,   0,   0,  38,   0, 
-      0,   9,   0, 208,   0,   0, 
-     34,   0,  16,   0,   0,   0, 
-      0,   0,  26,   0,  16,   0, 
-      0,   0,   0,   0,  26, 128, 
-     32,   0,   0,   0,   0,   0, 
-      2,   0,   0,   0,  35,   0, 
+      0,   0,   0,   0,   0,   0, 
+      0,   0, 128,  63,  79,   0, 
+      0,   8,  34,   0,  16,   0, 
+      2,   0,   0,   0,  26,   0, 
+     16,   0,   0,   0,   0,   0, 
+     26, 128,  32,   0,   0,   0, 
+      0,   0,   1,   0,   0,   0, 
+     35,   0,   0,  10,  34,   0, 
+     16,   0,   0,   0,   0,   0, 
+     26,   0,  16,   0,   0,   0, 
+      0,   0,  26, 128,  32,   0, 
+      0,   0,   0,   0,   2,   0, 
+      0,   0,  10,   0,  16,   0, 
+      1,   0,   0,   0,  35,   0, 
       0,  10,  34,   0,  16,   0, 
       0,   0,   0,   0,  10,   0, 
-     16,   0,   1,   0,   0,   0, 
-     10, 128,  32,   0,   0,   0, 
+     16,   0,   3,   0,   0,   0, 
+     42, 128,  32,   0,   0,   0, 
       0,   0,   2,   0,   0,   0, 
      26,   0,  16,   0,   0,   0, 
-      0,   0,  35,   0,   0,  10, 
-     34,   0,  16,   0,   0,   0, 
-      0,   0,  10,   0,  16,   0, 
-      3,   0,   0,   0,  42, 128, 
-     32,   0,   0,   0,   0,   0, 
-      2,   0,   0,   0,  26,   0, 
+      0,   0, 167,   0,   0, 139, 
+      2,  35,   0, 128, 131, 153, 
+     25,   0,  18,   0,  16,   0, 
+      3,   0,   0,   0,  26,   0, 
      16,   0,   0,   0,   0,   0, 
-    167,   0,   0, 139,   2,  35, 
-      0, 128, 131, 153,  25,   0, 
-     18,   0,  16,   0,   1,   0, 
-      0,   0,  26,   0,  16,   0, 
-      0,   0,   0,   0,   1,  64, 
-      0,   0,   0,   0,   0,   0, 
-      6, 224,  17,   0,   0,   0, 
-      0,   0,  32,   0,   0,   8, 
-    130,   0,  16,   0,   1,   0, 
-      0,   0,  58, 128,  32,   0, 
-      0,   0,   0,   0,   1,   0, 
-      0,   0,   1,  64,   0,   0, 
-      2,   0,   0,   0,  31,   0, 
+      1,  64,   0,   0,   0,   0, 
+      0,   0,   6, 224,  17,   0, 
+      0,   0,   0,   0,  31,   0, 
       4,   3,  58,   0,  16,   0, 
-      1,   0,   0,   0,  30,   0, 
+      2,   0,   0,   0,  30,   0, 
       0,   8,  34,   0,  16,   0, 
       0,   0,   0,   0,  26,   0, 
      16,   0,   0,   0,   0,   0, 
@@ -490,39 +466,37 @@ const BYTE g_DFT[] =
       0,   0,   2,   0,   0,   0, 
     167,   0,   0, 139,   2,  35, 
       0, 128, 131, 153,  25,   0, 
-     34,   0,  16,   0,   1,   0, 
+     34,   0,  16,   0,   3,   0, 
       0,   0,  26,   0,  16,   0, 
       0,   0,   0,   0,   1,  64, 
       0,   0,   0,   0,   0,   0, 
       6, 224,  17,   0,   0,   0, 
       0,   0,  18,   0,   0,   1, 
      54,   0,   0,   5,  34,   0, 
-     16,   0,   1,   0,   0,   0, 
+     16,   0,   3,   0,   0,   0, 
       1,  64,   0,   0,   0,   0, 
       0,   0,  21,   0,   0,   1, 
-     18,   0,   0,   1,  54,   0, 
-      0,   8,  50,   0,  16,   0, 
-      1,   0,   0,   0,   2,  64, 
-      0,   0,   0,   0,   0,   0, 
-      0,   0,   0,   0,   0,   0, 
-      0,   0,   0,   0,   0,   0, 
-     21,   0,   0,   1,  56,   0, 
+      1,   0,   0,   7,  98,   0, 
+     16,   0,   2,   0,   0,   0, 
+     86,   5,  16,   0,   2,   0, 
+      0,   0,   6,   1,  16,   0, 
+      3,   0,   0,   0,  56,   0, 
       0,   7,  34,   0,  16,   0, 
       0,   0,   0,   0,  26,   0, 
-     16,   0,   4,   0,   0,   0, 
-     26,   0,  16,   0,   1,   0, 
+     16,   0,   1,   0,   0,   0, 
+     42,   0,  16,   0,   2,   0, 
       0,   0,  50,   0,   0,  10, 
      34,   0,  16,   0,   0,   0, 
-      0,   0,  10,   0,  16,   0, 
-      1,   0,   0,   0,  10,   0, 
-     16,   0,   4,   0,   0,   0, 
+      0,   0,  26,   0,  16,   0, 
+      2,   0,   0,   0,  58,   0, 
+     16,   0,   1,   0,   0,   0, 
      26,   0,  16, 128,  65,   0, 
       0,   0,   0,   0,   0,   0, 
      15,   0,   0,   7,  18,   0, 
      16,   0,   1,   0,   0,   0, 
-     22,   5,  16,   0,   1,   0, 
-      0,   0,  70,   0,  16,   0, 
-      4,   0,   0,   0,  78,   0, 
+    150,   5,  16,   0,   2,   0, 
+      0,   0, 214,   5,  16,   0, 
+      1,   0,   0,   0,  78,   0, 
       0,   8,   0, 208,   0,   0, 
      34,   0,  16,   0,   1,   0, 
       0,   0,  10,   0,  16,   0, 
@@ -610,65 +584,77 @@ const BYTE g_DFT[] =
      16,   0,   0,   0,   0,   0, 
     166, 138,  32,   0,   0,   0, 
       0,   0,   7,   0,   0,   0, 
-     57,   0,   0,   8,  18,   0, 
+     24,   0,   0,   8,  18,   0, 
      16,   0,   1,   0,   0,   0, 
      26, 128,  32,   0,   0,   0, 
       0,   0,   7,   0,   0,   0, 
       1,  64,   0,   0,   0,   0, 
+      0,   0,  31,   0,   4,   3, 
+     10,   0,  16,   0,   1,   0, 
+      0,   0,  54,   0,   0,   5, 
+     18,   0,  16,   0,   1,   0, 
+      0,   0,  58,   0,  16,   0, 
+      0,   0,   0,   0,  18,   0, 
+      0,   1,  57,   0,   0,   8, 
+     66,   0,  16,   0,   1,   0, 
+      0,   0,  26, 128,  32,   0, 
+      0,   0,   0,   0,   7,   0, 
+      0,   0,   1,  64,   0,   0, 
+      0,   0,   0,   0,  56,   0, 
+      0,   7,  34,   0,  16,   0, 
+      1,   0,   0,   0,  26,   0, 
+     16,   0,   1,   0,   0,   0, 
+     26,   0,  16,   0,   1,   0, 
       0,   0,  56,   0,   0,   7, 
      34,   0,  16,   0,   1,   0, 
       0,   0,  26,   0,  16,   0, 
-      1,   0,   0,   0,  26,   0, 
-     16,   0,   1,   0,   0,   0, 
-     56,   0,   0,   7,  34,   0, 
+      1,   0,   0,   0,   1,  64, 
+      0,   0, 219,  15,  73,  64, 
+     14,   0,   0,   8,  34,   0, 
      16,   0,   1,   0,   0,   0, 
      26,   0,  16,   0,   1,   0, 
-      0,   0,   1,  64,   0,   0, 
-    219,  15,  73,  64,  14,   0, 
-      0,   8,  34,   0,  16,   0, 
-      1,   0,   0,   0,  26,   0, 
+      0,   0,  26, 128,  32,   0, 
+      0,   0,   0,   0,   7,   0, 
+      0,   0,  77,   0,   0,   7, 
+     18,   0,  16,   0,   2,   0, 
+      0,   0,  18,   0,  16,   0, 
+      3,   0,   0,   0,  26,   0, 
      16,   0,   1,   0,   0,   0, 
-     26, 128,  32,   0,   0,   0, 
-      0,   0,   7,   0,   0,   0, 
-     77,   0,   0,   7,  18,   0, 
+     54,   0,   0,   5,  34,   0, 
      16,   0,   2,   0,   0,   0, 
-     18,   0,  16,   0,   3,   0, 
-      0,   0,  26,   0,  16,   0, 
-      1,   0,   0,   0,  54,   0, 
-      0,   5,  34,   0,  16,   0, 
-      2,   0,   0,   0,  10,   0, 
-     16,   0,   3,   0,   0,   0, 
-     55,   0,   0,  12,  50,   0, 
-     16,   0,   1,   0,   0,   0, 
-      6,   0,  16,   0,   1,   0, 
-      0,   0,  70,   0,  16,   0, 
-      2,   0,   0,   0,   2,  64, 
+     10,   0,  16,   0,   3,   0, 
+      0,   0,  55,   0,   0,  12, 
+     98,   0,  16,   0,   1,   0, 
+      0,   0, 166,  10,  16,   0, 
+      1,   0,   0,   0,   6,   1, 
+     16,   0,   2,   0,   0,   0, 
+      2,  64,   0,   0,   0,   0, 
       0,   0,   0,   0,   0,   0, 
       0,   0, 128,  63,   0,   0, 
-      0,   0,   0,   0,   0,   0, 
-     56,   0,   0,   7, 194,   0, 
+      0,   0,  56,   0,   0,   7, 
+     50,   0,  16,   0,   2,   0, 
+      0,   0,  86,   5,  16,   0, 
+      0,   0,   0,   0, 150,   5, 
      16,   0,   1,   0,   0,   0, 
-     86,   5,  16,   0,   0,   0, 
-      0,   0,   6,   4,  16,   0, 
-      1,   0,   0,   0,  50,   0, 
-      0,  10,  34,   0,  16,   0, 
+     50,   0,   0,  10,  18,   0, 
+     16,   0,   1,   0,   0,   0, 
+     58,   0,  16,   0,   0,   0, 
+      0,   0,  42,   0,  16,   0, 
+      1,   0,   0,   0,  10,   0, 
+     16, 128,  65,   0,   0,   0, 
+      2,   0,   0,   0,  50,   0, 
+      0,   9,  34,   0,  16,   0, 
       0,   0,   0,   0,  58,   0, 
      16,   0,   0,   0,   0,   0, 
      26,   0,  16,   0,   1,   0, 
-      0,   0,  42,   0,  16, 128, 
-     65,   0,   0,   0,   1,   0, 
-      0,   0, 168,   0,   0,   9, 
+      0,   0,  26,   0,  16,   0, 
+      2,   0,   0,   0,  21,   0, 
+      0,   1, 168,   0,   0,   9, 
      18, 224,  17,   0,   1,   0, 
       0,   0,  42,   0,  16,   0, 
       0,   0,   0,   0,   1,  64, 
       0,   0,   0,   0,   0,   0, 
-     26,   0,  16,   0,   0,   0, 
-      0,   0,  50,   0,   0,   9, 
-     34,   0,  16,   0,   0,   0, 
-      0,   0,  58,   0,  16,   0, 
-      0,   0,   0,   0,  10,   0, 
-     16,   0,   1,   0,   0,   0, 
-     58,   0,  16,   0,   1,   0, 
+     10,   0,  16,   0,   1,   0, 
       0,   0, 168,   0,   0,   9, 
      18, 224,  17,   0,   1,   0, 
       0,   0,  10,   0,  16,   0, 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/GeneratedShaders/stockham_fp16.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/GeneratedShaders/stockham_fp16.h
index 988c0aa66ade2..56ce759875687 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/GeneratedShaders/stockham_fp16.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/GeneratedShaders/stockham_fp16.h
@@ -15,7 +15,7 @@
 ; Name                 Index   Mask Register SysValue  Format   Used
 ; -------------------- ----- ------ -------- -------- ------- ------
 ; no parameters
-; shader hash: e08f21199c48b0db30bf21bd8c5b80dc
+; shader hash: 6a1d88feb14177832f5ee49ca330c549
 ;
 ; Pipeline Runtime Information: 
 ;
@@ -125,7 +125,7 @@ define void @DFT() {
   %47 = fpext half %46 to float
   %48 = extractvalue %dx.types.CBufRet.i32 %37, 3
   %49 = icmp eq i32 %48, 2
-  br i1 %49, label %50, label %56
+  br i1 %49, label %50, label %56, !dx.controlflow.hints !15
 
 ; <label>:50                                      ; preds = %41
   %51 = extractvalue %dx.types.CBufRet.i32 %42, 3
@@ -141,7 +141,7 @@ define void @DFT() {
   %59 = call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32(i32 59, %dx.types.Handle %4, i32 1)  ; CBufferLoadLegacy(handle,regIndex)
   %60 = extractvalue %dx.types.CBufRet.i32 %59, 1
   %61 = icmp ult i32 %33, %60
-  br i1 %61, label %62, label %83, !dx.controlflow.hints !15
+  br i1 %61, label %62, label %83, !dx.controlflow.hints !16
 
 ; <label>:62                                      ; preds = %56
   %63 = call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32(i32 59, %dx.types.Handle %4, i32 2)  ; CBufferLoadLegacy(handle,regIndex)
@@ -158,7 +158,7 @@ define void @DFT() {
   %74 = fpext half %73 to float
   %75 = extractvalue %dx.types.CBufRet.i32 %59, 3
   %76 = icmp eq i32 %75, 2
-  br i1 %76, label %77, label %83, !dx.controlflow.hints !16
+  br i1 %76, label %77, label %83, !dx.controlflow.hints !17
 
 ; <label>:77                                      ; preds = %62
   %78 = extractvalue %dx.types.CBufRet.i32 %63, 3
@@ -188,7 +188,7 @@ define void @DFT() {
   %98 = fpext half %97 to float
   %99 = extractvalue %dx.types.CBufRet.i32 %37, 3
   %100 = icmp eq i32 %99, 2
-  br i1 %100, label %101, label %107
+  br i1 %100, label %101, label %107, !dx.controlflow.hints !15
 
 ; <label>:101                                     ; preds = %92
   %102 = extractvalue %dx.types.CBufRet.i32 %93, 3
@@ -202,7 +202,7 @@ define void @DFT() {
   %108 = phi float [ %98, %101 ], [ %98, %92 ], [ 1.000000e+00, %83 ]
   %109 = phi float [ %106, %101 ], [ 0.000000e+00, %92 ], [ 0.000000e+00, %83 ]
   %110 = icmp ult i32 %34, %60
-  br i1 %110, label %111, label %132, !dx.controlflow.hints !15
+  br i1 %110, label %111, label %132, !dx.controlflow.hints !16
 
 ; <label>:111                                     ; preds = %107
   %112 = call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32(i32 59, %dx.types.Handle %4, i32 2)  ; CBufferLoadLegacy(handle,regIndex)
@@ -219,7 +219,7 @@ define void @DFT() {
   %123 = fpext half %122 to float
   %124 = extractvalue %dx.types.CBufRet.i32 %59, 3
   %125 = icmp eq i32 %124, 2
-  br i1 %125, label %126, label %132, !dx.controlflow.hints !16
+  br i1 %125, label %126, label %132, !dx.controlflow.hints !17
 
 ; <label>:126                                     ; preds = %111
   %127 = extractvalue %dx.types.CBufRet.i32 %112, 3
@@ -270,19 +270,21 @@ define void @DFT() {
   %170 = fmul fast float %158, %169
   %171 = extractvalue %dx.types.CBufRet.f32 %157, 1
   %172 = fcmp fast oeq float %171, 0.000000e+00
-  br i1 %172, label %179, label %173
+  br i1 %172, label %173, label %176, !dx.controlflow.hints !18
 
 ; <label>:173                                     ; preds = %132
-  %174 = fmul fast float %146, %146
-  %175 = fmul fast float %174, 0x400921FB60000000
-  %176 = fdiv fast float %175, %171
-  %177 = call float @dx.op.unary.f32(i32 12, float %176)  ; Cos(value)
-  %178 = call float @dx.op.unary.f32(i32 13, float %176)  ; Sin(value)
-  br label %179
+  %174 = fptrunc float %164 to half
+  call void @dx.op.rawBufferStore.f16(i32 140, %dx.types.Handle %2, i32 %154, i32 0, half %174, half undef, half undef, half undef, i8 1, i32 2)  ; RawBufferStore(uav,index,elementOffset,value0,value1,value2,value3,mask,alignment)
+  %175 = fptrunc float %170 to half
+  call void @dx.op.rawBufferStore.f16(i32 140, %dx.types.Handle %2, i32 %156, i32 0, half %175, half undef, half undef, half undef, i8 1, i32 2)  ; RawBufferStore(uav,index,elementOffset,value0,value1,value2,value3,mask,alignment)
+  br label %190
 
-; <label>:179                                     ; preds = %173, %132
-  %180 = phi float [ %177, %173 ], [ 1.000000e+00, %132 ]
-  %181 = phi float [ %178, %173 ], [ 0.000000e+00, %132 ]
+; <label>:176                                     ; preds = %132
+  %177 = fmul fast float %146, %146
+  %178 = fmul fast float %177, 0x400921FB60000000
+  %179 = fdiv fast float %178, %171
+  %180 = call float @dx.op.unary.f32(i32 12, float %179)  ; Cos(value)
+  %181 = call float @dx.op.unary.f32(i32 13, float %179)  ; Sin(value)
   %182 = fmul fast float %180, %164
   %183 = fmul fast float %181, %170
   %184 = fsub fast float %182, %183
@@ -295,7 +297,7 @@ define void @DFT() {
   call void @dx.op.rawBufferStore.f16(i32 140, %dx.types.Handle %2, i32 %156, i32 0, half %189, half undef, half undef, half undef, i8 1, i32 2)  ; RawBufferStore(uav,index,elementOffset,value0,value1,value2,value3,mask,alignment)
   br label %190
 
-; <label>:190                                     ; preds = %179, %0
+; <label>:190                                     ; preds = %176, %173, %0
   ret void
 }
 
@@ -345,16 +347,18 @@ attributes #2 = { nounwind }
 !11 = !{void ()* @DFT, !"DFT", null, !4, !12}
 !12 = !{i32 0, i64 8388656, i32 4, !13}
 !13 = !{i32 64, i32 1, i32 1}
-!14 = distinct !{!14, !"dx.controlflow.hints", i32 1}
+!14 = distinct !{!14, !"dx.controlflow.hints", i32 2}
 !15 = distinct !{!15, !"dx.controlflow.hints", i32 1}
-!16 = distinct !{!16, !"dx.controlflow.hints", i32 1}
+!16 = distinct !{!16, !"dx.controlflow.hints", i32 2}
+!17 = distinct !{!17, !"dx.controlflow.hints", i32 1}
+!18 = distinct !{!18, !"dx.controlflow.hints", i32 1}
 
 #endif
 
 const unsigned char g_DFT[] = {
-  0x44, 0x58, 0x42, 0x43, 0x0f, 0xc1, 0xea, 0x65, 0x6d, 0xe3, 0x8d, 0x13,
-  0x2c, 0xb2, 0x19, 0xb3, 0xd4, 0xb1, 0x94, 0xb9, 0x01, 0x00, 0x00, 0x00,
-  0xfc, 0x0b, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00,
+  0x44, 0x58, 0x42, 0x43, 0x12, 0x40, 0x8a, 0x15, 0xf2, 0x7d, 0x33, 0xd8,
+  0x35, 0x6a, 0x11, 0xd5, 0x43, 0xa1, 0x29, 0x3b, 0x01, 0x00, 0x00, 0x00,
+  0x3c, 0x0c, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x48, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00,
   0x18, 0x01, 0x00, 0x00, 0x34, 0x01, 0x00, 0x00, 0x53, 0x46, 0x49, 0x30,
   0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -376,12 +380,12 @@ const unsigned char g_DFT[] = {
   0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x48, 0x41, 0x53, 0x48, 0x14, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0xe0, 0x8f, 0x21, 0x19, 0x9c, 0x48, 0xb0, 0xdb,
-  0x30, 0xbf, 0x21, 0xbd, 0x8c, 0x5b, 0x80, 0xdc, 0x44, 0x58, 0x49, 0x4c,
-  0xc0, 0x0a, 0x00, 0x00, 0x62, 0x00, 0x05, 0x00, 0xb0, 0x02, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x6a, 0x1d, 0x88, 0xfe, 0xb1, 0x41, 0x77, 0x83,
+  0x2f, 0x5e, 0xe4, 0x9c, 0xa3, 0x30, 0xc5, 0x49, 0x44, 0x58, 0x49, 0x4c,
+  0x00, 0x0b, 0x00, 0x00, 0x62, 0x00, 0x05, 0x00, 0xc0, 0x02, 0x00, 0x00,
   0x44, 0x58, 0x49, 0x4c, 0x02, 0x01, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
-  0xa8, 0x0a, 0x00, 0x00, 0x42, 0x43, 0xc0, 0xde, 0x21, 0x0c, 0x00, 0x00,
-  0xa7, 0x02, 0x00, 0x00, 0x0b, 0x82, 0x20, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0xe8, 0x0a, 0x00, 0x00, 0x42, 0x43, 0xc0, 0xde, 0x21, 0x0c, 0x00, 0x00,
+  0xb7, 0x02, 0x00, 0x00, 0x0b, 0x82, 0x20, 0x00, 0x02, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x00, 0x00, 0x07, 0x81, 0x23, 0x91, 0x41, 0xc8, 0x04, 0x49,
   0x06, 0x10, 0x32, 0x39, 0x92, 0x01, 0x84, 0x0c, 0x25, 0x05, 0x08, 0x19,
   0x1e, 0x04, 0x8b, 0x62, 0x80, 0x18, 0x45, 0x02, 0x42, 0x92, 0x0b, 0x42,
@@ -441,7 +445,7 @@ const unsigned char g_DFT[] = {
   0x4a, 0xa0, 0x08, 0x8a, 0x61, 0x04, 0xa0, 0x30, 0x0a, 0x50, 0xa0, 0x10,
   0x0a, 0x30, 0x80, 0xb0, 0x11, 0x00, 0x0a, 0x0b, 0x1c, 0x10, 0x10, 0x81,
   0xc0, 0x19, 0x00, 0xea, 0x66, 0x00, 0x00, 0x00, 0x79, 0x18, 0x00, 0x00,
-  0x4f, 0x00, 0x00, 0x00, 0x1a, 0x03, 0x4c, 0x90, 0x46, 0x02, 0x13, 0x44,
+  0x52, 0x00, 0x00, 0x00, 0x1a, 0x03, 0x4c, 0x90, 0x46, 0x02, 0x13, 0x44,
   0x35, 0x18, 0x63, 0x0b, 0x73, 0x3b, 0x03, 0xb1, 0x2b, 0x93, 0x9b, 0x4b,
   0x7b, 0x73, 0x03, 0x99, 0x71, 0xb9, 0x01, 0x41, 0xa1, 0x0b, 0x3b, 0x9b,
   0x7b, 0x91, 0x2a, 0x62, 0x2a, 0x0a, 0x9a, 0x2a, 0xfa, 0x9a, 0xb9, 0x81,
@@ -458,16 +462,17 @@ const unsigned char g_DFT[] = {
   0x70, 0x26, 0x08, 0xc3, 0xb3, 0x61, 0xe0, 0x86, 0x61, 0x03, 0xa1, 0x68,
   0x5b, 0xb7, 0xa1, 0xc0, 0x32, 0xe0, 0xf2, 0x48, 0x91, 0xe1, 0xb9, 0x8c,
   0xbd, 0xb9, 0xd1, 0xc9, 0xbd, 0xb1, 0x99, 0xb1, 0xbd, 0xdd, 0xb9, 0xa0,
-  0xa5, 0xb9, 0xd1, 0xcd, 0xad, 0x18, 0xc2, 0x00, 0x0c, 0x86, 0x15, 0x83,
-  0x18, 0x80, 0xc1, 0xb0, 0x62, 0x18, 0x03, 0x30, 0x18, 0xaa, 0xb0, 0xb1,
-  0xd9, 0xb5, 0xb9, 0xa4, 0x91, 0x95, 0xb9, 0xd1, 0x4d, 0x09, 0x82, 0x2a,
-  0x64, 0x78, 0x2e, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x53, 0x02,
-  0xa2, 0x09, 0x19, 0x9e, 0x8b, 0x5d, 0x18, 0x9b, 0x5d, 0x99, 0xdc, 0x94,
-  0xc0, 0xa8, 0x43, 0x86, 0xe7, 0x32, 0x87, 0x16, 0x46, 0x56, 0x26, 0xd7,
-  0xf4, 0x46, 0x56, 0xc6, 0x36, 0x25, 0x40, 0xca, 0x90, 0xe1, 0xb9, 0xc8,
-  0x95, 0xcd, 0xbd, 0xd5, 0xc9, 0x8d, 0x95, 0xcd, 0x4d, 0x09, 0xac, 0x3a,
-  0x64, 0x78, 0x2e, 0x65, 0x6e, 0x74, 0x72, 0x79, 0x50, 0x6f, 0x69, 0x6e,
-  0x74, 0x73, 0x53, 0x02, 0x0f, 0x00, 0x00, 0x00, 0x79, 0x18, 0x00, 0x00,
+  0xa5, 0xb9, 0xd1, 0xcd, 0xad, 0x18, 0xc2, 0x00, 0x0c, 0x88, 0x15, 0x83,
+  0x18, 0x80, 0xc1, 0xb0, 0x62, 0x18, 0x03, 0x30, 0x20, 0x56, 0x0c, 0x64,
+  0x00, 0x06, 0xc3, 0x8a, 0xa1, 0x0c, 0xc0, 0x60, 0xa8, 0xc2, 0xc6, 0x66,
+  0xd7, 0xe6, 0x92, 0x46, 0x56, 0xe6, 0x46, 0x37, 0x25, 0x08, 0xaa, 0x90,
+  0xe1, 0xb9, 0xd8, 0x95, 0xc9, 0xcd, 0xa5, 0xbd, 0xb9, 0x4d, 0x09, 0x88,
+  0x26, 0x64, 0x78, 0x2e, 0x76, 0x61, 0x6c, 0x76, 0x65, 0x72, 0x53, 0x02,
+  0xa3, 0x0e, 0x19, 0x9e, 0xcb, 0x1c, 0x5a, 0x18, 0x59, 0x99, 0x5c, 0xd3,
+  0x1b, 0x59, 0x19, 0xdb, 0x94, 0x00, 0x29, 0x43, 0x86, 0xe7, 0x22, 0x57,
+  0x36, 0xf7, 0x56, 0x27, 0x37, 0x56, 0x36, 0x37, 0x25, 0xb0, 0xea, 0x90,
+  0xe1, 0xb9, 0x94, 0xb9, 0xd1, 0xc9, 0xe5, 0x41, 0xbd, 0xa5, 0xb9, 0xd1,
+  0xcd, 0x4d, 0x09, 0x3c, 0x00, 0x00, 0x00, 0x00, 0x79, 0x18, 0x00, 0x00,
   0x59, 0x00, 0x00, 0x00, 0x33, 0x08, 0x80, 0x1c, 0xc4, 0xe1, 0x1c, 0x66,
   0x14, 0x01, 0x3d, 0x88, 0x43, 0x38, 0x84, 0xc3, 0x8c, 0x42, 0x80, 0x07,
   0x79, 0x78, 0x07, 0x73, 0x98, 0x71, 0x0c, 0xe6, 0x00, 0x0f, 0xed, 0x10,
@@ -510,9 +515,9 @@ const unsigned char g_DFT[] = {
   0x13, 0x11, 0x7e, 0x51, 0xeb, 0x16, 0x20, 0x0d, 0x97, 0xef, 0x3c, 0xfe,
   0x74, 0x44, 0x04, 0x30, 0x88, 0x83, 0x8f, 0xdc, 0xb6, 0x09, 0x3c, 0xc3,
   0xe5, 0x3b, 0x8f, 0x4f, 0x35, 0x40, 0x84, 0xf9, 0xc5, 0x6d, 0x03, 0x00,
-  0x61, 0x20, 0x00, 0x00, 0x22, 0x01, 0x00, 0x00, 0x13, 0x04, 0x51, 0x2c,
+  0x61, 0x20, 0x00, 0x00, 0x2f, 0x01, 0x00, 0x00, 0x13, 0x04, 0x51, 0x2c,
   0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x34, 0x94, 0x5d, 0x59,
-  0x0a, 0x94, 0x5c, 0xf9, 0x94, 0x43, 0x0d, 0x94, 0x46, 0x61, 0x0a, 0x94,
+  0x0a, 0x94, 0x5c, 0x61, 0x0a, 0x94, 0x4f, 0x39, 0xd4, 0x40, 0x69, 0x94,
   0x6e, 0x40, 0x19, 0x94, 0x02, 0x2d, 0x45, 0x50, 0x02, 0x64, 0x8c, 0x11,
   0xec, 0xfe, 0x28, 0xb3, 0x60, 0x30, 0x46, 0xb0, 0xfb, 0xa3, 0xcc, 0x82,
   0xc3, 0x18, 0xc1, 0xee, 0x8f, 0x32, 0x09, 0x06, 0x94, 0xcc, 0x00, 0x90,
@@ -525,87 +530,91 @@ const unsigned char g_DFT[] = {
   0x3c, 0x60, 0x30, 0x62, 0x70, 0x00, 0x20, 0x08, 0x06, 0xd3, 0x19, 0x60,
   0x42, 0x18, 0x8c, 0x26, 0x04, 0x40, 0x05, 0x03, 0x8c, 0x26, 0x0c, 0xc1,
   0x70, 0x83, 0x10, 0x90, 0xc1, 0x2c, 0x43, 0x00, 0x05, 0x23, 0x06, 0x07,
-  0x00, 0x82, 0x60, 0x30, 0xb1, 0x41, 0x77, 0x64, 0xa3, 0x09, 0xc1, 0x50,
-  0xc1, 0x1a, 0xe0, 0x68, 0x02, 0x22, 0x54, 0xe0, 0x69, 0xb9, 0x41, 0x70,
-  0x35, 0x80, 0x01, 0x54, 0x10, 0x06, 0x6a, 0x71, 0x10, 0x5c, 0x60, 0xc4,
-  0xe0, 0x00, 0x40, 0x10, 0x0c, 0xa6, 0x3a, 0x30, 0x03, 0xa8, 0x1b, 0x4d,
-  0x08, 0x82, 0xd1, 0x04, 0x41, 0xa8, 0x40, 0x90, 0x82, 0x82, 0xaa, 0x48,
-  0x98, 0x12, 0x88, 0xa9, 0xa1, 0xa8, 0x12, 0x1a, 0xac, 0x60, 0xb9, 0x5a,
-  0xd6, 0x00, 0xaa, 0x08, 0xb4, 0x86, 0x00, 0x2a, 0xa0, 0x60, 0x34, 0xe1,
-  0x02, 0x86, 0x1b, 0x82, 0x50, 0x00, 0x83, 0x11, 0x83, 0x03, 0x00, 0x41,
-  0x30, 0x98, 0x4a, 0xc1, 0x0e, 0xc0, 0x80, 0x0d, 0x46, 0x13, 0x02, 0x61,
-  0xb8, 0xc1, 0x08, 0xc8, 0xa0, 0x88, 0x40, 0x67, 0x19, 0x04, 0x22, 0x18,
-  0x31, 0x38, 0x00, 0x10, 0x04, 0x83, 0x29, 0x15, 0xf4, 0x80, 0x0c, 0x48,
-  0x61, 0x34, 0x21, 0x10, 0x2a, 0x50, 0x64, 0xc4, 0x40, 0x01, 0x40, 0x10,
-  0x0c, 0x1c, 0x57, 0xe0, 0x03, 0x35, 0x08, 0x4c, 0x21, 0x0e, 0x50, 0x61,
-  0x34, 0x21, 0x00, 0x2e, 0x30, 0x70, 0x34, 0x41, 0x19, 0x86, 0x1b, 0x02,
-  0x56, 0x00, 0x83, 0x59, 0x86, 0x81, 0x08, 0x46, 0x13, 0x90, 0xa1, 0x82,
-  0x03, 0x46, 0x0c, 0x14, 0x00, 0x04, 0xc1, 0xc0, 0xa9, 0x85, 0x51, 0x88,
-  0x83, 0xa0, 0x15, 0xf0, 0xe0, 0x15, 0x46, 0x13, 0x02, 0xe0, 0x02, 0x03,
-  0x67, 0x09, 0x88, 0x81, 0x0e, 0x03, 0x1a, 0x20, 0x81, 0x1d, 0x82, 0x81,
-  0x0e, 0x83, 0x18, 0xd8, 0x41, 0x60, 0x87, 0x60, 0xc4, 0xe0, 0x00, 0x40,
-  0x10, 0x0c, 0x26, 0x5c, 0x48, 0x85, 0x39, 0xa8, 0x85, 0xd1, 0x84, 0x20,
-  0x18, 0x6e, 0xc8, 0x02, 0x32, 0x98, 0x65, 0x28, 0x8e, 0x60, 0xc4, 0xe0,
-  0x00, 0x40, 0x10, 0x0c, 0xa6, 0x5d, 0x60, 0x05, 0x3b, 0xb8, 0x85, 0xd1,
-  0x84, 0x00, 0xa8, 0x60, 0x0c, 0x64, 0x34, 0x61, 0x08, 0x2a, 0xf0, 0xa4,
-  0x82, 0x01, 0x46, 0x13, 0x0c, 0xa1, 0x02, 0x33, 0x90, 0x1a, 0x02, 0x18,
-  0x31, 0x50, 0x00, 0x10, 0x04, 0x03, 0xc7, 0x1c, 0x68, 0x01, 0x14, 0x02,
-  0x5f, 0x48, 0x05, 0x70, 0x18, 0x4d, 0x08, 0x80, 0x0b, 0x0c, 0x1c, 0x4d,
-  0x78, 0x86, 0xe1, 0x86, 0x80, 0x1c, 0xc0, 0x60, 0x96, 0xc1, 0x38, 0x82,
-  0xd1, 0x04, 0x67, 0xa8, 0xe0, 0x80, 0x11, 0x03, 0x05, 0x00, 0x41, 0x30,
-  0x70, 0xda, 0x61, 0x17, 0x4e, 0x21, 0x28, 0x07, 0x58, 0x38, 0x87, 0xd1,
-  0x84, 0x00, 0xb8, 0xc0, 0xc0, 0x59, 0x82, 0x63, 0xa0, 0xc3, 0x80, 0x0c,
-  0xa8, 0xd0, 0x09, 0x62, 0xa0, 0xc3, 0x20, 0x0c, 0x9e, 0x28, 0x78, 0x82,
-  0x30, 0x41, 0x93, 0x8f, 0x09, 0x9a, 0x7c, 0x8c, 0xd8, 0xe4, 0x63, 0x44,
-  0x27, 0x9f, 0xe1, 0x06, 0x39, 0x70, 0x03, 0x32, 0xa8, 0x38, 0x08, 0x74,
-  0x96, 0x01, 0x51, 0x82, 0x11, 0x83, 0x03, 0x00, 0x41, 0x30, 0x98, 0xec,
-  0xe1, 0x1c, 0x62, 0x21, 0x1e, 0x46, 0x13, 0x02, 0xa1, 0x02, 0x3b, 0x90,
-  0x11, 0x03, 0x05, 0x00, 0x41, 0x30, 0x70, 0xf6, 0x21, 0x1d, 0x6e, 0x21,
-  0x98, 0x07, 0x5f, 0xa8, 0x87, 0xd1, 0x84, 0x00, 0xb8, 0xc0, 0xc0, 0xd1,
-  0x84, 0x3b, 0x18, 0x86, 0x1b, 0x82, 0x7c, 0x00, 0x83, 0x59, 0x86, 0x44,
-  0x09, 0x46, 0x13, 0x90, 0xa1, 0x82, 0x03, 0x46, 0x0c, 0x14, 0x00, 0x04,
-  0xc1, 0xc0, 0x11, 0x09, 0x78, 0xf0, 0x85, 0x40, 0x1f, 0xca, 0x81, 0x1f,
-  0x46, 0x13, 0x02, 0xe0, 0x02, 0x03, 0x67, 0x09, 0x94, 0x81, 0x0e, 0x03,
-  0x4a, 0x20, 0xc4, 0x34, 0x8e, 0x81, 0x0e, 0x83, 0x48, 0x4c, 0x03, 0x31,
-  0x8d, 0x63, 0xb8, 0x61, 0x14, 0xd8, 0x80, 0x0c, 0x66, 0x19, 0x96, 0x26,
-  0x18, 0x31, 0x38, 0x00, 0x10, 0x04, 0x83, 0xc9, 0x24, 0xee, 0x21, 0x1c,
-  0x44, 0x62, 0x34, 0x21, 0x00, 0x2a, 0x70, 0x05, 0x19, 0x4d, 0x18, 0x82,
-  0x0a, 0x50, 0x41, 0x2a, 0x18, 0x60, 0x34, 0xc1, 0x10, 0x2a, 0x88, 0x05,
-  0xa9, 0x21, 0x80, 0x11, 0x03, 0x05, 0x00, 0x41, 0x30, 0x70, 0x62, 0xe2,
-  0x1f, 0xd6, 0x21, 0x48, 0x09, 0x7a, 0x58, 0x89, 0xd1, 0x84, 0x00, 0xb8,
-  0xc0, 0xc0, 0xd1, 0x04, 0x3d, 0x18, 0x86, 0x1b, 0x82, 0x97, 0x00, 0x83,
-  0x59, 0x06, 0xa6, 0x09, 0x46, 0x13, 0x9c, 0xa1, 0x82, 0x03, 0x46, 0x0c,
-  0x14, 0x00, 0x04, 0xc1, 0xc0, 0xc1, 0x09, 0x93, 0x90, 0x87, 0x00, 0x26,
-  0xf6, 0x41, 0x26, 0x46, 0x13, 0x02, 0xe0, 0x02, 0x03, 0x67, 0x09, 0x9a,
-  0x81, 0x0e, 0x03, 0x62, 0xa0, 0x05, 0x3e, 0x94, 0x81, 0x0e, 0x83, 0x60,
-  0xe4, 0x63, 0x91, 0x0f, 0xc5, 0x04, 0x4c, 0x3e, 0x26, 0x60, 0xf2, 0x31,
-  0x21, 0x88, 0x8f, 0x15, 0x9a, 0x7c, 0xac, 0xe0, 0xe4, 0x63, 0x81, 0x00,
-  0x9f, 0x82, 0x87, 0x96, 0x80, 0x3a, 0x87, 0x40, 0x47, 0x13, 0xf8, 0x61,
-  0x18, 0x6e, 0x08, 0xc2, 0x02, 0x0c, 0xa6, 0x1b, 0x52, 0x02, 0x25, 0x82,
-  0x23, 0x8c, 0x32, 0x21, 0x90, 0xcf, 0xdd, 0x83, 0x51, 0x26, 0x04, 0xf4,
-  0x19, 0x31, 0x30, 0x00, 0x10, 0x04, 0x83, 0xa3, 0x2d, 0xc6, 0x22, 0x18,
-  0x31, 0x30, 0x00, 0x10, 0x04, 0x83, 0xc3, 0x2d, 0x6c, 0x42, 0x18, 0x31,
-  0x38, 0x00, 0x10, 0x04, 0x83, 0x89, 0x2d, 0x7a, 0xe2, 0x24, 0xc6, 0x62,
-  0x34, 0x21, 0x10, 0x2a, 0x28, 0x09, 0x19, 0x4d, 0x18, 0x86, 0x12, 0x02,
-  0x18, 0x31, 0x38, 0x00, 0x10, 0x04, 0x03, 0x0b, 0x2e, 0xc4, 0x82, 0x25,
-  0x7c, 0x62, 0x34, 0x21, 0x10, 0x2c, 0xb1, 0xe4, 0x63, 0x09, 0x25, 0x1f,
-  0x2b, 0x05, 0x52, 0x88, 0x8f, 0x05, 0x03, 0x7c, 0x2c, 0x18, 0xe2, 0x63,
-  0x46, 0x20, 0x1f, 0x7b, 0x32, 0xf9, 0xd8, 0xd3, 0xc9, 0xc7, 0x50, 0x21,
-  0x15, 0xe0, 0x63, 0xc1, 0x00, 0x1f, 0x0b, 0x06, 0xf8, 0x18, 0x13, 0xc8,
-  0x67, 0x34, 0xc1, 0x09, 0x86, 0x23, 0x82, 0x9f, 0x08, 0xbe, 0x59, 0x86,
-  0xc7, 0x09, 0x6c, 0xdb, 0xe4, 0x63, 0x01, 0x59, 0xc8, 0xc7, 0x02, 0x82,
-  0x3e, 0x23, 0x06, 0x06, 0x00, 0x82, 0x60, 0x70, 0x9c, 0x46, 0x5f, 0x04,
-  0x23, 0x06, 0x06, 0x00, 0x82, 0x60, 0x70, 0xa0, 0x06, 0x5c, 0x08, 0xb3,
-  0x04, 0xcf, 0x40, 0x85, 0x41, 0x38, 0xac, 0xd2, 0x0c, 0x54, 0x18, 0x84,
-  0xc3, 0x2a, 0x8d, 0x09, 0x90, 0x7c, 0x4c, 0x58, 0xe4, 0x63, 0x42, 0x10,
-  0x9f, 0x0b, 0x92, 0x1b, 0x31, 0x70, 0x00, 0x10, 0x04, 0x03, 0xa8, 0x35,
-  0xec, 0x22, 0x2d, 0x3c, 0xd3, 0x08, 0xda, 0xa2, 0x2d, 0xda, 0x22, 0x2e,
-  0x50, 0xc3, 0x0a, 0x4a, 0x3e, 0x76, 0x3c, 0xf2, 0x31, 0x21, 0x80, 0xcf,
-  0x05, 0xc9, 0x8d, 0x18, 0x38, 0x00, 0x08, 0x82, 0x01, 0x14, 0x1b, 0x7a,
-  0xd1, 0x16, 0x60, 0xa0, 0x1a, 0x41, 0x5c, 0xc4, 0x45, 0x5c, 0xd4, 0x05,
-  0x6b, 0xcc, 0x12, 0x40, 0x18, 0x10, 0x03, 0x00, 0x09, 0x00, 0x00, 0x00,
-  0x5b, 0x06, 0x34, 0x78, 0xc0, 0x60, 0xcb, 0xd0, 0x07, 0x4f, 0x18, 0x6c,
-  0x19, 0x58, 0xe1, 0x11, 0x83, 0x2d, 0xc3, 0x2e, 0x3c, 0x60, 0xb0, 0x65,
-  0x70, 0x87, 0x27, 0x0c, 0xb6, 0x0c, 0xfd, 0xf0, 0x88, 0x01, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+  0x00, 0x82, 0x60, 0x30, 0xb1, 0x41, 0x77, 0x60, 0xa3, 0x09, 0xc1, 0x50,
+  0xc1, 0x1a, 0xe0, 0x68, 0x02, 0x22, 0x54, 0xd0, 0x69, 0xb9, 0x41, 0x70,
+  0x35, 0x7c, 0x50, 0x01, 0x18, 0xa8, 0xc5, 0x41, 0x70, 0x81, 0x11, 0x83,
+  0x03, 0x00, 0x41, 0x30, 0x98, 0xea, 0xc0, 0x0c, 0xa0, 0x6e, 0x34, 0x21,
+  0x08, 0x46, 0x13, 0x04, 0xa1, 0x02, 0x41, 0x0a, 0x0a, 0xaa, 0x22, 0x61,
+  0x4a, 0x20, 0xa6, 0x86, 0xa2, 0x4a, 0x68, 0xb0, 0x82, 0xe5, 0x6a, 0x51,
+  0x03, 0xa8, 0x22, 0xd0, 0x1a, 0x02, 0xa8, 0x80, 0x82, 0xd1, 0x84, 0x0b,
+  0x18, 0x6e, 0x08, 0x42, 0x01, 0x0c, 0x46, 0x0c, 0x0e, 0x00, 0x04, 0xc1,
+  0x60, 0x2a, 0x05, 0x3b, 0x00, 0x03, 0x36, 0x18, 0x4d, 0x08, 0x84, 0xe1,
+  0x06, 0x23, 0x20, 0x83, 0x22, 0x02, 0x9d, 0x65, 0x10, 0x88, 0x60, 0xc4,
+  0xe0, 0x00, 0x40, 0x10, 0x0c, 0xa6, 0x54, 0xd0, 0x03, 0x32, 0x20, 0x85,
+  0xd1, 0x84, 0x40, 0xa8, 0x40, 0x91, 0x11, 0x03, 0x05, 0x00, 0x41, 0x30,
+  0x70, 0x5c, 0x81, 0x0f, 0xd4, 0x20, 0x30, 0x85, 0x38, 0x40, 0x85, 0xd1,
+  0x84, 0x00, 0xb8, 0xc0, 0xc0, 0xd1, 0x04, 0x65, 0x18, 0x6e, 0x08, 0x58,
+  0x01, 0x0c, 0x66, 0x19, 0x06, 0x22, 0x18, 0x4d, 0x40, 0x86, 0x0a, 0x0e,
+  0x18, 0x31, 0x50, 0x00, 0x10, 0x04, 0x03, 0xa7, 0x16, 0x46, 0x21, 0x0e,
+  0x82, 0x56, 0xc0, 0x83, 0x57, 0x18, 0x4d, 0x08, 0x80, 0x0b, 0x0c, 0x9c,
+  0x25, 0x20, 0x06, 0x3a, 0x0c, 0x68, 0x80, 0x04, 0x76, 0x08, 0x06, 0x3a,
+  0x0c, 0x62, 0x60, 0x07, 0x81, 0x1d, 0x82, 0x11, 0x83, 0x03, 0x00, 0x41,
+  0x30, 0x98, 0x70, 0x21, 0x15, 0xe6, 0xa0, 0x16, 0x46, 0x13, 0x82, 0x60,
+  0xb8, 0x21, 0x0b, 0xc8, 0x60, 0x96, 0xa1, 0x38, 0x82, 0x11, 0x83, 0x03,
+  0x00, 0x41, 0x30, 0x98, 0x76, 0x81, 0x15, 0xec, 0xe0, 0x16, 0x46, 0x13,
+  0x02, 0xa0, 0x82, 0x31, 0x90, 0xd1, 0x84, 0x21, 0xa8, 0xc0, 0x93, 0x0a,
+  0x06, 0x18, 0x4d, 0x30, 0x84, 0x0a, 0xcc, 0x40, 0x6a, 0x08, 0x60, 0xc4,
+  0x40, 0x01, 0x40, 0x10, 0x0c, 0x1c, 0x73, 0xa0, 0x05, 0x50, 0x08, 0x7c,
+  0x21, 0x15, 0xc0, 0x61, 0x34, 0x21, 0x00, 0x2e, 0x30, 0x70, 0x34, 0xe1,
+  0x19, 0x86, 0x1b, 0x02, 0x72, 0x00, 0x83, 0x59, 0x06, 0xe3, 0x08, 0x46,
+  0x13, 0x9c, 0xa1, 0x82, 0x03, 0x46, 0x0c, 0x14, 0x00, 0x04, 0xc1, 0xc0,
+  0x69, 0x87, 0x5d, 0x38, 0x85, 0xa0, 0x1c, 0x60, 0xe1, 0x1c, 0x46, 0x13,
+  0x02, 0xe0, 0x02, 0x03, 0x67, 0x09, 0x8e, 0x81, 0x0e, 0x03, 0x32, 0xa0,
+  0x42, 0x27, 0x88, 0x81, 0x0e, 0x83, 0x30, 0x78, 0xa2, 0xe0, 0x09, 0xc2,
+  0x04, 0x4d, 0x3e, 0x26, 0x68, 0xf2, 0x31, 0x62, 0x93, 0x8f, 0x11, 0x9d,
+  0x7c, 0x86, 0x1b, 0xe4, 0xc0, 0x0d, 0xc8, 0xa0, 0xe2, 0x20, 0xd0, 0x59,
+  0x06, 0x44, 0x09, 0x46, 0x0c, 0x0e, 0x00, 0x04, 0xc1, 0x60, 0xb2, 0x87,
+  0x73, 0x88, 0x85, 0x78, 0x18, 0x4d, 0x08, 0x84, 0x0a, 0xec, 0x40, 0x46,
+  0x0c, 0x14, 0x00, 0x04, 0xc1, 0xc0, 0xd9, 0x87, 0x74, 0xb8, 0x85, 0x60,
+  0x1e, 0x7c, 0xa1, 0x1e, 0x46, 0x13, 0x02, 0xe0, 0x02, 0x03, 0x47, 0x13,
+  0xee, 0x60, 0x18, 0x6e, 0x08, 0xf2, 0x01, 0x0c, 0x66, 0x19, 0x12, 0x25,
+  0x18, 0x4d, 0x40, 0x86, 0x0a, 0x0e, 0x18, 0x31, 0x50, 0x00, 0x10, 0x04,
+  0x03, 0x47, 0x24, 0xe0, 0xc1, 0x17, 0x02, 0x7d, 0x28, 0x07, 0x7e, 0x18,
+  0x4d, 0x08, 0x80, 0x0b, 0x0c, 0x9c, 0x25, 0x50, 0x06, 0x3a, 0x0c, 0x28,
+  0x81, 0x10, 0xd3, 0x38, 0x06, 0x3a, 0x0c, 0x22, 0x31, 0x0d, 0xc4, 0x34,
+  0x8e, 0xe1, 0x86, 0x51, 0x60, 0x03, 0x32, 0x98, 0x65, 0x58, 0x9a, 0x60,
+  0xc4, 0xe0, 0x00, 0x40, 0x10, 0x0c, 0x26, 0x93, 0xb8, 0x87, 0x70, 0x10,
+  0x89, 0xd1, 0x84, 0x00, 0xa8, 0xc0, 0x15, 0x64, 0x34, 0x61, 0x08, 0x2a,
+  0x40, 0x05, 0xa9, 0x60, 0x80, 0xd1, 0x04, 0x43, 0xa8, 0x20, 0x16, 0xa4,
+  0x86, 0x00, 0x46, 0x0c, 0x14, 0x00, 0x04, 0xc1, 0xc0, 0x89, 0x89, 0x7f,
+  0x58, 0x87, 0x20, 0x25, 0xe8, 0x61, 0x25, 0x46, 0x13, 0x02, 0xe0, 0x02,
+  0x03, 0x47, 0x13, 0xf4, 0x60, 0x18, 0x6e, 0x08, 0x5e, 0x02, 0x0c, 0x66,
+  0x19, 0x98, 0x26, 0x18, 0x4d, 0x70, 0x86, 0x0a, 0x0e, 0x18, 0x31, 0x50,
+  0x00, 0x10, 0x04, 0x03, 0x07, 0x27, 0x4c, 0x42, 0x1e, 0x02, 0x98, 0xd8,
+  0x07, 0x99, 0x18, 0x4d, 0x08, 0x80, 0x0b, 0x0c, 0x9c, 0x25, 0x68, 0x06,
+  0x3a, 0x0c, 0x88, 0x81, 0x16, 0xf8, 0x50, 0x06, 0x3a, 0x0c, 0x82, 0x91,
+  0x8f, 0x45, 0x3e, 0x14, 0x13, 0x30, 0xf9, 0x98, 0x80, 0xc9, 0xc7, 0x84,
+  0x20, 0x3e, 0x56, 0x68, 0xf2, 0xb1, 0x82, 0x93, 0x8f, 0x05, 0x02, 0x7c,
+  0x0a, 0x1e, 0x58, 0x02, 0xea, 0x1c, 0x02, 0x1d, 0x4d, 0xe0, 0x87, 0x61,
+  0xb8, 0x21, 0x08, 0x0b, 0x30, 0x98, 0x6e, 0x48, 0x09, 0x94, 0x08, 0x8e,
+  0x30, 0xca, 0x84, 0x40, 0x3e, 0x77, 0x0f, 0x46, 0x99, 0x10, 0xd0, 0x67,
+  0xc4, 0xc0, 0x00, 0x40, 0x10, 0x0c, 0x8e, 0xb6, 0x18, 0x8b, 0x60, 0xc4,
+  0xc0, 0x00, 0x40, 0x10, 0x0c, 0x0e, 0xb7, 0xa8, 0x09, 0x61, 0xc4, 0xe0,
+  0x00, 0x40, 0x10, 0x0c, 0x26, 0xb6, 0xe8, 0x89, 0x93, 0x18, 0x8b, 0xd1,
+  0x84, 0x40, 0xa8, 0xa0, 0x24, 0x64, 0x34, 0x61, 0x18, 0x4a, 0x08, 0x60,
+  0xc4, 0xe0, 0x00, 0x40, 0x10, 0x0c, 0x2c, 0xb8, 0x10, 0x0b, 0x96, 0xe8,
+  0x89, 0xd1, 0x84, 0x40, 0xb0, 0xc4, 0x92, 0x8f, 0x25, 0x94, 0x7c, 0xac,
+  0x14, 0x48, 0x21, 0x3e, 0x16, 0x0c, 0xf0, 0xb1, 0x60, 0x88, 0x8f, 0x19,
+  0x81, 0x7c, 0xec, 0xc9, 0xe4, 0x63, 0x4f, 0x27, 0x1f, 0x43, 0x85, 0x54,
+  0x80, 0x8f, 0x05, 0x03, 0x7c, 0x2c, 0x18, 0xe0, 0x63, 0x4c, 0x20, 0x9f,
+  0xd1, 0x04, 0x27, 0x18, 0x8e, 0x08, 0x7e, 0x22, 0xf8, 0x66, 0x19, 0x9c,
+  0x27, 0xb8, 0x24, 0xb9, 0x11, 0x03, 0x07, 0x00, 0x41, 0x30, 0x80, 0x46,
+  0x03, 0x2e, 0x7e, 0x82, 0xe2, 0x8b, 0x60, 0x2c, 0xc6, 0x62, 0x2c, 0xce,
+  0xc2, 0x2f, 0x8e, 0x48, 0x6e, 0xc4, 0xc0, 0x01, 0x40, 0x10, 0x0c, 0x20,
+  0xd2, 0x88, 0x0b, 0xb0, 0x98, 0xfa, 0x22, 0x20, 0x0b, 0xb2, 0x20, 0x0b,
+  0xb4, 0xf8, 0x8b, 0x59, 0x02, 0xc8, 0xba, 0x4e, 0x3e, 0x16, 0x98, 0x85,
+  0x7c, 0x2c, 0x30, 0xe8, 0x33, 0x62, 0x60, 0x00, 0x20, 0x08, 0x06, 0x47,
+  0x6a, 0xfc, 0x45, 0x30, 0x62, 0x60, 0x00, 0x20, 0x08, 0x06, 0x87, 0x6a,
+  0xc4, 0x85, 0x60, 0x02, 0x24, 0x1f, 0x13, 0x16, 0xf9, 0x98, 0x10, 0xc4,
+  0xe7, 0x82, 0xe4, 0x46, 0x0c, 0x1c, 0x00, 0x04, 0xc1, 0x00, 0x6a, 0x0d,
+  0xbd, 0x48, 0x0b, 0xcf, 0x34, 0x82, 0xb6, 0x68, 0x8b, 0xb6, 0x88, 0x0b,
+  0xd4, 0xb0, 0x82, 0x92, 0x8f, 0x1d, 0x8f, 0x7c, 0x4c, 0x08, 0xe0, 0x73,
+  0x41, 0x72, 0x23, 0x06, 0x0e, 0x00, 0x82, 0x60, 0x00, 0xc5, 0x86, 0x5f,
+  0xb4, 0x05, 0x18, 0xa8, 0x46, 0x10, 0x17, 0x71, 0x11, 0x17, 0x75, 0xc1,
+  0x1a, 0xb3, 0x04, 0x10, 0x06, 0xc4, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00,
+  0x5b, 0x06, 0x34, 0x78, 0xc0, 0x60, 0xcb, 0x10, 0x07, 0x4f, 0x18, 0x6c,
+  0x19, 0xfa, 0xe0, 0x11, 0x83, 0x2d, 0x03, 0x2b, 0x3c, 0x63, 0xb0, 0x65,
+  0xd8, 0x85, 0x07, 0x0c, 0xb6, 0x0c, 0xe4, 0xf0, 0x84, 0xc1, 0x96, 0xc1,
+  0x1d, 0x1e, 0x31, 0xd8, 0x32, 0xf4, 0xc3, 0x33, 0x06, 0x5b, 0x06, 0xb6,
+  0x78, 0xc8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
 };
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/Shaders/stockham.hlsl b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/Shaders/stockham.hlsl
index 01e62b0727520..c8a006c7e12e0 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/Shaders/stockham.hlsl
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/Shaders/stockham.hlsl
@@ -41,20 +41,21 @@ float2 ReadSourceValue(uint3 index)
     float2 value = float2(0, 0);
 
     bool hasWindow = HasWindow == 1;
-    [branch]
+    [flatten]
     if (hasWindow && index.y < (uint)WindowSizes[2])
     {
         uint windowIndexReal = index.y * WindowStrides[2];
         window_value.x = window[windowIndexReal];
 
         uint windowIndexImaginary = windowIndexReal + WindowStrides[3];
+        [branch]
         if (WindowSizes[3] == 2)
         {
             window_value.y = window[windowIndexImaginary];
         }
     }
 
-    [branch]
+    [flatten]
     if (index.y < (uint)InputSizes[1])
     {
         uint indexReal =
@@ -108,7 +109,7 @@ void DFT(uint3 dtid : SV_DispatchThreadId)
     uint index = StartIndex + dtid.x;
     if (index < ElementCount)
     {
-        uint halfTotalDFTLength = DFTLength / 2;
+        uint halfTotalDFTLength = DFTLength >> 1;
         uint N = 1U << DFTIteration;
         uint halfN = 1U << (DFTIteration - 1);
 
@@ -143,8 +144,16 @@ void DFT(uint3 dtid : SV_DispatchThreadId)
         unweighted.y = Scale * (inputEvenValue.y + (w.x * inputOddValue.y + w.y * inputOddValue.x));
 
         // When ChirpLength is 0, then chirp should evaluate to (1,0), which is a no-op.
-        float2 chirp = CalculateChirp(k, ChirpLength);
-        dst[outputIndex.x] = (TBUFFER)(unweighted.x * chirp.x - unweighted.y * chirp.y);
-        dst[outputIndex.y] = (TBUFFER)(unweighted.x * chirp.y + unweighted.y * chirp.x);
+        [branch]
+        if (ChirpLength == 0)
+        {
+            dst[outputIndex.x] = (TBUFFER)(unweighted.x);
+            dst[outputIndex.y] = (TBUFFER)(unweighted.y);
+        }
+        else {
+            float2 chirp = CalculateChirp(k, ChirpLength);
+            dst[outputIndex.x] = (TBUFFER)(unweighted.x * chirp.x - unweighted.y * chirp.y);
+            dst[outputIndex.y] = (TBUFFER)(unweighted.x * chirp.y + unweighted.y * chirp.x);
+        }
     }
 }
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index 684f390857d0b..ece224ef206fc 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -60,6 +60,7 @@
 #include "core/providers/dml/DmlExecutionProvider/src/GraphTransformer.h"
 #include "core/providers/dml/dml_session_options_config_keys.h"
 #include "core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h"
+#include "core/optimizer/stft_decomposition.h"
 #endif
 #include "core/session/environment.h"
 #include "core/session/user_logging_sink.h"
@@ -1761,6 +1762,14 @@ common::Status InferenceSession::Initialize() {
           }
           ORT_RETURN_IF_ERROR_SESSIONID_(graph_transformer_mgr_.Register(std::move(dmlOperatorFusionTransformer), onnxruntime::TransformerLevel::Level2));
         }
+
+        const auto dml_ep_impl = static_cast<const Dml::ExecutionProvider*>(dmlExecutionProvider);
+        auto is_mcdm_device = dml_ep_impl->GetImpl()->IsMcdmDevice();
+        if (is_mcdm_device) {
+          const InlinedHashSet<std::string_view> dml_ep = {onnxruntime::kDmlExecutionProvider};
+          auto stft_decomposition_transformer = std::make_unique<STFTDecomposition>(dml_ep);
+          ORT_RETURN_IF_ERROR_SESSIONID_(graph_transformer_mgr_.Register(std::move(stft_decomposition_transformer), onnxruntime::TransformerLevel::Level1));
+        }
       }
 #endif
 

From fa73d7cbf9ed7d1705c993874e4e05eae1c2f4a7 Mon Sep 17 00:00:00 2001
From: raoanag <127366241+raoanag@users.noreply.github.com>
Date: Fri, 8 Mar 2024 15:35:10 -0800
Subject: [PATCH 135/279] [DML] DynamicQuantizeMatMul (#19763)

### Description
DML Implementation for [com.microsoft.DynamicQuantizeMatMul
](https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#com.microsoft.DynamicQuantizeMatMul)

```
.\onnxruntime_test_all.exe --gtest_filter="*DynamicQuantizeMatMul.*"
Note: Google Test filter = *DynamicQuantizeMatMul.*
[==========] Running 10 tests from 1 test suite.
[----------] Global test environment set-up.
[----------] 10 tests from DynamicQuantizeMatMul
[ RUN      ] DynamicQuantizeMatMul.HasZeroPoint_NoBias_test_S8
[       OK ] DynamicQuantizeMatMul.HasZeroPoint_NoBias_test_S8 (635 ms)
[ RUN      ] DynamicQuantizeMatMul.HasZeroPoint_NoBias_test_U8
[       OK ] DynamicQuantizeMatMul.HasZeroPoint_NoBias_test_U8 (514 ms)
[ RUN      ] DynamicQuantizeMatMul.NoZeroPoint_HasBias_test_S8
[       OK ] DynamicQuantizeMatMul.NoZeroPoint_HasBias_test_S8 (512 ms)
[ RUN      ] DynamicQuantizeMatMul.NoZeroPoint_HasBias_test_U8
[       OK ] DynamicQuantizeMatMul.NoZeroPoint_HasBias_test_U8 (505 ms)
[ RUN      ] DynamicQuantizeMatMul.NoZeroPoint_NoBias_test_S8
[       OK ] DynamicQuantizeMatMul.NoZeroPoint_NoBias_test_S8 (526 ms)
[ RUN      ] DynamicQuantizeMatMul.NoZeroPoint_NoBias_test_U8
[       OK ] DynamicQuantizeMatMul.NoZeroPoint_NoBias_test_U8 (504 ms)
[ RUN      ] DynamicQuantizeMatMul.HasZeroPoint_HasBias_test_S8
[       OK ] DynamicQuantizeMatMul.HasZeroPoint_HasBias_test_S8 (512 ms)
[ RUN      ] DynamicQuantizeMatMul.HasZeroPoint_HasBias_test_U8
[       OK ] DynamicQuantizeMatMul.HasZeroPoint_HasBias_test_U8 (512 ms)
[ RUN      ] DynamicQuantizeMatMul.UInt8_test_with_empty_input
[       OK ] DynamicQuantizeMatMul.UInt8_test_with_empty_input (112 ms)
[ RUN      ] DynamicQuantizeMatMul.B_PerColumn_ND
[       OK ] DynamicQuantizeMatMul.B_PerColumn_ND (348 ms)
[----------] 10 tests from DynamicQuantizeMatMul (4685 ms total)

[----------] Global test environment tear-down
[==========] 10 tests from 1 test suite ran. (4686 ms total)
[  PASSED  ] 10 tests.
memleakdbg:
----- No memory leaks detected -----
```


### Motivation and Context
- CalculateDynamicQuantizeMatMul to replace CPU EP run reference
- Added more FP32 testcases to isolate all input datatype combinations

---------

Co-authored-by: Xiang Zhang <xianz@microsoft.com>
---
 docs/OperatorKernels.md                       |   1 +
 .../External/DirectMLHelpers/DirectMLSchema.h |  30 +--
 .../DmlOperatorDynamicQuantizeMatMul.cpp      | 173 +++++++++++++++
 .../src/Operators/OperatorRegistration.cpp    |   2 +
 .../dml/OperatorAuthorHelper/OperatorHelper.h |   1 +
 .../OperatorAuthorHelper/OperatorVersions.h   |   1 +
 .../dynamic_quantize_matmul_test.cc           | 199 ++++++++++++------
 7 files changed, 330 insertions(+), 77 deletions(-)
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorDynamicQuantizeMatMul.cpp

diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index 9f5cd4cc842dc..955957f2957dc 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -1268,6 +1268,7 @@ Do not modify directly.*
 |BiasSplitGelu|*in* X:**T**<br> *in* bias:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
 |ConvTransposeWithDynamicPads|*in* X:**T**<br> *in* W:**T**<br> *in* Pads:**tensor(int64)**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
 |DequantizeLinear|*in* x:**T1**<br> *in* x_scale:**T2**<br> *in* x_zero_point:**T1**<br> *out* y:**T2**|1+|**T1** = tensor(int32), tensor(int8), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
+|DynamicQuantizeMatMul|*in* A:**T1**<br> *in* B:**T2**<br> *in* b_scale:**T1**<br> *in* b_zero_point:**T2**<br> *in* bias:**T1**<br> *out* Y:**T1**|1+|**T1** = tensor(float)<br/> **T2** = tensor(int8), tensor(uint8)|
 |EmbedLayerNormalization|*in* input_ids:**T1**<br> *in* segment_ids:**T1**<br> *in* word_embedding:**T**<br> *in* position_embedding:**T**<br> *in* segment_embedding:**T**<br> *in* gamma:**T**<br> *in* beta:**T**<br> *in* mask:**T1**<br> *in* position_ids:**T1**<br> *out* output:**T**<br> *out* mask_index:**T1**<br> *out* embedding_sum:**T**|1+|**T** = tensor(float), tensor(float16)|
 |FusedMatMul|*in* A:**T**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
 |FusedMatMulActivation|*in* A:**T**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h
index da57c2aa235fd..64ea5b7801a84 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h
@@ -1865,43 +1865,43 @@ constexpr DML_OPERATOR_SCHEMA DML_MATRIX_MULTIPLY_INTEGER_OPERATOR_SCHEMA {
     DML_MATRIX_MULTIPLY_INTEGER_OPERATOR_SCHEMA_FIELDS,
 };
 
-constexpr DML_SCHEMA_FIELD DML_QUANTIZED_LINEAR_MATRIX_MULTIPLY_OPERATOR_SCHEMA_FIELDS[9] {
+constexpr DML_SCHEMA_FIELD DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA_FIELDS[8] {
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "ATensor", false },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "AScaleTensor", false },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "AZeroPointTensor", true },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BTensor", false },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BScaleTensor", false },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BZeroPointTensor", true },
-    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputScaleTensor", false },
-    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputZeroPointTensor", true },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BiasTensor", true },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false },
 };
 
-constexpr DML_OPERATOR_SCHEMA DML_QUANTIZED_LINEAR_MATRIX_MULTIPLY_OPERATOR_SCHEMA {
-    "DML_OPERATOR_QUANTIZED_LINEAR_MATRIX_MULTIPLY",
-    DML_OPERATOR_QUANTIZED_LINEAR_MATRIX_MULTIPLY,
+constexpr DML_OPERATOR_SCHEMA DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA {
+    "DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT",
+    static_cast<DML_OPERATOR_TYPE>(DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT),
     DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE,
-    9,
-    DML_QUANTIZED_LINEAR_MATRIX_MULTIPLY_OPERATOR_SCHEMA_FIELDS,
+    8,
+    DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA_FIELDS,
 };
 
-constexpr DML_SCHEMA_FIELD DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA_FIELDS[8] {
+constexpr DML_SCHEMA_FIELD DML_QUANTIZED_LINEAR_MATRIX_MULTIPLY_OPERATOR_SCHEMA_FIELDS[9] {
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "ATensor", false },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "AScaleTensor", false },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "AZeroPointTensor", true },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BTensor", false },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BScaleTensor", false },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BZeroPointTensor", true },
-    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BiasTensor", true },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputScaleTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputZeroPointTensor", true },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false },
 };
 
-constexpr DML_OPERATOR_SCHEMA DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA {
-    "DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT",
-    static_cast<DML_OPERATOR_TYPE>(DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT),
+constexpr DML_OPERATOR_SCHEMA DML_QUANTIZED_LINEAR_MATRIX_MULTIPLY_OPERATOR_SCHEMA {
+    "DML_OPERATOR_QUANTIZED_LINEAR_MATRIX_MULTIPLY",
+    DML_OPERATOR_QUANTIZED_LINEAR_MATRIX_MULTIPLY,
     DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE,
-    8,
-    DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA_FIELDS,
+    9,
+    DML_QUANTIZED_LINEAR_MATRIX_MULTIPLY_OPERATOR_SCHEMA_FIELDS,
 };
 
 constexpr DML_SCHEMA_FIELD DML_CONVOLUTION_INTEGER_OPERATOR_SCHEMA_FIELDS[11] {
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorDynamicQuantizeMatMul.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorDynamicQuantizeMatMul.cpp
new file mode 100644
index 0000000000000..c6a87da705a99
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorDynamicQuantizeMatMul.cpp
@@ -0,0 +1,173 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "precomp.h"
+
+namespace Dml
+{
+// DynamicQuantizeMatMul = MatrixMultiplyIntegerToFloat(DynamicQuantizeLinear(A), B)
+class DmlOperatorDynamicQuantizeMatMul : public DmlOperator
+{
+    // This order matches the ONNX schema.
+    enum OnnxInputIndex
+    {
+        A, // Input
+        B,
+        B_scale,
+        B_zero_point,
+        Bias,
+        Count,
+    };
+
+public:
+    DmlOperatorDynamicQuantizeMatMul(const MLOperatorKernelCreationContext& kernelCreationContext)
+    :   DmlOperator(kernelCreationContext)
+    {
+        DmlOperator::Initialize(kernelCreationContext);
+
+        const bool hasBias = kernelCreationContext.IsInputValid(OnnxInputIndex::Bias);
+        const bool hasBZP = kernelCreationContext.IsInputValid(OnnxInputIndex::B_zero_point);
+
+        // Broadcast Bias tensor to the shape of the output tensor.
+        if (hasBias)
+        {
+            m_inputTensorDescs[OnnxInputIndex::Bias] = CreateTensorDescFromInput(
+                kernelCreationContext,
+                OnnxInputIndex::Bias,
+                TensorAxis::DoNotCoerce,
+                TensorAxis::W,
+                TensorAxis::RightAligned,
+                kernelCreationContext.GetTensorShapeDescription().GetOutputTensorShape(0)
+            );
+        }
+        MLOperatorTensorDataType BDatatype = kernelCreationContext.GetInputEdgeDescription(OnnxInputIndex::B).tensorDataType;
+
+        std::vector<uint32_t> ATensorShape = kernelCreationContext.GetTensorShapeDescription().GetInputTensorShape(OnnxInputIndex::A);
+        std::vector<uint32_t> ExpectedAScaleTensorShape = {1, 1, 1, 1};
+        std::vector<uint32_t> ExpectedAZeroPointTensorShape = {1, 1, 1, 1};
+
+        //  output edges between DynQL and MMItoFloat node
+        TensorDesc intermediateQuantizedATensorDesc = TensorDesc(
+                BDatatype,
+                gsl::make_span(ATensorShape),
+                gsl::make_span(ATensorShape),
+                TensorAxis::DoNotCoerce,
+                TensorAxis::W,
+                TensorAxis::RightAligned,
+                NchwDimensionCount,  // minDimensionCount
+                0  // guaranteedBaseOffsetAlignment
+            );
+
+        TensorDesc intermediateQuantizedAScaleTensorDesc = TensorDesc(
+                MLOperatorTensorDataType::Float,
+                gsl::make_span(ExpectedAScaleTensorShape),
+                gsl::make_span(ExpectedAScaleTensorShape),
+                TensorAxis::DoNotCoerce,
+                TensorAxis::W,
+                TensorAxis::RightAligned,
+                NchwDimensionCount,  // minDimensionCount
+                0  // guaranteedBaseOffsetAlignment
+            );
+
+        TensorDesc intermediateQuantizedAZeroPointTensorDesc = TensorDesc(
+                BDatatype,
+                gsl::make_span(ExpectedAZeroPointTensorShape),
+                gsl::make_span(ExpectedAZeroPointTensorShape),
+                TensorAxis::DoNotCoerce,
+                TensorAxis::W,
+                TensorAxis::RightAligned,
+                NchwDimensionCount,  // minDimensionCount
+                0  // guaranteedBaseOffsetAlignment
+            );
+
+        DML_TENSOR_DESC namedIntermediateQuantizedATensorDesc = intermediateQuantizedATensorDesc.GetDmlDesc();
+        DML_TENSOR_DESC namedIntermediateQuantizedAScaleTensorDesc = intermediateQuantizedAScaleTensorDesc.GetDmlDesc();
+        DML_TENSOR_DESC namedIntermediateQuantizedAZeroPointTensorDesc = intermediateQuantizedAZeroPointTensorDesc.GetDmlDesc();
+
+        std::vector<DML_TENSOR_DESC> inputDescs = GetDmlInputDescs();
+        std::vector<DML_TENSOR_DESC> outputDescs = GetDmlOutputDescs();
+
+        DML_DYNAMIC_QUANTIZE_LINEAR_OPERATOR_DESC dynamicQuantizeLinearOperatorDesc = {};
+        dynamicQuantizeLinearOperatorDesc.InputTensor = &inputDescs[OnnxInputIndex::A];
+        dynamicQuantizeLinearOperatorDesc.OutputTensor = &namedIntermediateQuantizedATensorDesc;
+        dynamicQuantizeLinearOperatorDesc.OutputScaleTensor = &namedIntermediateQuantizedAScaleTensorDesc;
+        dynamicQuantizeLinearOperatorDesc.OutputZeroPointTensor = &namedIntermediateQuantizedAZeroPointTensorDesc;
+
+        const DML_OPERATOR_DESC opDesc1{DML_OPERATOR_DYNAMIC_QUANTIZE_LINEAR, &dynamicQuantizeLinearOperatorDesc};
+
+        DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC matrixMultiplyIntergerToFloatOperatorDesc = {};
+        matrixMultiplyIntergerToFloatOperatorDesc.ATensor = dynamicQuantizeLinearOperatorDesc.OutputTensor;
+        matrixMultiplyIntergerToFloatOperatorDesc.AScaleTensor = dynamicQuantizeLinearOperatorDesc.OutputScaleTensor;
+        matrixMultiplyIntergerToFloatOperatorDesc.AZeroPointTensor = dynamicQuantizeLinearOperatorDesc.OutputZeroPointTensor;
+        matrixMultiplyIntergerToFloatOperatorDesc.BTensor = &inputDescs[OnnxInputIndex::B];
+        matrixMultiplyIntergerToFloatOperatorDesc.BScaleTensor = &inputDescs[OnnxInputIndex::B_scale];
+        matrixMultiplyIntergerToFloatOperatorDesc.BZeroPointTensor = hasBZP? &inputDescs[OnnxInputIndex::B_zero_point] : nullptr;
+        matrixMultiplyIntergerToFloatOperatorDesc.BiasTensor = hasBias? &inputDescs[OnnxInputIndex::Bias] : nullptr;
+        matrixMultiplyIntergerToFloatOperatorDesc.OutputTensor = &outputDescs[0];
+
+        const DML_OPERATOR_DESC opDesc2{ DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT, &matrixMultiplyIntergerToFloatOperatorDesc};
+
+        MLOperatorGraphDesc operatorGraphDesc = {};
+        std::vector<const DML_OPERATOR_DESC*> opDescs{&opDesc1, &opDesc2};
+        operatorGraphDesc.nodeCount = static_cast<uint32_t>(opDescs.size());
+        operatorGraphDesc.nodes = opDescs.data();
+
+        // set input edges
+        std::pair<uint32_t, uint32_t> nodeToNodeInputIndex[OnnxInputIndex::Count] {{0, 0}, {1, 3}, {1, 4}, {1, 5}, {1, 6}};
+        std::vector<DML_INPUT_GRAPH_EDGE_DESC> inputEdges;
+        for (uint32_t inputIndex = 0; inputIndex < OnnxInputIndex::Count; inputIndex++)
+        {
+            if (inputIndex == OnnxInputIndex::B_zero_point && !hasBZP) continue;
+            if (inputIndex == OnnxInputIndex::Bias && !hasBias) continue;
+            DML_INPUT_GRAPH_EDGE_DESC inputEdge = {};
+            inputEdge.GraphInputIndex = inputIndex;  // OnnxInputIndex and DmlInputIndex are identity for QLinearSigmoid
+            inputEdge.ToNodeIndex = nodeToNodeInputIndex[inputIndex].first;
+            inputEdge.ToNodeInputIndex = nodeToNodeInputIndex[inputIndex].second;
+            inputEdges.push_back(inputEdge);
+        }
+        operatorGraphDesc.inputEdgeCount = gsl::narrow_cast<uint32_t>(inputEdges.size());
+        operatorGraphDesc.inputEdges = inputEdges.data();
+
+        // set intermediate edges
+        std::vector<DML_INTERMEDIATE_GRAPH_EDGE_DESC> intermediateEdges;
+
+        DML_INTERMEDIATE_GRAPH_EDGE_DESC dynQLToMMItofloatEdge1 = {};
+        dynQLToMMItofloatEdge1.FromNodeIndex = 0;
+        dynQLToMMItofloatEdge1.FromNodeOutputIndex = 0;
+        dynQLToMMItofloatEdge1.ToNodeIndex = 1;
+        dynQLToMMItofloatEdge1.ToNodeInputIndex = 0;
+        intermediateEdges.push_back(dynQLToMMItofloatEdge1);
+
+        DML_INTERMEDIATE_GRAPH_EDGE_DESC dynQLToMMItofloatEdge2 = {};
+        dynQLToMMItofloatEdge2.FromNodeIndex = 0;
+        dynQLToMMItofloatEdge2.FromNodeOutputIndex = 1;
+        dynQLToMMItofloatEdge2.ToNodeIndex = 1;
+        dynQLToMMItofloatEdge2.ToNodeInputIndex = 1;
+        intermediateEdges.push_back(dynQLToMMItofloatEdge2);
+
+        DML_INTERMEDIATE_GRAPH_EDGE_DESC dynQLToMMItofloatEdge3 = {};
+        dynQLToMMItofloatEdge3.FromNodeIndex = 0;
+        dynQLToMMItofloatEdge3.FromNodeOutputIndex = 2;
+        dynQLToMMItofloatEdge3.ToNodeIndex = 1;
+        dynQLToMMItofloatEdge3.ToNodeInputIndex = 2;
+        intermediateEdges.push_back(dynQLToMMItofloatEdge3);
+
+        operatorGraphDesc.intermediateEdgeCount = gsl::narrow_cast<uint32_t>(intermediateEdges.size());
+        operatorGraphDesc.intermediateEdges = intermediateEdges.data();
+
+        // set the output edges
+        std::vector<DML_OUTPUT_GRAPH_EDGE_DESC> outputEdges;
+        DML_OUTPUT_GRAPH_EDGE_DESC outputEdge = {};
+        outputEdge.FromNodeIndex = 1;
+        outputEdge.FromNodeOutputIndex = 0;
+        outputEdge.GraphOutputIndex = 0;
+        outputEdges.push_back(outputEdge);
+        operatorGraphDesc.outputEdgeCount = gsl::narrow_cast<uint32_t>(outputEdges.size());
+        operatorGraphDesc.outputEdges = outputEdges.data();
+
+        SetDmlOperatorGraphDesc(std::move(operatorGraphDesc), kernelCreationContext);
+    }
+};
+
+DML_OP_DEFINE_CREATION_FUNCTION(DynamicQuantizeMatMul, DmlOperatorDynamicQuantizeMatMul);
+}  // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp
index f08151b61197a..38cf80b381762 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp
@@ -435,6 +435,7 @@ DML_OP_EXTERN_CREATION_FUNCTION(Dropout);
 DML_OP_EXTERN_CREATION_FUNCTION(MatMul);
 DML_OP_EXTERN_CREATION_FUNCTION(FusedMatMul);
 DML_OP_EXTERN_CREATION_FUNCTION(FusedMatMulActivation);
+DML_OP_EXTERN_CREATION_FUNCTION(DynamicQuantizeMatMul);
 DML_OP_EXTERN_CREATION_FUNCTION(Cast);
 DML_OP_EXTERN_CREATION_FUNCTION(CastLike15);
 DML_OP_EXTERN_CREATION_FUNCTION(CastLike19);
@@ -1065,6 +1066,7 @@ constexpr static OperatorRegistrationInformation operatorRegistrationInformation
     {REG_INFO_MS(   1,  Gelu,                               typeNameListDefault,            supportedTypeListFloat16to32,           DmlGraphSupport::Supported)},
     {REG_INFO_MS(   1,  BiasGelu,                           typeNameListDefault,            supportedTypeListFloat16to32,           DmlGraphSupport::Supported)},
     {REG_INFO_MS(   1,  FusedMatMul,                        typeNameListDefault,            supportedTypeListFloat16to32,           DmlGraphSupport::Supported)},
+    {REG_INFO_MS(   1,  DynamicQuantizeMatMul,              typeNameListTwo,                supportedTypeListDynamicQuantizeLinear, DmlGraphSupport::Supported)},
     {REG_INFO_MS(   1,  FusedMatMulActivation,              typeNameListDefault,            supportedTypeListFloat16to32,           DmlGraphSupport::Supported)},
     {REG_INFO_MS(   1,  QLinearSigmoid,                     typeNameListDefault,            supportedTypeListQLinearSigmoid,        DmlGraphSupport::Supported, requiredConstantCpuInputs(), std::nullopt, QueryQLinearSigmoid)},
     {REG_INFO_MS(   1,  Attention,                          typeNameListAttention,          supportedTypeListAttention,             DmlGraphSupport::Supported, requiredConstantCpuInputs(), std::nullopt, QueryAttention)},
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h
index 06bacc1b28c99..1f5daed6ea0db 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h
@@ -1776,6 +1776,7 @@ using ShapeInferenceHelper_Identity19 = GetOutputShapeAsInputShapeHelper;
 using ShapeInferenceHelper_MatMul = MatMulHelper;
 using ShapeInferenceHelper_MatMulInteger = MatMulHelper;
 using ShapeInferenceHelper_MatMulIntegerToFloat = MatMulHelper;
+using ShapeInferenceHelper_DynamicQuantizeMatMul = MatMulHelper;
 using ShapeInferenceHelper_QLinearMatMul = QLinearMatMulHelper;
 using ShapeInferenceHelper_QLinearAdd = GetBroadcastedOutputShapeHelper;
 using ShapeInferenceHelper_DynamicQuantizeLinear = GetOutputShapeAsInputShapeHelper;
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h
index d081aa2e29148..8de43f270598d 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h
@@ -462,6 +462,7 @@ namespace OperatorHelper
         static const int sc_sinceVer_RotaryEmbedding = 1;
         static const int sc_sinceVer_QLinearAveragePool = 1;
         static const int sc_sinceVer_QLinearGlobalAveragePool = 1;
+        static const int sc_sinceVer_DynamicQuantizeMatMul = 1;
     } // namespace MsftOperatorSet1
 
 } // namespace OperatorHelper
diff --git a/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc b/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc
index c70f659f1b645..88bee5fe1b125 100644
--- a/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc
+++ b/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc
@@ -23,20 +23,85 @@ namespace onnxruntime {
 namespace test {
 
 template <typename T>
-void TestDynamicQuantizeMatMul(const std::vector<int64_t>& A_dims,
-                               std::vector<int64_t> B_dims,
-                               const std::string& reference_model,
-                               bool is_matrix_b_constant,
+static void CalculateDynamicQuantizeMatMul(const int64_t M, const int64_t N, const int64_t K,
+                                           const std::vector<float>& A_data, const std::vector<T>& B_data,
+                                           std::vector<float>& B_scale, std::vector<T>& B_zero_point,
+                                           const std::vector<float>& Bias, std::vector<float>& Y_data,
+                                           bool per_column, bool has_zp, bool has_bias) {
+  // DynamicQuantize Matrix A
+  const uint32_t num_elements = static_cast<uint32_t>(M * K);
+  std::vector<T> QuantA_data(num_elements);
+  std::vector<float> A_scale;
+  std::vector<T> A_zero_point;
+
+  // Get max and min
+  float min = std::numeric_limits<float>::max();
+  float max = std::numeric_limits<float>::lowest();
+  float qmax = static_cast<float>(std::numeric_limits<T>::max());
+  float qmin = static_cast<float>(std::numeric_limits<T>::lowest());
+
+  for (uint32_t i = 0; i < num_elements; ++i) {
+    max = std::max(A_data[i], max);
+    min = std::min(A_data[i], min);
+  }
+
+  // Adjust the maximum and minimum to include zero
+  max = std::max(max, 0.0f);
+  min = std::min(min, 0.0f);
+
+  float scale = static_cast<float>(max - min) / (qmax - qmin);
+  T zeroPoint = std::round(std::clamp(qmin - min / scale, qmin, qmax));
+
+  A_scale.push_back(scale);
+  A_zero_point.push_back(zeroPoint);
+
+  // Matrix Multiplication
+  for (uint32_t i = 0; i < num_elements; ++i) {
+    QuantA_data[i] = static_cast<T>(std::round((A_data[i] / scale) + zeroPoint));
+  }
+  if (!per_column) {
+    B_zero_point.resize(N, B_zero_point[0]);
+    B_scale.resize(N, B_scale[0]);
+  }
+
+  for (int64_t m = 0; m < M; m++) {
+    for (int64_t n = 0; n < N; n++) {
+      float sum = 0.0f;
+      for (int64_t k = 0; k < K; k++) {
+        float A_dequantized = (static_cast<int>(QuantA_data[m * K + k]) - static_cast<int>(A_zero_point[0])) * A_scale[0];
+
+        float B_dequantized = has_zp ? (static_cast<int>(B_data[k * N + n]) - static_cast<int>(B_zero_point[n])) * B_scale[n] : B_data[k * N + n] * B_scale[n];
+
+        sum += A_dequantized * B_dequantized;
+      }
+      if (has_bias) {
+        sum += Bias[n];
+      }
+      Y_data[m * N + n] = sum;
+    }
+  }
+}
+
+template <typename T>
+void TestDynamicQuantizeMatMul(bool is_matrix_b_constant,
                                bool per_column = false,
                                bool has_zp = true,
-                               bool has_bias = false) {
+                               bool has_bias = false,
+                               bool empty_input = false) {
   // create rand inputs
   RandomValueGenerator random{};
 
+  int64_t M = empty_input ? 1 : 4;
+  int64_t N = 128;
+  int64_t K = 128;
+  std::vector<int64_t> A_dims{empty_input ? 0 : M, K};
+  std::vector<int64_t> B_dims{K, N};
+  std::vector<int64_t> Y_dims{empty_input ? 0 : M, K};
   std::vector<float> A_data = random.Uniform<float>(A_dims, -1.0f, 1.0f);
-
   std::vector<T> B_data;
-  std::vector<int> tmp_B_data = random.Uniform<int32_t>(B_dims, std::numeric_limits<T>::min(), std::numeric_limits<T>::max());
+  std::vector<T> tmp_B_data = random.Uniform<T>(B_dims,
+                                                (std::is_same_v<T, int8_t>) ? std::numeric_limits<int8_t>::lowest() / 2 : std::numeric_limits<uint8_t>::lowest(),
+                                                std::numeric_limits<T>::max() / 2);
   std::transform(tmp_B_data.begin(), tmp_B_data.end(), std::back_inserter(B_data), [](int32_t v) -> T {
     return static_cast<T>(v);
   });
@@ -47,7 +112,9 @@ void TestDynamicQuantizeMatMul(const std::vector<int64_t>& A_dims,
   std::for_each(B_zero_point.begin(),
                 B_zero_point.end(),
                 [&random](T& zp) {
-                  zp = static_cast<T>(random.Uniform<int32_t>(std::array<int64_t, 1>{1}, std::numeric_limits<T>::min(), std::numeric_limits<T>::max())[0]);
+                  zp = static_cast<T>(random.Uniform<T>(std::array<int64_t, 1>{1},
+                                                        std::numeric_limits<T>::min(),
+                                                        std::numeric_limits<T>::max())[0]);
                 });
 
   std::vector<float> Bias = random.Uniform<float>(AsSpan({B_dims.back()}), -0.1f, 0.1f);
@@ -69,77 +136,85 @@ void TestDynamicQuantizeMatMul(const std::vector<int64_t>& A_dims,
     test.AddOptionalInputEdge<float>();
   }
 
-  test.AddReferenceOutputs(reference_model);
+  std::vector<float> Y_data(M * N);
+  CalculateDynamicQuantizeMatMul<T>(M, N, K, A_data, B_data, B_scale, B_zero_point, Bias, Y_data,
+                                    per_column, has_zp, has_bias);
+  test.AddOutput<float>("Y", Y_dims, Y_data);
+  test.SetOutputRelErr("Y", 0.02f);
   test.Run();
 }
 
-template <typename Scalar, bool HasZeroPoint, bool HasBias>
-void RunDynamicQuantizeMatMulTest(const string& model_path) {
-  std::vector<int64_t> A_dims{4, 128};
-  std::vector<int64_t> B_dims{128, 128};
-  std::vector<int64_t> Y_dims{4, 128};
-
-  TestDynamicQuantizeMatMul<Scalar>(A_dims,
-                                    B_dims,
-                                    model_path,
-                                    false,        /*is_matrix_b_constant*/
-                                    false,        /*per_column*/
-                                    HasZeroPoint, /*has_zp*/
-                                    HasBias       /*has_bias*/
+template <typename T, bool HasZeroPoint, bool HasBias>
+void RunDynamicQuantizeMatMulTest() {
+  TestDynamicQuantizeMatMul<T>(false,        /*is_matrix_b_constant*/
+                               false,        /*per_column*/
+                               HasZeroPoint, /*has_zp*/
+                               HasBias       /*has_bias*/
   );
 
-  TestDynamicQuantizeMatMul<Scalar>(A_dims,
-                                    B_dims,
-                                    model_path,
-                                    true,         /*is_matrix_b_constant*/
-                                    false,        /*per_column*/
-                                    HasZeroPoint, /*has_zp*/
-                                    HasBias       /*has_bias*/
+  TestDynamicQuantizeMatMul<T>(true,         /*is_matrix_b_constant*/
+                               false,        /*per_column*/
+                               HasZeroPoint, /*has_zp*/
+                               HasBias       /*has_bias*/
   );
 
-  TestDynamicQuantizeMatMul<Scalar>(A_dims,
-                                    B_dims,
-                                    model_path,
-                                    false,        /*is_matrix_b_constant*/
-                                    true,         /*per_column*/
-                                    HasZeroPoint, /*has_zp*/
-                                    HasBias       /*has_bias*/
+  TestDynamicQuantizeMatMul<T>(false,        /*is_matrix_b_constant*/
+                               true,         /*per_column*/
+                               HasZeroPoint, /*has_zp*/
+                               HasBias       /*has_bias*/
   );
 
-  TestDynamicQuantizeMatMul<Scalar>(A_dims,
-                                    B_dims,
-                                    model_path,
-                                    true,         /*is_matrix_b_constant*/
-                                    true,         /*per_column*/
-                                    HasZeroPoint, /*has_zp*/
-                                    HasBias       /*has_bias*/
+  TestDynamicQuantizeMatMul<T>(true,         /*is_matrix_b_constant*/
+                               true,         /*per_column*/
+                               HasZeroPoint, /*has_zp*/
+                               HasBias       /*has_bias*/
   );
 }
 
-TEST(DynamicQuantizeMatMul, HasZeroPoint_NoBias_test) {
-  RunDynamicQuantizeMatMulTest<int8_t, true, false>("testdata/dynamic_quantize_matmul_int8.onnx");
-  RunDynamicQuantizeMatMulTest<uint8_t, true, false>("testdata/dynamic_quantize_matmul_uint8.onnx");
+TEST(DynamicQuantizeMatMul, HasZeroPoint_NoBias_test_S8) {
+  RunDynamicQuantizeMatMulTest<int8_t, true, false>();
 }
 
-TEST(DynamicQuantizeMatMul, NoZeroPoint_HasBias_test) {
-  RunDynamicQuantizeMatMulTest<int8_t, false, true>("testdata/dynamic_quantize_matmul_int8_bias.onnx");
-  RunDynamicQuantizeMatMulTest<uint8_t, false, true>("testdata/dynamic_quantize_matmul_uint8_bias.onnx");
+TEST(DynamicQuantizeMatMul, HasZeroPoint_NoBias_test_U8) {
+  RunDynamicQuantizeMatMulTest<uint8_t, true, false>();
+}
+
+TEST(DynamicQuantizeMatMul, NoZeroPoint_HasBias_test_S8) {
+  RunDynamicQuantizeMatMulTest<int8_t, false, true>();
+}
+
+TEST(DynamicQuantizeMatMul, NoZeroPoint_HasBias_test_U8) {
+  RunDynamicQuantizeMatMulTest<uint8_t, false, true>();
+}
+
+TEST(DynamicQuantizeMatMul, NoZeroPoint_NoBias_test_S8) {
+  RunDynamicQuantizeMatMulTest<int8_t, false, false>();
+}
+
+TEST(DynamicQuantizeMatMul, NoZeroPoint_NoBias_test_U8) {
+  RunDynamicQuantizeMatMulTest<uint8_t, false, false>();
+}
+
+TEST(DynamicQuantizeMatMul, HasZeroPoint_HasBias_test_S8) {
+  RunDynamicQuantizeMatMulTest<int8_t, true, true>();
+}
+
+TEST(DynamicQuantizeMatMul, HasZeroPoint_HasBias_test_U8) {
+  RunDynamicQuantizeMatMulTest<uint8_t, true, true>();
 }
 
 TEST(DynamicQuantizeMatMul, UInt8_test_with_empty_input) {
-  std::vector<int64_t> A_dims{0, 128};
-  std::vector<int64_t> B_dims{128, 128};
-  std::vector<int64_t> Y_dims{0, 128};
-
-  TestDynamicQuantizeMatMul<uint8_t>(A_dims,
-                                     B_dims,
-                                     "testdata/dynamic_quantize_matmul_uint8.onnx",
-                                     false /*is_matrix_b_constant*/);
-
-  TestDynamicQuantizeMatMul<uint8_t>(A_dims,
-                                     B_dims,
-                                     "testdata/dynamic_quantize_matmul_uint8.onnx",
-                                     true /*is_matrix_b_constant*/);
+  std::vector<int64_t> A_dims{0, 2};
+  std::vector<int64_t> B_dims{2, 2};
+  std::vector<int64_t> Y_dims{0, 2};
+  OpTester test("DynamicQuantizeMatMul", 1, onnxruntime::kMSDomain);
+  test.AddInput<float>("T1", A_dims, {});
+  test.AddInput<uint8_t>("T2", B_dims, {1, 6, 0, 8});
+  test.AddInput<float>("b_scale", {1}, {1.0f});
+  test.AddInput<uint8_t>("b_zero_point", {1}, {0});
+  test.AddOptionalInputEdge<float>();
+  test.AddOutput<float>("Y", {0, 2}, {});
+  test.Run();
 }
 
 TEST(DynamicQuantizeMatMul, B_PerColumn_ND) {

From efad5bbc5aed1717200d3e8f6ddd253394af4b99 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Sat, 9 Mar 2024 09:17:36 -0800
Subject: [PATCH 136/279] Replace some old file system calls with C++17
 std::filesystem APIs. (#19196)

### Description
1. Replace some old file system calls to use C++17 std::filesystem APIs.
2. Remove tensorflow_C_PACKAGE_PATH cmake option, which was only used in
onnxruntime_perf_test and the code is out of maintain.
3. Excludes onnx_test_runner and onnxruntime_perf_test from iOS build
because C++17 filesystem library is not available there
---
 cmake/CMakeLists.txt                          |  15 +-
 cmake/onnxruntime_unittests.cmake             | 269 +++++++++---------
 onnxruntime/test/onnx/TestCase.cc             | 136 ++++-----
 onnxruntime/test/onnx/TestCase.h              |  18 +-
 onnxruntime/test/onnx/onnx_model_info.cc      |  10 +-
 onnxruntime/test/onnx/onnx_model_info.h       |  10 +-
 onnxruntime/test/onnx/testcase_request.cc     |   2 +-
 onnxruntime/test/perftest/TFModelInfo.cc      |  58 ----
 onnxruntime/test/perftest/TFModelInfo.h       |  32 ---
 .../test/perftest/command_args_parser.cc      |   5 +-
 .../test/perftest/performance_runner.cc       |  42 +--
 .../test/perftest/test_configuration.h        |   1 -
 onnxruntime/test/providers/cpu/model_tests.cc | 132 ++++-----
 winml/test/model/model_tests.cpp              |   2 +-
 14 files changed, 307 insertions(+), 425 deletions(-)
 delete mode 100644 onnxruntime/test/perftest/TFModelInfo.cc
 delete mode 100644 onnxruntime/test/perftest/TFModelInfo.h

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 3f919d7bf6e18..02b568abdf8da 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -118,7 +118,6 @@ option(onnxruntime_GCOV_COVERAGE "Compile with options necessary to run code cov
 option(onnxruntime_DONT_VECTORIZE "Do not vectorize operations in Eigen" OFF)
 
 option(onnxruntime_USE_FULL_PROTOBUF "Link to libprotobuf instead of libprotobuf-lite when this option is ON" OFF)
-option(tensorflow_C_PACKAGE_PATH "Path to tensorflow C package installation dir")
 option(onnxruntime_ENABLE_LANGUAGE_INTEROP_OPS "Enable operator implemented in language other than cpp" OFF)
 option(onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS "Dump debug information about node inputs and outputs when executing the model." OFF)
 cmake_dependent_option(onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB "Build dump debug information about node inputs and outputs with support for sql database." OFF "onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS" OFF)
@@ -1254,17 +1253,15 @@ if (onnxruntime_USE_TVM)
     $<TARGET_PROPERTY:tvm,INTERFACE_INCLUDE_DIRECTORIES>)
 
   set(onnxruntime_tvm_libs onnxruntime_providers_tvm)
-
-  # needs to link with stdc++fs in Linux
-  if (UNIX)
-    if (NOT APPLE)
-      set(FS_STDLIB stdc++fs)
-    endif()
-  endif()
-  list(APPEND onnxruntime_EXTERNAL_LIBRARIES tvm ${FS_STDLIB})
+  list(APPEND onnxruntime_EXTERNAL_LIBRARIES tvm)
   list(APPEND onnxruntime_EXTERNAL_DEPENDENCIES tvm)
 endif()
 
+# needs to link with stdc++fs in Linux
+if (UNIX AND "${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_LESS 9)
+  set(FS_STDLIB stdc++fs)
+endif()
+list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${FS_STDLIB})
 
 # onnxruntime-extensions
 if (onnxruntime_USE_EXTENSIONS)
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index b004054c616a5..cce39ae0f5d47 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -1,6 +1,6 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
-if (${CMAKE_SYSTEM_NAME} STREQUAL "iOS")
+if (IOS)
   find_package(XCTest REQUIRED)
 endif()
 
@@ -18,7 +18,7 @@ function(AddTest)
   cmake_parse_arguments(_UT "DYN" "TARGET" "LIBS;SOURCES;DEPENDS;TEST_ARGS" ${ARGN})
   list(REMOVE_DUPLICATES _UT_SOURCES)
 
-  if (${CMAKE_SYSTEM_NAME} STREQUAL "iOS")
+  if (IOS)
     onnxruntime_add_executable(${_UT_TARGET} ${TEST_SRC_DIR}/xctest/orttestmain.m)
   else()
     onnxruntime_add_executable(${_UT_TARGET} ${_UT_SOURCES})
@@ -129,7 +129,7 @@ function(AddTest)
     endif()
   endif(onnxruntime_GENERATE_TEST_REPORTS)
 
-  if (${CMAKE_SYSTEM_NAME} STREQUAL "iOS")
+  if (IOS)
     # target_sources(${_UT_TARGET} PRIVATE ${TEST_SRC_DIR}/xctest/orttestmain.m)
     set_target_properties(${_UT_TARGET} PROPERTIES FOLDER "ONNXRuntimeTest"
       MACOSX_BUNDLE_BUNDLE_NAME ${_UT_TARGET}
@@ -734,34 +734,37 @@ target_include_directories(onnxruntime_test_utils PUBLIC "${TEST_SRC_DIR}/util/i
 set_target_properties(onnxruntime_test_utils PROPERTIES FOLDER "ONNXRuntimeTest")
 source_group(TREE ${TEST_SRC_DIR} FILES ${onnxruntime_test_utils_src})
 
-set(onnx_test_runner_src_dir ${TEST_SRC_DIR}/onnx)
-file(GLOB onnx_test_runner_common_srcs CONFIGURE_DEPENDS
-    ${onnx_test_runner_src_dir}/*.h
-    ${onnx_test_runner_src_dir}/*.cc)
+if(NOT IOS)
+    set(onnx_test_runner_src_dir ${TEST_SRC_DIR}/onnx)
+    file(GLOB onnx_test_runner_common_srcs CONFIGURE_DEPENDS
+        ${onnx_test_runner_src_dir}/*.h
+        ${onnx_test_runner_src_dir}/*.cc)
 
-list(REMOVE_ITEM onnx_test_runner_common_srcs ${onnx_test_runner_src_dir}/main.cc)
+    list(REMOVE_ITEM onnx_test_runner_common_srcs ${onnx_test_runner_src_dir}/main.cc)
 
-onnxruntime_add_static_library(onnx_test_runner_common ${onnx_test_runner_common_srcs})
-if(MSVC)
-  target_compile_options(onnx_test_runner_common PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /utf-8>"
-          "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/utf-8>")
-else()
-  target_compile_definitions(onnx_test_runner_common PUBLIC -DNSYNC_ATOMIC_CPP11)
-  target_include_directories(onnx_test_runner_common PRIVATE ${CMAKE_CURRENT_BINARY_DIR} ${ONNXRUNTIME_ROOT})
-  onnxruntime_add_include_to_target(onnx_test_runner_common nsync::nsync_cpp)
-endif()
-if (MSVC AND NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
-  #TODO: fix the warnings, they are dangerous
-  target_compile_options(onnx_test_runner_common PRIVATE "/wd4244")
-endif()
-onnxruntime_add_include_to_target(onnx_test_runner_common onnxruntime_common onnxruntime_framework
-        onnxruntime_test_utils onnx onnx_proto re2::re2 flatbuffers::flatbuffers Boost::mp11 safeint_interface)
+    onnxruntime_add_static_library(onnx_test_runner_common ${onnx_test_runner_common_srcs})
+    if(MSVC)
+      target_compile_options(onnx_test_runner_common PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /utf-8>"
+              "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/utf-8>")
+    else()
+      target_compile_definitions(onnx_test_runner_common PUBLIC -DNSYNC_ATOMIC_CPP11)
+      target_include_directories(onnx_test_runner_common PRIVATE ${CMAKE_CURRENT_BINARY_DIR} ${ONNXRUNTIME_ROOT})
+      onnxruntime_add_include_to_target(onnx_test_runner_common nsync::nsync_cpp)
+    endif()
+    if (MSVC AND NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
+      #TODO: fix the warnings, they are dangerous
+      target_compile_options(onnx_test_runner_common PRIVATE "/wd4244")
+    endif()
+    onnxruntime_add_include_to_target(onnx_test_runner_common onnxruntime_common onnxruntime_framework
+            onnxruntime_test_utils onnx onnx_proto re2::re2 flatbuffers::flatbuffers Boost::mp11 safeint_interface)
 
-add_dependencies(onnx_test_runner_common onnx_test_data_proto ${onnxruntime_EXTERNAL_DEPENDENCIES})
-target_include_directories(onnx_test_runner_common PRIVATE ${eigen_INCLUDE_DIRS}
-        ${CMAKE_CURRENT_BINARY_DIR} ${ONNXRUNTIME_ROOT})
+    add_dependencies(onnx_test_runner_common onnx_test_data_proto ${onnxruntime_EXTERNAL_DEPENDENCIES})
+    target_include_directories(onnx_test_runner_common PRIVATE ${eigen_INCLUDE_DIRS}
+            ${CMAKE_CURRENT_BINARY_DIR} ${ONNXRUNTIME_ROOT})
 
-set_target_properties(onnx_test_runner_common PROPERTIES FOLDER "ONNXRuntimeTest")
+    set_target_properties(onnx_test_runner_common PROPERTIES FOLDER "ONNXRuntimeTest")
+    set(onnx_test_runner_common_lib onnx_test_runner_common)
+endif()
 
 set(all_tests ${onnxruntime_test_common_src} ${onnxruntime_test_ir_src} ${onnxruntime_test_optimizer_src}
         ${onnxruntime_test_framework_src} ${onnxruntime_test_providers_src} ${onnxruntime_test_quantiztion_src})
@@ -820,6 +823,15 @@ if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
       "${TEST_SRC_DIR}/providers/cpu/tensor/grid_sample_test.cc")
 endif()
 
+if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten" OR IOS)
+   # Because we do not run these model tests in our web or iOS CI build pipelines, and some test code uses C++17
+   # filesystem functions that are not available in the iOS version we target.
+   message("Disable model tests in onnxruntime_test_all")
+   list(REMOVE_ITEM all_tests
+      "${TEST_SRC_DIR}/providers/cpu/model_tests.cc"
+    )
+endif()
+
 set(test_all_args)
 if (onnxruntime_USE_TENSORRT)
   # TRT EP CI takes much longer time when updating to TRT 8.2
@@ -837,7 +849,7 @@ AddTest(
   TARGET onnxruntime_test_all
   SOURCES ${all_tests} ${onnxruntime_unittest_main_src}
   LIBS
-    onnx_test_runner_common ${onnxruntime_test_providers_libs} ${onnxruntime_test_common_libs}
+    ${onnx_test_runner_common_lib} ${onnxruntime_test_providers_libs} ${onnxruntime_test_common_libs}
     onnx_test_data_proto
   DEPENDS ${all_dependencies}
   TEST_ARGS ${test_all_args}
@@ -875,7 +887,7 @@ endif()
 # the default logger tests conflict with the need to have an overall default logger
 # so skip in this type of
 target_compile_definitions(onnxruntime_test_all PUBLIC -DSKIP_DEFAULT_LOGGER_TESTS)
-if (CMAKE_SYSTEM_NAME STREQUAL "iOS")
+if (IOS)
   target_compile_definitions(onnxruntime_test_all_xc PUBLIC -DSKIP_DEFAULT_LOGGER_TESTS)
 endif()
 if(onnxruntime_RUN_MODELTEST_IN_DEBUG_MODE)
@@ -1046,45 +1058,42 @@ if (onnxruntime_ENABLE_LANGUAGE_INTEROP_OPS)
   list(APPEND onnx_test_libs onnxruntime_language_interop onnxruntime_pyop)
 endif()
 
-onnxruntime_add_executable(onnx_test_runner ${onnx_test_runner_src_dir}/main.cc)
-if(MSVC)
-  target_compile_options(onnx_test_runner PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /utf-8>"
-          "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/utf-8>")
-endif()
-if(${CMAKE_SYSTEM_NAME} STREQUAL "iOS")
-  set_target_properties(onnx_test_runner PROPERTIES
-    XCODE_ATTRIBUTE_CODE_SIGNING_ALLOWED "NO"
-  )
-endif()
-if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
-  if (onnxruntime_ENABLE_WEBASSEMBLY_THREADS)
-    set_target_properties(onnx_test_runner PROPERTIES LINK_FLAGS "-s NODERAWFS=1 -s ALLOW_MEMORY_GROWTH=1 -s PROXY_TO_PTHREAD=1 -s EXIT_RUNTIME=1")
-  else()
-    set_target_properties(onnx_test_runner PROPERTIES LINK_FLAGS "-s NODERAWFS=1 -s ALLOW_MEMORY_GROWTH=1")
-  endif()
-endif()
+if (NOT IOS)
+    onnxruntime_add_executable(onnx_test_runner ${onnx_test_runner_src_dir}/main.cc)
+    if(MSVC)
+      target_compile_options(onnx_test_runner PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /utf-8>"
+              "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/utf-8>")
+    endif()
+    if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
+      if (onnxruntime_ENABLE_WEBASSEMBLY_THREADS)
+        set_target_properties(onnx_test_runner PROPERTIES LINK_FLAGS "-s NODERAWFS=1 -s ALLOW_MEMORY_GROWTH=1 -s PROXY_TO_PTHREAD=1 -s EXIT_RUNTIME=1")
+      else()
+        set_target_properties(onnx_test_runner PROPERTIES LINK_FLAGS "-s NODERAWFS=1 -s ALLOW_MEMORY_GROWTH=1")
+      endif()
+    endif()
 
-target_link_libraries(onnx_test_runner PRIVATE onnx_test_runner_common ${GETOPT_LIB_WIDE} ${onnx_test_libs} nlohmann_json::nlohmann_json)
-target_include_directories(onnx_test_runner PRIVATE ${ONNXRUNTIME_ROOT})
-if (onnxruntime_USE_ROCM)
-  target_include_directories(onnx_test_runner PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/orttraining)
-endif()
-if (onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
-  target_link_libraries(onnx_test_runner PRIVATE Python::Python)
-endif()
-set_target_properties(onnx_test_runner PROPERTIES FOLDER "ONNXRuntimeTest")
+    target_link_libraries(onnx_test_runner PRIVATE onnx_test_runner_common ${GETOPT_LIB_WIDE} ${onnx_test_libs} nlohmann_json::nlohmann_json)
+    target_include_directories(onnx_test_runner PRIVATE ${ONNXRUNTIME_ROOT})
+    if (onnxruntime_USE_ROCM)
+      target_include_directories(onnx_test_runner PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/orttraining)
+    endif()
+    if (onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
+      target_link_libraries(onnx_test_runner PRIVATE Python::Python)
+    endif()
+    set_target_properties(onnx_test_runner PROPERTIES FOLDER "ONNXRuntimeTest")
 
-if (onnxruntime_USE_TVM)
-  if (WIN32)
-    target_link_options(onnx_test_runner PRIVATE "/STACK:4000000")
-  endif()
-endif()
+    if (onnxruntime_USE_TVM)
+      if (WIN32)
+        target_link_options(onnx_test_runner PRIVATE "/STACK:4000000")
+      endif()
+    endif()
 
-install(TARGETS onnx_test_runner
-        ARCHIVE  DESTINATION ${CMAKE_INSTALL_LIBDIR}
-        LIBRARY  DESTINATION ${CMAKE_INSTALL_LIBDIR}
-        BUNDLE   DESTINATION ${CMAKE_INSTALL_LIBDIR}
-        RUNTIME  DESTINATION ${CMAKE_INSTALL_BINDIR})
+    install(TARGETS onnx_test_runner
+            ARCHIVE  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+            LIBRARY  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+            BUNDLE   DESTINATION ${CMAKE_INSTALL_LIBDIR}
+            RUNTIME  DESTINATION ${CMAKE_INSTALL_BINDIR})
+endif()
 
 if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
   if(onnxruntime_BUILD_BENCHMARKS)
@@ -1165,90 +1174,80 @@ endif()
 
 
 if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
-  #perf test runner
-  set(onnxruntime_perf_test_src_dir ${TEST_SRC_DIR}/perftest)
-  set(onnxruntime_perf_test_src_patterns
-  "${onnxruntime_perf_test_src_dir}/*.cc"
-  "${onnxruntime_perf_test_src_dir}/*.h")
+  if(NOT IOS)
+    #perf test runner
+    set(onnxruntime_perf_test_src_dir ${TEST_SRC_DIR}/perftest)
+    set(onnxruntime_perf_test_src_patterns
+    "${onnxruntime_perf_test_src_dir}/*.cc"
+    "${onnxruntime_perf_test_src_dir}/*.h")
 
-  if(WIN32)
-    list(APPEND onnxruntime_perf_test_src_patterns
-      "${onnxruntime_perf_test_src_dir}/windows/*.cc"
-      "${onnxruntime_perf_test_src_dir}/windows/*.h" )
-  else ()
-    list(APPEND onnxruntime_perf_test_src_patterns
-      "${onnxruntime_perf_test_src_dir}/posix/*.cc"
-      "${onnxruntime_perf_test_src_dir}/posix/*.h" )
-  endif()
+    if(WIN32)
+      list(APPEND onnxruntime_perf_test_src_patterns
+        "${onnxruntime_perf_test_src_dir}/windows/*.cc"
+        "${onnxruntime_perf_test_src_dir}/windows/*.h" )
+    else ()
+      list(APPEND onnxruntime_perf_test_src_patterns
+        "${onnxruntime_perf_test_src_dir}/posix/*.cc"
+        "${onnxruntime_perf_test_src_dir}/posix/*.h" )
+    endif()
 
-  file(GLOB onnxruntime_perf_test_src CONFIGURE_DEPENDS
-    ${onnxruntime_perf_test_src_patterns}
-    )
-  onnxruntime_add_executable(onnxruntime_perf_test ${onnxruntime_perf_test_src} ${ONNXRUNTIME_ROOT}/core/platform/path_lib.cc)
-  if(MSVC)
-    target_compile_options(onnxruntime_perf_test PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /utf-8>"
+    file(GLOB onnxruntime_perf_test_src CONFIGURE_DEPENDS
+      ${onnxruntime_perf_test_src_patterns}
+      )
+    onnxruntime_add_executable(onnxruntime_perf_test ${onnxruntime_perf_test_src} ${ONNXRUNTIME_ROOT}/core/platform/path_lib.cc)
+    if(MSVC)
+      target_compile_options(onnxruntime_perf_test PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /utf-8>"
             "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/utf-8>")
-  endif()
-  target_include_directories(onnxruntime_perf_test PRIVATE ${onnx_test_runner_src_dir} ${ONNXRUNTIME_ROOT}
+    endif()
+    target_include_directories(onnxruntime_perf_test PRIVATE   ${onnx_test_runner_src_dir} ${ONNXRUNTIME_ROOT}
           ${eigen_INCLUDE_DIRS} ${onnxruntime_graph_header} ${onnxruntime_exec_src_dir}
           ${CMAKE_CURRENT_BINARY_DIR})
-  if (onnxruntime_USE_ROCM)
-    target_include_directories(onnxruntime_perf_test PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/orttraining)
-  endif()
-  if (WIN32)
-    target_compile_options(onnxruntime_perf_test PRIVATE ${disabled_warnings})
-    if (NOT DEFINED SYS_PATH_LIB)
-      set(SYS_PATH_LIB shlwapi)
+    if (onnxruntime_USE_ROCM)
+      target_include_directories(onnxruntime_perf_test PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/orttraining)
+    endif()
+    if (WIN32)
+      target_compile_options(onnxruntime_perf_test PRIVATE ${disabled_warnings})
+      if (NOT DEFINED SYS_PATH_LIB)
+        set(SYS_PATH_LIB shlwapi)
+      endif()
     endif()
-  endif()
-  if(${CMAKE_SYSTEM_NAME} STREQUAL "iOS")
-    set_target_properties(onnxruntime_perf_test PROPERTIES
-      XCODE_ATTRIBUTE_CODE_SIGNING_ALLOWED "NO"
-    )
-  endif()
 
-  if (onnxruntime_BUILD_SHARED_LIB)
-    #It will dynamically link to onnxruntime. So please don't add onxruntime_graph/onxruntime_framework/... here.
-    #onnxruntime_common is kind of ok because it is thin, tiny and totally stateless.
-    set(onnxruntime_perf_test_libs
+    if (onnxruntime_BUILD_SHARED_LIB)
+      #It will dynamically link to onnxruntime. So please don't add onxruntime_graph/onxruntime_framework/... here.
+      #onnxruntime_common is kind of ok because it is thin, tiny and totally stateless.
+      set(onnxruntime_perf_test_libs
             onnx_test_runner_common onnxruntime_test_utils onnxruntime_common
             onnxruntime onnxruntime_flatbuffers onnx_test_data_proto
             ${onnxruntime_EXTERNAL_LIBRARIES}
             ${GETOPT_LIB_WIDE} ${SYS_PATH_LIB} ${CMAKE_DL_LIBS})
-    if(NOT WIN32)
-      list(APPEND onnxruntime_perf_test_libs nsync::nsync_cpp)
-      if(onnxruntime_USE_SNPE)
-        list(APPEND onnxruntime_perf_test_libs onnxruntime_providers_snpe)
+      if(NOT WIN32)
+        list(APPEND onnxruntime_perf_test_libs nsync::nsync_cpp)
+        if(onnxruntime_USE_SNPE)
+          list(APPEND onnxruntime_perf_test_libs onnxruntime_providers_snpe)
+        endif()
       endif()
+      if (CMAKE_SYSTEM_NAME STREQUAL "Android")
+        list(APPEND onnxruntime_perf_test_libs ${android_shared_libs})
+      endif()
+      target_link_libraries(onnxruntime_perf_test PRIVATE ${onnxruntime_perf_test_libs} Threads::Threads)
+      if(WIN32)
+        target_link_libraries(onnxruntime_perf_test PRIVATE debug dbghelp advapi32)
+      endif()
+    else()
+      target_link_libraries(onnxruntime_perf_test PRIVATE onnx_test_runner_common ${GETOPT_LIB_WIDE} ${onnx_test_libs})
     endif()
-    if (CMAKE_SYSTEM_NAME STREQUAL "Android")
-      list(APPEND onnxruntime_perf_test_libs ${android_shared_libs})
-    endif()
-    target_link_libraries(onnxruntime_perf_test PRIVATE ${onnxruntime_perf_test_libs} Threads::Threads)
-    if(WIN32)
-      target_link_libraries(onnxruntime_perf_test PRIVATE debug dbghelp advapi32)
-    endif()
-    if(tensorflow_C_PACKAGE_PATH)
-      target_include_directories(onnxruntime_perf_test PRIVATE ${tensorflow_C_PACKAGE_PATH}/include)
-      target_link_directories(onnxruntime_perf_test PRIVATE ${tensorflow_C_PACKAGE_PATH}/lib)
-      target_link_libraries(onnxruntime_perf_test PRIVATE tensorflow)
-      target_compile_definitions(onnxruntime_perf_test PRIVATE HAVE_TENSORFLOW)
-    endif()
-  else()
-    target_link_libraries(onnxruntime_perf_test PRIVATE onnx_test_runner_common ${GETOPT_LIB_WIDE} ${onnx_test_libs})
-  endif()
-  set_target_properties(onnxruntime_perf_test PROPERTIES FOLDER "ONNXRuntimeTest")
+    set_target_properties(onnxruntime_perf_test PROPERTIES FOLDER "ONNXRuntimeTest")
 
-  if (onnxruntime_ENABLE_LANGUAGE_INTEROP_OPS AND NOT onnxruntime_BUILD_SHARED_LIB)
-    target_link_libraries(onnxruntime_perf_test PRIVATE onnxruntime_language_interop onnxruntime_pyop)
-  endif()
+    if (onnxruntime_ENABLE_LANGUAGE_INTEROP_OPS AND NOT onnxruntime_BUILD_SHARED_LIB)
+      target_link_libraries(onnxruntime_perf_test PRIVATE onnxruntime_language_interop onnxruntime_pyop)
+    endif()
 
-  if (onnxruntime_USE_TVM)
-    if (WIN32)
-      target_link_options(onnxruntime_perf_test PRIVATE "/STACK:4000000")
+    if (onnxruntime_USE_TVM)
+      if (WIN32)
+        target_link_options(onnxruntime_perf_test PRIVATE "/STACK:4000000")
+      endif()
     endif()
   endif()
-
   # shared lib
   if (onnxruntime_BUILD_SHARED_LIB)
     onnxruntime_add_static_library(onnxruntime_mocked_allocator ${TEST_SRC_DIR}/util/test_allocator.cc)
@@ -1303,7 +1302,7 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
       target_compile_definitions(onnxruntime_shared_lib_test PRIVATE USE_DUMMY_EXA_DEMANGLE=1)
     endif()
 
-    if (CMAKE_SYSTEM_NAME STREQUAL "iOS")
+    if (IOS)
       add_custom_command(
         TARGET onnxruntime_shared_lib_test POST_BUILD
         COMMAND ${CMAKE_COMMAND} -E copy_directory
@@ -1390,7 +1389,7 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
       target_compile_options(onnxruntime_mlas_test PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /wd26426>"
                   "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/wd26426>")
     endif()
-    if(${CMAKE_SYSTEM_NAME} STREQUAL "iOS")
+    if(IOS)
       set_target_properties(onnxruntime_mlas_test PROPERTIES
         XCODE_ATTRIBUTE_CODE_SIGNING_ALLOWED "NO"
       )
@@ -1591,7 +1590,7 @@ if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
             DEPENDS ${all_dependencies}
     )
 
-    if (CMAKE_SYSTEM_NAME STREQUAL "iOS")
+    if (IOS)
       add_custom_command(
         TARGET onnxruntime_customopregistration_test POST_BUILD
         COMMAND ${CMAKE_COMMAND} -E copy_directory
diff --git a/onnxruntime/test/onnx/TestCase.cc b/onnxruntime/test/onnx/TestCase.cc
index 57c2061883736..09c8ae213bad2 100644
--- a/onnxruntime/test/onnx/TestCase.cc
+++ b/onnxruntime/test/onnx/TestCase.cc
@@ -267,12 +267,12 @@ void LoopDataFile(int test_data_pb_fd, bool is_input, const TestModelInfo& model
 }  // namespace
 
 #if !defined(ORT_MINIMAL_BUILD)
-std::unique_ptr<TestModelInfo> TestModelInfo::LoadOnnxModel(_In_ const PATH_CHAR_TYPE* model_url) {
+std::unique_ptr<TestModelInfo> TestModelInfo::LoadOnnxModel(const std::filesystem::path& model_url) {
   return std::make_unique<OnnxModelInfo>(model_url);
 }
 #endif
 
-std::unique_ptr<TestModelInfo> TestModelInfo::LoadOrtModel(_In_ const PATH_CHAR_TYPE* model_url) {
+std::unique_ptr<TestModelInfo> TestModelInfo::LoadOrtModel(const std::filesystem::path& model_url) {
   return std::make_unique<OnnxModelInfo>(model_url, true);
 }
 
@@ -290,7 +290,7 @@ class OnnxTestCase : public ITestCase {
   mutable std::vector<std::string> debuginfo_strings_;
   mutable onnxruntime::OrtMutex m_;
 
-  std::vector<std::basic_string<PATH_CHAR_TYPE>> test_data_dirs_;
+  std::vector<std::filesystem::path> test_data_dirs_;
 
   std::string GetDatasetDebugInfoString(size_t dataset_id) const override {
     std::lock_guard<OrtMutex> l(m_);
@@ -343,7 +343,7 @@ class OnnxTestCase : public ITestCase {
 
   size_t GetDataCount() const override { return test_data_dirs_.size(); }
   const std::string& GetNodeName() const override { return model_info_->GetNodeName(); }
-  const PATH_CHAR_TYPE* GetModelUrl() const override { return model_info_->GetModelUrl(); }
+  const std::filesystem::path& GetModelUrl() const override { return model_info_->GetModelUrl(); }
   const std::string& GetTestCaseName() const override { return test_case_name_; }
   std::string GetTestCaseVersion() const override { return model_info_->GetNominalOpsetVersion(); }
 
@@ -396,7 +396,14 @@ static std::string trim_str(const std::string& in) {
   return s;
 }
 
-static bool read_config_file(const std::basic_string<PATH_CHAR_TYPE>& path, std::map<std::string, std::string>& fc) {
+/**
+ * @brief Read a text file that each line is a key value pair separated by ':'
+ * @param path File path
+ * @param fc output key value pairs
+ * @return True, success. False, the file doesn't exist or could be read.
+ */
+static bool ReadConfigFile(const std::filesystem::path& path, std::map<std::string, std::string>& fc) {
+  if (!std::filesystem::exists(path)) return false;
   std::ifstream infile(path);
   if (!infile.good()) {
     return false;
@@ -474,10 +481,10 @@ void OnnxTestCase::LoadTestData(size_t id, onnxruntime::test::HeapBuffer& b,
     ORT_THROW("index out of bound");
   }
 
-  PATH_STRING_TYPE test_data_pb = ConcatPathComponent(
-      test_data_dirs_[id], (is_input ? ORT_TSTR("inputs.pb") : ORT_TSTR("outputs.pb")));
+  std::filesystem::path test_data_pb =
+      test_data_dirs_[id] / (is_input ? ORT_TSTR("inputs.pb") : ORT_TSTR("outputs.pb"));
   int test_data_pb_fd;
-  auto st = Env::Default().FileOpenRd(test_data_pb, test_data_pb_fd);
+  auto st = Env::Default().FileOpenRd(test_data_pb.string(), test_data_pb_fd);
   if (st.IsOK()) {  // has an all-in-one input file
     std::ostringstream oss;
     {
@@ -504,21 +511,23 @@ void OnnxTestCase::LoadTestData(size_t id, onnxruntime::test::HeapBuffer& b,
 
   std::vector<PATH_STRING_TYPE> test_data_pb_files;
 
-  const PATH_STRING_TYPE& dir_path = test_data_dirs_[id];
-  LoopDir(dir_path,
-          [&test_data_pb_files, &dir_path, is_input](const PATH_CHAR_TYPE* filename, OrtFileType f_type) -> bool {
-            if (filename[0] == '.') return true;
-            if (f_type != OrtFileType::TYPE_REG) return true;
-            std::basic_string<PATH_CHAR_TYPE> filename_str = filename;
-            if (!HasExtensionOf(filename_str, ORT_TSTR("pb"))) return true;
-            const std::basic_string<PATH_CHAR_TYPE> file_prefix =
-                is_input ? ORT_TSTR("input_") : ORT_TSTR("output_");
-            if (!filename_str.compare(0, file_prefix.length(), file_prefix)) {
-              std::basic_string<PATH_CHAR_TYPE> p = ConcatPathComponent(dir_path, filename_str);
-              test_data_pb_files.push_back(p);
-            }
-            return true;
-          });
+  std::filesystem::path dir_fs_path = test_data_dirs_[id];
+  if (!std::filesystem::exists(dir_fs_path)) return;
+
+  for (auto const& dir_entry : std::filesystem::directory_iterator(dir_fs_path)) {
+    if (!dir_entry.is_regular_file()) continue;
+    const std::filesystem::path& path = dir_entry.path();
+    if (!path.filename().has_extension()) {
+      continue;
+    }
+    if (path.filename().extension().compare(ORT_TSTR(".pb")) != 0) continue;
+    const std::basic_string<PATH_CHAR_TYPE> file_prefix =
+        is_input ? ORT_TSTR("input_") : ORT_TSTR("output_");
+    auto filename_str = path.filename().native();
+    if (filename_str.compare(0, file_prefix.length(), file_prefix) == 0) {
+      test_data_pb_files.push_back(path.native());
+    }
+  }
 
   SortFileNames(test_data_pb_files);
 
@@ -691,11 +700,13 @@ void OnnxTestCase::ConvertTestData(const ONNX_NAMESPACE::OptionalProto& test_dat
 OnnxTestCase::OnnxTestCase(const std::string& test_case_name, _In_ std::unique_ptr<TestModelInfo> model,
                            double default_per_sample_tolerance, double default_relative_per_sample_tolerance)
     : test_case_name_(test_case_name), model_info_(std::move(model)) {
-  std::basic_string<PATH_CHAR_TYPE> test_case_dir = model_info_->GetDir();
-
+  std::filesystem::path test_case_dir = model_info_->GetDir();
+  if (!std::filesystem::exists(test_case_dir)) {
+    ORT_THROW("test case dir doesn't exist");
+  }
   // parse config
-  std::basic_string<PATH_CHAR_TYPE> config_path =
-      ConcatPathComponent(test_case_dir, ORT_TSTR("config.txt"));
+  std::filesystem::path config_path =
+      test_case_dir / ORT_TSTR("config.txt");
   /* Note: protobuf-lite doesn't support reading protobuf files as text-format. Config.txt is exactly that.
      That's the reason I've to parse the file in a different way to read the configs. Currently
      this affects 2 tests - fp16_tiny_yolov2 and fp16_inception_v1. It's not clear why we've to use protobuf
@@ -705,7 +716,7 @@ OnnxTestCase::OnnxTestCase(const std::string& test_case_name, _In_ std::unique_p
   per_sample_tolerance_ = default_per_sample_tolerance;
   relative_per_sample_tolerance_ = default_relative_per_sample_tolerance;
   post_processing_ = false;
-  if (read_config_file(config_path, fc)) {
+  if (ReadConfigFile(config_path, fc)) {
     if (fc.count("per_sample_tolerance") > 0) {
       per_sample_tolerance_ = stod(fc["per_sample_tolerance"]);
     }
@@ -716,16 +727,11 @@ OnnxTestCase::OnnxTestCase(const std::string& test_case_name, _In_ std::unique_p
       post_processing_ = fc["post_processing"] == "true";
     }
   }
-
-  LoopDir(test_case_dir, [&test_case_dir, this](const PATH_CHAR_TYPE* filename, OrtFileType f_type) -> bool {
-    if (filename[0] == '.') return true;
-    if (f_type == OrtFileType::TYPE_DIR) {
-      std::basic_string<PATH_CHAR_TYPE> p = ConcatPathComponent(test_case_dir, filename);
-      test_data_dirs_.push_back(p);
-      debuginfo_strings_.push_back(ToUTF8String(p));
-    }
-    return true;
-  });
+  for (auto const& dir_entry : std::filesystem::directory_iterator(test_case_dir)) {
+    if (!dir_entry.is_directory()) continue;
+    test_data_dirs_.push_back(dir_entry.path());
+    debuginfo_strings_.push_back(ToUTF8String(dir_entry.path().string()));
+  }
 }
 
 void LoadTests(const std::vector<std::basic_string<PATH_CHAR_TYPE>>& input_paths,
@@ -737,20 +743,23 @@ void LoadTests(const std::vector<std::basic_string<PATH_CHAR_TYPE>>& input_paths
                const std::function<void(std::unique_ptr<ITestCase>)>& process_function) {
   std::vector<std::basic_string<PATH_CHAR_TYPE>> paths(input_paths);
   while (!paths.empty()) {
-    std::basic_string<PATH_CHAR_TYPE> node_data_root_path = paths.back();
+    std::filesystem::path node_data_root_path = paths.back();
     paths.pop_back();
-    std::basic_string<PATH_CHAR_TYPE> my_dir_name = GetLastComponent(node_data_root_path);
-    LoopDir(node_data_root_path, [&](const PATH_CHAR_TYPE* filename, OrtFileType f_type) -> bool {
-      if (filename[0] == '.') return true;
-      if (f_type == OrtFileType::TYPE_DIR) {
-        std::basic_string<PATH_CHAR_TYPE> p = ConcatPathComponent(node_data_root_path, filename);
-        paths.push_back(p);
-        return true;
+    if (!std::filesystem::exists(node_data_root_path)) continue;
+    std::filesystem::path my_dir_name = node_data_root_path.filename();
+    for (auto const& dir_entry : std::filesystem::directory_iterator(node_data_root_path)) {
+      if (dir_entry.is_directory()) {
+        paths.push_back(dir_entry.path());
+        continue;
       }
-
-      std::basic_string<PATH_CHAR_TYPE> filename_str = filename;
-      bool is_onnx_format = HasExtensionOf(filename_str, ORT_TSTR("onnx"));
-      bool is_ort_format = HasExtensionOf(filename_str, ORT_TSTR("ort"));
+      if (!dir_entry.is_regular_file()) continue;
+      std::filesystem::path filename_str = dir_entry.path().filename();
+      if (filename_str.empty() || filename_str.native()[0] == ORT_TSTR('.')) {
+        // Ignore hidden files.
+        continue;
+      }
+      bool is_onnx_format = filename_str.has_extension() && (filename_str.extension().compare(ORT_TSTR(".onnx")) == 0);
+      bool is_ort_format = filename_str.has_extension() && (filename_str.extension().compare(ORT_TSTR(".ort")) == 0);
       bool is_valid_model = false;
 
 #if !defined(ORT_MINIMAL_BUILD)
@@ -759,42 +768,40 @@ void LoadTests(const std::vector<std::basic_string<PATH_CHAR_TYPE>>& input_paths
 
       is_valid_model = is_valid_model || is_ort_format;
       if (!is_valid_model)
-        return true;
+        continue;
 
-      std::basic_string<PATH_CHAR_TYPE> test_case_name = my_dir_name;
+      std::basic_string<PATH_CHAR_TYPE> test_case_name = my_dir_name.native();
       if (test_case_name.compare(0, 5, ORT_TSTR("test_")) == 0) test_case_name = test_case_name.substr(5);
 
       if (!whitelisted_test_cases.empty() && std::find(whitelisted_test_cases.begin(), whitelisted_test_cases.end(),
                                                        test_case_name) == whitelisted_test_cases.end()) {
-        return true;
+        continue;
       }
-      if (disabled_tests.find(test_case_name) != disabled_tests.end()) return true;
-
-      std::basic_string<PATH_CHAR_TYPE> p = ConcatPathComponent(node_data_root_path, filename_str);
+      if (disabled_tests.find(test_case_name) != disabled_tests.end()) continue;
 
       std::unique_ptr<TestModelInfo> model_info;
 
       if (is_onnx_format) {
 #if !defined(ORT_MINIMAL_BUILD)
-        model_info = TestModelInfo::LoadOnnxModel(p.c_str());
+        model_info = TestModelInfo::LoadOnnxModel(dir_entry.path());
 #else
         ORT_THROW("onnx model is not supported in this build");
 #endif
       } else if (is_ort_format) {
-        model_info = TestModelInfo::LoadOrtModel(p.c_str());
+        model_info = TestModelInfo::LoadOrtModel(dir_entry.path());
       } else {
         ORT_NOT_IMPLEMENTED(ToUTF8String(filename_str), " is not supported");
       }
 
       auto test_case_dir = model_info->GetDir();
-      auto test_case_name_in_log = test_case_name + ORT_TSTR(" in ") + test_case_dir;
+      auto test_case_name_in_log = test_case_name + ORT_TSTR(" in ") + test_case_dir.native();
 
 #if !defined(ORT_MINIMAL_BUILD) && !defined(USE_QNN)
       // to skip some models like *-int8 or *-qdq
       if ((reinterpret_cast<OnnxModelInfo*>(model_info.get()))->HasDomain(ONNX_NAMESPACE::AI_ONNX_TRAINING_DOMAIN) ||
           (reinterpret_cast<OnnxModelInfo*>(model_info.get()))->HasDomain(ONNX_NAMESPACE::AI_ONNX_PREVIEW_TRAINING_DOMAIN)) {
         fprintf(stderr, "Skip test case:: %s %s\n", ToUTF8String(test_case_name_in_log).c_str(), " as it has training domain");
-        return true;
+        continue;
       }
 #endif
 
@@ -809,7 +816,7 @@ void LoadTests(const std::vector<std::basic_string<PATH_CHAR_TYPE>>& input_paths
       });
       if (!has_test_data) {
         fprintf(stderr, "Skip test case:: %s %s\n", ToUTF8String(test_case_name_in_log).c_str(), " due to no test data");
-        return true;
+        continue;
       }
 
       if (broken_tests) {
@@ -820,7 +827,7 @@ void LoadTests(const std::vector<std::basic_string<PATH_CHAR_TYPE>>& input_paths
             (opset_version == TestModelInfo::unknown_version || iter->broken_opset_versions_.empty() ||
              iter->broken_opset_versions_.find(opset_version) != iter->broken_opset_versions_.end())) {
           fprintf(stderr, "Skip test case:: %s %s\n", ToUTF8String(test_case_name_in_log).c_str(), " due to broken_tests");
-          return true;
+          continue;
         }
       }
 
@@ -829,7 +836,7 @@ void LoadTests(const std::vector<std::basic_string<PATH_CHAR_TYPE>>& input_paths
           std::string keyword = *iter2;
           if (ToUTF8String(test_case_name).find(keyword) != std::string::npos) {
             fprintf(stderr, "Skip test case:: %s %s\n", ToUTF8String(test_case_name_in_log).c_str(), " as it is in broken test keywords");
-            return true;
+            continue;
           }
         }
       }
@@ -841,8 +848,7 @@ void LoadTests(const std::vector<std::basic_string<PATH_CHAR_TYPE>>& input_paths
                                                         tolerances.relative(tolerance_key));
       fprintf(stdout, "Load Test Case: %s\n", ToUTF8String(test_case_name_in_log).c_str());
       process_function(std::move(l));
-      return true;
-    });
+    }
   }
 }
 
diff --git a/onnxruntime/test/onnx/TestCase.h b/onnxruntime/test/onnx/TestCase.h
index 96b0b5f6f7c08..0cb92056d378e 100644
--- a/onnxruntime/test/onnx/TestCase.h
+++ b/onnxruntime/test/onnx/TestCase.h
@@ -6,6 +6,7 @@
 #include <mutex>
 #include <unordered_map>
 #include <unordered_set>
+#include <filesystem>
 #include <core/common/common.h>
 #include <core/common/status.h>
 #include <core/platform/path_lib.h>
@@ -31,7 +32,7 @@ class ITestCase {
   virtual void LoadTestData(size_t id, onnxruntime::test::HeapBuffer& b,
                             std::unordered_map<std::string, Ort::Value>& name_data_map,
                             bool is_input) const = 0;
-  virtual const PATH_CHAR_TYPE* GetModelUrl() const = 0;
+  virtual const std::filesystem::path& GetModelUrl() const = 0;
   virtual const std::string& GetNodeName() const = 0;
   virtual const ONNX_NAMESPACE::ValueInfoProto* GetInputInfoFromModel(size_t i) const = 0;
   virtual const ONNX_NAMESPACE::ValueInfoProto* GetOutputInfoFromModel(size_t i) const = 0;
@@ -50,14 +51,9 @@ class ITestCase {
 
 class TestModelInfo {
  public:
-  virtual const PATH_CHAR_TYPE* GetModelUrl() const = 0;
-  virtual std::basic_string<PATH_CHAR_TYPE> GetDir() const {
-    std::basic_string<PATH_CHAR_TYPE> test_case_dir;
-    auto st = onnxruntime::GetDirNameFromFilePath(GetModelUrl(), test_case_dir);
-    if (!st.IsOK()) {
-      ORT_THROW("GetDirNameFromFilePath failed");
-    }
-    return test_case_dir;
+  virtual const std::filesystem::path& GetModelUrl() const = 0;
+  virtual std::filesystem::path GetDir() const {
+    return GetModelUrl().parent_path();
   }
   virtual const std::string& GetNodeName() const = 0;
   virtual const ONNX_NAMESPACE::ValueInfoProto* GetInputInfoFromModel(size_t i) const = 0;
@@ -70,10 +66,10 @@ class TestModelInfo {
   virtual ~TestModelInfo() = default;
 
 #if !defined(ORT_MINIMAL_BUILD)
-  static std::unique_ptr<TestModelInfo> LoadOnnxModel(_In_ const PATH_CHAR_TYPE* model_url);
+  static std::unique_ptr<TestModelInfo> LoadOnnxModel(const std::filesystem::path& model_url);
 #endif
 
-  static std::unique_ptr<TestModelInfo> LoadOrtModel(_In_ const PATH_CHAR_TYPE* model_url);
+  static std::unique_ptr<TestModelInfo> LoadOrtModel(const std::filesystem::path& model_url);
 
   static const std::string unknown_version;
 };
diff --git a/onnxruntime/test/onnx/onnx_model_info.cc b/onnxruntime/test/onnx/onnx_model_info.cc
index d6afa99382e61..f23012aee9fd2 100644
--- a/onnxruntime/test/onnx/onnx_model_info.cc
+++ b/onnxruntime/test/onnx/onnx_model_info.cc
@@ -14,7 +14,7 @@
 
 using namespace onnxruntime;
 
-OnnxModelInfo::OnnxModelInfo(_In_ const PATH_CHAR_TYPE* model_url, bool is_ort_model)
+OnnxModelInfo::OnnxModelInfo(const std::filesystem::path& model_url, bool is_ort_model)
     : model_url_(model_url) {
   if (is_ort_model) {
     InitOrtModelInfo(model_url);
@@ -38,7 +38,7 @@ static void RepeatedPtrFieldToVector(const ::google::protobuf::RepeatedPtrField<
   }
 }
 
-void OnnxModelInfo::InitOnnxModelInfo(_In_ const PATH_CHAR_TYPE* model_url) {  // parse model
+void OnnxModelInfo::InitOnnxModelInfo(const std::filesystem::path& model_url) {  // parse model
   int model_fd;
   auto st = Env::Default().FileOpenRd(model_url, model_fd);
   if (!st.IsOK()) {
@@ -50,7 +50,9 @@ void OnnxModelInfo::InitOnnxModelInfo(_In_ const PATH_CHAR_TYPE* model_url) {  /
   const bool parse_result = model_pb.ParseFromZeroCopyStream(&input) && input.GetErrno() == 0;
   if (!parse_result) {
     (void)Env::Default().FileClose(model_fd);
-    ORT_THROW("Failed to load model because protobuf parsing failed.");
+    std::ostringstream oss;
+    oss << "Failed to load model from " << model_url << " because protobuf parsing failed.";
+    ORT_THROW(oss.str());
   }
   (void)Env::Default().FileClose(model_fd);
   {
@@ -91,7 +93,7 @@ void OnnxModelInfo::InitOnnxModelInfo(_In_ const PATH_CHAR_TYPE* model_url) {  /
 
 #endif  // #if !defined(ORT_MINIMAL_BUILD)
 
-void OnnxModelInfo::InitOrtModelInfo(_In_ const PATH_CHAR_TYPE* model_url) {
+void OnnxModelInfo::InitOrtModelInfo(const std::filesystem::path& model_url) {
   std::vector<uint8_t> bytes;
   size_t num_bytes = 0;
   const auto model_location = ToWideString(model_url);
diff --git a/onnxruntime/test/onnx/onnx_model_info.h b/onnxruntime/test/onnx/onnx_model_info.h
index a0aa27df64a94..48e297376aff5 100644
--- a/onnxruntime/test/onnx/onnx_model_info.h
+++ b/onnxruntime/test/onnx/onnx_model_info.h
@@ -13,16 +13,16 @@ class OnnxModelInfo : public TestModelInfo {
   std::vector<ONNX_NAMESPACE::ValueInfoProto> input_value_info_;
   std::vector<ONNX_NAMESPACE::ValueInfoProto> output_value_info_;
   std::unordered_map<std::string, int64_t> domain_to_version_;
-  const std::basic_string<PATH_CHAR_TYPE> model_url_;
+  const std::filesystem::path model_url_;
 
 #if !defined(ORT_MINIMAL_BUILD)
-  void InitOnnxModelInfo(_In_ const PATH_CHAR_TYPE* model_url);
+  void InitOnnxModelInfo(const std::filesystem::path& model_url);
 #endif
 
-  void InitOrtModelInfo(_In_ const PATH_CHAR_TYPE* model_url);
+  void InitOrtModelInfo(const std::filesystem::path& model_url);
 
  public:
-  OnnxModelInfo(_In_ const PATH_CHAR_TYPE* model_url, bool is_ort_model = false);
+  OnnxModelInfo(const std::filesystem::path& path, bool is_ort_model = false);
   bool HasDomain(const std::string& name) const {
     return domain_to_version_.find(name) != domain_to_version_.end();
   }
@@ -32,7 +32,7 @@ class OnnxModelInfo : public TestModelInfo {
     return iter == domain_to_version_.end() ? -1 : iter->second;
   }
 
-  const PATH_CHAR_TYPE* GetModelUrl() const override { return model_url_.c_str(); }
+  const std::filesystem::path& GetModelUrl() const override { return model_url_; }
   std::string GetNominalOpsetVersion() const override { return onnx_nominal_opset_vesion_; }
 
   const std::string& GetNodeName() const override { return node_name_; }
diff --git a/onnxruntime/test/onnx/testcase_request.cc b/onnxruntime/test/onnx/testcase_request.cc
index 9ca8273ac907b..9d653571ca2ec 100644
--- a/onnxruntime/test/onnx/testcase_request.cc
+++ b/onnxruntime/test/onnx/testcase_request.cc
@@ -36,7 +36,7 @@ bool TestCaseRequestContext::SetupSession() {
   ORT_TRY {
     const auto* test_case_name = test_case_.GetTestCaseName().c_str();
     session_opts_.SetLogId(test_case_name);
-    Ort::Session session{env_, test_case_.GetModelUrl(), session_opts_};
+    Ort::Session session{env_, test_case_.GetModelUrl().native().c_str(), session_opts_};
     session_ = std::move(session);
     LOGF_DEFAULT(INFO, "Testing %s\n", test_case_name);
     return true;
diff --git a/onnxruntime/test/perftest/TFModelInfo.cc b/onnxruntime/test/perftest/TFModelInfo.cc
deleted file mode 100644
index 82f5359545b4d..0000000000000
--- a/onnxruntime/test/perftest/TFModelInfo.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "TFModelInfo.h"
-
-#include <memory>
-
-#include <core/platform/env.h>
-
-std::unique_ptr<TestModelInfo> TFModelInfo::Create(_In_ const PATH_CHAR_TYPE* model_url) {
-  std::unique_ptr<TFModelInfo> model_info = std::make_unique<TFModelInfo>();
-
-  model_info->model_url_ = model_url;
-  std::basic_string<PATH_CHAR_TYPE> meta_file_path = model_url;
-  meta_file_path.append(ORT_TSTR(".meta"));
-  const onnxruntime::Env& env = onnxruntime::Env::Default();
-  size_t len;
-  auto status = env.GetFileLength(meta_file_path.c_str(), len);
-  if (!status.IsOK()) {
-    ORT_THROW(status.ErrorMessage());
-  }
-  std::string file_content;
-  file_content.resize(len);
-  auto buffer_span = gsl::make_span(&file_content[0], file_content.size());
-  status = onnxruntime::Env::Default().ReadFileIntoBuffer(meta_file_path.c_str(), 0, len, buffer_span);
-  if (!status.IsOK()) {
-    ORT_THROW(status.ErrorMessage());
-  }
-  // this string is not null terminated
-  std::istringstream is{file_content};
-
-  std::string line;
-  while (std::getline(is, line)) {
-    size_t line_len = 0;
-    if (!line.empty() && line.back() == '\n') {
-      line_len = line.length() - 1;
-      if (line_len > 0 && line[line_len - 1] == '\r') {
-        --line_len;
-      }
-      line.resize(line_len);
-    }
-    if (line.empty()) continue;
-    if (line.compare(0, 6, "input=") == 0) {
-      model_info->input_names_.push_back(line.substr(6));
-    } else if (line.compare(0, 7, "output=") == 0) {
-      model_info->output_names_.push_back(line.substr(7));
-    } else {
-      ORT_THROW("unknown line:", line.size());
-    }
-  }
-
-  return model_info;
-}
-
-int TFModelInfo::GetInputCount() const { return static_cast<int>(input_names_.size()); }
-int TFModelInfo::GetOutputCount() const { return static_cast<int>(output_names_.size()); }
-const std::string& TFModelInfo::GetInputName(size_t i) const { return input_names_[i]; }
-const std::string& TFModelInfo::GetOutputName(size_t i) const { return output_names_[i]; }
diff --git a/onnxruntime/test/perftest/TFModelInfo.h b/onnxruntime/test/perftest/TFModelInfo.h
deleted file mode 100644
index 2ca60010e300b..0000000000000
--- a/onnxruntime/test/perftest/TFModelInfo.h
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-
-#include "TestCase.h"
-#include <string>
-#include <vector>
-
-class TFModelInfo : public TestModelInfo {
- public:
-  const PATH_CHAR_TYPE* GetModelUrl() const override { return model_url_.c_str(); }
-
-  const std::string& GetNodeName() const override { return node_name_; }
-  const ONNX_NAMESPACE::ValueInfoProto* GetInputInfoFromModel(size_t) const override { return nullptr; }
-  const ONNX_NAMESPACE::ValueInfoProto* GetOutputInfoFromModel(size_t) const override { return nullptr; }
-
-  int GetInputCount() const override;
-  int GetOutputCount() const override;
-  const std::string& GetInputName(size_t i) const override;
-  const std::string& GetOutputName(size_t i) const override;
-  ~TFModelInfo() override = default;
-
-  static std::unique_ptr<TestModelInfo> Create(_In_ const PATH_CHAR_TYPE* model_url);
-  TFModelInfo() = default;
-
- private:
-  std::basic_string<PATH_CHAR_TYPE> model_url_;
-  std::vector<std::string> input_names_;
-  std::vector<std::string> output_names_;
-  std::string node_name_;
-};
diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc
index 729ad34368453..16c90c39f300f 100644
--- a/onnxruntime/test/perftest/command_args_parser.cc
+++ b/onnxruntime/test/perftest/command_args_parser.cc
@@ -199,7 +199,7 @@ static bool ParseSessionConfigs(const std::string& configs_string,
 
 /*static*/ bool CommandLineParser::ParseArguments(PerformanceTestConfig& test_config, int argc, ORTCHAR_T* argv[]) {
   int ch;
-  while ((ch = getopt(argc, argv, ORT_TSTR("b:m:e:r:t:p:x:y:c:d:o:u:i:f:F:S:T:C:AMPIDZvhsqzn"))) != -1) {
+  while ((ch = getopt(argc, argv, ORT_TSTR("m:e:r:t:p:x:y:c:d:o:u:i:f:F:S:T:C:AMPIDZvhsqzn"))) != -1) {
     switch (ch) {
       case 'f': {
         std::basic_string<ORTCHAR_T> dim_name;
@@ -228,9 +228,6 @@ static bool ParseSessionConfigs(const std::string& configs_string,
           return false;
         }
         break;
-      case 'b':
-        test_config.backend = optarg;
-        break;
       case 'p':
         test_config.run_config.profile_file = optarg;
         break;
diff --git a/onnxruntime/test/perftest/performance_runner.cc b/onnxruntime/test/perftest/performance_runner.cc
index 37bf80c80e90b..08d77008dc25c 100644
--- a/onnxruntime/test/perftest/performance_runner.cc
+++ b/onnxruntime/test/perftest/performance_runner.cc
@@ -10,12 +10,8 @@
 #include <iostream>
 
 #include "TestCase.h"
-#include "TFModelInfo.h"
 #include "utils.h"
 #include "ort_test_session.h"
-#ifdef HAVE_TENSORFLOW
-#include "tf_test_session.h"
-#endif
 using onnxruntime::Status;
 
 // TODO: Temporary, while we bring up the threadpool impl...
@@ -260,47 +256,25 @@ Status PerformanceRunner::ForkJoinRepeat() {
 }
 
 static std::unique_ptr<TestModelInfo> CreateModelInfo(const PerformanceTestConfig& performance_test_config_) {
-  if (CompareCString(performance_test_config_.backend.c_str(), ORT_TSTR("ort")) == 0) {
-    const auto& file_path = performance_test_config_.model_info.model_file_path;
+  const auto& file_path = performance_test_config_.model_info.model_file_path;
 #if !defined(ORT_MINIMAL_BUILD)
-    if (HasExtensionOf(file_path, ORT_TSTR("onnx"))) {
-      return TestModelInfo::LoadOnnxModel(performance_test_config_.model_info.model_file_path.c_str());
-    }
-#endif
-
-    if (HasExtensionOf(file_path, ORT_TSTR("ort"))) {
-      return TestModelInfo::LoadOrtModel(performance_test_config_.model_info.model_file_path.c_str());
-    }
-
-    ORT_NOT_IMPLEMENTED(ToUTF8String(file_path), " is not supported");
+  if (HasExtensionOf(file_path, ORT_TSTR("onnx"))) {
+    return TestModelInfo::LoadOnnxModel(performance_test_config_.model_info.model_file_path.c_str());
   }
+#endif
 
-  if (CompareCString(performance_test_config_.backend.c_str(), ORT_TSTR("tf")) == 0) {
-    return TFModelInfo::Create(performance_test_config_.model_info.model_file_path.c_str());
+  if (HasExtensionOf(file_path, ORT_TSTR("ort"))) {
+    return TestModelInfo::LoadOrtModel(performance_test_config_.model_info.model_file_path.c_str());
   }
 
-  ORT_NOT_IMPLEMENTED(ToUTF8String(performance_test_config_.backend), " is not supported");
-}
-
-static std::unique_ptr<TestSession> CreateSession(Ort::Env& env, std::random_device& rd,
-                                                  const PerformanceTestConfig& performance_test_config_,
-                                                  const TestModelInfo& test_model_info) {
-  if (CompareCString(performance_test_config_.backend.c_str(), ORT_TSTR("ort")) == 0) {
-    return std::make_unique<OnnxRuntimeTestSession>(env, rd, performance_test_config_, test_model_info);
-  }
-#ifdef HAVE_TENSORFLOW
-  if (CompareCString(performance_test_config_.backend.c_str(), ORT_TSTR("tf")) == 0) {
-    return new TensorflowTestSession(rd, performance_test_config_, test_model_info);
-  }
-#endif
-  ORT_NOT_IMPLEMENTED(ToUTF8String(performance_test_config_.backend), " is not supported");
+  ORT_NOT_IMPLEMENTED(ToUTF8String(file_path), " is not supported");
 }
 
 PerformanceRunner::PerformanceRunner(Ort::Env& env, const PerformanceTestConfig& test_config, std::random_device& rd)
     : performance_test_config_(test_config),
       test_model_info_(CreateModelInfo(test_config)) {
   session_create_start_ = std::chrono::high_resolution_clock::now();
-  session_ = CreateSession(env, rd, test_config, *test_model_info_);
+  session_ = std::make_unique<OnnxRuntimeTestSession>(env, rd, performance_test_config_, *test_model_info_);
   session_create_end_ = std::chrono::high_resolution_clock::now();
 }
 
diff --git a/onnxruntime/test/perftest/test_configuration.h b/onnxruntime/test/perftest/test_configuration.h
index 74c8eb472cb3e..70a6b12690d5d 100644
--- a/onnxruntime/test/perftest/test_configuration.h
+++ b/onnxruntime/test/perftest/test_configuration.h
@@ -70,7 +70,6 @@ struct PerformanceTestConfig {
   ModelInfo model_info;
   MachineConfig machine_config;
   RunConfig run_config;
-  std::basic_string<ORTCHAR_T> backend = ORT_TSTR("ort");
 };
 
 }  // namespace perftest
diff --git a/onnxruntime/test/providers/cpu/model_tests.cc b/onnxruntime/test/providers/cpu/model_tests.cc
index 8128c170c5211..7461717377144 100644
--- a/onnxruntime/test/providers/cpu/model_tests.cc
+++ b/onnxruntime/test/providers/cpu/model_tests.cc
@@ -3,6 +3,12 @@
 
 #include <iostream>
 #include <iterator>
+#include <string>
+#include <codecvt>
+#include <locale>
+#include <filesystem>
+#include <utility>
+#include <unordered_map>
 #include <gtest/gtest.h>
 
 #include "core/session/onnxruntime_c_api.h"
@@ -15,9 +21,6 @@
 #include <core/platform/path_lib.h>
 #include "default_providers.h"
 #include "test/onnx/TestCase.h"
-#include <string>
-#include <codecvt>
-#include <locale>
 
 #ifdef USE_DNNL
 #include "core/providers/dnnl/dnnl_provider_factory.h"
@@ -47,7 +50,6 @@
 #include "test/compare_ortvalue.h"
 #include "test/onnx/heap_buffer.h"
 #include "test/onnx/onnx_model_info.h"
-#include "test/onnx/callback.h"
 #include "test/onnx/testcase_request.h"
 
 extern std::unique_ptr<Ort::Env> ort_env;
@@ -378,46 +380,46 @@ TEST_P(ModelTest, Run) {
 }
 
 using ORT_STRING_VIEW = std::basic_string_view<ORTCHAR_T>;
-static ORT_STRING_VIEW opset7 = ORT_TSTR("opset7");
-static ORT_STRING_VIEW opset8 = ORT_TSTR("opset8");
-static ORT_STRING_VIEW opset9 = ORT_TSTR("opset9");
-static ORT_STRING_VIEW opset10 = ORT_TSTR("opset10");
-static ORT_STRING_VIEW opset11 = ORT_TSTR("opset11");
-static ORT_STRING_VIEW opset12 = ORT_TSTR("opset12");
-static ORT_STRING_VIEW opset13 = ORT_TSTR("opset13");
-static ORT_STRING_VIEW opset14 = ORT_TSTR("opset14");
-static ORT_STRING_VIEW opset15 = ORT_TSTR("opset15");
-static ORT_STRING_VIEW opset16 = ORT_TSTR("opset16");
-static ORT_STRING_VIEW opset17 = ORT_TSTR("opset17");
-static ORT_STRING_VIEW opset18 = ORT_TSTR("opset18");
+static constexpr ORT_STRING_VIEW opset7 = ORT_TSTR("opset7");
+static constexpr ORT_STRING_VIEW opset8 = ORT_TSTR("opset8");
+static constexpr ORT_STRING_VIEW opset9 = ORT_TSTR("opset9");
+static constexpr ORT_STRING_VIEW opset10 = ORT_TSTR("opset10");
+static constexpr ORT_STRING_VIEW opset11 = ORT_TSTR("opset11");
+static constexpr ORT_STRING_VIEW opset12 = ORT_TSTR("opset12");
+static constexpr ORT_STRING_VIEW opset13 = ORT_TSTR("opset13");
+static constexpr ORT_STRING_VIEW opset14 = ORT_TSTR("opset14");
+static constexpr ORT_STRING_VIEW opset15 = ORT_TSTR("opset15");
+static constexpr ORT_STRING_VIEW opset16 = ORT_TSTR("opset16");
+static constexpr ORT_STRING_VIEW opset17 = ORT_TSTR("opset17");
+static constexpr ORT_STRING_VIEW opset18 = ORT_TSTR("opset18");
 // TODO: enable opset19 tests
-// static ORT_STRING_VIEW opset19 = ORT_TSTR("opset19");
+// static constexpr ORT_STRING_VIEW opset19 = ORT_TSTR("opset19");
 
-static ORT_STRING_VIEW provider_name_cpu = ORT_TSTR("cpu");
-static ORT_STRING_VIEW provider_name_tensorrt = ORT_TSTR("tensorrt");
+static constexpr ORT_STRING_VIEW provider_name_cpu = ORT_TSTR("cpu");
+static constexpr ORT_STRING_VIEW provider_name_tensorrt = ORT_TSTR("tensorrt");
 #ifdef USE_MIGRAPHX
-static ORT_STRING_VIEW provider_name_migraphx = ORT_TSTR("migraphx");
+static constexpr ORT_STRING_VIEW provider_name_migraphx = ORT_TSTR("migraphx");
 #endif
-static ORT_STRING_VIEW provider_name_openvino = ORT_TSTR("openvino");
-static ORT_STRING_VIEW provider_name_cuda = ORT_TSTR("cuda");
+static constexpr ORT_STRING_VIEW provider_name_openvino = ORT_TSTR("openvino");
+static constexpr ORT_STRING_VIEW provider_name_cuda = ORT_TSTR("cuda");
 #ifdef USE_ROCM
-static ORT_STRING_VIEW provider_name_rocm = ORT_TSTR("rocm");
+static constexpr ORT_STRING_VIEW provider_name_rocm = ORT_TSTR("rocm");
 #endif
-static ORT_STRING_VIEW provider_name_dnnl = ORT_TSTR("dnnl");
+static constexpr ORT_STRING_VIEW provider_name_dnnl = ORT_TSTR("dnnl");
 // For any non-Android system, NNAPI will only be used for ort model converter
 #if defined(USE_NNAPI) && defined(__ANDROID__)
-static ORT_STRING_VIEW provider_name_nnapi = ORT_TSTR("nnapi");
+static constexpr ORT_STRING_VIEW provider_name_nnapi = ORT_TSTR("nnapi");
 #endif
 #ifdef USE_RKNPU
-static ORT_STRING_VIEW provider_name_rknpu = ORT_TSTR("rknpu");
+static constexpr ORT_STRING_VIEW provider_name_rknpu = ORT_TSTR("rknpu");
 #endif
 #ifdef USE_ACL
-static ORT_STRING_VIEW provider_name_acl = ORT_TSTR("acl");
+static constexpr ORT_STRING_VIEW provider_name_acl = ORT_TSTR("acl");
 #endif
 #ifdef USE_ARMNN
-static ORT_STRING_VIEW provider_name_armnn = ORT_TSTR("armnn");
+static constexpr ORT_STRING_VIEW provider_name_armnn = ORT_TSTR("armnn");
 #endif
-static ORT_STRING_VIEW provider_name_dml = ORT_TSTR("dml");
+static constexpr ORT_STRING_VIEW provider_name_dml = ORT_TSTR("dml");
 
 ::std::vector<::std::basic_string<ORTCHAR_T>> GetParameterStrings() {
   // Map key is provider name(CPU, CUDA, etc). Value is the ONNX node tests' opsets to run.
@@ -615,7 +617,7 @@ ::std::vector<::std::basic_string<ORTCHAR_T>> GetParameterStrings() {
       ORT_TSTR("SSD"),                 // needs to run symbolic shape inference shape first
       ORT_TSTR("size")                 // INVALID_ARGUMENT: Cannot find binding of given name: x
   };
-  std::vector<std::basic_string<ORTCHAR_T>> paths;
+  std::vector<std::filesystem::path> paths;
 
   for (std::pair<ORT_STRING_VIEW, std::vector<ORT_STRING_VIEW>> kvp : provider_names) {
     // Setup ONNX node tests. The test data is preloaded on our CI build machines.
@@ -644,7 +646,7 @@ ::std::vector<::std::basic_string<ORTCHAR_T>> GetParameterStrings() {
     }
 #endif
 
-    ORT_STRING_VIEW provider_name = kvp.first;
+    const ORT_STRING_VIEW provider_name = kvp.first;
     std::unordered_set<std::basic_string<ORTCHAR_T>> all_disabled_tests(std::begin(immutable_broken_tests),
                                                                         std::end(immutable_broken_tests));
     if (provider_name == provider_name_cuda) {
@@ -699,45 +701,45 @@ ::std::vector<::std::basic_string<ORTCHAR_T>> GetParameterStrings() {
     all_disabled_tests.insert(ORT_TSTR("fp16_tiny_yolov2"));
 
     while (!paths.empty()) {
-      std::basic_string<ORTCHAR_T> node_data_root_path = paths.back();
+      std::filesystem::path node_data_root_path = paths.back();
       paths.pop_back();
-      std::basic_string<ORTCHAR_T> my_dir_name = GetLastComponent(node_data_root_path);
-      ORT_TRY {
-        LoopDir(node_data_root_path, [&](const ORTCHAR_T* filename, OrtFileType f_type) -> bool {
-          if (filename[0] == ORT_TSTR('.'))
-            return true;
-          if (f_type == OrtFileType::TYPE_DIR) {
-            std::basic_string<PATH_CHAR_TYPE> p = ConcatPathComponent(node_data_root_path, filename);
-            paths.push_back(p);
-            return true;
-          }
-          std::basic_string<PATH_CHAR_TYPE> filename_str = filename;
-          if (!HasExtensionOf(filename_str, ORT_TSTR("onnx")))
-            return true;
-          std::basic_string<PATH_CHAR_TYPE> test_case_name = my_dir_name;
-          if (test_case_name.compare(0, 5, ORT_TSTR("test_")) == 0)
-            test_case_name = test_case_name.substr(5);
-          if (all_disabled_tests.find(test_case_name) != all_disabled_tests.end())
-            return true;
+      if (!std::filesystem::exists(node_data_root_path) || !std::filesystem::is_directory(node_data_root_path)) {
+        continue;
+      }
+      for (auto const& dir_entry : std::filesystem::directory_iterator(node_data_root_path)) {
+        if (dir_entry.is_directory()) {
+          paths.push_back(dir_entry.path());
+          continue;
+        }
+        const std::filesystem::path& path = dir_entry.path();
+        if (!path.has_filename() || path.filename().native().compare(0, 1, ORT_TSTR(".")) == 0) {
+          // Ignore hidden files.
+          continue;
+        }
+        if (path.filename().extension().compare(ORT_TSTR(".onnx")) != 0) {
+          // Ignore the files that are not ONNX models
+          continue;
+        }
+        std::basic_string<PATH_CHAR_TYPE> test_case_name = path.parent_path().filename().native();
+        if (test_case_name.compare(0, 5, ORT_TSTR("test_")) == 0)
+          test_case_name = test_case_name.substr(5);
+        if (all_disabled_tests.find(test_case_name) != all_disabled_tests.end())
+          continue;
 
 #ifdef DISABLE_ML_OPS
-          auto starts_with = [](const std::basic_string<PATH_CHAR_TYPE>& find_in,
-                                const std::basic_string<PATH_CHAR_TYPE>& find_what) {
-            return find_in.compare(0, find_what.size(), find_what) == 0;
-          };
-          if (starts_with(test_case_name, ORT_TSTR("XGBoost_")) || starts_with(test_case_name, ORT_TSTR("coreml_")) ||
-              starts_with(test_case_name, ORT_TSTR("scikit_")) || starts_with(test_case_name, ORT_TSTR("libsvm_"))) {
-            return true;
-          }
+        auto starts_with = [](const std::basic_string<PATH_CHAR_TYPE>& find_in,
+                              const std::basic_string<PATH_CHAR_TYPE>& find_what) {
+          return find_in.compare(0, find_what.size(), find_what) == 0;
+        };
+        if (starts_with(test_case_name, ORT_TSTR("XGBoost_")) || starts_with(test_case_name, ORT_TSTR("coreml_")) ||
+            starts_with(test_case_name, ORT_TSTR("scikit_")) || starts_with(test_case_name, ORT_TSTR("libsvm_"))) {
+          continue;
+        }
 #endif
-          std::basic_ostringstream<PATH_CHAR_TYPE> oss;
-          oss << provider_name << ORT_TSTR("_") << ConcatPathComponent(node_data_root_path, filename_str);
-          v.emplace_back(oss.str());
-          return true;
-        });
+        std::basic_ostringstream<PATH_CHAR_TYPE> oss;
+        oss << provider_name << ORT_TSTR("_") << path.native();
+        v.emplace_back(oss.str());
       }
-      ORT_CATCH(const std::exception&) {
-      }  // ignore non-exist dir
     }
   }
   return v;
diff --git a/winml/test/model/model_tests.cpp b/winml/test/model/model_tests.cpp
index f40f08ad2696d..27d74d7d6b034 100644
--- a/winml/test/model/model_tests.cpp
+++ b/winml/test/model/model_tests.cpp
@@ -118,7 +118,7 @@ TEST_P(ModelTest, Run) {
   LearningModelDevice device = nullptr;
   LearningModelSession session = nullptr;
   LearningModelBinding binding = nullptr;
-  WINML_EXPECT_NO_THROW(model = LearningModel::LoadFromFilePath(m_testCase->GetModelUrl()));
+  WINML_EXPECT_NO_THROW(model = LearningModel::LoadFromFilePath(m_testCase->GetModelUrl().native()));
   WINML_EXPECT_NO_THROW(device = LearningModelDevice(m_deviceKind));
   WINML_EXPECT_NO_THROW(session = LearningModelSession(model, device));
   for (size_t i = 0; i < m_testCase->GetDataCount(); i++) {

From 5479124834034a5b8bcc2b598c4158a12de010d3 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Sun, 10 Mar 2024 20:25:11 -0700
Subject: [PATCH 137/279] Remove remaining Windows ARM32 build jobs (#19840)

### Description
As a follow up of #19788, remove more remaining Windows ARM32 build
jobs.


### Motivation and Context
Our nuget packaging pipeline is failing because it could not find an
artifact for Win ARM32.
```
##[error]Artifact onnxruntime-training-win-arm was not found for build 421397.
```

Deprecation of Win ARM32 was announced by Windows team in January 2023.
We should follow it.
---
 .../azure-pipelines/post-merge-jobs.yml       | 45 -------------------
 ...device-training-cpu-packaging-pipeline.yml |  6 ---
 2 files changed, 51 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
index 3ec5400dacc65..1803c8769c510 100644
--- a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
+++ b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
@@ -17,7 +17,6 @@ stages:
 # Each group has 4 jobs that cover:
 # o Windows ARM64EC
 # o Windows ARM64
-# o Windows ARM
 # o Windows x64
 # o Windows x86
 # Now we don't have coverage for ARM64EC yet. Will add it.
@@ -35,20 +34,6 @@ stages:
     buildNodejs: false
     ort_build_pool_name: 'onnxruntime-Win-CPU-2022'
 
-- template: templates/win-ci.yml
-  parameters:
-    DoCompliance: false
-    DoEsrp: false
-    stage_name_suffix: CPU_arm_default
-    buildArch: x64
-    msbuildPlatform: arm
-    packageName: arm
-    buildparameter: --arm  --enable_onnx_tests --path_to_protoc_exe $(Build.BinariesDirectory)\RelWithDebInfo\installed\bin\protoc.exe
-    runTests: false
-    buildJava: false
-    buildNodejs: false
-    ort_build_pool_name: 'onnxruntime-Win-CPU-2022'
-
 - template: templates/win-ci.yml
   parameters:
     DoCompliance: false
@@ -106,21 +91,6 @@ stages:
     buildNodejs: false
     ort_build_pool_name: 'onnxruntime-Win-CPU-2022'
 
-- template: templates/win-ci.yml
-  parameters:
-    DoCompliance: false
-    DoEsrp: false
-    stage_name_suffix: CPU_arm_wcos
-    artifact_name_suffix: '-wcos'
-    buildArch: x64
-    msbuildPlatform: arm
-    packageName: arm
-    buildparameter: --arm  --enable_onnx_tests --enable_wcos --path_to_protoc_exe $(Build.BinariesDirectory)\RelWithDebInfo\installed\bin\protoc.exe
-    runTests: false
-    buildJava: false
-    buildNodejs: false
-    ort_build_pool_name: 'onnxruntime-Win-CPU-2022'
-
 - template: templates/win-ci.yml
   parameters:
     DoCompliance: false
@@ -181,21 +151,6 @@ stages:
     buildNodejs: false
     ort_build_pool_name: 'onnxruntime-Win-CPU-2022'
 
-- template: templates/win-ci.yml
-  parameters:
-    DoCompliance: false
-    DoEsrp: false
-    stage_name_suffix: CPU_arm_extension
-    artifact_name_suffix: '-extension'
-    buildArch: x64
-    msbuildPlatform: arm
-    packageName: arm
-    buildparameter: --arm --use_extensions  --enable_onnx_tests --path_to_protoc_exe $(Build.BinariesDirectory)\RelWithDebInfo\installed\bin\protoc.exe
-    runTests: false
-    buildJava: false
-    buildNodejs: false
-    ort_build_pool_name: 'onnxruntime-Win-CPU-2022'
-
 - template: templates/win-ci.yml
   parameters:
     DoCompliance: false
diff --git a/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml
index b3faaf2a7f1a6..bfee58e6e5ef9 100644
--- a/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml
@@ -148,12 +148,6 @@ stages:
         artifactName: 'onnxruntime-training-win-arm64'
         targetPath: '$(Build.BinariesDirectory)/nuget-artifact'
 
-    - task: DownloadPipelineArtifact@0
-      displayName: 'Download win-arm Pipeline Artifact'
-      inputs:
-        artifactName: 'onnxruntime-training-win-arm'
-        targetPath: '$(Build.BinariesDirectory)/nuget-artifact'
-
     - task: DownloadPipelineArtifact@0
       displayName: 'Download linux-x64 Pipeline Artifact'
       inputs:

From 89aa4697b145b3879c8a0f40db353af1e5e5003a Mon Sep 17 00:00:00 2001
From: raoanag <127366241+raoanag@users.noreply.github.com>
Date: Mon, 11 Mar 2024 09:44:34 -0800
Subject: [PATCH 138/279] [DML] QAttention (#19766)

### Description
DML Implementation for
[com.microsoft.QAttention](https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#com.microsoft.QAttention)


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

---------

Co-authored-by: Xiang Zhang <xianz@microsoft.com>
---
 docs/OperatorKernels.md                       |   1 +
 .../src/Operators/DmlOperatorQAttention.cpp   | 704 ++++++++++++++++++
 .../Operators/DmlOperatorQLinearSigmoid.cpp   |   2 +-
 .../src/Operators/OperatorRegistration.cpp    |  23 +-
 .../OperatorAuthorHelper/OperatorHelper.cpp   |  42 ++
 .../dml/OperatorAuthorHelper/OperatorHelper.h |  17 +
 .../OperatorAuthorHelper/OperatorVersions.h   |   1 +
 .../contrib_ops/quantize_attention_op_test.cc |  60 +-
 8 files changed, 839 insertions(+), 11 deletions(-)
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQAttention.cpp

diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index 955957f2957dc..eddc3b7873d80 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -1277,6 +1277,7 @@ Do not modify directly.*
 |MatMulIntegerToFloat|*in* A:**T1**<br> *in* B:**T2**<br> *in* a_scale:**T3**<br> *in* b_scale:**T3**<br> *in* a_zero_point:**T1**<br> *in* b_zero_point:**T2**<br> *in* bias:**T3**<br> *out* Y:**T3**|1+|**T1** = tensor(int8), tensor(uint8)<br/> **T2** = tensor(int8), tensor(uint8)<br/> **T3** = tensor(float), tensor(float16)|
 |MultiHeadAttention|*in* query:**T**<br> *in* key:**T**<br> *in* value:**T**<br> *in* bias:**T**<br> *in* key_padding_mask:**M**<br> *in* relative_position_bias:**T**<br> *in* past_key:**T**<br> *in* past_value:**T**<br> *out* output:**T**<br> *out* present_key:**T**<br> *out* present_value:**T**|1+|**M** = tensor(int32)<br/> **T** = tensor(float), tensor(float16)|
 |NhwcConv|*in* X:**T**<br> *in* W:**T**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
+|QAttention|*in* input:**T1**<br> *in* weight:**T2**<br> *in* bias:**T3**<br> *in* input_scale:**T3**<br> *in* weight_scale:**T3**<br> *in* mask_index:**T4**<br> *in* input_zero_point:**T1**<br> *in* weight_zero_point:**T2**<br> *in* past:**T3**<br> *out* output:**T3**<br> *out* present:**T3**|1+|**T1** = tensor(int8), tensor(uint8)<br/> **T2** = tensor(int8), tensor(uint8)<br/> **T3** = tensor(float), tensor(float16)<br/> **T4** = tensor(int32)|
 |QLinearAdd|*in* A:**T**<br> *in* A_scale:**tensor(float)**<br> *in* A_zero_point:**T**<br> *in* B:**T**<br> *in* B_scale:**tensor(float)**<br> *in* B_zero_point:**T**<br> *in* C_scale:**tensor(float)**<br> *in* C_zero_point:**T**<br> *out* C:**T**|1+|**T** = tensor(int8), tensor(uint8)|
 |QLinearAveragePool|*in* X:**T**<br> *in* x_scale:**tensor(float)**<br> *in* x_zero_point:**T**<br> *in* y_scale:**tensor(float)**<br> *in* y_zero_point:**T**<br> *out* Y:**T**|1+|**T** = tensor(int8), tensor(uint8)|
 |QLinearConcat|*in* Y_scale:**TF**<br> *in* Y_zero_point:**T8**<br> *in* inputs:**TV**<br> *out* Y:**T8**|1+|**T8** = tensor(int8), tensor(uint8)<br/> **TF** = tensor(float)<br/> **TV** = tensor(float), tensor(int8), tensor(uint8)|
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQAttention.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQAttention.cpp
new file mode 100644
index 0000000000000..f9519b26bb4e3
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQAttention.cpp
@@ -0,0 +1,704 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "precomp.h"
+
+/*
+Abbreviations: B is batch_size, S is sequence_length, W is hidden_size
+               N is number of attention heads, H is head size, and W=N*H
+
+Input, Weight, Bias, Mask Index and Past are Inputs
+
+Mask Index/Causal  Input   Weight   Bias
+         |             \    |       /
+         |              \   |      /
+         |               \  |     /
+         |             MatMulIntToFloat
+         |                / |   \
+         |               /  |    \
+         |              /   |     \
+         |          Slice  Slice  Slice
+         |            |     |       |
+         |            |     |       |
+         |      Identity Identity Identity // The identities are used to transpose NCHW -> NHCW while
+         |            |     |       |      // keeping the GEMM strides as NCHW to better target metacommands
+         |            |     |       |
+         |            |     |       |       Past
+         |            |     |       |       / \
+         |            |     |       |      /   \
+         |            |     |       |  Slice   Slice
+         |            |     |       |     |      |
+         |            |     |       |     |      |
+         |            |     |       |     |      |
+         --------------------------MHA -----------
+                                  / | \
+                                 /  |   \
+                                /   |     \
+                               /    |       \
+                              /     |         \
+                             /      |           \
+                            /  presentKey   presentValue
+                           /         \       /
+                          /           \     /
+                         /             \   /
+                        /             Concat
+                       /                 |
+                   Output1            Output2 (present)
+
+ This kernel creates a DML_GRAPH, as mentioned above.
+ For reference, refer to this Doc:
+ https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#commicrosoftqattention
+ */
+
+namespace Dml
+{
+class DmlOperatorQAttention : public DmlOperator
+{
+public:
+    DmlOperatorQAttention(const MLOperatorKernelCreationContext& kernelCreationContext)
+    :   DmlOperator(kernelCreationContext)
+    {
+        enum InputIndex : uint32_t
+        {
+            inputIndex,
+            weightsIndex,
+            biasIndex,
+            inputScaleIndex,
+            weightScaleIndex,
+            maskIndex,
+            inputZeroPointIndex,
+            weightZeroPointIndex,
+            pastIndex,
+            inputCount,
+        };
+
+        enum OutputIndex : uint32_t
+        {
+            outputIndex,
+            presentIndex,
+            outputCount,
+        };
+
+        enum MhaInputIndex : uint32_t
+        {
+            mhaQueryIndex,
+            mhaKeyIndex,
+            mhaValueIndex,
+            mhaStackedQueryKeyIndex,
+            mhaStackedKeyValueIndex,
+            mhaStackedQueryKeyValueIndex,
+            mhaBiasIndex,
+            mhaMaskIndex,
+            mhaRelativePositionBiasIndex,
+            mhaPastKeyIndex,
+            mhaPastValueIndex,
+            mhaInputCount,
+        };
+
+        enum MhaOutputIndex : uint32_t
+        {
+            mhaOutputIndex,
+            mhaPresentKeyIndex,
+            mhaPresentValueIndex,
+            mhaOutputCount,
+        };
+
+        ML_CHECK_VALID_ARGUMENT(kernelCreationContext.GetInputCount() >= 5);
+        ML_CHECK_VALID_ARGUMENT(kernelCreationContext.GetOutputCount() >= 1);
+
+        const bool hasBias = kernelCreationContext.IsInputValid(biasIndex);
+        const bool hasMask = kernelCreationContext.IsInputValid(maskIndex);
+        const bool hasUnpaddedBounds = hasMask && kernelCreationContext.GetInputTensorDimensionCount(maskIndex) == 1;
+        const bool hasPast = kernelCreationContext.IsInputValid(pastIndex);
+
+        DmlOperator::Initialize(kernelCreationContext, std::nullopt, std::nullopt, std::nullopt, std::nullopt, 1);
+
+        const bool unidirectional = gsl::narrow_cast<uint32_t>(kernelCreationContext.GetAttribute<int64_t>(AttrName::Unidirectional));
+        const uint32_t numHeads = gsl::narrow_cast<uint32_t>(kernelCreationContext.GetAttribute<int64_t>(AttrName::NumHeads));
+        ML_CHECK_VALID_ARGUMENT(numHeads > 0); //  to avoid process crash because of division by zero.
+
+        auto inputTensorShape = m_inputTensorDescs[inputIndex].GetSizes();
+        ML_CHECK_VALID_ARGUMENT(inputTensorShape.size() == 3);
+
+        auto weightTensorShape = m_inputTensorDescs[weightsIndex].GetSizes();
+        ML_CHECK_VALID_ARGUMENT(weightTensorShape.size() == 2);
+        ML_CHECK_VALID_ARGUMENT(weightTensorShape[0] == inputTensorShape[2]);
+        ML_CHECK_VALID_ARGUMENT(weightTensorShape[1] % 3 == 0);
+
+        if (hasBias)
+        {
+            auto biasTensorShape = m_inputTensorDescs[biasIndex].GetSizes();
+            ML_CHECK_VALID_ARGUMENT(biasTensorShape.size() == 1);
+            ML_CHECK_VALID_ARGUMENT(biasTensorShape[0] % 3 == 0);
+            ML_CHECK_VALID_ARGUMENT(weightTensorShape[1] == biasTensorShape[0]);
+        }
+
+        if (hasPast)
+        {
+            ML_CHECK_VALID_ARGUMENT(kernelCreationContext.IsOutputValid(presentIndex));
+        }
+
+        const uint32_t hiddenSize = weightTensorShape[1] / 3;
+        const uint32_t headSize = hiddenSize / numHeads;
+        const uint32_t batchSize = inputTensorShape[0];
+        const uint32_t sequenceLength = inputTensorShape[1];
+        const uint32_t pastSequenceLength = hasPast ? m_inputTensorDescs[pastIndex].GetSizes()[3] : 0;
+
+        uint32_t desiredWeightTensorShape[3] = {batchSize, weightTensorShape[0], 3 * hiddenSize};
+        MLOperatorTensorDataType dataType = kernelCreationContext.GetOutputEdgeDescription(outputIndex).tensorDataType;
+
+        m_inputTensorDescs[weightsIndex] = TensorDesc::ConstructBroadcastedTensorDesc(
+            kernelCreationContext.GetInputEdgeDescription(weightsIndex).tensorDataType,
+            desiredWeightTensorShape,
+            weightTensorShape);
+
+        uint32_t desiredBiasTensorShape[3] = {batchSize, sequenceLength, 3 * hiddenSize};
+
+        if (hasBias)
+        {
+            auto biasTensorShape = m_inputTensorDescs[biasIndex].GetSizes();
+            m_inputTensorDescs[biasIndex] = TensorDesc::ConstructBroadcastedTensorDesc(kernelCreationContext.GetInputEdgeDescription(biasIndex).tensorDataType, desiredBiasTensorShape, biasTensorShape);
+        }
+
+        MLOperatorTensorDataType maskTensorDataType = MLOperatorTensorDataType::Undefined;
+        bool hasMaxSequenceMask = false;
+        DML_MULTIHEAD_ATTENTION_MASK_TYPE maskType = DML_MULTIHEAD_ATTENTION_MASK_TYPE_NONE;
+        if (hasMask)
+        {
+            if (hasUnpaddedBounds)
+            {
+                auto unpaddedKeyBoundsShape = m_inputTensorDescs[maskIndex].GetSizes();
+                ML_CHECK_VALID_ARGUMENT(unpaddedKeyBoundsShape.size() == 1);
+
+                const uint32_t batchGroupCount = unpaddedKeyBoundsShape[0] / batchSize;
+                ML_CHECK_VALID_ARGUMENT(batchGroupCount == 1 || batchGroupCount == 2);
+
+                uint32_t desiredShape[2] = {batchGroupCount, batchSize};
+                m_inputTensorDescs[maskIndex] = TensorDesc(
+                    m_inputTensorDescs[maskIndex].GetDmlDataType(),
+                    desiredShape);
+
+                maskType = batchGroupCount == 1
+                    ? DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_SEQUENCE_LENGTH
+                    : DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_SEQUENCE_END_START;
+            }
+            else
+            {
+                auto maskIndexTensorShape = m_inputTensorDescs[maskIndex].GetSizes();
+                ML_CHECK_VALID_ARGUMENT(maskIndexTensorShape.size() > 1 && maskIndexTensorShape.size() <= 4);
+
+                maskType = DML_MULTIHEAD_ATTENTION_MASK_TYPE_BOOLEAN;
+                std::vector<uint32_t> reshapedMaskIndexTensorShape(maskIndexTensorShape.begin(), maskIndexTensorShape.end());
+                if (maskIndexTensorShape.size() == 4 && maskIndexTensorShape[2] != sequenceLength)
+                {
+                    hasMaxSequenceMask = true;
+                    ML_CHECK_VALID_ARGUMENT(maskIndexTensorShape[2] == maskIndexTensorShape[3]);
+                    const uint32_t maxSequenceLength = maskIndexTensorShape[2];
+                    uint32_t desiredMaskIndexShape[4] = {batchSize, numHeads, maxSequenceLength, maxSequenceLength};
+                    maskTensorDataType = kernelCreationContext.GetInputEdgeDescription(maskIndex).tensorDataType;
+                    m_inputTensorDescs[maskIndex] = TensorDesc::ConstructBroadcastedTensorDesc(maskTensorDataType, desiredMaskIndexShape, reshapedMaskIndexTensorShape);
+                }
+                else
+                {
+                    uint32_t maskIndexDimensionCount = gsl::narrow_cast<uint32_t>(maskIndexTensorShape.size());
+                    reshapedMaskIndexTensorShape.insert(reshapedMaskIndexTensorShape.begin() + 1, 4 - maskIndexDimensionCount, 1);
+                    uint32_t desiredMaskIndexShape[4] = {batchSize, numHeads, sequenceLength, sequenceLength};
+                    maskTensorDataType = kernelCreationContext.GetInputEdgeDescription(maskIndex).tensorDataType;
+                    m_inputTensorDescs[maskIndex] = TensorDesc::ConstructBroadcastedTensorDesc(maskTensorDataType, desiredMaskIndexShape, reshapedMaskIndexTensorShape);
+                }
+            }
+        }
+
+        MLOperatorTensorDataType pastTensorDataType = MLOperatorTensorDataType::Undefined;
+        MLOperatorTensorDataType presentTensorDataType = MLOperatorTensorDataType::Undefined;
+        if (hasPast)
+        {
+            pastTensorDataType = kernelCreationContext.GetInputEdgeDescription(pastIndex).tensorDataType;
+            presentTensorDataType = kernelCreationContext.GetOutputEdgeDescription(presentIndex).tensorDataType;
+        }
+
+        TensorDesc matMulIntToFloatOutputTensorDesc = TensorDesc::ConstructDefaultTensorDesc(dataType, desiredBiasTensorShape);
+        DML_TENSOR_DESC namedMatMulIntToFloatOutputTensorDesc = matMulIntToFloatOutputTensorDesc.GetDmlDesc();
+
+        std::vector<DML_TENSOR_DESC> inputDescs = GetDmlInputDescs();
+        std::vector<DML_TENSOR_DESC> outputDescs = GetDmlOutputDescs();
+
+        DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC matMulIntToFloatOperatorDesc = {};
+        matMulIntToFloatOperatorDesc.ATensor = &inputDescs[InputIndex::inputIndex];
+        matMulIntToFloatOperatorDesc.AScaleTensor = &inputDescs[InputIndex::inputScaleIndex];
+        matMulIntToFloatOperatorDesc.AZeroPointTensor = &inputDescs[InputIndex::inputZeroPointIndex];
+        matMulIntToFloatOperatorDesc.BTensor = &inputDescs[InputIndex::weightsIndex];
+        matMulIntToFloatOperatorDesc.BScaleTensor = &inputDescs[InputIndex::weightScaleIndex];
+        matMulIntToFloatOperatorDesc.BZeroPointTensor = &inputDescs[InputIndex::weightZeroPointIndex];
+        matMulIntToFloatOperatorDesc.BiasTensor = hasBias ? &inputDescs[InputIndex::biasIndex] : nullptr;
+        matMulIntToFloatOperatorDesc.OutputTensor = &namedMatMulIntToFloatOutputTensorDesc;
+
+        const DML_OPERATOR_DESC matMulIntToFloatDesc = { DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT, &matMulIntToFloatOperatorDesc};
+
+        std::array<uint32_t, 3> queryKeySlicedTensorShape = {batchSize, sequenceLength, hiddenSize + hiddenSize};
+        TensorDesc queryKeySlicedInputTensorDesc = TensorDesc::ConstructDefaultTensorDesc(dataType, queryKeySlicedTensorShape);
+        DML_TENSOR_DESC namedQueryKeySlicedInputTensorDesc = queryKeySlicedInputTensorDesc.GetDmlDesc();
+
+        std::array<uint32_t, 3> valueSlicedTensorShape = {batchSize, sequenceLength, hiddenSize};
+        TensorDesc valueSlicedInputTensorDesc = TensorDesc::ConstructDefaultTensorDesc(dataType, valueSlicedTensorShape);
+        DML_TENSOR_DESC namedValueSlicedInputTensorDesc = valueSlicedInputTensorDesc.GetDmlDesc();
+
+        // Transpose slice QK from [batchSize, sequenceLength, 2, numHeads, headSize] to [batchSize, sequenceLength, numHeads, 2, headSize]
+        std::array<uint32_t, 5> queryKeyTransposedTensorShape = {batchSize, sequenceLength, numHeads, 2, headSize};
+        std::array<uint32_t, 5> queryKeyTransposedStrides = {
+            sequenceLength * numHeads * 2 * headSize,
+            numHeads * 2 * headSize,
+            headSize,
+            numHeads * headSize,
+            1,
+        };
+
+        TensorDesc queryKeyTransposedInputTensorDesc = TensorDesc(
+            GetDmlDataTypeFromMlDataType(dataType),
+            queryKeyTransposedTensorShape,
+            queryKeyTransposedStrides);
+        DML_TENSOR_DESC namedQueryKeyTransposedInputTensorDesc = queryKeyTransposedInputTensorDesc.GetDmlDesc();
+
+        TensorDesc queryKeyTransposedOutputTensorDesc = TensorDesc(
+            GetDmlDataTypeFromMlDataType(dataType),
+            queryKeyTransposedTensorShape);
+        DML_TENSOR_DESC namedQueryKeyTransposedOutputTensorDesc = queryKeyTransposedOutputTensorDesc.GetDmlDesc();
+
+        // Transpose QKV from [batchSize, sequenceLength, 3, numHeads, headSize] to [batchSize, sequenceLength, numHeads, 3, headSize]
+        std::array<uint32_t, 5> queryKeyValueTransposedTensorShape = {batchSize, sequenceLength, numHeads, 3, headSize};
+        std::array<uint32_t, 5> queryKeyValueTransposedStrides = {
+            sequenceLength * numHeads * 3 * headSize,
+            numHeads * 3 * headSize,
+            headSize,
+            numHeads * headSize,
+            1,
+        };
+
+        TensorDesc queryKeyValueTransposedInputTensorDesc = TensorDesc(
+            GetDmlDataTypeFromMlDataType(dataType),
+            queryKeyValueTransposedTensorShape,
+            queryKeyValueTransposedStrides);
+        DML_TENSOR_DESC namedQueryKeyValueTransposedInputTensorDesc = queryKeyValueTransposedInputTensorDesc.GetDmlDesc();
+
+        TensorDesc queryKeyValueTransposedOutputTensorDesc = TensorDesc(
+            GetDmlDataTypeFromMlDataType(dataType),
+            queryKeyValueTransposedTensorShape);
+        DML_TENSOR_DESC namedQueryKeyValueTransposedOutputTensorDesc = queryKeyValueTransposedOutputTensorDesc.GetDmlDesc();
+
+        std::array<uint32_t, 3> queryKeySliceOffset = {0, 0, 0};
+        std::array<uint32_t, 3> queryKeySliceSize = {batchSize, sequenceLength, hiddenSize + hiddenSize};
+        std::array<int32_t, 3> queryKeySliceStrides = {1, 1, 1};
+
+        std::array<uint32_t, 3> valueSliceOffset = {0, 0, 2 * hiddenSize};
+        std::array<uint32_t, 3> valueSliceSize = {batchSize, sequenceLength, hiddenSize};
+        std::array<int32_t, 3> valueSliceStrides = {1, 1, 1};
+
+        // When Q/K/V all have the same hidden size, we just have to transpose it before sending it to MHA
+        DML_ELEMENT_WISE_IDENTITY_OPERATOR_DESC transposeOperatorDesc = {};
+
+        transposeOperatorDesc.InputTensor = &namedQueryKeyValueTransposedInputTensorDesc;
+        transposeOperatorDesc.OutputTensor = &namedQueryKeyValueTransposedOutputTensorDesc;
+
+        const DML_OPERATOR_DESC transposedDesc = { DML_OPERATOR_ELEMENT_WISE_IDENTITY, &transposeOperatorDesc};
+
+        std::array<uint32_t, 4> maskSliceOutputShape = {batchSize, numHeads, sequenceLength, sequenceLength};
+        std::array<int32_t, 4> maskSliceStrides = {1, 1, 1, 1};
+        std::array<uint32_t, 4> maskSliceOffsets = {0, 0, 0, 0};
+        TensorDesc maskSliceOutputTensorDesc;
+        DML_TENSOR_DESC namedMaskSliceOutputTensorDesc;
+
+        DML_SLICE1_OPERATOR_DESC maskSlicedOperatorDesc = {};
+        if (hasMaxSequenceMask)
+        {
+            maskSliceOutputTensorDesc = TensorDesc::ConstructDefaultTensorDesc(maskTensorDataType, maskSliceOutputShape);
+            namedMaskSliceOutputTensorDesc = maskSliceOutputTensorDesc.GetDmlDesc();
+            maskSlicedOperatorDesc.InputTensor = &inputDescs[maskIndex];
+            maskSlicedOperatorDesc.OutputTensor = &namedMaskSliceOutputTensorDesc;
+            maskSlicedOperatorDesc.DimensionCount = gsl::narrow_cast<uint32_t>(maskSliceOutputShape.size());
+            maskSlicedOperatorDesc.InputWindowOffsets = maskSliceOffsets.data();
+            maskSlicedOperatorDesc.InputWindowSizes = maskSliceOutputShape.data();
+            maskSlicedOperatorDesc.InputWindowStrides = maskSliceStrides.data();
+        }
+        const DML_OPERATOR_DESC maskSlicedDesc = { DML_OPERATOR_SLICE1, &maskSlicedOperatorDesc};
+
+        // We need to slice Past to get PastValue and PastKey tensors for MHA
+        std::array<uint32_t, 5> pastKeyOutputShape = {1, batchSize, numHeads, pastSequenceLength, headSize};
+        std::array<int32_t, 5> pastKeyStrides = {1, 1, 1, 1, 1};
+        std::array<uint32_t, 5> pastKeyOffsets = {0, 0, 0, 0, 0};
+        TensorDesc pastKeyOutputTensorDesc;
+        DML_TENSOR_DESC namedPastKeyOutputTensorDesc;
+
+        std::array<uint32_t, 5> pastValueOutputShape = {1, batchSize, numHeads, pastSequenceLength, headSize};
+        std::array<int32_t, 5> pastValueStrides = {1, 1, 1, 1, 1};
+        std::array<uint32_t, 5> pastValueOffsets = {1, 0, 0, 0, 0};
+        TensorDesc pastValueOutputTensorDesc;
+        DML_TENSOR_DESC namedPastValueOutputTensorDesc;
+
+        DML_SLICE1_OPERATOR_DESC pastKeySlicedOperatorDesc = {};
+        DML_SLICE1_OPERATOR_DESC pastValueSlicedOperatorDesc = {};
+
+        if (hasPast)
+        {
+            pastKeyOutputTensorDesc = TensorDesc::ConstructDefaultTensorDesc(pastTensorDataType, pastKeyOutputShape);
+            namedPastKeyOutputTensorDesc = pastKeyOutputTensorDesc.GetDmlDesc();
+            pastKeySlicedOperatorDesc.InputTensor = &inputDescs[pastIndex];
+            pastKeySlicedOperatorDesc.OutputTensor = &namedPastKeyOutputTensorDesc;
+            pastKeySlicedOperatorDesc.DimensionCount = gsl::narrow_cast<uint32_t>(pastKeyOutputShape.size());
+            pastKeySlicedOperatorDesc.InputWindowOffsets = pastKeyOffsets.data();
+            pastKeySlicedOperatorDesc.InputWindowSizes = pastKeyOutputShape.data();
+            pastKeySlicedOperatorDesc.InputWindowStrides = pastKeyStrides.data();
+
+            pastValueOutputTensorDesc = TensorDesc::ConstructDefaultTensorDesc(pastTensorDataType, pastValueOutputShape);
+            namedPastValueOutputTensorDesc = pastValueOutputTensorDesc.GetDmlDesc();
+            pastValueSlicedOperatorDesc.InputTensor = &inputDescs[pastIndex];
+            pastValueSlicedOperatorDesc.OutputTensor = &namedPastValueOutputTensorDesc;
+            pastValueSlicedOperatorDesc.DimensionCount = gsl::narrow_cast<uint32_t>(pastValueOutputShape.size());
+            pastValueSlicedOperatorDesc.InputWindowOffsets = pastValueOffsets.data();
+            pastValueSlicedOperatorDesc.InputWindowSizes = pastValueOutputShape.data();
+            pastValueSlicedOperatorDesc.InputWindowStrides = pastValueStrides.data();
+        }
+
+        const DML_OPERATOR_DESC pastKeySlicedDesc = { DML_OPERATOR_SLICE1, &pastKeySlicedOperatorDesc};
+        const DML_OPERATOR_DESC pastValueSlicedDesc = { DML_OPERATOR_SLICE1, &pastValueSlicedOperatorDesc};
+
+        // Causal Mask: Upper Triangular Boolean Matrix
+        // Example: [[1, 0, 0, 0, 0],
+        //           [1, 1, 0, 0, 0],
+        //           [1, 1, 1, 0, 0],
+        //           [1, 1, 1, 1, 0]]
+        // DML adds maskFilterValue to the "off" bits in the mask and sets the "on" bits to 0
+        // passed to MHA as maskIndex Tensor when unidirectional == 1
+        std::array<uint32_t, 4> causalMaskOutputShape = {1, 1,  sequenceLength, pastSequenceLength + sequenceLength};
+        TensorDesc causalMaskTensorDesc;
+        DML_DIAGONAL_MATRIX1_OPERATOR_DESC causalMaskOperatorDesc = {};
+        DML_TENSOR_DESC namedcausalMaskTensorDesc;
+
+        if (unidirectional && !hasMask)
+        {
+            causalMaskTensorDesc = TensorDesc::ConstructDefaultTensorDesc(MLOperatorTensorDataType::Int32, causalMaskOutputShape);
+            namedcausalMaskTensorDesc = causalMaskTensorDesc.GetDmlDesc();
+            causalMaskOperatorDesc.ValueDataType = DML_TENSOR_DATA_TYPE_INT32;
+            causalMaskOperatorDesc.DiagonalFillBegin = INT32_MIN;
+            causalMaskOperatorDesc.DiagonalFillEnd = pastSequenceLength + 1;
+            causalMaskOperatorDesc.Value.Int32 = 1;
+            causalMaskOperatorDesc.OutputTensor = &namedcausalMaskTensorDesc;
+            maskType = DML_MULTIHEAD_ATTENTION_MASK_TYPE_BOOLEAN;
+        }
+        DML_OPERATOR_DESC causalMaskDesc = { DML_OPERATOR_DIAGONAL_MATRIX1, &causalMaskOperatorDesc };
+
+        DML_MULTIHEAD_ATTENTION_OPERATOR_DESC mhaOperatorDesc = {};
+        std::array<uint32_t, 5> presentKeyOutputShape = {1, batchSize, numHeads, pastSequenceLength + sequenceLength, headSize};
+        std::array<uint32_t, 5> presentValueOutputShape = {1, batchSize, numHeads, pastSequenceLength + sequenceLength, headSize};
+        TensorDesc presentKeyTensorDesc;
+        TensorDesc presentValueTensorDesc;
+        DML_TENSOR_DESC namedPresentKeyOutputTensorDesc;
+        DML_TENSOR_DESC namedPresentValueOutputTensorDesc;
+
+        mhaOperatorDesc.StackedQueryKeyValueTensor = &namedQueryKeyValueTransposedOutputTensorDesc;
+
+        // Broadcast to MHA MaskTensor Shape
+        std::array<uint32_t, 4> mhaMaskTensorShape = {batchSize, numHeads, sequenceLength, pastSequenceLength + sequenceLength};
+        TensorDesc broadcastedcausalMaskTensorDesc;
+        DML_TENSOR_DESC namedbroadcastedcausalMaskTensorDesc;
+        if (unidirectional && !hasMask)
+        {
+            broadcastedcausalMaskTensorDesc = TensorDesc::ConstructBroadcastedTensorDesc(MLOperatorTensorDataType::Int32, mhaMaskTensorShape, causalMaskOutputShape);
+            namedbroadcastedcausalMaskTensorDesc = broadcastedcausalMaskTensorDesc.GetDmlDesc();
+            mhaOperatorDesc.MaskTensor = &namedbroadcastedcausalMaskTensorDesc;
+        }
+        else if (hasMaxSequenceMask)
+        {
+            mhaOperatorDesc.MaskTensor = &namedMaskSliceOutputTensorDesc;
+        }
+        else
+        {
+            mhaOperatorDesc.MaskTensor = hasMask ? &inputDescs[maskIndex] : nullptr;
+        }
+
+        mhaOperatorDesc.RelativePositionBiasTensor = nullptr;
+        mhaOperatorDesc.OutputTensor = &outputDescs[outputIndex];
+        mhaOperatorDesc.Scale = kernelCreationContext.GetOptionalAttribute<float>(AttrName::Scale, gsl::narrow_cast<float>(1.0f / std::sqrt(headSize)));
+        // Set MaskFilterValue to lowest float for Causal Mask 
+        mhaOperatorDesc.MaskFilterValue = unidirectional ? std::numeric_limits<float>::lowest() :
+            kernelCreationContext.GetOptionalAttribute<float>(AttrName::MaskFilterValue, -10'000.0f);
+        mhaOperatorDesc.HeadCount = numHeads;
+        mhaOperatorDesc.MaskType = maskType;
+        if (hasPast)
+        {
+            presentKeyTensorDesc = TensorDesc::ConstructDefaultTensorDesc(presentTensorDataType, presentKeyOutputShape);
+            namedPresentKeyOutputTensorDesc = presentKeyTensorDesc.GetDmlDesc();
+            presentValueTensorDesc = TensorDesc::ConstructDefaultTensorDesc(presentTensorDataType, presentValueOutputShape);
+            namedPresentValueOutputTensorDesc = presentValueTensorDesc.GetDmlDesc();
+            mhaOperatorDesc.PastKeyTensor = &namedPastKeyOutputTensorDesc;
+            mhaOperatorDesc.PastValueTensor = &namedPastValueOutputTensorDesc;
+            mhaOperatorDesc.OutputPresentKeyTensor = &namedPresentKeyOutputTensorDesc;
+            mhaOperatorDesc.OutputPresentValueTensor = &namedPresentValueOutputTensorDesc;
+        }
+
+        const DML_OPERATOR_DESC mhaDesc = { DML_OPERATOR_MULTIHEAD_ATTENTION, &mhaOperatorDesc };
+
+        DML_JOIN_OPERATOR_DESC presentKeyValueJoinOperatorDesc = {};
+        std::vector<DML_TENSOR_DESC> joinInputDesc;
+
+        if (hasPast)
+        {
+            joinInputDesc.push_back(namedPresentKeyOutputTensorDesc);
+            joinInputDesc.push_back(namedPresentValueOutputTensorDesc);
+            presentKeyValueJoinOperatorDesc.InputCount = gsl::narrow_cast<uint32_t>(joinInputDesc.size());
+            presentKeyValueJoinOperatorDesc.InputTensors = joinInputDesc.data();
+            presentKeyValueJoinOperatorDesc.OutputTensor = &outputDescs[presentIndex];
+            presentKeyValueJoinOperatorDesc.Axis = gsl::narrow_cast<uint32_t>(0);
+        }
+
+        DML_OPERATOR_DESC presentKeyValueJoinDesc = { DML_OPERATOR_JOIN, &presentKeyValueJoinOperatorDesc };
+
+        // Construct the graph
+        std::vector<DML_INPUT_GRAPH_EDGE_DESC> inputEdges;
+        std::vector<DML_INTERMEDIATE_GRAPH_EDGE_DESC> intermediateEdges;
+        std::vector<DML_OUTPUT_GRAPH_EDGE_DESC> outputEdges;
+
+        std::vector<const DML_OPERATOR_DESC*> opDescs = {
+            &matMulIntToFloatDesc,
+            &mhaDesc,
+        };
+
+        uint32_t currentNodeIndex = 0;
+        const uint32_t matMulIntToFloatNodeIndex = currentNodeIndex++;
+        const uint32_t mhaNodeIndex = currentNodeIndex++;
+
+        uint32_t queryKeyValueTransposedNodeIndex = 0;
+
+        opDescs.push_back(&transposedDesc);
+        queryKeyValueTransposedNodeIndex = currentNodeIndex++;
+
+        uint32_t maskSliceNodeIndex = 0;
+        if (hasMaxSequenceMask)
+        {
+            opDescs.push_back(&maskSlicedDesc);
+            maskSliceNodeIndex = currentNodeIndex++;
+        }
+
+        uint32_t pastKeySliceNodeIndex = 0;
+        uint32_t pastValueSliceNodeIndex = 0;
+        uint32_t concatNodeIndex = 0;
+        if (hasPast)
+        {
+            opDescs.push_back(&pastKeySlicedDesc);
+            pastKeySliceNodeIndex = currentNodeIndex++;
+            opDescs.push_back(&pastValueSlicedDesc);
+            pastValueSliceNodeIndex = currentNodeIndex++;
+            opDescs.push_back(&presentKeyValueJoinDesc);
+            concatNodeIndex = currentNodeIndex++;
+        }
+
+        uint32_t causalMaskNodeIndex = 0;
+        if (unidirectional && !hasMask)
+        {
+            opDescs.push_back(&causalMaskDesc);
+            causalMaskNodeIndex = currentNodeIndex++;
+        }
+
+        DML_INPUT_GRAPH_EDGE_DESC inputToMatMulIntToFloatEdge = {};
+        inputToMatMulIntToFloatEdge.GraphInputIndex = InputIndex::inputIndex;
+        inputToMatMulIntToFloatEdge.ToNodeIndex = matMulIntToFloatNodeIndex;
+        inputToMatMulIntToFloatEdge.ToNodeInputIndex = 0;
+        inputEdges.push_back(inputToMatMulIntToFloatEdge);
+
+        DML_INPUT_GRAPH_EDGE_DESC inputScaleToMatMulIntToFloatEdge = {};
+        inputScaleToMatMulIntToFloatEdge.GraphInputIndex = InputIndex::inputScaleIndex;
+        inputScaleToMatMulIntToFloatEdge.ToNodeIndex = matMulIntToFloatNodeIndex;
+        inputScaleToMatMulIntToFloatEdge.ToNodeInputIndex = 1;
+        inputEdges.push_back(inputScaleToMatMulIntToFloatEdge);
+
+        DML_INPUT_GRAPH_EDGE_DESC inputZeroPointToMatMulIntToFloatEdge = {};
+        inputZeroPointToMatMulIntToFloatEdge.GraphInputIndex = InputIndex::inputZeroPointIndex;
+        inputZeroPointToMatMulIntToFloatEdge.ToNodeIndex = matMulIntToFloatNodeIndex;
+        inputZeroPointToMatMulIntToFloatEdge.ToNodeInputIndex = 2;
+        inputEdges.push_back(inputZeroPointToMatMulIntToFloatEdge);
+
+        DML_INPUT_GRAPH_EDGE_DESC weightToMatMulIntToFloatEdge = {};
+        weightToMatMulIntToFloatEdge.GraphInputIndex = InputIndex::weightsIndex;
+        weightToMatMulIntToFloatEdge.ToNodeIndex = matMulIntToFloatNodeIndex;
+        weightToMatMulIntToFloatEdge.ToNodeInputIndex = 3;
+        inputEdges.push_back(weightToMatMulIntToFloatEdge);
+
+        DML_INPUT_GRAPH_EDGE_DESC weightScaleToMatMulIntToFloatEdge = {};
+        weightScaleToMatMulIntToFloatEdge.GraphInputIndex = InputIndex::weightScaleIndex;
+        weightScaleToMatMulIntToFloatEdge.ToNodeIndex = matMulIntToFloatNodeIndex;
+        weightScaleToMatMulIntToFloatEdge.ToNodeInputIndex = 4;
+        inputEdges.push_back(weightScaleToMatMulIntToFloatEdge);
+
+        DML_INPUT_GRAPH_EDGE_DESC weightZeroPointToMatMulIntToFloatEdge = {};
+        weightZeroPointToMatMulIntToFloatEdge.GraphInputIndex = InputIndex::weightZeroPointIndex;
+        weightZeroPointToMatMulIntToFloatEdge.ToNodeIndex = matMulIntToFloatNodeIndex;
+        weightZeroPointToMatMulIntToFloatEdge.ToNodeInputIndex = 5;
+        inputEdges.push_back(weightZeroPointToMatMulIntToFloatEdge);
+
+        if (hasBias)
+        {
+            DML_INPUT_GRAPH_EDGE_DESC biasToMatMulIntToFloatEdge = {};
+            biasToMatMulIntToFloatEdge.GraphInputIndex = InputIndex::biasIndex;
+            biasToMatMulIntToFloatEdge.ToNodeIndex = matMulIntToFloatNodeIndex;
+            biasToMatMulIntToFloatEdge.ToNodeInputIndex = 6;
+            inputEdges.push_back(biasToMatMulIntToFloatEdge);
+        }
+
+        if (hasMask)
+        {
+            if (hasUnpaddedBounds)
+            {
+                DML_INPUT_GRAPH_EDGE_DESC maskToMhaEdge = {};
+                maskToMhaEdge.GraphInputIndex = InputIndex::maskIndex;
+                maskToMhaEdge.ToNodeIndex = mhaNodeIndex;
+                maskToMhaEdge.ToNodeInputIndex = mhaMaskIndex;
+                inputEdges.push_back(maskToMhaEdge);
+            }
+            else if (hasMaxSequenceMask)
+            {
+                DML_INPUT_GRAPH_EDGE_DESC maskToMaskSliceEdge = {};
+                maskToMaskSliceEdge.GraphInputIndex = InputIndex::maskIndex;
+                maskToMaskSliceEdge.ToNodeIndex = maskSliceNodeIndex;
+                maskToMaskSliceEdge.ToNodeInputIndex = 0;
+                inputEdges.push_back(maskToMaskSliceEdge);
+
+                DML_INTERMEDIATE_GRAPH_EDGE_DESC maskSliceToMhaEdge = {};
+                maskSliceToMhaEdge.FromNodeIndex = maskSliceNodeIndex;
+                maskSliceToMhaEdge.FromNodeOutputIndex = 0;
+                maskSliceToMhaEdge.ToNodeIndex = mhaNodeIndex;
+                maskSliceToMhaEdge.ToNodeInputIndex = mhaMaskIndex;
+                intermediateEdges.push_back(maskSliceToMhaEdge);
+            }
+            else
+            {
+                DML_INPUT_GRAPH_EDGE_DESC maskToMhaEdge = {};
+                maskToMhaEdge.GraphInputIndex = InputIndex::maskIndex;
+                maskToMhaEdge.ToNodeIndex = mhaNodeIndex;
+                maskToMhaEdge.ToNodeInputIndex = mhaMaskIndex;
+                inputEdges.push_back(maskToMhaEdge);
+            }
+        }
+        else if (unidirectional)
+        {
+            DML_INTERMEDIATE_GRAPH_EDGE_DESC causalMaskToMhaEdge = {};
+            causalMaskToMhaEdge.FromNodeIndex = causalMaskNodeIndex;
+            causalMaskToMhaEdge.FromNodeOutputIndex = 0;
+            causalMaskToMhaEdge.ToNodeIndex = mhaNodeIndex;
+            causalMaskToMhaEdge.ToNodeInputIndex = mhaMaskIndex;
+            intermediateEdges.push_back(causalMaskToMhaEdge);
+        }
+
+        if (hasPast)
+        {
+            DML_INPUT_GRAPH_EDGE_DESC pastToPastKeySliceEdge = {};
+            pastToPastKeySliceEdge.GraphInputIndex = InputIndex::pastIndex;
+            pastToPastKeySliceEdge.ToNodeIndex = pastKeySliceNodeIndex;
+            pastToPastKeySliceEdge.ToNodeInputIndex = 0;
+            inputEdges.push_back(pastToPastKeySliceEdge);
+
+            DML_INPUT_GRAPH_EDGE_DESC pastToPastValueSliceEdge = {};
+            pastToPastValueSliceEdge.GraphInputIndex = InputIndex::pastIndex;
+            pastToPastValueSliceEdge.ToNodeIndex = pastValueSliceNodeIndex;
+            pastToPastValueSliceEdge.ToNodeInputIndex = 0;
+            inputEdges.push_back(pastToPastValueSliceEdge);
+
+            DML_INTERMEDIATE_GRAPH_EDGE_DESC pastKeyToMhaEdge = {};
+            pastKeyToMhaEdge.FromNodeIndex = pastKeySliceNodeIndex;
+            pastKeyToMhaEdge.FromNodeOutputIndex = 0;
+            pastKeyToMhaEdge.ToNodeIndex = mhaNodeIndex;
+            pastKeyToMhaEdge.ToNodeInputIndex = mhaPastKeyIndex;
+            intermediateEdges.push_back(pastKeyToMhaEdge);
+
+            DML_INTERMEDIATE_GRAPH_EDGE_DESC pastValueToMhaEdge = {};
+            pastValueToMhaEdge.FromNodeIndex = pastValueSliceNodeIndex;
+            pastValueToMhaEdge.FromNodeOutputIndex = 0;
+            pastValueToMhaEdge.ToNodeIndex = mhaNodeIndex;
+            pastValueToMhaEdge.ToNodeInputIndex = mhaPastValueIndex;
+            intermediateEdges.push_back(pastValueToMhaEdge);
+
+            DML_INTERMEDIATE_GRAPH_EDGE_DESC presentKeyToConcatEdge = {};
+            presentKeyToConcatEdge.FromNodeIndex = mhaNodeIndex;
+            presentKeyToConcatEdge.FromNodeOutputIndex = mhaPresentKeyIndex;
+            presentKeyToConcatEdge.ToNodeIndex = concatNodeIndex;
+            presentKeyToConcatEdge.ToNodeInputIndex = 0;
+            intermediateEdges.push_back(presentKeyToConcatEdge);
+
+            DML_INTERMEDIATE_GRAPH_EDGE_DESC presentValueToConcatEdge = {};
+            presentValueToConcatEdge.FromNodeIndex = mhaNodeIndex;
+            presentValueToConcatEdge.FromNodeOutputIndex = mhaPresentValueIndex;
+            presentValueToConcatEdge.ToNodeIndex = concatNodeIndex;
+            presentValueToConcatEdge.ToNodeInputIndex = 1;
+            intermediateEdges.push_back(presentValueToConcatEdge);
+        }
+
+        DML_INTERMEDIATE_GRAPH_EDGE_DESC matMulIntToFloatToQueryKeyValueTransposeEdge = {};
+        matMulIntToFloatToQueryKeyValueTransposeEdge.FromNodeIndex = matMulIntToFloatNodeIndex;
+        matMulIntToFloatToQueryKeyValueTransposeEdge.FromNodeOutputIndex = 0;
+        matMulIntToFloatToQueryKeyValueTransposeEdge.ToNodeIndex = queryKeyValueTransposedNodeIndex;
+        matMulIntToFloatToQueryKeyValueTransposeEdge.ToNodeInputIndex = 0;
+        intermediateEdges.push_back(matMulIntToFloatToQueryKeyValueTransposeEdge);
+
+        // All we need to do here is transpose the stacked QKV tensor into something DML supports
+        DML_INTERMEDIATE_GRAPH_EDGE_DESC queryKeyValueTransposedToMhaEdge = {};
+        queryKeyValueTransposedToMhaEdge.FromNodeIndex = queryKeyValueTransposedNodeIndex;
+        queryKeyValueTransposedToMhaEdge.FromNodeOutputIndex = 0;
+        queryKeyValueTransposedToMhaEdge.ToNodeIndex = mhaNodeIndex;
+        queryKeyValueTransposedToMhaEdge.ToNodeInputIndex = mhaStackedQueryKeyValueIndex;
+        intermediateEdges.push_back(queryKeyValueTransposedToMhaEdge);
+
+        DML_OUTPUT_GRAPH_EDGE_DESC mhaToOutputEdge = {};
+        mhaToOutputEdge.FromNodeIndex = mhaNodeIndex;
+        mhaToOutputEdge.FromNodeOutputIndex = mhaOutputIndex;
+        mhaToOutputEdge.GraphOutputIndex = OutputIndex::outputIndex;
+        outputEdges.push_back(mhaToOutputEdge);
+
+        if (hasPast)
+        {
+            DML_OUTPUT_GRAPH_EDGE_DESC concatToOutputEdge = {};
+            concatToOutputEdge.FromNodeIndex = concatNodeIndex;
+            concatToOutputEdge.FromNodeOutputIndex = 0;
+            concatToOutputEdge.GraphOutputIndex = OutputIndex::presentIndex;
+            outputEdges.push_back(concatToOutputEdge);
+        }
+
+        MLOperatorGraphDesc operatorGraphDesc = {};
+        operatorGraphDesc.inputEdgeCount = gsl::narrow_cast<uint32_t>(inputEdges.size());
+        operatorGraphDesc.inputEdges = inputEdges.data();
+        operatorGraphDesc.intermediateEdgeCount = gsl::narrow_cast<uint32_t>(intermediateEdges.size());
+        operatorGraphDesc.intermediateEdges = intermediateEdges.data();
+        operatorGraphDesc.outputEdgeCount = gsl::narrow_cast<uint32_t>(outputEdges.size());
+        operatorGraphDesc.outputEdges = outputEdges.data();
+        operatorGraphDesc.nodeCount = gsl::narrow_cast<uint32_t>(opDescs.size());
+        operatorGraphDesc.nodes = opDescs.data();
+
+        SetDmlOperatorGraphDesc(std::move(operatorGraphDesc), kernelCreationContext);
+    }
+};
+
+void CALLBACK QueryQAttention(IMLOperatorSupportQueryContextPrivate* context, /*out*/ bool* isSupported)
+{
+    *isSupported = false;
+
+    // `unidirectional == 1` with Mask Tensor is not supported yet
+    MLOperatorAttributes attributes(context);
+    if (attributes.GetOptionalAttribute<int32_t>(AttrName::Unidirectional, 0) != 0 && context->IsInputValid(5))
+    {
+        return;
+    }
+
+    // `do_rotary == 1` is not supported yet
+    if (attributes.GetOptionalAttribute<int32_t>(AttrName::DoRotary, 0) != 0)
+    {
+        return;
+    }
+
+    // `past_present_share_buffer == 1` is not supported yet
+    if (attributes.GetOptionalAttribute<int32_t>(AttrName::PastPresentShareBuffer, 0) != 0)
+    {
+        return;
+    }
+
+    *isSupported = true;
+}
+
+DML_OP_DEFINE_CREATION_FUNCTION(QAttention, DmlOperatorQAttention);
+}  // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearSigmoid.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearSigmoid.cpp
index f658e7c7da323..bc0082fef3496 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearSigmoid.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearSigmoid.cpp
@@ -178,4 +178,4 @@ void CALLBACK QueryQLinearSigmoid(IMLOperatorSupportQueryContextPrivate* context
 }
 
 DML_OP_DEFINE_CREATION_FUNCTION(QLinearSigmoid, DmlOperatorQLinearSigmoid);
-} // namespace Dml
+} // namespace Dml
\ No newline at end of file
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp
index 38cf80b381762..71fc8741bfdc8 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp
@@ -516,6 +516,7 @@ DML_OP_EXTERN_CREATION_FUNCTION(Resize19);
 
 DML_OP_EXTERN_CREATION_FUNCTION(Shape);
 DML_OP_EXTERN_CREATION_FUNCTION(Size);
+DML_OP_EXTERN_CREATION_FUNCTION(QAttention);
 DML_OP_EXTERN_CREATION_FUNCTION(Attention);
 DML_OP_EXTERN_CREATION_FUNCTION(MultiHeadAttention);
 DML_OP_EXTERN_CREATION_FUNCTION(NonZero);
@@ -537,6 +538,7 @@ DML_OP_EXTERN_QUERY_FUNCTION(Pad);
 DML_OP_EXTERN_QUERY_FUNCTION(LayerNormalization);
 DML_OP_EXTERN_QUERY_FUNCTION(SkipLayerNormalization);
 DML_OP_EXTERN_QUERY_FUNCTION(QLinearSigmoid);
+DML_OP_EXTERN_QUERY_FUNCTION(QAttention);
 DML_OP_EXTERN_QUERY_FUNCTION(Attention);
 
 constexpr static std::array<const char*, 1> typeNameListDefault = {"T"};
@@ -614,15 +616,23 @@ constexpr static std::array<SupportedTensorDataTypes, 2> supportedTypeListLayerN
 constexpr static std::array<SupportedTensorDataTypes, 2> supportedTypeListShape = {SupportedTensorDataTypes::All, SupportedTensorDataTypes::Int64};
 constexpr static std::array<SupportedTensorDataTypes, 2> supportedTypeListSize = {SupportedTensorDataTypes::All, SupportedTensorDataTypes::Int64};
 constexpr static std::array<SupportedTensorDataTypes, 1> supportedTypeListQLinearSigmoid = {SupportedTensorDataTypes::UInt8 | SupportedTensorDataTypes::Int8};
+
+constexpr static std::array<SupportedTensorDataTypes, 4> supportedTypeListQAttention = {
+    SupportedTensorDataTypes::Ints8Bit,
+    SupportedTensorDataTypes::Ints8Bit,
+    SupportedTensorDataTypes::Float16to32,
+    SupportedTensorDataTypes::Int32
+};
+
 constexpr static std::array<SupportedTensorDataTypes, 2> supportedTypeListAttention = {SupportedTensorDataTypes::Float16to32, SupportedTensorDataTypes::Int32};
 constexpr static std::array<SupportedTensorDataTypes, 2> supportedTypeListRotaryEmbedding = {SupportedTensorDataTypes::Float16to32, SupportedTensorDataTypes::Int64};
 constexpr static std::array<SupportedTensorDataTypes, 2> supportedTypeListGroupNorm = {SupportedTensorDataTypes::Float16to32, SupportedTensorDataTypes::Float16to32};
 constexpr static std::array<SupportedTensorDataTypes, 1> supportedTypeListNonZero = {SupportedTensorDataTypes::Float16to32 | SupportedTensorDataTypes::Ints8Bit | SupportedTensorDataTypes::Ints16Bit | SupportedTensorDataTypes::Ints32Bit | SupportedTensorDataTypes::Bool};
 
 constexpr static std::array<SupportedTensorDataTypes, 3> supportedTypeListQLinearMatMul = {
-    SupportedTensorDataTypes::Int8|SupportedTensorDataTypes::UInt8,
-    SupportedTensorDataTypes::Int8|SupportedTensorDataTypes::UInt8,
-    SupportedTensorDataTypes::Int8|SupportedTensorDataTypes::UInt8
+    SupportedTensorDataTypes::Ints8Bit,
+    SupportedTensorDataTypes::Ints8Bit,
+    SupportedTensorDataTypes::Ints8Bit
 };
 
 constexpr static std::array<SupportedTensorDataTypes, 3> supportedTypeListMatMulIntegerToFloat = {
@@ -632,9 +642,9 @@ constexpr static std::array<SupportedTensorDataTypes, 3> supportedTypeListMatMul
 };
 
 constexpr static std::array<SupportedTensorDataTypes, 4> supportedTypeListQLinearConv = {
-    SupportedTensorDataTypes::Int8|SupportedTensorDataTypes::UInt8,
-    SupportedTensorDataTypes::Int8|SupportedTensorDataTypes::UInt8,
-    SupportedTensorDataTypes::Int8|SupportedTensorDataTypes::UInt8,
+    SupportedTensorDataTypes::Ints8Bit,
+    SupportedTensorDataTypes::Ints8Bit,
+    SupportedTensorDataTypes::Ints8Bit,
     SupportedTensorDataTypes::Int32
 };
 
@@ -1069,6 +1079,7 @@ constexpr static OperatorRegistrationInformation operatorRegistrationInformation
     {REG_INFO_MS(   1,  DynamicQuantizeMatMul,              typeNameListTwo,                supportedTypeListDynamicQuantizeLinear, DmlGraphSupport::Supported)},
     {REG_INFO_MS(   1,  FusedMatMulActivation,              typeNameListDefault,            supportedTypeListFloat16to32,           DmlGraphSupport::Supported)},
     {REG_INFO_MS(   1,  QLinearSigmoid,                     typeNameListDefault,            supportedTypeListQLinearSigmoid,        DmlGraphSupport::Supported, requiredConstantCpuInputs(), std::nullopt, QueryQLinearSigmoid)},
+    {REG_INFO_MS(   1,  QAttention,                         typeNameListFour,               supportedTypeListQAttention,            DmlGraphSupport::Supported, requiredConstantCpuInputs(), std::nullopt, QueryQAttention)},
     {REG_INFO_MS(   1,  Attention,                          typeNameListAttention,          supportedTypeListAttention,             DmlGraphSupport::Supported, requiredConstantCpuInputs(), std::nullopt, QueryAttention)},
     {REG_INFO_MS(   1,  MultiHeadAttention,                 typeNameListAttention,          supportedTypeListAttention,             DmlGraphSupport::Supported)},
     {REG_INFO_MS(   1,  RotaryEmbedding,                    typeNameListRotaryEmbedding,    supportedTypeListRotaryEmbedding,       DmlGraphSupport::Supported)},
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp
index 317f5ebcbc3e1..acda1a516be09 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp
@@ -2802,6 +2802,48 @@ namespace OperatorHelper
         m_qkvHiddenSizes = kernelInformation.GetAttributes().GetOptionalAttributeVectorInt32(AttrName::QkvHiddenSizes);
     }
 
+    std::vector<EdgeShapes> QAttentionHelper::GetOutputShapes(const MLShapeInferenceContext& shapeInfo) const
+    {
+        ML_CHECK_VALID_ARGUMENT(shapeInfo.GetInputCount() >= 5);
+
+        auto queryShape = shapeInfo.GetInputTensorShape(0);
+        ML_CHECK_VALID_ARGUMENT(queryShape.size() == 3);
+
+        auto weightShape = shapeInfo.GetInputTensorShape(1);
+        ML_CHECK_VALID_ARGUMENT(weightShape.size() == 2);
+        ML_CHECK_VALID_ARGUMENT(weightShape[1] % 3 == 0);
+
+        const uint32_t batchSize = queryShape[0];
+        const uint32_t sequenceLength = queryShape[1];
+        const uint32_t hiddenSize = weightShape[1] / 3;
+        const uint32_t headSize = hiddenSize / m_numHeads;
+
+        std::vector<EdgeShapes> outputShapes(2);
+
+        outputShapes[0] = EdgeShapes({batchSize, sequenceLength, hiddenSize});
+
+        uint32_t totalSequenceLength = sequenceLength;
+        if (shapeInfo.IsInputValid(8))
+        {
+            ML_CHECK_VALID_ARGUMENT(shapeInfo.GetInputTensorDimensionCount(8) == 5);
+            const uint32_t pastSequenceLength = shapeInfo.GetInputTensorShape(8)[3];
+            totalSequenceLength += pastSequenceLength;
+        }
+
+        if (shapeInfo.IsOutputValid(1))
+        {
+            ML_CHECK_VALID_ARGUMENT(shapeInfo.IsInputValid(8));
+            outputShapes[1] = EdgeShapes({2, batchSize, m_numHeads, totalSequenceLength, headSize});
+        }
+
+        return outputShapes;
+    }
+
+    void QAttentionHelper::Initialize(const IKernelInformationAdapter& kernelInformation)
+    {
+        m_numHeads = gsl::narrow_cast<uint32_t>(kernelInformation.GetAttributes().GetAttribute<int64_t>(AttrName::NumHeads));
+    }
+
     std::vector<EdgeShapes> SkipLayerNormHelper::GetOutputShapes(const MLShapeInferenceContext& shapeInfo) const
     {
         ML_CHECK_VALID_ARGUMENT(shapeInfo.GetInputCount() >= 3);
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h
index 1f5daed6ea0db..aff31bb3050a7 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h
@@ -1554,6 +1554,22 @@ class AttentionHelper
     std::vector<int32_t> m_qkvHiddenSizes;
 };
 
+class QAttentionHelper
+{
+public:
+    template <typename Info_t, typename Shape_t>
+    QAttentionHelper(const Info_t& info, const Shape_t& shapeInfo)
+    {
+        Initialize(KernelInformationAdapter(info));
+    }
+
+    std::vector<EdgeShapes> GetOutputShapes(const MLShapeInferenceContext& shapeInfo) const;
+
+private:
+    void Initialize(const IKernelInformationAdapter& kernelInformation);
+    uint32_t m_numHeads;
+};
+
 class SkipLayerNormHelper
 {
 public:
@@ -1699,6 +1715,7 @@ using ShapeInferenceHelper_Affine = GetOutputShapeAsInputShapeHelper;
 using ShapeInferenceHelper_QuantizeLinear = GetOutputShapeAsInputShapeHelper;
 using ShapeInferenceHelper_DequantizeLinear = GetOutputShapeAsInputShapeHelper;
 using ShapeInferenceHelper_QLinearSigmoid = GetOutputShapeAsInputShapeHelper;
+using ShapeInferenceHelper_QAttention = QAttentionHelper;
 using ShapeInferenceHelper_Attention = AttentionHelper;
 using ShapeInferenceHelper_MultiHeadAttention = MultiHeadAttentionHelper;
 using ShapeInferenceHelper_RotaryEmbedding = GetOutputShapeAsInputShapeHelper;
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h
index 8de43f270598d..7492b729425a5 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h
@@ -448,6 +448,7 @@ namespace OperatorHelper
         static const int sc_sinceVer_FusedMatMul = 1;
         static const int sc_sinceVer_FusedMatMulActivation = 1;
         static const int sc_sinceVer_QLinearSigmoid = 1;
+        static const int sc_sinceVer_QAttention = 1;
         static const int sc_sinceVer_Attention = 1;
         static const int sc_sinceVer_MatMulIntegerToFloat = 1;
         static const int sc_sinceVer_MultiHeadAttention = 1;
diff --git a/onnxruntime/test/contrib_ops/quantize_attention_op_test.cc b/onnxruntime/test/contrib_ops/quantize_attention_op_test.cc
index 3af334696a97d..fd222583ac67f 100644
--- a/onnxruntime/test/contrib_ops/quantize_attention_op_test.cc
+++ b/onnxruntime/test/contrib_ops/quantize_attention_op_test.cc
@@ -20,7 +20,8 @@ namespace test {
 enum class EP : char {
   CPU,
   CUDA,
-  DNNL
+  DNNL,
+  DML
 };
 
 // input:      [batch_size, sequence_length, hidden_size]
@@ -111,7 +112,9 @@ void RunQAttention(const std::vector<float>& input_data,
     execution_providers.push_back(DefaultCudaExecutionProvider());
   } else if constexpr (ep == EP::CPU) {
     execution_providers.push_back(DefaultCpuExecutionProvider());
-  } else {  // onednn ep
+  } else if constexpr (ep == EP::DML) {
+    execution_providers.push_back(DefaultDmlExecutionProvider());
+  } else {  //  onednn ep
     execution_providers.push_back(DefaultDnnlExecutionProvider());
   }
 
@@ -192,6 +195,52 @@ static void RunQAttentionDNNL(
 #endif
 }
 
+static void RunQAttentionDML(
+    const std::vector<float>& input_data,
+    const std::vector<float>& weights_data,
+    const std::vector<float>& bias_data,
+    const std::vector<int32_t>& mask_index_data,
+    const std::vector<float>& output_data,
+    int batch_size,
+    int sequence_length,
+    int hidden_size,
+    int number_of_heads,
+    bool use_special_quantize_parameter = true,
+    bool is_unidirectional = false,
+    int input_hidden_size = 0) {
+  // Return without running code if USE_DML is not defined
+#ifdef USE_DML
+  bool enable_dml = (nullptr != DefaultDmlExecutionProvider().get());
+  if (enable_dml) {
+    quantization::Params<uint8_t> input_quant_params(/*scale=*/0.0f, /*zero_point=*/0);
+    quantization::Params<int8_t> weights_quant_params(/*scale=*/0.0f, /*zero_point=*/0);
+    if (use_special_quantize_parameter) {
+      input_quant_params.scale = 0.1f;
+      weights_quant_params.scale = 0.1f;
+      input_quant_params.zero_point = 128;
+      weights_quant_params.zero_point = 1;
+    }
+
+    RunQAttention<uint8_t, int8_t, EP::DML>(
+        input_data, weights_data, bias_data, mask_index_data, output_data, input_quant_params, weights_quant_params,
+        batch_size, sequence_length, hidden_size, number_of_heads, is_unidirectional, false, input_hidden_size);
+  }
+#else
+  ORT_UNUSED_PARAMETER(input_data);
+  ORT_UNUSED_PARAMETER(weights_data);
+  ORT_UNUSED_PARAMETER(bias_data);
+  ORT_UNUSED_PARAMETER(mask_index_data);
+  ORT_UNUSED_PARAMETER(output_data);
+  ORT_UNUSED_PARAMETER(batch_size);
+  ORT_UNUSED_PARAMETER(sequence_length);
+  ORT_UNUSED_PARAMETER(hidden_size);
+  ORT_UNUSED_PARAMETER(number_of_heads);
+  ORT_UNUSED_PARAMETER(use_special_quantize_parameter);
+  ORT_UNUSED_PARAMETER(is_unidirectional);
+  ORT_UNUSED_PARAMETER(input_hidden_size);
+#endif
+}
+
 static void RunQAttentionU8U8(
     const std::vector<float>& input_data,
     const std::vector<float>& weights_data,
@@ -272,6 +321,9 @@ static void RunQAttentionAll(
   RunQAttentionDNNL(input_data, weight_data, bias_data, mask_index_data, output_data,
                     batch_size, sequence_length, hidden_size, number_of_heads,
                     use_special_quantize_parameter, is_unidirectional, input_hidden_size);
+  RunQAttentionDML(input_data, weight_data, bias_data, mask_index_data, output_data,
+                   batch_size, sequence_length, hidden_size, number_of_heads,
+                   use_special_quantize_parameter, is_unidirectional, input_hidden_size);
 }
 
 // ONEDNN EP only supports 2D raw mask
@@ -859,8 +911,8 @@ void TestQuantizedAttentionPastState(int64_t batch,
   std::vector<int64_t> input_dims{batch, seq_len, hidden_size};
   std::vector<InputT> input_data = random.Gaussian<InputT>(input_dims, input_mean, static_cast<InputT>(input_range / 6), input_min, input_max);
 
-  constexpr WeightT weight_min = std::numeric_limits<WeightT>::min();
-  constexpr WeightT weight_max = std::numeric_limits<WeightT>::max();
+  constexpr WeightT weight_min = std::is_same_v<WeightT, int8_t> ? std::numeric_limits<int8_t>::min() / 2 : std::numeric_limits<WeightT>::min();
+  constexpr WeightT weight_max = std::numeric_limits<WeightT>::max() / 2;
   constexpr int32_t weight_range = weight_max - weight_min;
 
   std::vector<WeightT> weight_zero_point(weight_scale_zp_size);

From cba605e845959cc0b34031224f1d08782d4efe70 Mon Sep 17 00:00:00 2001
From: Hector Li <hecli@microsoft.com>
Date: Mon, 11 Mar 2024 16:39:41 -0700
Subject: [PATCH 139/279] Fix Clip op builder for FP16 support (#19825)

### Description
Fix Clip op builder for FP16 support.

### Motivation and Context
Enables mobilenet v2 FP16 model inference on HTP
---
 .../qnn/builder/opbuilder/clip_op_builder.cc  |  83 ++++----
 .../test/providers/qnn/clip_op_test.cc        |  38 ++++
 .../test/providers/qnn/qnn_test_utils.h       | 181 ++++++++++++++++++
 3 files changed, 262 insertions(+), 40 deletions(-)

diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc
index 0a9f9889ad2d8..dc99687e78d30 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc
@@ -36,6 +36,27 @@ class ClipOpBuilder : public BaseOpBuilder {
   Status ExplictOpCheck(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit) const;
 };
 
+static Status ProcessClipMinMax(QnnModelWrapper& qnn_model_wrapper,
+                                const NodeUnitIODef& input,
+                                float& float_value) {
+  TensorInfo input_info = {};
+  std::vector<uint8_t> val_bytes;
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(input, input_info));
+  assert(input_info.is_initializer);  // Checked by ExplicitOpCheck().
+  if (QNN_DATATYPE_FLOAT_16 == input_info.qnn_data_type) {
+    ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*input_info.initializer_tensor, val_bytes));
+    MLFloat16 fp16_value = *reinterpret_cast<const MLFloat16*>(val_bytes.data());
+    float_value = fp16_value.ToFloat();
+  } else {
+    ORT_RETURN_IF_NOT(QNN_DATATYPE_FLOAT_32 == input_info.qnn_data_type,
+                      "QNN EP: The 'min' input of the Clip operator must be of type float32.");
+    ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*input_info.initializer_tensor, val_bytes));
+    float_value = *reinterpret_cast<const float*>(val_bytes.data());
+  }
+
+  return Status::OK();
+}
+
 Status ClipOpBuilder::ExplictOpCheck(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit) const {
   if (node_unit.Inputs().size() > 1) {
     const auto& min_input_name = node_unit.Inputs()[1].node_arg.Name();
@@ -75,54 +96,36 @@ Status ClipOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wra
   const Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32;
   std::vector<std::string> param_tensor_names;
 
-  auto get_f32_from_bytes = [](const std::vector<uint8_t>& bytes, float default_val) -> float {
-    return bytes.empty() ? default_val : *reinterpret_cast<const float*>(bytes.data());
-  };
-
   // Set the 'min' parameter.
-  {
-    std::vector<uint8_t> min_val_bytes;
-
-    if (num_inputs > 1 && !inputs[1].node_arg.Name().empty()) {
-      TensorInfo min_input_info = {};
-      ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[1], min_input_info));
-      ORT_RETURN_IF_NOT(min_input_info.qnn_data_type == qnn_data_type,
-                        "QNN EP: The 'min' input of the Clip operator must be of type float32.");
-      assert(min_input_info.is_initializer);  // Checked by ExplicitOpCheck().
-      ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*min_input_info.initializer_tensor, min_val_bytes));
-    }
+  Qnn_Scalar_t min_qnn_scalar = QNN_SCALAR_INIT;
+  min_qnn_scalar.dataType = qnn_data_type;
 
-    Qnn_Scalar_t min_qnn_scalar = QNN_SCALAR_INIT;
-    min_qnn_scalar.dataType = qnn_data_type;
-    min_qnn_scalar.floatValue = get_f32_from_bytes(min_val_bytes, std::numeric_limits<float>::lowest());
-    QnnParamWrapper min_value_param(node_unit.Index(), node_unit.Name(), QNN_OP_RELU_MIN_MAX_PARAM_MIN_VALUE,
-                                    min_qnn_scalar);
-    param_tensor_names.push_back(min_value_param.GetParamTensorName());
-    qnn_model_wrapper.AddParamWrapper(std::move(min_value_param));
+  if (num_inputs > 1 && !inputs[1].node_arg.Name().empty()) {
+    ORT_RETURN_IF_ERROR(ProcessClipMinMax(qnn_model_wrapper, inputs[1], min_qnn_scalar.floatValue));
+  } else {
+    min_qnn_scalar.floatValue = std::numeric_limits<float>::lowest();
   }
 
+  QnnParamWrapper min_value_param(node_unit.Index(), node_unit.Name(), QNN_OP_RELU_MIN_MAX_PARAM_MIN_VALUE,
+                                  min_qnn_scalar);
+  param_tensor_names.push_back(min_value_param.GetParamTensorName());
+  qnn_model_wrapper.AddParamWrapper(std::move(min_value_param));
+
   // Set the 'max' parameter.
-  {
-    std::vector<uint8_t> max_val_bytes;
-
-    if (num_inputs > 2 && !inputs[2].node_arg.Name().empty()) {
-      TensorInfo max_input_info = {};
-      ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[2], max_input_info));
-      ORT_RETURN_IF_NOT(max_input_info.qnn_data_type == qnn_data_type,
-                        "QNN EP: The 'max' input of the Clip operator must of type float32.");
-      assert(max_input_info.is_initializer);  // Checked by ExplicitOpCheck().
-      ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*max_input_info.initializer_tensor, max_val_bytes));
-    }
+  Qnn_Scalar_t max_qnn_scalar = QNN_SCALAR_INIT;
+  max_qnn_scalar.dataType = qnn_data_type;
 
-    Qnn_Scalar_t max_qnn_scalar = QNN_SCALAR_INIT;
-    max_qnn_scalar.dataType = qnn_data_type;
-    max_qnn_scalar.floatValue = get_f32_from_bytes(max_val_bytes, std::numeric_limits<float>::max());
-    QnnParamWrapper max_value_param(node_unit.Index(), node_unit.Name(), QNN_OP_RELU_MIN_MAX_PARAM_MAX_VALUE,
-                                    max_qnn_scalar);
-    param_tensor_names.push_back(max_value_param.GetParamTensorName());
-    qnn_model_wrapper.AddParamWrapper(std::move(max_value_param));
+  if (num_inputs > 2 && !inputs[2].node_arg.Name().empty()) {
+    ORT_RETURN_IF_ERROR(ProcessClipMinMax(qnn_model_wrapper, inputs[2], max_qnn_scalar.floatValue));
+  } else {
+    max_qnn_scalar.floatValue = std::numeric_limits<float>::max();
   }
 
+  QnnParamWrapper max_value_param(node_unit.Index(), node_unit.Name(), QNN_OP_RELU_MIN_MAX_PARAM_MAX_VALUE,
+                                  max_qnn_scalar);
+  param_tensor_names.push_back(max_value_param.GetParamTensorName());
+  qnn_model_wrapper.AddParamWrapper(std::move(max_value_param));
+
   ORT_RETURN_IF_ERROR(ProcessOutputs(qnn_model_wrapper, node_unit,
                                      std::move(input_names),
                                      std::move(param_tensor_names),
diff --git a/onnxruntime/test/providers/qnn/clip_op_test.cc b/onnxruntime/test/providers/qnn/clip_op_test.cc
index 15ba3b5de2fa1..e899f870f9e78 100644
--- a/onnxruntime/test/providers/qnn/clip_op_test.cc
+++ b/onnxruntime/test/providers/qnn/clip_op_test.cc
@@ -182,6 +182,44 @@ TEST_F(QnnHTPBackendTests, Clip_U8_Rank5) {
                   ExpectedEPNodeAssignment::All);
 }
 
+// Test FP16 Clip with min (FP16)
+TEST_F(QnnHTPBackendTests, Clip_FP16) {
+  ProviderOptions provider_options;
+
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  auto f32_input = TestInputDef<float>({1, 3, 2, 2}, false,
+                                       {-10.0f, -8.0f, -3.5f, 2.2f,
+                                        1.3f, 1.5f, 3.2f, 5.8f,
+                                        5.8f, 9.7f, 8.5f, 8.9f});
+  std::vector<MLFloat16> f16_data;
+  std::for_each(f32_input.GetRawData().begin(), f32_input.GetRawData().end(),
+                [&f16_data](const float data) {
+                  f16_data.push_back(static_cast<MLFloat16>(data));
+                });
+  auto f16_input = TestInputDef<MLFloat16>({1, 3, 2, 2}, false, f16_data);
+
+  const float min_f32 = 1.2f;
+  const MLFloat16 min_f16 = static_cast<MLFloat16>(min_f32);
+  auto f32_min_input = TestInputDef<float>({}, true, {min_f32});
+  auto f16_min_input = TestInputDef<MLFloat16>({}, true, {min_f16});
+
+  auto f32_model_builder = BuildOpTestCase<float, float>("Clip", {f32_input}, {f32_min_input}, {});
+  auto f16_model_builder = BuildOpTestCase<MLFloat16, MLFloat16>("Clip", {f16_input}, {f16_min_input}, {});
+  int opset = 13;
+  ExpectedEPNodeAssignment expected_ep_assignment = ExpectedEPNodeAssignment::All;
+
+  TestFp16ModelAccuracy(f32_model_builder,
+                        f16_model_builder,
+                        provider_options,
+                        opset,
+                        expected_ep_assignment);
+}
+
 #endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/qnn/qnn_test_utils.h b/onnxruntime/test/providers/qnn/qnn_test_utils.h
index f4febd99ddae7..c0cfe3f0342fd 100644
--- a/onnxruntime/test/providers/qnn/qnn_test_utils.h
+++ b/onnxruntime/test/providers/qnn/qnn_test_utils.h
@@ -467,6 +467,187 @@ inline void TestQDQModelAccuracy(const GetTestModelFn& f32_model_fn, const GetTe
   }
 }
 
+/**
+ * Tests the accuracy of a FP16 model on QNN EP by runnning 3 inferences:
+ *
+ * 1. float32 model on CPU EP (baseline)
+ * 2. FP16 model on CPU EP
+ * 3. FP16 model on QNN EP
+ *
+ * This function checks that running the FP16 model on QNN EP (#3) is at least as accurate (+- small tolerance)
+ * as running the FP16 model on CPU EP (#2). We primarily measure accuracy by comparing to the baseline (#1).
+ *
+ * \param f32_model_fn Function that builds the float model (baseline for comparison).
+ * \param f16_model_fn Function that builds the FP16 model (run by CPU EP and QNN EP).
+ * \param qnn_options QNN EP provider options.
+ * \param opset_version The opset version.
+ * \param expected_ep_assignment Describes "which nodes" should be assigned to the EP.
+ * \param tolerance The percent tolerance (as fraction) QNN EP results are allowed to differ from the FP16 model on CPU EP.
+ *                  This tolerance is a percentage of the output range.
+ * \param log_severity The logger's severity setting.
+ */
+inline void TestFp16ModelAccuracy(const GetTestModelFn& f32_model_fn,
+                                  const GetTestModelFn& f16_model_fn,
+                                  ProviderOptions qnn_options,
+                                  int opset_version,
+                                  ExpectedEPNodeAssignment expected_ep_assignment,
+                                  float tolerance = 0.004,
+                                  logging::Severity log_severity = logging::Severity::kERROR,
+                                  const std::string& qnn_ctx_model_path = "",
+                                  const std::unordered_map<std::string, std::string>& session_option_pairs = {}) {
+  // Add kMSDomain to cover contrib op like Gelu
+  const std::unordered_map<std::string, int> domain_to_version = {{"", opset_version}, {kMSDomain, 1}};
+
+  auto& logging_manager = DefaultLoggingManager();
+  logging_manager.SetDefaultLoggerSeverity(log_severity);
+
+  // Create float model and serialize it to a string.
+  onnxruntime::Model f32_model("f32_model", false, ModelMetaData(), PathString(),
+                               IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {},
+                               logging_manager.DefaultLogger());
+  ModelTestBuilder f32_helper(f32_model.MainGraph());
+  std::string f32_model_data;
+  f32_model_fn(f32_helper);
+  f32_helper.SetGraphOutputs();
+  ASSERT_STATUS_OK(f32_model.MainGraph().Resolve());
+  f32_model.ToProto().SerializeToString(&f32_model_data);
+
+  // Run f32 model on CPU EP and collect outputs.
+  std::vector<OrtValue> cpu_f32_outputs;
+  InferenceModel(f32_model_data, "f32_model_logger", {}, ExpectedEPNodeAssignment::All,
+                 f32_helper.feeds_, cpu_f32_outputs);
+  ASSERT_FALSE(cpu_f32_outputs.empty());
+
+  const size_t num_outputs = cpu_f32_outputs.size();
+
+  // Compute output range(s) and quantization params.
+  std::vector<gsl::span<const float>> output_vals;
+  std::vector<int32_t> output_types;
+  output_vals.resize(num_outputs);
+  output_types.resize(num_outputs);
+
+  for (size_t i = 0; i < num_outputs; i++) {
+    auto& tensor = cpu_f32_outputs[i].Get<Tensor>();
+    int32_t elem_type = tensor.GetElementType();
+
+    if (elem_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT) {
+      output_vals[i] = tensor.DataAsSpan<float>();
+    }
+
+    output_types[i] = elem_type;
+  }
+
+  // Create FP16 model and serialize it to a string.
+  onnxruntime::Model f16_model("fp16_model", false, ModelMetaData(), PathString(),
+                               IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {},
+                               logging_manager.DefaultLogger());
+  ModelTestBuilder f16_helper(f16_model.MainGraph());
+  std::string f16_model_data;
+  f16_model_fn(f16_helper);
+  f16_helper.SetGraphOutputs();
+  ASSERT_STATUS_OK(f16_model.MainGraph().Resolve());
+  f16_model.ToProto().SerializeToString(&f16_model_data);
+
+  bool is_qnn_ep = true;
+  TryEnableQNNSaver(qnn_options);
+  std::vector<OrtValue> qnn_f16_outputs;
+  if (!qnn_ctx_model_path.empty()) {
+    onnx::ModelProto model_proto;
+    onnxruntime::Model qnn_ctx_model;
+    // Load the QNN context cache model from path specified
+    ASSERT_STATUS_OK(qnn_ctx_model.Load(ToPathString(qnn_ctx_model_path), model_proto));
+    std::string qnn_ctx_model_data;
+    model_proto.SerializeToString(&qnn_ctx_model_data);
+    // Run QNN context cache model on QNN EP and collect outputs.
+    InferenceModel(qnn_ctx_model_data, "qnn_ctx_model_logger", qnn_options,
+                   expected_ep_assignment, f16_helper.feeds_, qnn_f16_outputs, is_qnn_ep, session_option_pairs);
+  } else {
+    // Run QDQ model on QNN EP and collect outputs.
+    // Only need to apply the extra session options to this QDQ model inference on QNN EP
+    InferenceModel(f16_model_data, "fp16_model_logger", qnn_options, expected_ep_assignment,
+                   f16_helper.feeds_, qnn_f16_outputs, is_qnn_ep, session_option_pairs);
+  }
+
+  if (expected_ep_assignment != ExpectedEPNodeAssignment::None) {
+    // Run QDQ model on CPU EP and collect outputs.
+    std::vector<OrtValue> cpu_f16_outputs;
+    InferenceModel(f16_model_data, "fp16_model_logger", {}, ExpectedEPNodeAssignment::All,
+                   f16_helper.feeds_, cpu_f16_outputs);
+    ASSERT_EQ(cpu_f16_outputs.size(), num_outputs);
+    ASSERT_EQ(qnn_f16_outputs.size(), num_outputs);
+
+    // limit the error message count in case test with large data failed
+    size_t max_error_count = 10;
+    size_t error_count = 0;
+
+    // Compare accuracy of QDQ results with float model.
+    // QNN EP must be at least as accurate as CPU EP when running the QDQ model.
+    const std::string base_output_name = "output_";
+    for (size_t i = 0; i < num_outputs; i++) {
+      std::string debug_output_name = base_output_name + std::to_string(i);
+      auto& cpu_f16_tensor = cpu_f16_outputs[i].Get<Tensor>();
+      auto& qnn_f16_tensor = qnn_f16_outputs[i].Get<Tensor>();
+
+      ASSERT_EQ(cpu_f16_tensor.GetElementType(), ONNX_NAMESPACE::TensorProto_DataType_FLOAT16);
+      ASSERT_EQ(qnn_f16_tensor.GetElementType(), ONNX_NAMESPACE::TensorProto_DataType_FLOAT16);
+      ASSERT_EQ(output_types[i], ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+
+      const size_t num_vals = output_vals[i].size();
+      gsl::span<const float> cpu_f32_vals = output_vals[i];
+      gsl::span<const MLFloat16> cpu_f16_vals = cpu_f16_tensor.DataAsSpan<MLFloat16>();
+      gsl::span<const MLFloat16> qnn_f16_vals = qnn_f16_tensor.DataAsSpan<MLFloat16>();
+
+      ASSERT_EQ(num_vals, cpu_f16_vals.size());
+      ASSERT_EQ(num_vals, qnn_f16_vals.size());
+
+      float max_f16_cpu_err = 0.0f;
+      float max_f16_qnn_err = 0.0f;
+
+      for (size_t j = 0; j < num_vals && error_count < max_error_count; j++) {
+        const float expected_val = cpu_f32_vals[j];           // f32@CPU_EP val ("ground-truth")
+        const float qnn_f16_val = qnn_f16_vals[j].ToFloat();  // f16@QNN_EP val
+        const float cpu_f16_val = cpu_f16_vals[j].ToFloat();  // f16@CPU_EP val
+
+        // Get errors of f16@CPU_EP and f16@QNN_EP against f32@CPU_EP.
+        const float cpu_relative_err = std::fabs(expected_val - cpu_f16_val) / expected_val;
+        const float qnn_relative_err = std::fabs(expected_val - qnn_f16_val) / expected_val;
+
+        // Also compare the FP16 values against each other.
+        // This is equivalent to abs(f16@QNN_EP - f16@CPU_EP) / output_range
+        const float f16_vals_err = std::fabs(qnn_relative_err - cpu_relative_err);
+
+        // True if f16@QNN_EP is at least as accurate as f16@CPU_EP when compared to expected f32@CPU_EP value.
+        const bool is_as_accurate_as_cpu_ep = qnn_relative_err <= qnn_relative_err;
+
+        // True if the normalized difference between f16@QNN_EP and f16@CPU_EP is within tolerance.
+        const bool f16_vals_diff_within_tolerance = f16_vals_err <= tolerance;
+
+        const bool passed_test = is_as_accurate_as_cpu_ep || f16_vals_diff_within_tolerance;
+        if (!passed_test) {
+          ++error_count;
+        }
+        EXPECT_TRUE(passed_test)
+            << "Inaccuracy detected for output '" << debug_output_name
+            << "', element " << j << ", tolerance=" << (tolerance * 100) << "%"
+            << ".\nExpected val (f32@CPU_EP): " << expected_val << "\n"
+            << "f16@QNN_EP val: " << qnn_f16_val << " (err: " << qnn_relative_err << ")\n"
+            << "f16@CPU_EP val: " << cpu_f16_val << " (err: " << cpu_relative_err << ")\n";
+
+        max_f16_cpu_err = std::max(max_f16_cpu_err, cpu_relative_err);
+        max_f16_qnn_err = std::max(max_f16_qnn_err, qnn_relative_err);
+      }
+
+      if (error_count > 0) {
+        std::cerr << std::endl
+                  << "[WARNING]: Output " << i
+                  << " required larger tolerance to pass accuracy checks" << std::endl
+                  << "Max relative error against f32@CPU_EP = " << max_f16_cpu_err << std::endl
+                  << "Max relative error against f16@CPU_EP = " << max_f16_qnn_err << std::endl;
+      }
+    }
+  }
+}
+
 /**
  * Creates and returns an input in a test model graph. The input's characteristics are defined
  * by the provided input definition.

From 978c40d85310a1d9b8a6069be853bc3dbec44e18 Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Tue, 12 Mar 2024 10:55:49 +1000
Subject: [PATCH 140/279] Make partitioning utils QDQ aware so it does not
 break up QDQ node units (#19723)

### Description
<!-- Describe your changes. -->
If the EP handles QDQ node units, we need to make sure we do not split
those into different partitions.

Update the partitioning utils to be QDQ aware. If there are node units
we process the logical nodes they represent instead of individual nodes.
This ensure we process all nodes in a QDQ node unit at the same time so
that they are always in the same partition.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
Fix one of the issues in #19590

---------

Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com>
---
 .../external/onnxruntime_external_deps.cmake  |   9 +-
 cmake/onnxruntime_providers_coreml.cmake      |   4 +-
 cmake/onnxruntime_providers_nnapi.cmake       |   6 +-
 cmake/onnxruntime_providers_qnn.cmake         |   8 +-
 cmake/onnxruntime_providers_xnnpack.cmake     |   3 -
 onnxruntime/core/framework/node_unit.cc       | 351 ++++++++++++++++++
 .../node_unit => framework}/node_unit.h       |  51 ++-
 .../selectors_actions/qdq_selectors.cc        |  12 +-
 .../selectors_actions/qdq_selectors.h         |   8 +-
 .../selectors_actions/shared/utils.cc         |  62 +++-
 .../selectors_actions/shared/utils.h          |  16 +-
 .../providers/js/js_execution_provider.cc     |   1 -
 .../nnapi/nnapi_builtin/builders/helper.cc    |   2 +-
 .../builders/impl/base_op_builder.h           |   2 +-
 .../nnapi_builtin/builders/model_builder.cc   |  12 +-
 .../builders/op_builder_helpers.cc            |   1 -
 .../builders/op_builder_helpers.h             |   2 +-
 .../nnapi_builtin/nnapi_execution_provider.cc |   8 +-
 .../core/providers/partitioning_utils.cc      | 105 ++++--
 .../core/providers/partitioning_utils.h       |  14 +-
 .../core/providers/qnn/builder/op_builder.h   |   2 +-
 .../core/providers/qnn/builder/qnn_model.cc   |   4 +-
 .../core/providers/qnn/builder/qnn_model.h    |   2 +-
 .../providers/qnn/builder/qnn_model_wrapper.h |   2 +-
 .../providers/qnn/qnn_execution_provider.cc   |  69 ++--
 .../providers/shared/node_unit/node_unit.cc   | 319 ----------------
 .../core/providers/shared/utils/utils.cc      |  10 +-
 onnxruntime/core/providers/utils.cc           |   3 +-
 .../xnnpack/detail/node_support_checker.cc    |   2 +-
 .../core/providers/xnnpack/detail/utils.cc    |   6 +-
 .../core/providers/xnnpack/detail/utils.h     |   2 +-
 .../xnnpack/xnnpack_execution_provider.cc     |  18 +-
 .../mlas/unittest/test_fp16_activation.cpp    |   1 +
 .../test/optimizer/qdq_transformer_test.cc    |  14 +-
 .../internal_testing_execution_provider.cc    |   1 +
 .../test/providers/partitioning_utils_test.cc | 174 +++++++++
 .../test/testdata/ort_github_issue_19590.onnx | Bin 0 -> 599 bytes
 .../test/testdata/ort_github_issue_19590.py   |  77 ++++
 38 files changed, 890 insertions(+), 493 deletions(-)
 create mode 100644 onnxruntime/core/framework/node_unit.cc
 rename onnxruntime/core/{providers/shared/node_unit => framework}/node_unit.h (54%)
 delete mode 100644 onnxruntime/core/providers/shared/node_unit/node_unit.cc
 create mode 100644 onnxruntime/test/providers/partitioning_utils_test.cc
 create mode 100644 onnxruntime/test/testdata/ort_github_issue_19590.onnx
 create mode 100644 onnxruntime/test/testdata/ort_github_issue_19590.py

diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index cb75b0b8751bb..e4fefdbf86369 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -37,8 +37,13 @@ if (onnxruntime_BUILD_UNIT_TESTS)
     set(gtest_disable_pthreads ON)
   endif()
   set(INSTALL_GTEST OFF CACHE BOOL "" FORCE)
-  if (CMAKE_SYSTEM_NAME STREQUAL "iOS")
-    # Needs to update onnxruntime/test/xctest/xcgtest.mm
+  if (IOS OR ANDROID)
+    # on mobile platforms the absl flags class dumps the flag names (assumably for binary size), which breaks passing
+    # any args to gtest executables, such as using --gtest_filter to debug a specific test.
+    # Processing of compile definitions:
+    # https://github.com/abseil/abseil-cpp/blob/8dc90ff07402cd027daec520bb77f46e51855889/absl/flags/config.h#L21
+    # If set, this code throws away the flag and does nothing on registration, which results in no flags being known:
+    # https://github.com/abseil/abseil-cpp/blob/8dc90ff07402cd027daec520bb77f46e51855889/absl/flags/flag.h#L205-L217
     set(GTEST_HAS_ABSL OFF CACHE BOOL "" FORCE)
   else()
     set(GTEST_HAS_ABSL ON CACHE BOOL "" FORCE)
diff --git a/cmake/onnxruntime_providers_coreml.cmake b/cmake/onnxruntime_providers_coreml.cmake
index 8f3b1828e1c61..b8ebc4ca53239 100644
--- a/cmake/onnxruntime_providers_coreml.cmake
+++ b/cmake/onnxruntime_providers_coreml.cmake
@@ -70,8 +70,8 @@ list(FILTER coreml_proto_generated_srcs INCLUDE REGEX "\.pb\.(h|cc)$")
 source_group(TREE ${CMAKE_CURRENT_BINARY_DIR} PREFIX coreml_proto_generated FILES ${coreml_proto_generated_srcs})
 
 # These are shared utils,
-# TODO, move this to a separated lib when used by EPs other than NNAPI and CoreML
-file(GLOB_RECURSE onnxruntime_providers_shared_utils_cc_srcs CONFIGURE_DEPENDS
+# TODO, move this to a separate lib when used by EPs other than NNAPI and CoreML
+file(GLOB onnxruntime_providers_shared_utils_cc_srcs CONFIGURE_DEPENDS
   "${ONNXRUNTIME_ROOT}/core/providers/shared/utils/utils.h"
   "${ONNXRUNTIME_ROOT}/core/providers/shared/utils/utils.cc"
 )
diff --git a/cmake/onnxruntime_providers_nnapi.cmake b/cmake/onnxruntime_providers_nnapi.cmake
index 5ac25a3b76efb..b718a976eb26f 100644
--- a/cmake/onnxruntime_providers_nnapi.cmake
+++ b/cmake/onnxruntime_providers_nnapi.cmake
@@ -49,12 +49,10 @@
   endif()
 
   # These are shared utils,
-  # TODO, move this to a separated lib when used by EPs other than NNAPI and CoreML
+  # TODO, move this to a separate lib when used by EPs other than NNAPI and CoreML
   list(APPEND onnxruntime_provider_nnapi_cc_src_patterns
     "${ONNXRUNTIME_ROOT}/core/providers/shared/utils/utils.h"
     "${ONNXRUNTIME_ROOT}/core/providers/shared/utils/utils.cc"
-    "${ONNXRUNTIME_ROOT}/core/providers/shared/node_unit/node_unit.h"
-    "${ONNXRUNTIME_ROOT}/core/providers/shared/node_unit/node_unit.cc"
   )
 
   file(GLOB onnxruntime_providers_nnapi_cc_srcs CONFIGURE_DEPENDS ${onnxruntime_provider_nnapi_cc_src_patterns})
@@ -81,4 +79,4 @@
             LIBRARY   DESTINATION ${CMAKE_INSTALL_LIBDIR}
             RUNTIME   DESTINATION ${CMAKE_INSTALL_BINDIR}
             FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
-  endif()
\ No newline at end of file
+  endif()
diff --git a/cmake/onnxruntime_providers_qnn.cmake b/cmake/onnxruntime_providers_qnn.cmake
index a93a06e960c81..b68d84c23bb32 100644
--- a/cmake/onnxruntime_providers_qnn.cmake
+++ b/cmake/onnxruntime_providers_qnn.cmake
@@ -4,12 +4,10 @@
   add_compile_definitions(USE_QNN=1)
 
   # These are shared utils,
-  # TODO, move this to a separated lib when used by EPs other than QNN, NNAPI and CoreML
-  file(GLOB_RECURSE onnxruntime_providers_shared_utils_cc_srcs CONFIGURE_DEPENDS
+  # TODO, move to a separate lib when used by EPs other than QNN, NNAPI and CoreML
+  file(GLOB onnxruntime_providers_shared_utils_cc_srcs CONFIGURE_DEPENDS
     "${ONNXRUNTIME_ROOT}/core/providers/shared/utils/utils.h"
     "${ONNXRUNTIME_ROOT}/core/providers/shared/utils/utils.cc"
-    "${ONNXRUNTIME_ROOT}/core/providers/shared/node_unit/node_unit.h"
-    "${ONNXRUNTIME_ROOT}/core/providers/shared/node_unit/node_unit.cc"
   )
 
   file(GLOB_RECURSE
@@ -42,4 +40,4 @@
   # ignore the warning unknown-pragmas on "pragma region"
   if(NOT MSVC)
     target_compile_options(onnxruntime_providers_qnn PRIVATE "-Wno-unknown-pragmas")
-  endif()
\ No newline at end of file
+  endif()
diff --git a/cmake/onnxruntime_providers_xnnpack.cmake b/cmake/onnxruntime_providers_xnnpack.cmake
index 6342c24b2917e..796536ac9d12b 100644
--- a/cmake/onnxruntime_providers_xnnpack.cmake
+++ b/cmake/onnxruntime_providers_xnnpack.cmake
@@ -7,9 +7,6 @@
     "${ONNXRUNTIME_INCLUDE_DIR}/core/providers/xnnpack/*.h"
     "${ONNXRUNTIME_ROOT}/core/providers/xnnpack/*.h"
     "${ONNXRUNTIME_ROOT}/core/providers/xnnpack/*.cc"
-    # utils for handling QDQ models
-    "${ONNXRUNTIME_ROOT}/core/providers/shared/node_unit/node_unit.h"
-    "${ONNXRUNTIME_ROOT}/core/providers/shared/node_unit/node_unit.cc"
   )
 
   source_group(TREE ${REPO_ROOT} FILES ${onnxruntime_providers_xnnpack_cc_srcs})
diff --git a/onnxruntime/core/framework/node_unit.cc b/onnxruntime/core/framework/node_unit.cc
new file mode 100644
index 0000000000000..4dee1c14b3761
--- /dev/null
+++ b/onnxruntime/core/framework/node_unit.cc
@@ -0,0 +1,351 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+
+#include "node_unit.h"
+#include "core/graph/graph_viewer.h"
+
+namespace onnxruntime {
+
+namespace {
+
+enum class QLinearOpType : uint8_t {
+  Unknown,  // Unknown or not a linear quantized op
+  DequantizeLinear,
+  QuantizeLinear,
+  QLinearConv,
+  QLinearMatMul,
+  QLinearAdd,
+  QLinearSigmoid,
+  QLinearAveragePool,
+  QLinearMul,
+  QLinearReduceMean,
+  QLinearConcat,
+  QLinearGlobalAveragePool,
+  QLinearLeakyRelu,
+};
+
+QLinearOpType GetQLinearOpType(const onnxruntime::Node& node) {
+  const auto& op_type = node.OpType();
+  if (op_type == "DequantizeLinear")
+    return QLinearOpType::DequantizeLinear;
+  else if (op_type == "QuantizeLinear")
+    return QLinearOpType::QuantizeLinear;
+  else if (op_type == "QLinearConv")
+    return QLinearOpType::QLinearConv;
+  else if (op_type == "QLinearMatMul")
+    return QLinearOpType::QLinearMatMul;
+  else if (op_type == "QLinearAdd")
+    return QLinearOpType::QLinearAdd;
+  else if (op_type == "QLinearSigmoid")
+    return QLinearOpType::QLinearSigmoid;
+  else if (op_type == "QLinearAveragePool")
+    return QLinearOpType::QLinearAveragePool;
+  else if (op_type == "QLinearMul")
+    return QLinearOpType::QLinearMul;
+  else if (op_type == "QLinearReduceMean")
+    return QLinearOpType::QLinearReduceMean;
+  else if (op_type == "QLinearConcat")
+    return QLinearOpType::QLinearConcat;
+  else if (op_type == "QLinearGlobalAveragePool")
+    return QLinearOpType::QLinearGlobalAveragePool;
+  else if (op_type == "QLinearLeakyRelu")
+    return QLinearOpType::QLinearLeakyRelu;
+
+  return QLinearOpType::Unknown;
+}
+
+// Ops have 1 input
+bool IsUnaryQLinearOp(QLinearOpType type) {
+  return type == QLinearOpType::QLinearSigmoid ||
+         type == QLinearOpType::QLinearAveragePool ||
+         type == QLinearOpType::QLinearGlobalAveragePool ||
+         type == QLinearOpType::QLinearLeakyRelu ||
+         type == QLinearOpType::QLinearReduceMean;
+}
+
+// Ops have 2 inputs
+bool IsBinaryQLinearOp(QLinearOpType type) {
+  return type == QLinearOpType::QLinearConv ||
+         type == QLinearOpType::QLinearMatMul ||
+         type == QLinearOpType::QLinearAdd ||
+         type == QLinearOpType::QLinearMul;
+}
+
+// Ops have 1 or more inputs
+bool IsVariadicQLinearOp(QLinearOpType type) {
+  return type == QLinearOpType::QLinearConcat;
+}
+
+const std::vector<const Node*> GetQDQIONodes(const GraphViewer& graph_viewer,
+                                             const QDQ::NodeGroup& node_group, bool is_input) {
+  std::vector<const Node*> io_nodes;
+  const auto& src_nodes = is_input ? node_group.dq_nodes : node_group.q_nodes;
+  io_nodes.reserve(src_nodes.size());
+  for (const auto& node_idx : src_nodes) {
+    io_nodes.push_back(graph_viewer.GetNode(node_idx));
+  }
+
+  return io_nodes;
+}
+
+// Get the input or output NodeUnitIODef(s) for the given QDQ NodeGroup
+std::vector<NodeUnitIODef> GetQDQIODefs(const Node& target_node, const QDQ::NodeGroup& node_group, bool is_input) {
+  const auto& dq_or_q_nodes = is_input ? node_group.dq_nodes : node_group.q_nodes;
+  const auto target_node_io_defs = is_input ? target_node.InputDefs() : target_node.OutputDefs();
+  const size_t target_node_io_defs_size = target_node_io_defs.size();
+
+  // Find all the quantized IO defs and indices (for the input/output of the target node)
+  std::unordered_map<size_t, NodeUnitIODef> quantized_io_defs;
+  quantized_io_defs.reserve(target_node_io_defs_size);
+
+  auto cur = is_input ? target_node.InputEdgesBegin() : target_node.OutputEdgesBegin();
+  auto end = is_input ? target_node.InputEdgesEnd() : target_node.OutputEdgesEnd();
+
+  for (; cur != end; ++cur) {
+    const Node& node = cur->GetNode();
+
+    // If we can find the node index in the dq or q nodes this is a quantized input/output
+    if (std::find(dq_or_q_nodes.cbegin(), dq_or_q_nodes.cend(), node.Index()) != dq_or_q_nodes.cend()) {
+      const auto node_inputs = node.InputDefs();
+      // quantization scale and zp are always the input[1, 2]
+      NodeUnitIODef::QuantParam quant_param{*node_inputs[1], node_inputs.size() == 3 ? node_inputs[2] : nullptr};
+
+      if (is_input) {
+        // DQ is input to the target node, use the DstArgIndex
+        auto idx = cur->GetDstArgIndex();
+        // This is a DQ node, we are using x, x_scale, x_zp (input[0, 1, 2])
+        quantized_io_defs.insert({idx, NodeUnitIODef{*node_inputs[0], quant_param}});
+      } else {
+        // Q is output of the target node, use the SrcArgIndex
+        auto idx = cur->GetSrcArgIndex();
+        // This is a Q node, we are using y (output[0]), y_scale, y_zp (input[1, 2])
+        const auto node_outputs = node.OutputDefs();
+        quantized_io_defs.insert({idx, NodeUnitIODef{*node_outputs[0], quant_param}});
+      }
+    }
+  }
+
+  // Construct the IODefs for this QDQ NodeGroup
+  std::vector<NodeUnitIODef> io_defs;
+  io_defs.reserve(target_node_io_defs_size);
+  for (size_t i = 0; i < target_node_io_defs_size; i++) {
+    // If we can find the NodeUnitIODef for this index, this is a quantized input/output
+    if (quantized_io_defs.find(i) != quantized_io_defs.cend()) {
+      io_defs.push_back(std::move(quantized_io_defs.at(i)));
+    } else {
+      // This is a regular input
+      io_defs.push_back({*target_node_io_defs[i], std::nullopt});
+    }
+  }
+
+  return io_defs;
+}
+
+}  // namespace
+
+Status QDQ::NodeGroup::CanCreateNodeGroup(const GraphViewer& graph_viewer,
+                                          const Node& target_node,
+                                          gsl::span<const Node* const> dq_nodes,
+                                          gsl::span<const Node* const> q_nodes) {
+  // Within a QDQ node group, a target node input is the only consumer of each DQ.
+  // This should have been ensured by the EnsureUniqueDQForNodeUnit graph transformer, but other graph modifications
+  // may have happened since. Verify that this is still true.
+  for (const auto* dq_node : dq_nodes) {
+    const bool dq_produces_graph_output = graph_viewer.NodeProducesGraphOutput(*dq_node);
+    ORT_RETURN_IF(dq_produces_graph_output,
+                  "QDQ node group cannot have DQ node that produces a graph output. DQ node: ", dq_node->Name(),
+                  ", target node: ", target_node.Name());
+
+    const bool dq_has_single_output_edge_to_target =
+        dq_node->GetOutputEdgesCount() == 1 &&
+        dq_node->OutputEdgesBegin()->GetNode().Index() == target_node.Index();
+    ORT_RETURN_IF_NOT(dq_has_single_output_edge_to_target,
+                      "QDQ node group cannot have DQ that doesn't have a single output edge to the target node. "
+                      "DQ node: ",
+                      dq_node->Name(), ", target node: ", target_node.Name());
+  }
+
+  // an output from the target node can have either Q consumers or direct consumers. it cannot have both.
+  // this must be checked on a per output basis.
+  // e.g. TopK produces values and indices. The indices output won't be quantized, so even if we replace the TopK QDQ
+  // node group with a quantized TopK, an int64_t indices value will be produced and can provide a graph output.
+  if (!q_nodes.empty()) {
+    auto cur_edge = target_node.OutputEdgesBegin();
+    auto end_edge = target_node.OutputEdgesEnd();
+    std::vector<const Node*> output_consumers(target_node.OutputDefs().size(), nullptr);
+
+    for (; cur_edge != end_edge; ++cur_edge) {
+      auto output_idx = cur_edge->GetSrcArgIndex();
+      const Node& this_consumer = cur_edge->GetNode();
+      const Node* existing_consumer = output_consumers[output_idx];
+
+      if (existing_consumer != nullptr) {
+        // another edge for this output. either both are Q or both are not.
+        bool valid = true;
+        if (existing_consumer->OpType() == "QuantizeLinear") {
+          valid = this_consumer.OpType() == "QuantizeLinear";
+        } else {
+          valid = this_consumer.OpType() != "QuantizeLinear";
+        }
+
+        ORT_RETURN_IF_NOT(valid,
+                          "QDQ node group cannot have an output from the target node being consumed by a Q node and "
+                          "a non-Q node. target node: ",
+                          target_node.Name());
+      } else {
+        output_consumers[output_idx] = &this_consumer;
+      }
+    }
+
+    const auto& graph_outputs = graph_viewer.GetOutputs();
+    for (size_t idx = 0, end = output_consumers.size(); idx < end; ++idx) {
+      // any output with a Q cannot be a graph output as it will disappear if the QDQ node unit is converted to
+      // a quantized op.
+      if (output_consumers[idx] != nullptr && output_consumers[idx]->OpType() == "QuantizeLinear") {
+        const auto& output_name = target_node.OutputDefs()[idx]->Name();
+        bool is_graph_output = std::any_of(graph_outputs.begin(), graph_outputs.end(),
+                                           [&output_name](const NodeArg* node_arg) {
+                                             return node_arg->Name() == output_name;
+                                           });
+        ORT_RETURN_IF(is_graph_output,
+                      "QDQ node group cannot have an output from the target node that is consumed by a Q node and "
+                      "a graph output. target node: ",
+                      target_node.Name(), " output idx:", idx);
+      }
+    }
+  }
+
+  return Status::OK();
+}
+NodeUnit::NodeUnit(const Node& node)
+    : target_node_(node),
+      type_(Type::SingleNode),
+      input_edge_count_(node.GetInputEdgesCount()) {
+  InitForSingleNode();
+}
+
+NodeUnit::NodeUnit(const GraphViewer& graph_viewer, const QDQ::NodeGroup& node_group)
+    : dq_nodes_{GetQDQIONodes(graph_viewer, node_group, true /* is_input */)},
+      target_node_(*graph_viewer.GetNode(node_group.target_node)),
+      q_nodes_{GetQDQIONodes(graph_viewer, node_group, false /* is_input */)},
+      type_(Type::QDQGroup),
+      inputs_{GetQDQIODefs(target_node_, node_group, true /* is_input */)},
+      outputs_{GetQDQIODefs(target_node_, node_group, false /* is_input */)} {
+  ORT_THROW_IF_ERROR(QDQ::NodeGroup::CanCreateNodeGroup(graph_viewer, target_node_, dq_nodes_, q_nodes_));
+
+  input_edge_count_ = std::accumulate(dq_nodes_.cbegin(), dq_nodes_.cend(), size_t(0),
+                                      [](size_t acc, const Node* node) { return acc + node->GetInputEdgesCount(); });
+
+  // add edges for inputs that are not from DQ nodes. there is one edge to each DQ node.
+  // other inputs could come from initializers or graph inputs (no edges) or other nodes (edge).
+  input_edge_count_ += target_node_.GetInputEdgesCount() - dq_nodes_.size();
+
+  // create output edges. each target node output either goes to Q node/s or non-Q node/s.
+  // ValidateNodeGroupQDQNodes ensures this.
+  auto cur_edge = target_node_.OutputEdgesBegin();
+  auto end_edge = target_node_.OutputEdgesEnd();
+  for (; cur_edge != end_edge; ++cur_edge) {
+    const Node& node = cur_edge->GetNode();
+
+    // if node is in q_nodes we hide the Q node.
+    if (std::find(q_nodes_.cbegin(), q_nodes_.cend(), &node) != q_nodes_.cend()) {
+      auto src_idx = cur_edge->GetSrcArgIndex();
+      auto q_cur_edge = node.OutputEdgesBegin();
+      auto q_end_edge = node.OutputEdgesEnd();
+      for (; q_cur_edge != q_end_edge; ++q_cur_edge) {
+        output_edges_.insert(Node::EdgeEnd{q_cur_edge->GetNode(), src_idx, q_cur_edge->GetDstArgIndex()});
+      }
+    } else {
+      // non-Q node, or Q node that isn't in the QDQ node group (unexpected but may be possible). add as-is.
+      output_edges_.insert(*cur_edge);
+    }
+  }
+}
+
+const std::string& NodeUnit::Domain() const noexcept { return target_node_.Domain(); }
+const std::string& NodeUnit::OpType() const noexcept { return target_node_.OpType(); }
+const std::string& NodeUnit::Name() const noexcept { return target_node_.Name(); }
+int NodeUnit::SinceVersion() const noexcept { return target_node_.SinceVersion(); }
+NodeIndex NodeUnit::Index() const noexcept { return target_node_.Index(); }
+const Path& NodeUnit::ModelPath() const noexcept { return target_node_.ModelPath(); }
+ProviderType NodeUnit::GetExecutionProviderType() const noexcept { return target_node_.GetExecutionProviderType(); }
+
+void NodeUnit::InitForSingleNode() {
+  const auto& input_defs = target_node_.InputDefs();
+  const auto& output_defs = target_node_.OutputDefs();
+  auto qlinear_type = GetQLinearOpType(target_node_);
+  if (qlinear_type == QLinearOpType::Unknown || IsVariadicQLinearOp(qlinear_type)) {  // TODO, add variadic support
+    // Not a Qlinear op, add all inputs / outputs
+    auto add_all_io = [](std::vector<NodeUnitIODef>& defs,
+                         const ConstPointerContainer<std::vector<NodeArg*>>& node_defs) {
+      defs.reserve(node_defs.size());
+
+      for (const auto def : node_defs) {
+        defs.push_back(NodeUnitIODef{*def, std::nullopt});
+      }
+    };
+
+    add_all_io(inputs_, input_defs);
+    add_all_io(outputs_, output_defs);
+  } else if (IsUnaryQLinearOp(qlinear_type)) {
+    // Unary QLinear Op has 5 inputs
+    // x, x_scale, x_zp, y_scale, y_zp (optional)
+    inputs_.push_back(NodeUnitIODef{*input_defs[0], NodeUnitIODef::QuantParam{*input_defs[1], input_defs[2]}});
+    outputs_.push_back(NodeUnitIODef{*output_defs[0],
+                                     NodeUnitIODef::QuantParam{*input_defs[3],
+                                                               input_defs.size() > 4 ? input_defs[4] : nullptr}});
+
+  } else if (IsBinaryQLinearOp(qlinear_type)) {
+    // Binary QLinear Op has 9 inputs
+    // x1, x1_scale, x1_zp, x2/w, x2_scale, x2_zp, y_scale , y_zp, B
+    inputs_.push_back(NodeUnitIODef{*input_defs[0], NodeUnitIODef::QuantParam{*input_defs[1], input_defs[2]}});
+    inputs_.push_back(NodeUnitIODef{*input_defs[3], NodeUnitIODef::QuantParam{*input_defs[4], input_defs[5]}});
+
+    if (input_defs.size() == 9) {                                      // has Bias
+      inputs_.push_back(NodeUnitIODef{*input_defs[8], std::nullopt});  // for Bias the scale and zp are optional
+    }
+
+    outputs_.push_back(NodeUnitIODef{*output_defs[0], NodeUnitIODef::QuantParam{*input_defs[6], input_defs[7]}});
+
+  } else if (qlinear_type == QLinearOpType::DequantizeLinear) {
+    // DequantizeLinear has 3 inputs
+    // x, x_scale, x_zp
+    // output is not quantized
+    inputs_.push_back(NodeUnitIODef{*input_defs[0], NodeUnitIODef::QuantParam{*input_defs[1], input_defs.size() == 3
+                                                                                                  ? input_defs[2]
+                                                                                                  : nullptr}});
+    outputs_.push_back(NodeUnitIODef{*output_defs[0], std::nullopt});
+
+  } else if (qlinear_type == QLinearOpType::QuantizeLinear) {
+    // QuantizeLinear the input is not quantized and has 3 inputs
+    // x, y_scale, y_zp (optional)
+    // The output is quantized
+    inputs_.push_back(NodeUnitIODef{*input_defs[0], std::nullopt});
+    outputs_.push_back(NodeUnitIODef{*output_defs[0], NodeUnitIODef::QuantParam{*input_defs[1], input_defs.size() == 3
+                                                                                                    ? input_defs[2]
+                                                                                                    : nullptr}});
+  } else {
+    ORT_THROW("The QLinear op [", static_cast<uint8_t>(qlinear_type), "] is not supported");
+  }
+}
+
+Node::EdgeConstIterator NodeUnit::OutputEdgesBegin() const {
+  return (type_ == Type::SingleNode) ? target_node_.OutputEdgesBegin() : output_edges_.begin();
+}
+
+Node::EdgeConstIterator NodeUnit::OutputEdgesEnd() const {
+  return (type_ == Type::SingleNode) ? target_node_.OutputEdgesEnd() : output_edges_.end();
+}
+
+std::vector<const Node*> NodeUnit::GetAllNodesInGroup() const noexcept {
+  std::vector<const Node*> all_nodes = dq_nodes_;
+  all_nodes.push_back(&target_node_);
+  all_nodes.insert(all_nodes.end(), q_nodes_.begin(), q_nodes_.end());
+  return all_nodes;
+}
+
+}  // namespace onnxruntime
+
+#endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
diff --git a/onnxruntime/core/providers/shared/node_unit/node_unit.h b/onnxruntime/core/framework/node_unit.h
similarity index 54%
rename from onnxruntime/core/providers/shared/node_unit/node_unit.h
rename to onnxruntime/core/framework/node_unit.h
index b47204ca3c42d..66afaec8ee1e2 100644
--- a/onnxruntime/core/providers/shared/node_unit/node_unit.h
+++ b/onnxruntime/core/framework/node_unit.h
@@ -3,6 +3,9 @@
 
 #pragma once
 
+// QDQ models require graph modification at runtime, so we know this infrastructure is not used in a minimal build
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+
 #include <string>
 #include <optional>
 #include <vector>
@@ -18,8 +21,21 @@ class NodeArg;
 class Path;
 
 namespace QDQ {
-struct NodeGroup;
-}
+// Struct to represent a DequantizeLinear -> Op -> QuantizeLinear node group
+struct NodeGroup {
+  std::vector<NodeIndex> dq_nodes;
+  std::vector<NodeIndex> q_nodes;
+  NodeIndex target_node;
+
+  // Validator to check if the set of nodes can form a valid QDQ NodeGroup.
+  // Checks target node is only consumer of each DQ, and that the outputs remain valid if the QDQ node group was to
+  // be converted into a single node with a quantized operator.
+  static Status CanCreateNodeGroup(const GraphViewer& graph_viewer,
+                                   const Node& target_node,
+                                   gsl::span<const Node* const> dq_nodes,
+                                   gsl::span<const Node* const> q_nodes);
+};
+}  // namespace QDQ
 
 // Definition of one input or output
 // If the optional quant_param is present, then this is a quantized input,
@@ -69,26 +85,33 @@ class NodeUnit {
   const std::vector<const Node*>& GetQNodes() const noexcept { return q_nodes_; }
   std::vector<const Node*> GetAllNodesInGroup() const noexcept;
 
-  Node::EdgeConstIterator OutputEdgesBegin(size_t index) const;
-  Node::EdgeConstIterator OutputEdgesEnd(size_t index) const;
+  /// Number of input edges to the logical node. For a QDQ node this is the count of input edges to the DQ nodes
+  /// plus any other edges to the target node for inputs that are not via a DQ node.
+  size_t InputEdgeCount() const { return input_edge_count_; }
+
+  // output edges. src index is for outputs of the target node. dest index and node is for consumer of node unit
+  // output. any Q nodes are hidden.
+  Node::EdgeConstIterator OutputEdgesBegin() const;
+  Node::EdgeConstIterator OutputEdgesEnd() const;
 
  private:
-  const std::vector<const Node*> q_nodes_;   // q-nodes for this NodeUnit
-  const std::vector<const Node*> dq_nodes_;  // dq nodes for this NodeUnit, not all inputs
+  // Initialization for a NodeUnit that contains a single node
+  void InitForSingleNode();
+
+  const std::vector<const Node*> dq_nodes_;  // dq nodes for this NodeUnit, not necessarily all inputs
   const Node& target_node_;
+  const std::vector<const Node*> q_nodes_;  // q-nodes for this NodeUnit. not necessarily all outputs
   const Type type_;
 
   std::vector<NodeUnitIODef> inputs_;
   std::vector<NodeUnitIODef> outputs_;
 
-  // Initializing for a single Node
-  void InitForSingleNode();
-};
+  size_t input_edge_count_;  // total number of input edges
 
-// Get all the nodes in the given graph_viewer as NodeUnits (SingleNode or QDQGroup)
-// And return a map to quick query the NodeUnit which contains the given Node,
-// Note, the value of the map is owned by the vector of std::unique_ptr<NodeUnit>
-std::pair<std::vector<std::unique_ptr<NodeUnit>>, std::unordered_map<const Node*, const NodeUnit*>>
-GetAllNodeUnits(const GraphViewer& graph_viewer);
+  // output edges, hiding any Q nodes involved. src_idx will be value from target node. only used for QDQ node group.
+  Node::EdgeSet output_edges_;
+};
 
 }  // namespace onnxruntime
+
+#endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc
index 8535b8c9a944a..6b4f62ae1343d 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc
@@ -58,8 +58,8 @@ bool NodeGroupSelector::CheckQDQNodes(const GraphViewer& graph_viewer, const Nod
     return false;
   }
 
-  if (const auto dq_validation_status = QDQ::ValidateNodeGroupDQNodes(graph_viewer, node, dq_nodes);
-      !dq_validation_status.IsOK()) {
+  if (const auto qdq_validation_status = NodeGroup::CanCreateNodeGroup(graph_viewer, node, dq_nodes, q_nodes);
+      !qdq_validation_status.IsOK()) {
     return false;
   }
 
@@ -153,8 +153,8 @@ bool DropDQNodeGroupSelector::Check(const GraphViewer& graph_viewer,
     return false;
   }
 
-  if (const auto dq_validation_status = QDQ::ValidateNodeGroupDQNodes(graph_viewer, node, dq_nodes);
-      !dq_validation_status.IsOK()) {
+  if (const auto qdq_validation_status = NodeGroup::CanCreateNodeGroup(graph_viewer, node, dq_nodes, q_nodes);
+      !qdq_validation_status.IsOK()) {
     return false;
   }
 
@@ -544,8 +544,8 @@ bool TopKNodeGroupSelector::Check(const GraphViewer& graph_viewer,
     return false;
   }
 
-  if (const auto dq_validation_status = QDQ::ValidateNodeGroupDQNodes(graph_viewer, node, dq_nodes);
-      !dq_validation_status.IsOK()) {
+  if (const auto qdq_validation_status = QDQ::NodeGroup::CanCreateNodeGroup(graph_viewer, node, dq_nodes, q_nodes);
+      !qdq_validation_status.IsOK()) {
     return false;
   }
 
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h
index deee6e7f25f1a..c90a42a36483d 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h
@@ -5,6 +5,7 @@
 
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
 
+#include "core/framework/node_unit.h"
 #include "core/optimizer/selectors_actions/selector_action_transformer.h"
 
 namespace onnxruntime {
@@ -13,13 +14,6 @@ class Node;
 
 namespace QDQ {
 
-// Struct to represent a DQ->Op->Q node group
-struct NodeGroup {
-  std::vector<NodeIndex> dq_nodes;
-  std::vector<NodeIndex> q_nodes;
-  NodeIndex target_node;
-};
-
 class NodeGroupSelector {
  public:
   // This is a QDQ Selectors only function, will return QDQ::NodeGroup instead of NodesToOptimizeIndices
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
index 544fe82a268c8..1876f7826c968 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
@@ -13,6 +13,7 @@
 #include <core/providers/common.h>
 
 #include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
 
 namespace onnxruntime {
 namespace QDQ {
@@ -43,6 +44,7 @@ static const OpVersionsAndSelector::OpVersionsMap GetMiscOpVersionsMap() {
           {"Tile", {}}};
 }
 
+// These produce int64 indices output, which can't be quantized, so there's no downstream Q node.
 static const OpVersionsAndSelector::OpVersionsMap GetDropDQOpVersionsMap() {
   return {{"ArgMax", {}},
           {"ArgMin", {}}};
@@ -324,28 +326,48 @@ std::vector<NodeGroup> SelectorManager::GetQDQSelections(const GraphViewer& grap
   return qdq_selections;
 }
 
-Status ValidateNodeGroupDQNodes(const GraphViewer& graph_viewer,
-                                const Node& target_node,
-                                gsl::span<const Node* const> dq_nodes) {
-  // Within a QDQ node group, a target node input is the only consumer of each DQ.
-  // This should have been ensured by the EnsureUniqueDQForNodeUnit graph transformer, but other graph modifications
-  // may have happened since. Verify that this is still true.
-  for (const auto* dq_node : dq_nodes) {
-    const bool dq_produces_graph_output = graph_viewer.NodeProducesGraphOutput(*dq_node);
-    ORT_RETURN_IF(dq_produces_graph_output,
-                  "QDQ node group cannot have DQ node that produces a graph output. DQ node: ", dq_node->Name(),
-                  ", target node: ", target_node.Name());
-
-    const bool dq_has_single_output_edge_to_target =
-        dq_node->GetOutputEdgesCount() == 1 &&
-        dq_node->OutputEdgesBegin()->GetNode().Index() == target_node.Index();
-    ORT_RETURN_IF_NOT(dq_has_single_output_edge_to_target,
-                      "QDQ node group cannot have DQ that doesn't have a single output edge to the target node. "
-                      "DQ node: ",
-                      dq_node->Name(), ", target node: ", target_node.Name());
+std::pair<std::vector<std::unique_ptr<NodeUnit>>, std::unordered_map<const Node*, const NodeUnit*>>
+GetAllNodeUnits(const GraphViewer& graph_viewer) {
+  std::vector<std::unique_ptr<NodeUnit>> node_unit_holder;
+  std::unordered_map<const Node*, const NodeUnit*> node_unit_map;
+
+  const auto add_node_unit_to_map = [&](const std::vector<NodeIndex>& node_indices, const NodeUnit* node_unit) {
+    for (const auto& node_idx : node_indices) {
+      const auto* node = graph_viewer.GetNode(node_idx);
+      node_unit_map.insert({node, node_unit});
+    }
+  };
+
+  // Get QDQ NodeUnits first
+  QDQ::SelectorManager selector_mgr;
+  const auto qdq_selections = selector_mgr.GetQDQSelections(graph_viewer);
+
+  for (const auto& qdq_selection : qdq_selections) {
+    auto qdq_unit = std::make_unique<NodeUnit>(graph_viewer, qdq_selection);
+
+    // Fill the node to node_unit map for all nodes in the QDQ Group
+    add_node_unit_to_map(qdq_selection.dq_nodes, qdq_unit.get());
+    add_node_unit_to_map(qdq_selection.q_nodes, qdq_unit.get());
+    add_node_unit_to_map({qdq_selection.target_node}, qdq_unit.get());
+
+    node_unit_holder.push_back(std::move(qdq_unit));
+  }
+
+  // Get the left over SingleNode NodeUnits
+  const auto& node_indices = graph_viewer.GetNodesInTopologicalOrder();
+  for (const auto node_idx : node_indices) {
+    const auto* node(graph_viewer.GetNode(node_idx));
+
+    // This is already part of a QDQ NodeUnit
+    if (node_unit_map.find(node) != node_unit_map.cend())
+      continue;
+
+    auto node_unit = std::make_unique<NodeUnit>(*node);
+    node_unit_map[node] = node_unit.get();
+    node_unit_holder.push_back(std::move(node_unit));
   }
 
-  return Status::OK();
+  return std::make_pair(std::move(node_unit_holder), std::move(node_unit_map));
 }
 
 }  // namespace QDQ
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.h b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.h
index 246f26c1760ec..de36202afff29 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.h
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.h
@@ -7,6 +7,7 @@
 #include "core/common/common.h"
 #include "core/common/gsl.h"
 #include "core/common/inlined_containers.h"
+#include "core/framework/node_unit.h"
 #include "core/graph/basic_types.h"
 
 #if !defined(ORT_MINIMAL_BUILD)
@@ -78,11 +79,16 @@ class SelectorManager {
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(SelectorManager);
 };
 
-// Checks whether the provided DQ nodes are valid for forming a QDQ node group with the provided target node.
-// Returns successful status if so, failed status with reason otherwise.
-Status ValidateNodeGroupDQNodes(const GraphViewer& graph_viewer,
-                                const Node& target_node,
-                                gsl::span<const Node* const> dq_nodes);
+// Get all the nodes in the given graph_viewer as NodeUnits (SingleNode or QDQGroup)
+// And return a map to quick query the NodeUnit which contains the given Node,
+// Note, the value of the map is owned by the vector of std::unique_ptr<NodeUnit>
+//
+// TODO: The overall QDQ setup needs refactoring to separate out generic functionality from optimizer specific
+// functionality.
+// We currently have a bit of a mess with generic things like this to get all the node units being in the optimizer
+// library whereas it should be able to be used by an EP with no dependency on optimizers.
+std::pair<std::vector<std::unique_ptr<NodeUnit>>, std::unordered_map<const Node*, const NodeUnit*>>
+GetAllNodeUnits(const GraphViewer& graph_viewer);
 
 }  // namespace QDQ
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc
index 2d2c89f36f1a7..038423104d92e 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.cc
+++ b/onnxruntime/core/providers/js/js_execution_provider.cc
@@ -21,7 +21,6 @@
 #include "core/framework/kernel_registry.h"
 #include "core/graph/function_utils.h"
 #include "core/graph/indexed_sub_graph.h"
-#include "core/providers/shared/node_unit/node_unit.h"
 #include "data_transfer.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
index 0b32508a5bb38..745504ca04941 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
@@ -11,6 +11,7 @@
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
+#include "core/framework/node_unit.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/graph/graph_viewer.h"
 #include "core/graph/graph.h"
@@ -18,7 +19,6 @@
 #include "core/providers/common.h"
 #include "core/providers/nnapi/nnapi_builtin/builders/op_builder.h"
 #include "core/providers/nnapi/nnapi_builtin/builders/op_builder_factory.h"
-#include "core/providers/shared/node_unit/node_unit.h"
 #include "core/providers/shared/utils/utils.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/base_op_builder.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/base_op_builder.h
index 6a54bf7bdb938..0c0bc7b2e4674 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/base_op_builder.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/base_op_builder.h
@@ -4,7 +4,7 @@
 #pragma once
 
 #include "core/common/common.h"
-#include "core/providers/shared/node_unit/node_unit.h"
+#include "core/framework/node_unit.h"
 #include "core/providers/nnapi/nnapi_builtin/builders/model_builder.h"
 #include "core/providers/nnapi/nnapi_builtin/builders/op_builder.h"
 #include "core/providers/nnapi/nnapi_builtin/builders/op_builder_factory.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
index 6962a7be94bb6..d0ae32378379d 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
@@ -11,17 +11,19 @@
 #include "core/common/safeint.h"
 #include "core/common/status.h"
 #include "core/framework/execution_provider.h"
+#include "core/framework/node_unit.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/graph/graph_viewer.h"
+#include "core/optimizer/initializer.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
 #include "core/providers/common.h"
 #include "core/providers/nnapi/nnapi_builtin/nnapi_api_helper.h"
-#include "core/providers/shared/node_unit/node_unit.h"
-#include "core/providers/shared/utils/utils.h"
 #include "core/providers/nnapi/nnapi_builtin/builders/helper.h"
 #include "core/providers/nnapi/nnapi_builtin/builders/op_builder.h"
 #include "core/providers/nnapi/nnapi_builtin/builders/op_builder_factory.h"
 #include "core/providers/nnapi/nnapi_builtin/nnapi_lib/nnapi_implementation.h"
-#include "core/optimizer/initializer.h"
+#include "core/providers/shared/utils/utils.h"
 
 using namespace android::nn::wrapper;
 
@@ -119,7 +121,7 @@ const NodeUnit& ModelBuilder::GetNodeUnit(const Node* node) const {
 }
 
 void ModelBuilder::PreprocessNodeUnits() {
-  std::tie(node_unit_holder_, node_unit_map_) = GetAllNodeUnits(graph_viewer_);
+  std::tie(node_unit_holder_, node_unit_map_) = QDQ::GetAllNodeUnits(graph_viewer_);
 }
 
 // Help to get all quantized operators' input and the NodeUnit(s) using the input
@@ -664,7 +666,7 @@ int32_t ModelBuilder::FindActivation(const NodeUnit& node_unit) {
 
   int32_t fuse_code = ANEURALNETWORKS_FUSED_NONE;
   bool fuse_code_assigned_from_activation = false;
-  for (auto it = node_unit.OutputEdgesBegin(0), end = node_unit.OutputEdgesEnd(0); it != end; ++it) {
+  for (auto it = node_unit.OutputEdgesBegin(), end = node_unit.OutputEdgesEnd(); it != end; ++it) {
     const auto& dst_node = it->GetNode();
     const auto* dst_input = dst_node.InputDefs()[it->GetDstArgIndex()];
 
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_helpers.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_helpers.cc
index 466865f23f49a..dab7bccf43396 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_helpers.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_helpers.cc
@@ -21,7 +21,6 @@
 #include "core/optimizer/initializer.h"
 #include "core/providers/common.h"
 #include "core/providers/shared/utils/utils.h"
-#include "core/providers/shared/node_unit/node_unit.h"
 #include "core/providers/nnapi/nnapi_builtin/builders/impl/base_op_builder.h"
 
 namespace onnxruntime::nnapi::op_builder_helpers {
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_helpers.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_helpers.h
index 61a16ceff752f..0844857a06d61 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_helpers.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_helpers.h
@@ -7,12 +7,12 @@
 #include <vector>
 
 #include "core/common/common.h"
+#include "core/framework/node_unit.h"
 #include "core/providers/common.h"
 #include "core/providers/nnapi/nnapi_builtin/builders/helper.h"
 #include "core/providers/nnapi/nnapi_builtin/builders/model_builder.h"
 #include "core/providers/nnapi/nnapi_builtin/builders/op_builder.h"
 #include "core/providers/nnapi/nnapi_builtin/nnapi_lib/NeuralNetworksWrapper.h"
-#include "core/providers/shared/node_unit/node_unit.h"
 
 namespace onnxruntime::nnapi::op_builder_helpers {
 
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.cc
index b04703d7611ee..4d2888222ff0f 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.cc
@@ -7,7 +7,10 @@
 #include "core/common/logging/logging.h"
 #include "core/common/string_utils.h"
 #include "core/framework/compute_capability.h"
+#include "core/framework/node_unit.h"
 #include "core/graph/graph_viewer.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
 #include "core/platform/env.h"
 #include "core/providers/common.h"
 #include "core/providers/nnapi/nnapi_builtin/builders/helper.h"
@@ -17,7 +20,6 @@
 #include "core/providers/nnapi/nnapi_builtin/nnapi_api_helper.h"
 #include "core/providers/nnapi/nnapi_builtin/nnapi_lib/nnapi_implementation.h"
 #include "core/providers/partitioning_utils.h"
-#include "core/providers/shared/node_unit/node_unit.h"
 #include "core/session/onnxruntime_cxx_api.h"
 
 namespace onnxruntime {
@@ -119,7 +121,7 @@ NnapiExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_view
   std::vector<std::unique_ptr<NodeUnit>> node_unit_holder;
   std::unordered_map<const Node*, const NodeUnit*> node_unit_map;
 
-  std::tie(node_unit_holder, node_unit_map) = GetAllNodeUnits(graph_viewer);
+  std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(graph_viewer);
 
   // This holds the result of whether a NodeUnit is supported or not,
   // to prevent nodes in a NodeUnit to be checked for multiple times
@@ -181,7 +183,7 @@ NnapiExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_view
   };
 
   result = utils::CreateSupportedPartitions(graph_viewer, is_node_supported, on_group_closed,
-                                            gen_metadef_name, NNAPI, kNnapiExecutionProvider);
+                                            gen_metadef_name, NNAPI, kNnapiExecutionProvider, &node_unit_map);
 
   // Generally, NNAPI supports sub-graphs with at least one non-constant initializer input and one output.
   // So far, we have a few cases that sub-graph has zero valid inputs, like `CastLike`
diff --git a/onnxruntime/core/providers/partitioning_utils.cc b/onnxruntime/core/providers/partitioning_utils.cc
index d537a4cf58b2d..c45f5cd0848dd 100644
--- a/onnxruntime/core/providers/partitioning_utils.cc
+++ b/onnxruntime/core/providers/partitioning_utils.cc
@@ -1,6 +1,9 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+// QDQ models require graph modification at runtime, so we know this infrastructure is not used in a minimal build
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+
 #include "core/providers/partitioning_utils.h"
 
 #include <algorithm>
@@ -10,6 +13,7 @@
 
 #include "core/framework/compute_capability.h"
 #include "core/framework/execution_provider.h"
+#include "core/framework/node_unit.h"
 #include "core/graph/graph_viewer.h"
 #include "core/providers/common.h"
 
@@ -76,6 +80,11 @@ When selecting the next node to process, we first take:
 The remaining unsupported nodes mark the border of the current group so they will be processed later when we consider
 the next group.
 
+If node_unit_map is provided, we process NodeUnit instances (a logical 'Node' that can be a single node or a
+QDQ node group) instead of individual Node instances. As an EP must take complete NodeUnit instances (i.e. it
+must not break up a QDQ node group by taking a subset of nodes in it), this granularity of processing is valid.
+It is required to ensure we do not break up a QDQ node unit during partitioning.
+
 @param graph_viewer GraphViewer that IExecutionProvider::GetCapability is called with.
 @param is_node_supported_fn Callback to check whether a node is supported.
 @param on_group_closed_fn Callback to indicate a completed partition node group.
@@ -88,6 +97,7 @@ std::vector<std::vector<const Node*>> CreateSupportedPartitionNodeGroups(
     const IsNodeSupportedFn& is_node_supported_fn,
     const OnGroupClosedFn& on_group_closed_fn,
     const std::string& execution_provider_type,
+    const std::unordered_map<const Node*, const NodeUnit*>* node_unit_map,
     bool debug_output) {
 #ifdef NDEBUG
   ORT_UNUSED_PARAMETER(debug_output);
@@ -111,7 +121,18 @@ std::vector<std::vector<const Node*>> CreateSupportedPartitionNodeGroups(
   // initialize in-degrees and find root nodes
   for (const auto& node_index : graph_viewer.GetNodesInTopologicalOrder()) {
     const auto& node = *graph_viewer.GetNode(node_index);
-    const auto node_input_edge_count = node.GetInputEdgesCount();
+    auto node_input_edge_count = node.GetInputEdgesCount();
+
+    if (node_unit_map != nullptr) {
+      const auto& node_unit = node_unit_map->at(&node);
+      if (&node_unit->GetNode() != &node) {
+        // only process the target node
+        continue;
+      }
+
+      node_input_edge_count = node_unit->InputEdgeCount();
+    }
+
     in_degree.insert({node.Index(), node_input_edge_count});
     if (node_input_edge_count == 0) {
       nodes_to_process.push_back(&node);
@@ -151,6 +172,8 @@ std::vector<std::vector<const Node*>> CreateSupportedPartitionNodeGroups(
     }
   };
 
+  size_t num_nodes_processed = 0;
+
   while (!nodes_to_process.empty() || !nodes_to_process_with_next_group.empty()) {
     if (nodes_to_process.empty()) {
       // we have processed all the nodes that we can while building this partition node group, start a new one
@@ -162,9 +185,13 @@ std::vector<std::vector<const Node*>> CreateSupportedPartitionNodeGroups(
     const Node& node = *nodes_to_process.front();
     nodes_to_process.pop_front();
 
+    const NodeUnit* node_unit = node_unit_map ? node_unit_map->at(&node) : nullptr;
+    const bool is_qdq_node_unit = node_unit && node_unit->UnitType() == NodeUnit::Type::QDQGroup;
+
     // a node that is already assigned to an EP other than current EP is unsupported
-    const bool is_node_supported =
-        (node.GetExecutionProviderType().empty() || node.GetExecutionProviderType() == execution_provider_type) && is_node_supported_fn(node);
+    const bool is_node_supported = (node.GetExecutionProviderType().empty() ||
+                                    node.GetExecutionProviderType() == execution_provider_type) &&
+                                   is_node_supported_fn(node);
 
     if (!is_node_supported && Contains(supported_group_border, &node)) {
       // an unsupported node on the border will be processed after the current partition node group
@@ -173,34 +200,62 @@ std::vector<std::vector<const Node*>> CreateSupportedPartitionNodeGroups(
     }
 
     if (is_node_supported) {
-      // add node to the partition node group
-      supported_group.push_back(&node);
+      if (is_qdq_node_unit) {
+        // add DQ -> node -> Q for the node unit. must be in topological order
+        for (const auto& dq : node_unit->GetDQNodes()) {
+          supported_group.push_back(dq);
+        }
 
-      // remove node from the border and add its outputs to the border
+        supported_group.push_back(&node);
+
+        for (const auto& q : node_unit->GetQNodes()) {
+          supported_group.push_back(q);
+        }
+      } else {
+        supported_group.push_back(&node);
+      }
+
+      // remove node from the border
       supported_group_border.erase(&node);
+    }
 
-      std::for_each(
-          node.OutputNodesBegin(), node.OutputNodesEnd(),
-          [&supported_group_border](const Node& output) {
-            supported_group_border.insert(&output);
-          });
+    // For each downstream node:
+    //   1: add the downstream node to the border if the current node is supported
+    //   2: adjust in-degrees of the nodes consuming the current node's outputs, and add any new nodes to process
+    const auto process_downstream_node = [&](const Node& downstream_node) {
+      if (is_node_supported) {
+        supported_group_border.insert(&downstream_node);
+      }
+
+      auto& downstream_node_in_degree = in_degree[downstream_node.Index()];
+      --downstream_node_in_degree;
+
+      if (downstream_node_in_degree == 0) {
+        nodes_to_process.push_back(&downstream_node);
+      }
+    };
+
+    if (node_unit_map) {
+      std::for_each(node_unit->OutputEdgesBegin(), node_unit->OutputEdgesEnd(),
+                    [&](const Node::EdgeEnd& edge_end) {
+                      const Node& n = edge_end.GetNode();
+                      const NodeUnit& downstream_node_unit = *node_unit_map->at(&n);
+                      const Node& output = downstream_node_unit.GetNode();
+
+                      process_downstream_node(output);
+                    });
+    } else {
+      std::for_each(node.OutputNodesBegin(), node.OutputNodesEnd(), process_downstream_node);
     }
 
-    // adjust in-degrees of the node outputs and add any new nodes to process
-    std::for_each(
-        node.OutputNodesBegin(), node.OutputNodesEnd(),
-        [&](const Node& output) {
-          auto& output_node_in_degree = in_degree[output.Index()];
-          --output_node_in_degree;
-
-          if (output_node_in_degree == 0) {
-            nodes_to_process.push_back(&output);
-          }
-        });
+    ++num_nodes_processed;
   }
 
   close_group();
 
+  ORT_ENFORCE(num_nodes_processed == in_degree.size(),
+              "Processed ", num_nodes_processed, " nodes. Expected to process ", in_degree.size());
+
   return supported_groups;
 }
 }  // namespace
@@ -318,11 +373,13 @@ CreateSupportedPartitions(const GraphViewer& graph_viewer,
                           const GenerateMetadefNameFn& generate_metadef_name_fn,
                           const std::string& execution_provider_name,
                           const std::string& execution_provider_type,
+                          const std::unordered_map<const Node*, const NodeUnit*>* node_unit_map,
                           bool debug_output) {
   const auto groups = CreateSupportedPartitionNodeGroups(graph_viewer,
                                                          is_node_supported_fn,
                                                          on_partition_closed_fn,
                                                          execution_provider_type,
+                                                         node_unit_map,
                                                          debug_output);
 
   std::vector<std::unique_ptr<ComputeCapability>> partitions{};
@@ -346,6 +403,7 @@ CreateSupportedPartitions(const GraphViewer& graph_viewer,
                           const GenerateMetadefNameFn& generate_metadef_name_fn,
                           const std::string& execution_provider_name,
                           const std::string& execution_provider_type,
+                          const std::unordered_map<const Node*, const NodeUnit*>* node_unit_map,
                           bool debug_output) {
   const auto excluded_nodes = CreateExcludedNodeSet(graph_viewer, stop_ops);
   const bool check_excluded_nodes = !excluded_nodes.empty();
@@ -360,8 +418,11 @@ CreateSupportedPartitions(const GraphViewer& graph_viewer,
       generate_metadef_name_fn,
       execution_provider_name,
       execution_provider_type,
+      node_unit_map,
       debug_output);
 }
 
 }  // namespace utils
 }  // namespace onnxruntime
+
+#endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
diff --git a/onnxruntime/core/providers/partitioning_utils.h b/onnxruntime/core/providers/partitioning_utils.h
index 136725c2f7250..c3f6b104e3f6a 100644
--- a/onnxruntime/core/providers/partitioning_utils.h
+++ b/onnxruntime/core/providers/partitioning_utils.h
@@ -3,6 +3,9 @@
 
 #pragma once
 
+// QDQ models require graph modification at runtime, so we know this infrastructure is not used in a minimal build
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+
 #include <functional>
 #include <memory>
 #include <unordered_set>
@@ -14,8 +17,9 @@
 namespace onnxruntime {
 struct ComputeCapability;
 class GraphViewer;
-class NodeArg;
 class Node;
+class NodeArg;
+class NodeUnit;
 
 namespace utils {
 
@@ -56,6 +60,8 @@ Create the supported partitions for the execution provider.
 @param generate_metadef_name_fn Callback to create the name for the MetaDef.
 @param execution_provider_name Name of execution provider creating the ComputeCapability instance.
 @param execution_provider_type ExecutionProviderType of the EP creating this ComputeCapability instance.
+@param node_unit_map Map of each Node in the graph_viewer to its NodeUnit. Provide if EP handles QDQ format models.
+                     Should be created by EP calling GetAllNodeUnits.
 @param debug_output Print diagnostic output about the partitions and reasons for partition breaks.
                     No-op in a release build.
 
@@ -68,6 +74,7 @@ CreateSupportedPartitions(const GraphViewer& graph_viewer,
                           const GenerateMetadefNameFn& generate_metadef_name_fn,
                           const std::string& execution_provider_name,
                           const std::string& execution_provider_type,
+                          const std::unordered_map<const Node*, const NodeUnit*>* node_unit_map = nullptr,
                           bool debug_output = false);
 
 /**
@@ -79,6 +86,8 @@ Create the supported partitions for the execution provider.
 @param generate_metadef_name Functor to create the name for the MetaDef.
 @param execution_provider_name Name of execution provider creating the ComputeCapability instance.
 @param execution_provider_type ExecutionProviderType of the EP creating this ComputeCapability instance.
+@param node_unit_map Map of each Node in the graph_viewer to its NodeUnit. Provide if EP handles QDQ format models.
+                     Should be created by EP calling GetAllNodeUnits.
 @param debug_output Print diagnostic output about the partitions and reasons for partition breaks.
                     No-op in a release build.
 
@@ -91,6 +100,7 @@ CreateSupportedPartitions(const GraphViewer& graph_viewer,
                           const GenerateMetadefNameFn& generate_metadef_name,
                           const std::string& execution_provider_name,
                           const std::string& execution_provider_type,
+                          const std::unordered_map<const Node*, const NodeUnit*>* node_unit_map = nullptr,
                           bool debug_output = false);
 
 /**
@@ -125,3 +135,5 @@ InlinedHashSet<const Node*> CreateExcludedNodeSet(const GraphViewer& graph_viewe
                                                   const std::unordered_set<std::string>& stop_ops);
 }  // namespace utils
 }  // namespace onnxruntime
+
+#endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
diff --git a/onnxruntime/core/providers/qnn/builder/op_builder.h b/onnxruntime/core/providers/qnn/builder/op_builder.h
index 018d9a2797a66..05398c3f22ea2 100644
--- a/onnxruntime/core/providers/qnn/builder/op_builder.h
+++ b/onnxruntime/core/providers/qnn/builder/op_builder.h
@@ -4,7 +4,7 @@
 #pragma once
 
 #include "core/graph/graph_viewer.h"
-#include "core/providers/shared/node_unit/node_unit.h"
+#include "core/framework/node_unit.h"
 #include "core/providers/shared/utils/utils.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model.cc b/onnxruntime/core/providers/qnn/builder/qnn_model.cc
index dc91b9dfa199e..b3501dfec1ba8 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model.cc
@@ -9,6 +9,8 @@
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/shared/utils/utils.h"
 #include "core/framework/utils.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 
 namespace onnxruntime {
@@ -95,7 +97,7 @@ Status QnnModel::ComposeGraph(const GraphViewer& graph_viewer,
   // valid throughout the lifetime of the ModelBuilder
   std::vector<std::unique_ptr<NodeUnit>> node_unit_holder;
   std::unordered_map<const Node*, const NodeUnit*> node_unit_map;
-  std::tie(node_unit_holder, node_unit_map) = GetAllNodeUnits(graph_viewer);
+  std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(graph_viewer);
 
   // This name must be same with the EPContext node name
   const auto& graph_name = fused_node.Name();
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model.h b/onnxruntime/core/providers/qnn/builder/qnn_model.h
index d0dd091cb1688..8fed2f364ba5a 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model.h
@@ -6,13 +6,13 @@
 #include <vector>
 
 #include "core/common/status.h"
+#include "core/framework/node_unit.h"
 #include "core/graph/graph_viewer.h"
 #include "core/platform/ort_mutex.h"
 #include "core/providers/qnn/builder/qnn_def.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/qnn_backend_manager.h"
 #include "core/session/onnxruntime_cxx_api.h"
-#include "core/providers/shared/node_unit/node_unit.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
index 8ae489c749f31..1e2993f246ae4 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
@@ -11,8 +11,8 @@
 #include "QnnInterface.h"
 #include "qnn_def.h"
 #include "core/common/logging/logging.h"
+#include "core/framework/node_unit.h"
 #include "core/graph/graph_viewer.h"
-#include "core/providers/shared/node_unit/node_unit.h"
 #include "core/providers/shared/utils/utils.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index 3d9cfd92b7922..5c4fa3e0fb88b 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -10,6 +10,8 @@
 #include "core/session/onnxruntime_run_options_config_keys.h"
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/framework/kernel_registry.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
 #include "core/platform/env.h"
 #include "core/providers/common.h"
 #include "core/providers/partitioning_utils.h"
@@ -494,7 +496,7 @@ QNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer
   std::vector<std::unique_ptr<NodeUnit>> node_unit_holder;
   std::unordered_map<const Node*, const NodeUnit*> node_unit_map;
 
-  std::tie(node_unit_holder, node_unit_map) = GetAllNodeUnits(graph_viewer);
+  std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(graph_viewer);
 
   const auto supported_nodes = GetSupportedNodes(graph_viewer, node_unit_map, node_unit_holder.size(),
                                                  is_qnn_ctx_model, logger);
@@ -534,44 +536,39 @@ QNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer
   size_t num_of_supported_nodes = 0;
 
   // Create partitions from supported nodes.
-  {
-    std::vector<std::unique_ptr<ComputeCapability>> partitions = utils::CreateSupportedPartitions(graph_viewer,
-                                                                                                  supported_nodes, {},
-                                                                                                  gen_metadef_name, QNN,
-                                                                                                  kQnnExecutionProvider,
-                                                                                                  true);
-
-    // Filter out partitions that consist of a single QuantizeLinear or DequantizeLinear node.
-    // We also count the number of supported nodes in all valid partitions.
-    for (auto& partition : partitions) {
-      bool is_valid_partition = true;
-      size_t nodes_in_partition = 0;
-
-      if (partition && partition->sub_graph) {
-        nodes_in_partition = partition->sub_graph->nodes.size();
-
-        if (nodes_in_partition == 1 && !is_qnn_ctx_model) {
-          const Node* node = graph_viewer.GetNode(partition->sub_graph->nodes[0]);
-
-          if (!node) {
-            LOGS(logger, ERROR) << "QNN EP: Invalid node in partition of one node.";
-            is_valid_partition = false;
-          } else if (node->OpType() == "QuantizeLinear" || node->OpType() == "DequantizeLinear") {
-            LOGS(logger, WARNING) << "QNN EP does not support a single Quantize/Dequantize node in a partition.";
-            is_valid_partition = false;
-          }
+  std::vector<std::unique_ptr<ComputeCapability>> partitions = utils::CreateSupportedPartitions(
+      graph_viewer, supported_nodes, {}, gen_metadef_name, QNN, kQnnExecutionProvider, &node_unit_map, true);
+
+  // Filter out partitions that consist of a single QuantizeLinear or DequantizeLinear node.
+  // We also count the number of supported nodes in all valid partitions.
+  for (auto& partition : partitions) {
+    bool is_valid_partition = true;
+    size_t nodes_in_partition = 0;
+
+    if (partition && partition->sub_graph) {
+      nodes_in_partition = partition->sub_graph->nodes.size();
+
+      if (nodes_in_partition == 1 && !is_qnn_ctx_model) {
+        const Node* node = graph_viewer.GetNode(partition->sub_graph->nodes[0]);
+
+        if (!node) {
+          LOGS(logger, ERROR) << "QNN EP: Invalid node in partition of one node.";
+          is_valid_partition = false;
+        } else if (node->OpType() == "QuantizeLinear" || node->OpType() == "DequantizeLinear") {
+          LOGS(logger, WARNING) << "QNN EP does not support a single Quantize/Dequantize node in a partition.";
+          is_valid_partition = false;
         }
-      } else {
-        LOGS(logger, ERROR) << "QNN EP: Invalid partition.";
-        is_valid_partition = false;
       }
+    } else {
+      LOGS(logger, ERROR) << "QNN EP: Invalid partition.";
+      is_valid_partition = false;
+    }
 
-      if (is_valid_partition) {
-        result.push_back(std::move(partition));
-        num_of_supported_nodes += nodes_in_partition;
-      }
-    }  // for
-  }
+    if (is_valid_partition) {
+      result.push_back(std::move(partition));
+      num_of_supported_nodes += nodes_in_partition;
+    }
+  }  // for
 
   const size_t num_of_partitions = result.size();
   const auto summary_msg = MakeString("Number of partitions supported by QNN EP: ", num_of_partitions,
diff --git a/onnxruntime/core/providers/shared/node_unit/node_unit.cc b/onnxruntime/core/providers/shared/node_unit/node_unit.cc
deleted file mode 100644
index 10dd58ba28375..0000000000000
--- a/onnxruntime/core/providers/shared/node_unit/node_unit.cc
+++ /dev/null
@@ -1,319 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "node_unit.h"
-#include "core/graph/graph_viewer.h"
-#include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
-#include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
-
-namespace onnxruntime {
-
-namespace {
-
-enum class QLinearOpType : uint8_t {
-  Unknown,  // Unknown or not a linear quantized op
-  DequantizeLinear,
-  QuantizeLinear,
-  QLinearConv,
-  QLinearMatMul,
-  QLinearAdd,
-  QLinearSigmoid,
-  QLinearAveragePool,
-  QLinearMul,
-  QLinearReduceMean,
-  QLinearConcat,
-  QLinearGlobalAveragePool,
-  QLinearLeakyRelu,
-};
-
-QLinearOpType GetQLinearOpType(const onnxruntime::Node& node) {
-  const auto& op_type = node.OpType();
-  if (op_type == "DequantizeLinear")
-    return QLinearOpType::DequantizeLinear;
-  else if (op_type == "QuantizeLinear")
-    return QLinearOpType::QuantizeLinear;
-  else if (op_type == "QLinearConv")
-    return QLinearOpType::QLinearConv;
-  else if (op_type == "QLinearMatMul")
-    return QLinearOpType::QLinearMatMul;
-  else if (op_type == "QLinearAdd")
-    return QLinearOpType::QLinearAdd;
-  else if (op_type == "QLinearSigmoid")
-    return QLinearOpType::QLinearSigmoid;
-  else if (op_type == "QLinearAveragePool")
-    return QLinearOpType::QLinearAveragePool;
-  else if (op_type == "QLinearMul")
-    return QLinearOpType::QLinearMul;
-  else if (op_type == "QLinearReduceMean")
-    return QLinearOpType::QLinearReduceMean;
-  else if (op_type == "QLinearConcat")
-    return QLinearOpType::QLinearConcat;
-  else if (op_type == "QLinearGlobalAveragePool")
-    return QLinearOpType::QLinearGlobalAveragePool;
-  else if (op_type == "QLinearLeakyRelu")
-    return QLinearOpType::QLinearLeakyRelu;
-
-  return QLinearOpType::Unknown;
-}
-
-// Ops have 1 input
-bool IsUnaryQLinearOp(QLinearOpType type) {
-  return type == QLinearOpType::QLinearSigmoid ||
-         type == QLinearOpType::QLinearAveragePool ||
-         type == QLinearOpType::QLinearGlobalAveragePool ||
-         type == QLinearOpType::QLinearLeakyRelu ||
-         type == QLinearOpType::QLinearReduceMean;
-}
-
-// Ops have 2 inputs
-bool IsBinaryQLinearOp(QLinearOpType type) {
-  return type == QLinearOpType::QLinearConv ||
-         type == QLinearOpType::QLinearMatMul ||
-         type == QLinearOpType::QLinearAdd ||
-         type == QLinearOpType::QLinearMul;
-}
-
-// Ops have 1 or more inputs
-bool IsVariadicQLinearOp(QLinearOpType type) {
-  return type == QLinearOpType::QLinearConcat;
-}
-
-const std::vector<const Node*> GetQDQIONodes(const GraphViewer& graph_viewer,
-                                             const QDQ::NodeGroup& node_group, bool is_input) {
-  std::vector<const Node*> io_nodes;
-  const auto& src_nodes = is_input ? node_group.dq_nodes : node_group.q_nodes;
-  io_nodes.reserve(src_nodes.size());
-  for (const auto& node_idx : src_nodes) {
-    io_nodes.push_back(graph_viewer.GetNode(node_idx));
-  }
-  return io_nodes;
-}
-
-// Get the input or output NodeUnitIODef(s) for the given QDQ NodeGroup
-std::vector<NodeUnitIODef> GetQDQIODefs(const Node& target_node, const QDQ::NodeGroup& node_group,
-                                        bool is_input) {
-  const auto& dq_or_q_nodes = is_input ? node_group.dq_nodes : node_group.q_nodes;
-  const auto target_node_io_defs = is_input ? target_node.InputDefs() : target_node.OutputDefs();
-  const size_t target_node_io_defs_size = target_node_io_defs.size();
-
-  // Find all the quantized IO defs and indices (for the input to the target node)
-  std::unordered_map<size_t, NodeUnitIODef> quantized_io_defs;
-  quantized_io_defs.reserve(target_node_io_defs_size);
-
-  auto cur = is_input ? target_node.InputEdgesBegin() : target_node.OutputEdgesBegin();
-  auto end = is_input ? target_node.InputEdgesEnd() : target_node.OutputEdgesEnd();
-  for (; cur != end; ++cur) {
-    const Node& node = cur->GetNode();
-
-    // If we can find the node index in the dq or q nodes, then this is a quantize node (can be DQ or Q depends on is_input)
-    if (std::find(dq_or_q_nodes.cbegin(), dq_or_q_nodes.cend(), node.Index()) != dq_or_q_nodes.cend()) {
-      const auto node_inputs = node.InputDefs();
-      // quantization scale and zp are always the input[1, 2]
-      NodeUnitIODef::QuantParam quant_param{
-          *node_inputs[1],
-          node_inputs.size() == 3 ? node_inputs[2] : nullptr};
-      if (is_input) {
-        // DQ is input to the target node, use the DstArgIndex
-        auto idx = cur->GetDstArgIndex();
-        // This is a DQ node, we are using x, x_scale, x_zp (input[0, 1, 2])
-        quantized_io_defs.insert({idx, NodeUnitIODef{*node_inputs[0], quant_param}});
-      } else {
-        // Q is output of the target node, use the SrcArgIndex
-        auto idx = cur->GetSrcArgIndex();
-        // This is a Q node, we are using y (output[0]), y_scale, y_zp (input[1, 2])
-        const auto node_outputs = node.OutputDefs();
-        quantized_io_defs.insert({idx, NodeUnitIODef{*node_outputs[0], quant_param}});
-      }
-    }
-  }
-
-  // Construct the IODefs for this QDQ NodeGroup
-  std::vector<NodeUnitIODef> io_defs;
-  io_defs.reserve(target_node_io_defs_size);
-  for (size_t i = 0; i < target_node_io_defs_size; i++) {
-    // If we can find the NodeUnitIODef for this index, this is a quantized input
-    if (quantized_io_defs.find(i) != quantized_io_defs.cend()) {
-      io_defs.push_back(std::move(quantized_io_defs.at(i)));
-    } else {
-      // This is a regular input
-      io_defs.push_back({*target_node_io_defs[i], std::nullopt});
-    }
-  }
-
-  return io_defs;
-}
-
-}  // namespace
-
-NodeUnit::NodeUnit(const Node& node)
-    : target_node_(node),
-      type_(Type::SingleNode) {
-  InitForSingleNode();
-}
-
-NodeUnit::NodeUnit(const GraphViewer& graph_viewer, const QDQ::NodeGroup& node_group)
-    : q_nodes_{GetQDQIONodes(graph_viewer, node_group, false /* is_input */)},
-      dq_nodes_{GetQDQIONodes(graph_viewer, node_group, true /* is_input */)},
-      target_node_(*graph_viewer.GetNode(node_group.target_node)),
-      type_(Type::QDQGroup),
-      inputs_{GetQDQIODefs(target_node_, node_group, true /* is_input */)},
-      outputs_{GetQDQIODefs(target_node_, node_group, false /* is_input */)} {
-  ORT_THROW_IF_ERROR(QDQ::ValidateNodeGroupDQNodes(graph_viewer, target_node_, dq_nodes_));
-}
-
-const std::string& NodeUnit::Domain() const noexcept { return target_node_.Domain(); }
-const std::string& NodeUnit::OpType() const noexcept { return target_node_.OpType(); }
-const std::string& NodeUnit::Name() const noexcept { return target_node_.Name(); }
-int NodeUnit::SinceVersion() const noexcept { return target_node_.SinceVersion(); }
-NodeIndex NodeUnit::Index() const noexcept { return target_node_.Index(); }
-const Path& NodeUnit::ModelPath() const noexcept { return target_node_.ModelPath(); }
-ProviderType NodeUnit::GetExecutionProviderType() const noexcept { return target_node_.GetExecutionProviderType(); }
-
-void NodeUnit::InitForSingleNode() {
-  const auto& input_defs = target_node_.InputDefs();
-  const auto& output_defs = target_node_.OutputDefs();
-  auto qlinear_type = GetQLinearOpType(target_node_);
-  if (qlinear_type == QLinearOpType::Unknown ||
-      IsVariadicQLinearOp(qlinear_type)) {  // TODO, add variadic support
-    // Not a Qlinear op, add all inputs / outputs
-    auto add_all_io = [](std::vector<NodeUnitIODef>& defs,
-                         const ConstPointerContainer<std::vector<NodeArg*>>& node_defs) {
-      defs.reserve(node_defs.size());
-
-      for (const auto def : node_defs) {
-        defs.push_back(NodeUnitIODef{*def, std::nullopt});
-      }
-    };
-    add_all_io(inputs_, input_defs);
-    add_all_io(outputs_, output_defs);
-  } else if (IsUnaryQLinearOp(qlinear_type)) {
-    // Unary QLinear Op has 5 inputs
-    // x, x_scale, x_zp, y_scale, y_zp (optional)
-    inputs_.push_back(NodeUnitIODef{
-        *input_defs[0],
-        NodeUnitIODef::QuantParam{*input_defs[1], input_defs[2]}});
-
-    outputs_.push_back(NodeUnitIODef{
-        *output_defs[0],
-        NodeUnitIODef::QuantParam{*input_defs[3],
-                                  input_defs.size() > 4
-                                      ? input_defs[4]
-                                      : nullptr}});
-  } else if (IsBinaryQLinearOp(qlinear_type)) {
-    // Binary QLinear Op has 9 inputs
-    // x1, x1_scale, x1_zp, x2/w, x2_scale, x2_zp, y_scale , y_zp, B
-    inputs_.push_back(NodeUnitIODef{
-        *input_defs[0],
-        NodeUnitIODef::QuantParam{*input_defs[1], input_defs[2]}});
-    inputs_.push_back(NodeUnitIODef{
-        *input_defs[3],
-        NodeUnitIODef::QuantParam{*input_defs[4], input_defs[5]}});
-
-    if (input_defs.size() == 9) {  // has Bias
-      inputs_.push_back(NodeUnitIODef{
-          *input_defs[8],
-          std::nullopt});  // for Bias the scale and zp are optional
-    }
-
-    outputs_.push_back(NodeUnitIODef{
-        *output_defs[0],
-        NodeUnitIODef::QuantParam{*input_defs[6], input_defs[7]}});
-  } else if (qlinear_type == QLinearOpType::DequantizeLinear) {
-    // DequantizeLinear has 3 inputs
-    // x, x_scale, x_zp
-    // output is not quantized
-    inputs_.push_back(NodeUnitIODef{
-        *input_defs[0],
-        NodeUnitIODef::QuantParam{*input_defs[1],
-                                  input_defs.size() == 3
-                                      ? input_defs[2]
-                                      : nullptr}});
-    outputs_.push_back(NodeUnitIODef{*output_defs[0], std::nullopt});
-  } else if (qlinear_type == QLinearOpType::QuantizeLinear) {
-    // QuantizeLinear the input is not quantized and has 3 inputs
-    // x, y_scale, y_zp (optional)
-    // The output is quantized
-    inputs_.push_back(NodeUnitIODef{*input_defs[0], std::nullopt});
-    outputs_.push_back(NodeUnitIODef{
-        *output_defs[0],
-        NodeUnitIODef::QuantParam{*input_defs[1],
-                                  input_defs.size() == 3
-                                      ? input_defs[2]
-                                      : nullptr}});
-  } else {
-    ORT_THROW("The QLinear op [", static_cast<uint8_t>(qlinear_type), "] is not supported");
-  }
-}
-
-Node::EdgeConstIterator NodeUnit::OutputEdgesBegin(size_t index) const {
-  if (type_ == Type::SingleNode) {
-    ORT_ENFORCE(index == 0, "invalid output node index");
-    return target_node_.OutputEdgesBegin();
-  } else {
-    ORT_ENFORCE(index < q_nodes_.size(), "invalid output node index");
-    return q_nodes_[index]->OutputEdgesBegin();
-  }
-}
-
-Node::EdgeConstIterator NodeUnit::OutputEdgesEnd(size_t index) const {
-  if (type_ == Type::SingleNode) {
-    ORT_ENFORCE(index == 0, "invalid output node index");
-    return target_node_.OutputEdgesEnd();
-  } else {
-    ORT_ENFORCE(index < q_nodes_.size(), "invalid output node index");
-    return q_nodes_[index]->OutputEdgesEnd();
-  }
-}
-
-std::vector<const Node*> NodeUnit::GetAllNodesInGroup() const noexcept {
-  std::vector<const Node*> all_nodes = dq_nodes_;
-  all_nodes.push_back(&target_node_);
-  all_nodes.insert(all_nodes.end(), q_nodes_.begin(), q_nodes_.end());
-  return all_nodes;
-}
-
-std::pair<std::vector<std::unique_ptr<NodeUnit>>, std::unordered_map<const Node*, const NodeUnit*>>
-GetAllNodeUnits(const GraphViewer& graph_viewer) {
-  std::vector<std::unique_ptr<NodeUnit>> node_unit_holder;
-  std::unordered_map<const Node*, const NodeUnit*> node_unit_map;
-
-  const auto add_node_unit_to_map = [&](const std::vector<NodeIndex>& node_indices, const NodeUnit* node_unit) {
-    for (const auto& node_idx : node_indices) {
-      const auto* node = graph_viewer.GetNode(node_idx);
-      node_unit_map.insert({node, node_unit});
-    }
-  };
-
-  // Get QDQ NodeUnits first
-  QDQ::SelectorManager selector_mgr;
-  const auto qdq_selections = selector_mgr.GetQDQSelections(graph_viewer);
-
-  for (const auto& qdq_selection : qdq_selections) {
-    auto qdq_unit = std::make_unique<NodeUnit>(graph_viewer, qdq_selection);
-
-    // Fill the node to node_unit map for all nodes in the QDQ Group
-    add_node_unit_to_map(qdq_selection.dq_nodes, qdq_unit.get());
-    add_node_unit_to_map(qdq_selection.q_nodes, qdq_unit.get());
-    add_node_unit_to_map({qdq_selection.target_node}, qdq_unit.get());
-
-    node_unit_holder.push_back(std::move(qdq_unit));
-  }
-
-  // Get the left over SingleNode NodeUnits
-  const auto& node_indices = graph_viewer.GetNodesInTopologicalOrder();
-  for (const auto node_idx : node_indices) {
-    const auto* node(graph_viewer.GetNode(node_idx));
-
-    // This is already part of a QDQ NodeUnit
-    if (node_unit_map.find(node) != node_unit_map.cend())
-      continue;
-
-    auto node_unit = std::make_unique<NodeUnit>(*node);
-    node_unit_map[node] = node_unit.get();
-    node_unit_holder.push_back(std::move(node_unit));
-  }
-
-  return std::make_pair(std::move(node_unit_holder), std::move(node_unit_map));
-}
-
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/shared/utils/utils.cc b/onnxruntime/core/providers/shared/utils/utils.cc
index c07a0929353b1..2088618538de5 100644
--- a/onnxruntime/core/providers/shared/utils/utils.cc
+++ b/onnxruntime/core/providers/shared/utils/utils.cc
@@ -4,12 +4,12 @@
 
 #include "utils.h"
 
-#include <core/common/safeint.h>
-#include <core/framework/tensorprotoutils.h>
-#include <core/graph/graph.h>
-#include <core/providers/common.h>
-#include "core/providers/shared/node_unit/node_unit.h"
+#include "core/common/safeint.h"
+#include "core/framework/node_unit.h"
+#include "core/framework/tensorprotoutils.h"
+#include "core/graph/graph.h"
 #include "core/optimizer/initializer.h"
+#include "core/providers/common.h"
 
 namespace onnxruntime {
 
diff --git a/onnxruntime/core/providers/utils.cc b/onnxruntime/core/providers/utils.cc
index ca3fc4fc1972b..b2f9d265ca053 100644
--- a/onnxruntime/core/providers/utils.cc
+++ b/onnxruntime/core/providers/utils.cc
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 #include "core/framework/tensorprotoutils.h"
-#include "utils.h"
+#include "core/providers/utils.h"
 
 namespace onnxruntime {
 namespace utils {
@@ -23,6 +23,5 @@ common::Status OutputOptionalWithoutDataHelper(const ONNX_NAMESPACE::TypeProto&
   return Status::OK();
 }
 #endif
-
 }  // namespace utils
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/xnnpack/detail/node_support_checker.cc b/onnxruntime/core/providers/xnnpack/detail/node_support_checker.cc
index 8e7e228f974e6..e2d71cda68ec4 100644
--- a/onnxruntime/core/providers/xnnpack/detail/node_support_checker.cc
+++ b/onnxruntime/core/providers/xnnpack/detail/node_support_checker.cc
@@ -6,12 +6,12 @@
 #include <unordered_map>
 
 #include "core/common/common.h"
+#include "core/framework/node_unit.h"
 #include "core/framework/op_node_proto_helper.h"
 #include "core/graph/graph_utils.h"
 #include "core/graph/graph_viewer.h"
 #include "core/providers/common.h"
 #include "core/providers/cpu/nn/pool_attributes.h"
-#include "core/providers/shared/node_unit/node_unit.h"
 #include "core/providers/xnnpack/detail/utils.h"
 
 // each operator provides a helper to check if supported
diff --git a/onnxruntime/core/providers/xnnpack/detail/utils.cc b/onnxruntime/core/providers/xnnpack/detail/utils.cc
index 1a32612981120..f9cb45ebc8abc 100644
--- a/onnxruntime/core/providers/xnnpack/detail/utils.cc
+++ b/onnxruntime/core/providers/xnnpack/detail/utils.cc
@@ -6,14 +6,14 @@
 #include <vector>
 
 #include "core/common/common.h"
+#include "core/common/safeint.h"
+#include "core/framework/node_unit.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/graph/indexed_sub_graph.h"
 #include "core/graph/node_attr_utils.h"
+#include "core/optimizer/initializer.h"
 
-#include "core/providers/shared/node_unit/node_unit.h"
 #include "onnx/defs/attr_proto_util.h"
-#include "core/common/safeint.h"
-#include "core/optimizer/initializer.h"
 
 namespace onnxruntime {
 namespace xnnpack {
diff --git a/onnxruntime/core/providers/xnnpack/detail/utils.h b/onnxruntime/core/providers/xnnpack/detail/utils.h
index 2bbf3ac8c2cb5..d555ee2286b84 100644
--- a/onnxruntime/core/providers/xnnpack/detail/utils.h
+++ b/onnxruntime/core/providers/xnnpack/detail/utils.h
@@ -10,10 +10,10 @@
 #include <string>
 #include <utility>
 
+#include "core/framework/node_unit.h"
 #include "core/framework/op_kernel.h"
 #include "core/graph/indexed_sub_graph.h"
 #include "core/providers/common.h"
-#include "core/providers/shared/node_unit/node_unit.h"
 
 #include "xnnpack.h"
 
diff --git a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc
index eafbfae6f01e1..12e567e7080b3 100644
--- a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc
+++ b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc
@@ -6,17 +6,17 @@
 #include <unordered_set>
 #include <utility>
 
-#include "core/graph/function_utils.h"
-#include "xnnpack_execution_provider.h"
-#include "detail/utils.h"
-#include "detail/node_support_checker.h"
-
 #include "core/framework/compute_capability.h"
 #include "core/framework/kernel_registry.h"
-#include "core/providers/shared/node_unit/node_unit.h"
+#include "core/framework/node_unit.h"
+#include "core/graph/function_utils.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
-
-#include "xnnpack_init.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
+#include "core/providers/xnnpack/xnnpack_execution_provider.h"
+#include "core/providers/xnnpack/detail/utils.h"
+#include "core/providers/xnnpack/detail/node_support_checker.h"
+#include "core/providers/xnnpack/xnnpack_init.h"
 
 namespace onnxruntime {
 
@@ -268,7 +268,7 @@ std::vector<std::unique_ptr<ComputeCapability>> XnnpackExecutionProvider::GetCap
   // Get all the NodeUnits in the GraphViewer so we can check if something is in a QDQ node group
   std::vector<std::unique_ptr<NodeUnit>> node_unit_holder;
   std::unordered_map<const Node*, const NodeUnit*> node_unit_map;
-  std::tie(node_unit_holder, node_unit_map) = GetAllNodeUnits(graph);
+  std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(graph);
 
   // This holds the result of whether a NodeUnit is supported or not,
   // to prevent nodes in a NodeUnit being checked for multiple times
diff --git a/onnxruntime/test/mlas/unittest/test_fp16_activation.cpp b/onnxruntime/test/mlas/unittest/test_fp16_activation.cpp
index 484a9a22429d5..969997d2b84ec 100644
--- a/onnxruntime/test/mlas/unittest/test_fp16_activation.cpp
+++ b/onnxruntime/test/mlas/unittest/test_fp16_activation.cpp
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 #include "test_fp16.h"
+#include <iomanip>
 
 #ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
 
diff --git a/onnxruntime/test/optimizer/qdq_transformer_test.cc b/onnxruntime/test/optimizer/qdq_transformer_test.cc
index 13333f1558cc6..fbd5c9b5a137b 100644
--- a/onnxruntime/test/optimizer/qdq_transformer_test.cc
+++ b/onnxruntime/test/optimizer/qdq_transformer_test.cc
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 #include "core/framework/compute_capability.h"
+#include "core/framework/node_unit.h"
 #include "core/graph/model.h"
 #include "core/graph/onnx_protobuf.h"
 #include "core/mlas/inc/mlas.h"
@@ -9,7 +10,6 @@
 #include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
 #include "core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.h"
 #include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
-#include "core/optimizer/utils.h"
 #include "core/providers/partitioning_utils.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
 #include "core/session/environment.h"
@@ -30,10 +30,6 @@
 #pragma warning(disable : 4127)
 #endif  // #if defined(_MSC_VER)
 
-#ifdef USE_NNAPI
-#include "core/providers/shared/node_unit/node_unit.h"
-#endif  // #ifdef USE_NNAPI
-
 struct QDQOpKeys {
   const char* quantize_linear;
   const char* dequantize_linear;
@@ -3243,14 +3239,14 @@ TEST(QDQTransformerTests, QDQ_Selector_Test) {
     ASSERT_EQ(std::vector<NodeIndex>({4}), qdq_group.q_nodes);
   }
 
-// The function GetAllNodeUnits is enabled for NNAPI EP only for now
-#ifdef USE_NNAPI
+// The function GetAllNodeUnits is used by NNAPI, XNNPACK and QNN
+#if defined(USE_NNAPI) || defined(USE_QNN) || defined(USE_XNNPACK)
   {
     // Get all the NodeUnits in the graph_viewer
     std::vector<std::unique_ptr<NodeUnit>> node_unit_holder;
     std::unordered_map<const Node*, const NodeUnit*> node_unit_map;
 
-    std::tie(node_unit_holder, node_unit_map) = GetAllNodeUnits(whole_graph_viewer);
+    std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(whole_graph_viewer);
 
     // We should get a single QDQ Node unit in the result
     ASSERT_EQ(1, node_unit_holder.size());
@@ -3288,7 +3284,7 @@ TEST(QDQTransformerTests, QDQ_Selector_Test) {
     verify_io_def(qdq_node_unit.Inputs()[2], *whole_graph_viewer.GetNode(2));   // DQ_bias
     verify_io_def(qdq_node_unit.Outputs()[0], *whole_graph_viewer.GetNode(4));  // Q_output
   }
-#endif  // #ifdef USE_NNAPI
+#endif  // defined(USE_NNAPI) || defined(USE_QNN) || defined(USE_XNNPACK)
 
   // Create a graph viewer covers part of the graph
   // Make sure the qdq conv selector will fail for the partial graph
diff --git a/onnxruntime/test/providers/internal_testing/internal_testing_execution_provider.cc b/onnxruntime/test/providers/internal_testing/internal_testing_execution_provider.cc
index 0167f7a7718b1..2e073def5d643 100644
--- a/onnxruntime/test/providers/internal_testing/internal_testing_execution_provider.cc
+++ b/onnxruntime/test/providers/internal_testing/internal_testing_execution_provider.cc
@@ -220,6 +220,7 @@ InternalTestingExecutionProvider::GetCapability(const onnxruntime::GraphViewer&
   auto compile_capabilities = utils::CreateSupportedPartitions(graph_viewer, supported_compiled_nodes, stop_ops_,
                                                                generate_metadef_name, ep_name_,
                                                                onnxruntime::utils::kInternalTestingExecutionProvider,
+                                                               /*QDQ NodeUnit map*/ nullptr,
                                                                debug_output_);
 
   if (!static_capabilities.empty()) {
diff --git a/onnxruntime/test/providers/partitioning_utils_test.cc b/onnxruntime/test/providers/partitioning_utils_test.cc
new file mode 100644
index 0000000000000..5db69489afaef
--- /dev/null
+++ b/onnxruntime/test/providers/partitioning_utils_test.cc
@@ -0,0 +1,174 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include "core/common/common.h"
+#include "core/graph/graph_viewer.h"
+#include "core/graph/model.h"
+#include "core/framework/node_unit.h"
+#include "core/framework/compute_capability.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
+#include "core/providers/partitioning_utils.h"
+
+#include "test/optimizer/graph_transform_test_builder.h"
+#include "test/optimizer/qdq_test_utils.h"
+#include "test/util/include/asserts.h"
+#include "test/util/include/test_utils.h"
+#include "test/util/include/test/test_environment.h"
+
+namespace onnxruntime {
+namespace test {
+
+// Test handling of a DQ node that is connected to an initializer at the start of the graph, but not used
+// in a QDQ node group until after an unsupported node in the graph. If we do not process QDQ node units
+// correctly this DQ will incorrectly be in the first partition, with the rest of the QDQ node group in
+// the second partition.
+TEST(PartitioningUtilsTest, TestQDQHandling) {
+  constexpr const ORTCHAR_T* model_uri = ORT_TSTR("testdata/ort_github_issue_19590.onnx");
+  auto& logger = DefaultLoggingManager().DefaultLogger();
+
+  std::shared_ptr<Model> p_model;
+  ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, logger));
+  Graph& graph = p_model->MainGraph();
+  GraphViewer graph_viewer = GraphViewer(graph);
+
+  // we want everything but the Cast in the test model to be supported
+  const auto is_node_supported = [&](const Node& node) -> bool {
+    return node.OpType() != "Cast";
+  };
+
+  const auto on_group_closed = [&](const std::vector<const Node*>& /*group*/) -> bool {
+    return true;
+  };
+
+  const auto gen_metadef_name = [&]() {
+    static int metadef_id = 0;
+    return "TestMetaDef_" + std::to_string(metadef_id++);
+  };
+
+  std::vector<std::unique_ptr<NodeUnit>> node_unit_holder;
+  std::unordered_map<const Node*, const NodeUnit*> node_unit_map;
+  std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(graph_viewer);
+
+  auto result = utils::CreateSupportedPartitions(graph_viewer, is_node_supported, on_group_closed,
+                                                 gen_metadef_name, "TEST", kCpuExecutionProvider, &node_unit_map,
+                                                 true);
+
+  // we should have 2 supported partitions, split by the Cast node.
+  // the first should have the Mul and NOT the DQ for the initializer if everything worked correctly.
+  ASSERT_EQ(result.size(), size_t(2)) << "Expected 2 partitions";
+  ASSERT_EQ(result[0]->sub_graph->nodes.size(), size_t(1)) << "First partition should only have the Mul and not a DQ";
+  ASSERT_EQ(result[1]->sub_graph->nodes.size(), size_t(5));  // everything else except the unsupported Cast
+}
+
+/// Check that CreateSupportedPartitions processes all nodes without error.
+static void CheckAllNodesProcessed(const std::function<void(ModelTestBuilder&)>& build_model) {
+  auto& logger = DefaultLoggingManager().DefaultLogger();
+  const std::unordered_map<std::string, int> domain_to_version = {{"", 15}};
+
+  Model model("PartitioningUtils_TestModel", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
+              domain_to_version, {}, logger);
+
+  Graph& graph = model.MainGraph();
+  ModelTestBuilder helper(graph);
+  build_model(helper);
+  helper.SetGraphOutputs();
+  ASSERT_STATUS_OK(model.MainGraph().Resolve());
+
+  GraphViewer graph_viewer = GraphViewer(graph);
+
+  std::vector<std::unique_ptr<NodeUnit>> node_unit_holder;
+  std::unordered_map<const Node*, const NodeUnit*> node_unit_map;
+  std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(graph_viewer);
+
+  const auto is_node_supported = [&](const Node& /*node*/) -> bool {
+    return true;
+  };
+
+  const auto on_group_closed = [&](const std::vector<const Node*>& /*group*/) -> bool {
+    return true;
+  };
+
+  const auto gen_metadef_name = [&]() {
+    static int metadef_id = 0;
+    return "TestMetaDef_" + std::to_string(metadef_id++);
+  };
+
+  auto result = utils::CreateSupportedPartitions(graph_viewer, is_node_supported, on_group_closed,
+                                                 gen_metadef_name, "TEST", kCpuExecutionProvider, &node_unit_map,
+                                                 true);
+
+  // the 'real' test is that CreateSupportedPartitions doesn't throw due to a mismatch with expected vs processed nodes
+  // as all ops are supported there should only ever be 1 partition
+  ASSERT_EQ(result.size(), size_t(1)) << "Expected 1 partition";
+}
+
+TEST(PartitioningUtilsTest, TestHandlingQDQNodeUnitWithNoQNodes) {
+  // build graph with QDQ node unit for logical operator (Equal) that has no Q node and a downstream node (Cast).
+  auto build_model = [](ModelTestBuilder& builder) {
+    constexpr uint8_t zero_point = 0;
+    constexpr float qdq_scale = 0.0038f;
+    const std::vector<int64_t> input_shape = {1, 3, 8, 8};
+
+    auto* input0 = builder.MakeInput<float>(input_shape, -1.0f, 1.0f);
+    auto* input1 = builder.MakeInput<float>(input_shape, -1.0f, 1.0f);
+    auto* output = builder.MakeOutput();
+
+    // input -> Q -> DQ -> Op
+    auto* qdq0_output = AddQDQNodePair<uint8_t>(builder, input0, qdq_scale, zero_point);
+    auto* qdq1_output = AddQDQNodePair<uint8_t>(builder, input1, qdq_scale, zero_point);
+
+    // Equal ->
+    auto* equal_output = builder.MakeIntermediate();
+    builder.AddNode("Equal", {qdq0_output, qdq1_output}, {equal_output});
+
+    // -> Cast -> output
+    Node& cast_node = builder.AddNode("Cast", {equal_output}, {output});
+    cast_node.AddAttribute("to",
+                           static_cast<int64_t>(ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT));
+  };
+
+  CheckAllNodesProcessed(build_model);
+}
+
+// TopK produces 2 outputs, one of which is used in a QDQ node group (Q of values output)
+// and the other (indices output) is not. A downstream node consuming the indices output has an edge from the target
+// node and not a Q node.
+// To process this correctly, the QDQ NodeUnit must return output edges for both the Q node/s of the values output,
+// and the downstream node (Cast in this case) of the indices output.
+TEST(PartitioningUtilsTest, TestQDQNodeGroupWithOutputFromTargetNode) {
+  const auto build_model = [](ModelTestBuilder& builder) {
+    constexpr uint8_t zero_point = 0;
+    constexpr float qdq_scale = 0.0038f;
+    const std::vector<int64_t> input_shape = {1, 3, 8, 8};
+
+    auto* input0 = builder.MakeInput<float>(input_shape, -1.0f, 1.0f);
+
+    // input -> Q -> DQ ->
+    auto* qdq0_output = AddQDQNodePair<uint8_t>(builder, input0, qdq_scale, zero_point);
+
+    // K input
+    NodeArg* k_input = builder.MakeInput<int64_t>({1}, {10});
+
+    // TopK op
+    NodeArg* values_output = builder.MakeIntermediate();
+    NodeArg* indices_output = builder.MakeIntermediate();
+    builder.AddNode("TopK", {qdq0_output, k_input}, {values_output, indices_output});
+
+    // values -> Q -> DQ -> graph output
+    AddQDQNodePairWithOutputAsGraphOutput<uint8_t>(builder, values_output, qdq_scale, zero_point);
+
+    // indices -> Cast -> graph output
+    auto* i_output = builder.MakeOutput();
+    Node& cast_node = builder.AddNode("Cast", {indices_output}, {i_output});
+    const auto dst_type = ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT32;
+    cast_node.AddAttribute("to", static_cast<int64_t>(dst_type));
+  };
+
+  CheckAllNodesProcessed(build_model);
+}
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/testdata/ort_github_issue_19590.onnx b/onnxruntime/test/testdata/ort_github_issue_19590.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..fa07b624780bb0244d04e0f8ec17f10c8d9d6d2c
GIT binary patch
literal 599
zcmZuuyH3L}6iu9@I5((777`E`kSCNW?NmiV4WT0$SXm;cL=Y)WU(UkBr}T^X7Cr&z
zZ3=^BpL?!-j?cxW|E0qM#w*5GWgaJSnH78nqy3WQsYk5WZQN0g45+<Gw<Rlh_K`ki
zMap(?2PavvDY@@~RbP*5NDT3??>V~IJgP-y4lXn`Vpf{9qGK%Co_kb(6q{=T;_FLv
zP!Y^wXliXuWLr$O#s0B11Iag&K_c{<Z?nW1j&u|;UfH}C(jDJQPl?qBA}+~*j-P+$
zF$bCMA28NcM9!H=_HD;D@3FulwZF~@L@X<sCYNB<R;jKihlj8o&QSaAYLlUM8@7E@
z_XZ=3Y~<=H()93X)b6$5REiZP2o&Wr2<6$mT=g?0V>rbs!GN$P7D_Il`N=U6BK!OY
DAeX8E

literal 0
HcmV?d00001

diff --git a/onnxruntime/test/testdata/ort_github_issue_19590.py b/onnxruntime/test/testdata/ort_github_issue_19590.py
new file mode 100644
index 0000000000000..9be07134fd8ad
--- /dev/null
+++ b/onnxruntime/test/testdata/ort_github_issue_19590.py
@@ -0,0 +1,77 @@
+import onnx
+from onnx import TensorProto, helper
+
+# graph with a QDQ MatMul node unit where one input is and initializer -> DQ and the other is on a path that
+# contains a supported node followed by an unsupported node followed by the DQ -> MatMul.
+# The DQ of the initializer is prior to the unsupported node. If the partitioning utils do not process the QDQ node
+# unit together, the DQ for the initializer and the first supported node will be in the first partition, which
+# incorrectly breaks up the QDQ node unit.
+graph_proto = helper.make_graph(
+    [
+        # DQ of initializer for MatMul B input
+        helper.make_node(
+            "DequantizeLinear",
+            inputs=["matmul_b_uint8", "scale0"],
+            outputs=["dq_matmul_b"],
+            name="dq_matmul_b",
+        ),
+        # Treat as supported
+        helper.make_node(
+            "Mul",
+            inputs=["input:0", "scale_input"],
+            outputs=["mul:0"],
+            name="mul0",
+        ),
+        # Treat as unsupported
+        helper.make_node("Cast", inputs=["mul:0"], outputs=["mul_uint8"], name="cast0", to=2),
+        # DQ of MatMul A input
+        helper.make_node(
+            "DequantizeLinear",
+            inputs=["mul_uint8", "scale1"],
+            outputs=["dq_matmul_a"],
+            name="dq_matmul_a",
+        ),
+        # MatMul
+        helper.make_node(
+            "MatMul",
+            inputs=[
+                "dq_matmul_a",
+                "dq_matmul_b",
+            ],
+            outputs=["matmul_ab"],
+            name="matmul_ab",
+        ),
+        # Q
+        helper.make_node(
+            "QuantizeLinear",
+            inputs=["matmul_ab", "scale2"],
+            outputs=["q_matmul_ab"],
+            name="q_matmul_ab",
+        ),
+        # DQ for model output
+        helper.make_node(
+            "DequantizeLinear",
+            inputs=["q_matmul_ab", "scale2"],
+            outputs=["out:0"],
+            name="dq_graph_output",
+        ),
+    ],
+    "Main_graph",
+    [
+        helper.make_tensor_value_info("input:0", TensorProto.FLOAT, [3, 2]),
+    ],
+    [
+        helper.make_tensor_value_info("out:0", TensorProto.FLOAT, [3, 2]),
+    ],
+    [
+        helper.make_tensor("scale0", TensorProto.FLOAT, [1], [20.0]),
+        helper.make_tensor("scale1", TensorProto.FLOAT, [1], [30.0]),
+        helper.make_tensor("scale2", TensorProto.FLOAT, [1], [40.0]),
+        helper.make_tensor("matmul_b_uint8", TensorProto.UINT8, [2, 2], [1, 2, 3, 4]),
+        helper.make_tensor("scale_input", TensorProto.FLOAT, [2], [3.0, 4.0]),
+    ],
+)
+
+model = helper.make_model(graph_proto)
+onnx.checker.check_model(model, True)
+onnx.save(model, "ort_github_issue_19590.onnx")

From 0c078dfc8b1fd3e8acf8f0057cbd6156577ec56b Mon Sep 17 00:00:00 2001
From: Vincent Wang <wangwchpku@outlook.com>
Date: Tue, 12 Mar 2024 10:29:27 +0800
Subject: [PATCH 141/279] Some Shape Related Fusions (#19832)

This PR adds below shape related fusions, which is helpful for some
transformer models:
- ShapeInputMerge is to merge all Shape nodes' input NodeArg to a single
one (the 1st one on topo order) if they have the same shape value. This
helps CSE fusion to merge more nodes.
- CSE fusion to support scalar tensor as attribute value. This is mainly
to support ConstantOfShape node.
---
 .../common_subexpression_elimination.cc       |  55 ++++++++
 .../core/optimizer/graph_transformer_utils.cc |   9 +-
 .../core/optimizer/shape_input_merge.cc       |  78 +++++++++++
 .../core/optimizer/shape_input_merge.h        |  23 ++++
 onnxruntime/core/optimizer/utils.cc           |   5 +-
 .../test/optimizer/graph_transform_test.cc    | 122 ++++++++++++++++++
 .../core/optimizer/graph_transformer_utils.cc |   8 +-
 7 files changed, 291 insertions(+), 9 deletions(-)
 create mode 100644 onnxruntime/core/optimizer/shape_input_merge.cc
 create mode 100644 onnxruntime/core/optimizer/shape_input_merge.h

diff --git a/onnxruntime/core/optimizer/common_subexpression_elimination.cc b/onnxruntime/core/optimizer/common_subexpression_elimination.cc
index b2e7ef0b4f558..48df511d0c672 100644
--- a/onnxruntime/core/optimizer/common_subexpression_elimination.cc
+++ b/onnxruntime/core/optimizer/common_subexpression_elimination.cc
@@ -4,6 +4,7 @@
 #include "common_subexpression_elimination.h"
 #include "core/optimizer/utils.h"
 #include "core/graph/graph_utils.h"
+#include "core/framework/tensorprotoutils.h"
 
 #include <memory>
 #include <type_traits>
@@ -170,6 +171,32 @@ bool AreRangesEqual(const Range& lhs, const Range& rhs) {
          std::equal(lhs.begin(), lhs.end(), rhs.begin());
 }
 
+// Check if two tensor attributes are equal scalar tensors, mainly to support ConstantOfShape Op.
+// Currently support float, float16 and int64 data types, and requires the data are raw data in TensorProto.
+bool AreScalarTensorAttributeEqual(const ONNX_NAMESPACE::TensorProto& lhs_t, const ONNX_NAMESPACE::TensorProto& rhs_t) {
+  if (!(utils::HasDataType(lhs_t) && utils::HasDataType(rhs_t) && lhs_t.data_type() == rhs_t.data_type() &&
+        (lhs_t.data_type() == onnx::TensorProto_DataType_FLOAT ||
+         lhs_t.data_type() == onnx::TensorProto_DataType_FLOAT16 ||
+         lhs_t.data_type() == onnx::TensorProto_DataType_INT64) &&
+        lhs_t.dims_size() == 1 && rhs_t.dims_size() == 1 && lhs_t.dims()[0] == 1 && rhs_t.dims()[0] == 1 &&
+        utils::HasRawData(lhs_t) && utils::HasRawData(rhs_t))) {
+    return false;
+  }
+  const void* lhs_value = lhs_t.raw_data().data();
+  const void* rhs_value = rhs_t.raw_data().data();
+  switch (lhs_t.data_type()) {
+    case onnx::TensorProto_DataType_FLOAT:
+      return *reinterpret_cast<const float*>(lhs_value) == *reinterpret_cast<const float*>(rhs_value);
+    case onnx::TensorProto_DataType_FLOAT16:
+      return *reinterpret_cast<const MLFloat16*>(lhs_value) == *reinterpret_cast<const MLFloat16*>(rhs_value);
+    case onnx::TensorProto_DataType_INT64:
+      return *reinterpret_cast<const int64_t*>(lhs_value) == *reinterpret_cast<const int64_t*>(rhs_value);
+    default:
+      break;
+  }
+  return false;
+}
+
 bool AreEqual(const ONNX_NAMESPACE::AttributeProto& lhs, const ONNX_NAMESPACE::AttributeProto& rhs) {
   if (&lhs == &rhs) {
     return true;
@@ -193,6 +220,7 @@ bool AreEqual(const ONNX_NAMESPACE::AttributeProto& lhs, const ONNX_NAMESPACE::A
     case onnx::AttributeProto_AttributeType_STRINGS:
       return AreRangesEqual(lhs.strings(), rhs.strings());
     case onnx::AttributeProto_AttributeType_TENSOR:
+      return AreScalarTensorAttributeEqual(lhs.t(), rhs.t());
     case onnx::AttributeProto_AttributeType_GRAPH:
     case onnx::AttributeProto_AttributeType_SPARSE_TENSOR:
     case onnx::AttributeProto_AttributeType_TYPE_PROTO:
@@ -207,6 +235,31 @@ bool AreEqual(const ONNX_NAMESPACE::AttributeProto& lhs, const ONNX_NAMESPACE::A
   return false;
 }
 
+// Support scalar float/int64/fp16 tensor attribute only for now, and requires data is raw data in TensorProto.
+std::size_t GetTensorAttributeHash(const ONNX_NAMESPACE::TensorProto& attr_t) {
+  std::size_t hash = 0;
+  if (utils::HasDataType(attr_t) && attr_t.dims_size() == 1 && attr_t.dims()[0] == 1 && utils::HasRawData(attr_t)) {
+    int data_type = attr_t.data_type();
+    switch (data_type) {
+      case onnx::TensorProto_DataType_FLOAT:
+        UpdateHash(data_type, hash);
+        UpdateHash(*reinterpret_cast<const float*>(attr_t.raw_data().data()), hash);
+        break;
+      case onnx::TensorProto_DataType_FLOAT16:
+        UpdateHash(data_type, hash);
+        UpdateHash(static_cast<float>(*reinterpret_cast<const MLFloat16*>(attr_t.raw_data().data())), hash);
+        break;
+      case onnx::TensorProto_DataType_INT64:
+        UpdateHash(data_type, hash);
+        UpdateHash(*reinterpret_cast<const int64_t*>(attr_t.raw_data().data()), hash);
+        break;
+      default:
+        break;
+    }
+  }
+  return hash;
+}
+
 std::size_t GetAttributeHash(const ONNX_NAMESPACE::AttributeProto& attr) {
   std::size_t hash = 0;
   UpdateHash(
@@ -233,6 +286,8 @@ std::size_t GetAttributeHash(const ONNX_NAMESPACE::AttributeProto& attr) {
       UpdateHashWithContainer(attr.strings(), hash);
       break;
     case onnx::AttributeProto_AttributeType_TENSOR:
+      UpdateHash(attr.t(), &GetTensorAttributeHash, hash);
+      break;
     case onnx::AttributeProto_AttributeType_GRAPH:
     case onnx::AttributeProto_AttributeType_SPARSE_TENSOR:
     case onnx::AttributeProto_AttributeType_TYPE_PROTO:
diff --git a/onnxruntime/core/optimizer/graph_transformer_utils.cc b/onnxruntime/core/optimizer/graph_transformer_utils.cc
index f319e7254568d..63612c47f9c56 100644
--- a/onnxruntime/core/optimizer/graph_transformer_utils.cc
+++ b/onnxruntime/core/optimizer/graph_transformer_utils.cc
@@ -69,6 +69,7 @@
 #include "core/optimizer/reshape_fusion.h"
 #include "core/optimizer/rocm_blas_alt_impl.h"
 #include "core/optimizer/rule_based_graph_transformer.h"
+#include "core/optimizer/shape_input_merge.h"
 #include "core/optimizer/skip_layer_norm_fusion.h"
 #include "core/optimizer/slice_elimination.h"
 #include "core/optimizer/transpose_optimizer.h"
@@ -211,9 +212,9 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
         transformers.emplace_back(std::make_unique<DoubleQDQPairsRemover>());
       }
 
-      // Put ConstantSharing before CommonSubexpressionElimination by intention as it can create more opportunities for
-      // CSE. For example, if A and B nodes both do Add operation with a same value but different initializers, by
-      // default, CSE will not merge them, because the different initializers are represented by different NodeArg.
+      // Put ConstantSharing and ShapeInputMerge before CommonSubexpressionElimination by intention as it can create
+      // more opportunities for CSE. For example, if A and B nodes consume same different args but produce same output
+      // or consume different initializers with same value, by default, CSE will not merge them.
       InlinedHashSet<std::string> excluded_initializers;
       excluded_initializers.reserve(session_options.initializers_to_share_map.size());
       for (const auto& p : session_options.initializers_to_share_map) {
@@ -221,7 +222,7 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
       }
       const InlinedHashSet<std::string_view> no_limit_empty_ep_list = {};
       transformers.emplace_back(std::make_unique<ConstantSharing>(no_limit_empty_ep_list, excluded_initializers));
-
+      transformers.emplace_back(std::make_unique<ShapeInputMerge>());
       transformers.emplace_back(std::make_unique<CommonSubexpressionElimination>());
       transformers.emplace_back(std::make_unique<ConstantFolding>(cpu_execution_provider, !disable_quant_qdq,
                                                                   session_options.config_options));
diff --git a/onnxruntime/core/optimizer/shape_input_merge.cc b/onnxruntime/core/optimizer/shape_input_merge.cc
new file mode 100644
index 0000000000000..9f20520e3e3f4
--- /dev/null
+++ b/onnxruntime/core/optimizer/shape_input_merge.cc
@@ -0,0 +1,78 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/optimizer/shape_input_merge.h"
+
+#include "core/graph/graph_utils.h"
+
+namespace onnxruntime {
+
+namespace {
+std::string GetShapeString(const NodeArg* input_arg) {
+  auto shape = input_arg->Shape();
+  if (!shape) return "";
+  std::stringstream ss;
+  ss << "[";
+  for (int i = 0; i < shape->dim_size(); ++i) {
+    if (i != 0) ss << ",";
+    auto dim = shape->dim(i);
+    if (dim.has_dim_value()) {
+      ss << std::to_string(dim.dim_value());
+    } else if (dim.has_dim_param()) {
+      ss << "'" << dim.dim_param() << "'";
+    } else {
+      return "";
+    }
+  }
+  ss << "]";
+  return ss.str();
+}
+
+}  // namespace
+
+Status ShapeInputMerge::ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const {
+  GraphViewer graph_viewer(graph);
+  const auto& node_topology_list = graph_viewer.GetNodesInTopologicalOrder();
+  InlinedHashMap<std::string, InlinedVector<Node*>> input_hash_to_nodes;
+  for (auto node_index : node_topology_list) {
+    auto* p_node = graph.GetNode(node_index);
+    if (!p_node) continue;  // we removed the node as part of an earlier fusion
+    ORT_RETURN_IF_ERROR(Recurse(*p_node, modified, graph_level, logger));
+    if (!graph_utils::IsSupportedOptypeVersionAndDomain(*p_node, "Shape", {1, 13, 15, 19, 21}) ||
+        !graph_utils::IsSupportedProvider(*p_node, GetCompatibleExecutionProviders())) {
+      continue;
+    }
+    std::string shape_str = GetShapeString(p_node->InputDefs()[0]);
+    if (shape_str.empty()) continue;
+    if (input_hash_to_nodes.find(shape_str) == input_hash_to_nodes.end()) {
+      input_hash_to_nodes[shape_str] = InlinedVector<Node*>();
+    }
+    input_hash_to_nodes[shape_str].emplace_back(p_node);
+  }
+
+  // All Shape nodes are processed in topological order, so we can safely merge the inputs to the first node's input.
+  for (auto& kv : input_hash_to_nodes) {
+    if (kv.second.size() < 2) continue;
+    NodeArg* first_input_arg = kv.second[0]->MutableInputDefs()[0];
+    bool is_first_input_arg_graph_input = graph.IsInputsIncludingInitializers(first_input_arg);
+    for (size_t i = 1; i < kv.second.size(); ++i) {
+      Node* p_node = kv.second[i];
+      const NodeArg* input_arg = p_node->InputDefs()[0];
+      if (p_node->InputDefs()[0]->Name() == first_input_arg->Name()) continue;
+      if (!graph.IsInputsIncludingInitializers(input_arg)) {
+        const Node::EdgeEnd& input_edge = *p_node->InputEdgesBegin();
+        graph.RemoveEdge(input_edge.GetNode().Index(), p_node->Index(), input_edge.GetSrcArgIndex(), 0);
+      }
+      graph_utils::ReplaceNodeInput(*p_node, 0, *first_input_arg);
+      if (!is_first_input_arg_graph_input) {
+        const Node::EdgeEnd& first_input_edge = *kv.second[0]->InputEdgesBegin();
+        graph.AddEdge(first_input_edge.GetNode().Index(), p_node->Index(), first_input_edge.GetSrcArgIndex(), 0);
+      }
+      modified = true;
+    }
+  }
+
+  return Status::OK();
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/shape_input_merge.h b/onnxruntime/core/optimizer/shape_input_merge.h
new file mode 100644
index 0000000000000..5cb943998487b
--- /dev/null
+++ b/onnxruntime/core/optimizer/shape_input_merge.h
@@ -0,0 +1,23 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/optimizer/graph_transformer.h"
+
+namespace onnxruntime {
+
+/**
+@Class ShapeInputMerge
+Merge all shape inputs having same shape value to a single shape input.
+This change will not affect the performance, but it open chances for CSE fusion to merge nodes.
+*/
+class ShapeInputMerge : public GraphTransformer {
+ public:
+  ShapeInputMerge(const InlinedHashSet<std::string_view>& compatible_execution_providers = {}) noexcept
+      : GraphTransformer("ShapeInputMerge", compatible_execution_providers) {}
+
+  Status ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const override;
+};
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/utils.cc b/onnxruntime/core/optimizer/utils.cc
index 7c3599a08ec7a..7055882961e17 100644
--- a/onnxruntime/core/optimizer/utils.cc
+++ b/onnxruntime/core/optimizer/utils.cc
@@ -272,7 +272,7 @@ int32_t IndexOfNodeOutput(const Node& node, const NodeArg& node_arg) {
 // We could also allow other known domains (kMSDomain, kMSNchwcDomain, kMSFeaturizersDomain),
 // as long as we verify which of their operations are non-deterministic and add them in the map below.
 constexpr std::array kOnnxDomainNonDeterministicOps{"RandomUniform", "RandomNormal", "RandomUniformLike",
-                                                    "RandomNormalLike", "Multinomial"};
+                                                    "RandomNormalLike", "Multinomial", "Dropout"};
 
 // List of deterministic MS domain operators. Currently used for constant folding and common subexpression elimination.
 //
@@ -280,7 +280,8 @@ constexpr std::array kOnnxDomainNonDeterministicOps{"RandomUniform", "RandomNorm
 // with the above ONNX list. With the current approach, only MS domain Q/DQ operators
 // (plus ShrunkenGather for training) are considered deterministic.
 #ifdef ENABLE_TRAINING_OPS
-constexpr std::array kMSDomainDeterministicOps{"ShrunkenGather", "QuantizeLinear", "DequantizeLinear"};
+constexpr std::array kMSDomainDeterministicOps{"ShrunkenGather", "QuantizeLinear", "DequantizeLinear",
+                                               "ConcatTraining"};
 #else
 constexpr std::array kMSDomainDeterministicOps{"QuantizeLinear", "DequantizeLinear"};
 #endif
diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc
index 1535e2b60a3bd..97f1feaaa612d 100755
--- a/onnxruntime/test/optimizer/graph_transform_test.cc
+++ b/onnxruntime/test/optimizer/graph_transform_test.cc
@@ -65,6 +65,7 @@
 #include "core/optimizer/relu_clip_fusion.h"
 #include "core/optimizer/reshape_fusion.h"
 #include "core/optimizer/rule_based_graph_transformer.h"
+#include "core/optimizer/shape_input_merge.h"
 #include "core/optimizer/slice_elimination.h"
 #include "core/optimizer/unsqueeze_elimination.h"
 #include "core/optimizer/utils.h"
@@ -4879,6 +4880,53 @@ TEST_F(GraphTransformationTests, FastGeluFusionWithCastsTest3) {
   ASSERT_TRUE(op_to_count["com.microsoft.FastGelu"] == 1);
 }
 
+TEST_F(GraphTransformationTests, CseWithConstantOfShape) {
+  auto build_test_case = [&](ModelTestBuilder& builder) {
+    std::vector<std::variant<int64_t, std::string>> input_shape;
+    input_shape.reserve(4);
+    input_shape.emplace_back("dim0");
+    input_shape.emplace_back(512);
+    input_shape.emplace_back(16);
+    input_shape.emplace_back("dim3");
+    auto* input_arg = builder.MakeSymbolicInput<float>(input_shape);
+    auto* shape_out_1 = builder.MakeIntermediate();
+    auto* shape_out_2 = builder.MakeIntermediate();
+    auto* constant_of_shape_out_1 = builder.MakeIntermediate();
+    auto* constant_of_shape_out_2 = builder.MakeIntermediate();
+    auto* mul_out_1 = builder.MakeIntermediate();
+    auto* mul_out_2 = builder.MakeOutput();
+    builder.AddNode("Shape", {input_arg}, {shape_out_1});
+    builder.AddNode("Shape", {input_arg}, {shape_out_2});
+    TensorProto value_tensor;
+    value_tensor.add_dims(1);
+    float value = 2.333f;
+    value_tensor.set_raw_data(reinterpret_cast<const char*>(&value), sizeof(float));
+    value_tensor.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+    builder.AddNode("ConstantOfShape", {shape_out_1}, {constant_of_shape_out_1}).AddAttribute("value", value_tensor);
+    builder.AddNode("ConstantOfShape", {shape_out_2}, {constant_of_shape_out_2}).AddAttribute("value", value_tensor);
+    builder.AddNode("Mul", {input_arg, constant_of_shape_out_1}, {mul_out_1});
+    builder.AddNode("Mul", {mul_out_1, constant_of_shape_out_2}, {mul_out_2});
+  };
+
+  auto pre_graph_checker = [&](Graph& graph) {
+    auto op_count_map = CountOpsInGraph(graph);
+    TEST_RETURN_IF_NOT(op_count_map["Shape"] == 2);
+    TEST_RETURN_IF_NOT(op_count_map["ConstantOfShape"] == 2);
+    return Status::OK();
+  };
+
+  auto post_graph_checker = [&](Graph& graph) {
+    auto op_count_map = CountOpsInGraph(graph);
+    TEST_RETURN_IF_NOT(op_count_map["Shape"] == 1);
+    TEST_RETURN_IF_NOT(op_count_map["ConstantOfShape"] == 1);
+    return Status::OK();
+  };
+
+  std::unique_ptr<GraphTransformer> transformer = std::make_unique<CommonSubexpressionElimination>();
+  ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 14, *logger_, std::move(transformer), TransformerLevel::Level1,
+                                        1, pre_graph_checker, post_graph_checker));
+}
+
 TEST_F(GraphTransformationTests, QuickGelu) {
   // Sigmoid(x*alpha)*x, float
   {
@@ -7543,5 +7591,79 @@ TEST_F(GraphTransformationTests, GatherToSliceFusion) {
   }
 }
 
+TEST_F(GraphTransformationTests, ShapeInputMerge) {
+  auto build_test_case = [&](ModelTestBuilder& builder) {
+    std::vector<std::variant<int64_t, std::string>> input_shape;
+    input_shape.reserve(5);
+    input_shape.emplace_back("dim0");
+    input_shape.emplace_back(512);
+    input_shape.emplace_back(1);
+    input_shape.emplace_back(1536);
+    input_shape.emplace_back("dim4");
+    auto* input_arg = builder.MakeSymbolicInput<float>(input_shape);
+    auto* neg_out = builder.MakeIntermediate();
+    auto* axes_initializer = builder.MakeInitializer<int64_t>({1}, {static_cast<int64_t>(2)});
+    auto* squeeze_out = builder.MakeIntermediate();
+    auto* cast_out = builder.MakeIntermediate();
+    auto* unsqueeze_out = builder.MakeOutput();
+    auto* shape_1_out = builder.MakeOutput();
+    auto* shape_2_out = builder.MakeOutput();
+    auto* shape_3_out = builder.MakeOutput();
+    auto* shape_4_out = builder.MakeOutput();
+    auto* shape_5_out = builder.MakeOutput();
+    builder.AddNode("Neg", {input_arg}, {neg_out});
+    builder.AddNode("Squeeze", {neg_out, axes_initializer}, {squeeze_out});
+    builder.AddNode("Cast", {squeeze_out}, {cast_out}).AddAttribute("to", static_cast<int64_t>(10));
+    builder.AddNode("Unsqueeze", {cast_out, axes_initializer}, {unsqueeze_out});
+    builder.AddNode("Shape", {input_arg}, {shape_1_out});
+    builder.AddNode("Shape", {neg_out}, {shape_2_out});
+    builder.AddNode("Shape", {squeeze_out}, {shape_3_out});
+    builder.AddNode("Shape", {cast_out}, {shape_4_out});
+    builder.AddNode("Shape", {unsqueeze_out}, {shape_5_out});
+  };
+
+  auto pre_graph_checker = [&](Graph& graph) {
+    InlinedHashMap<std::string, int> ref_count;
+    for (auto& node : graph.Nodes()) {
+      if (node.OpType() == "Shape") {
+        std::string name = node.InputDefs()[0]->Name();
+        if (ref_count.find(name) == ref_count.end()) {
+          ref_count[name] = 1;
+        } else {
+          ref_count[name]++;
+        }
+      }
+    }
+    TEST_RETURN_IF_NOT(ref_count.size() == 5);
+    return Status::OK();
+  };
+
+  auto post_graph_checker = [&](Graph& graph) {
+    InlinedHashMap<std::string, int> ref_count;
+    for (auto& node : graph.Nodes()) {
+      if (node.OpType() == "Shape") {
+        std::string name = node.InputDefs()[0]->Name();
+        if (ref_count.find(name) == ref_count.end()) {
+          ref_count[name] = 1;
+        } else {
+          ref_count[name]++;
+        }
+      }
+    }
+    TEST_RETURN_IF_NOT(ref_count.size() == 2);
+    int sum = 0, mul = 1;
+    for (auto& entry : ref_count) {
+      sum += entry.second;
+      mul *= entry.second;
+    }
+    TEST_RETURN_IF_NOT(sum == 5 && mul == 6);
+    return Status::OK();
+  };
+
+  std::unique_ptr<GraphTransformer> transformer = std::make_unique<ShapeInputMerge>();
+  ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 14, *logger_, std::move(transformer), TransformerLevel::Level1,
+                                        1, pre_graph_checker, post_graph_checker));
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/orttraining/orttraining/core/optimizer/graph_transformer_utils.cc b/orttraining/orttraining/core/optimizer/graph_transformer_utils.cc
index 5d527369a1b75..9ce88e549eed2 100644
--- a/orttraining/orttraining/core/optimizer/graph_transformer_utils.cc
+++ b/orttraining/orttraining/core/optimizer/graph_transformer_utils.cc
@@ -44,6 +44,7 @@
 #include "core/optimizer/relu_clip_fusion.h"
 #include "core/optimizer/reshape_fusion.h"
 #include "core/optimizer/rule_based_graph_transformer.h"
+#include "core/optimizer/shape_input_merge.h"
 #include "core/optimizer/skip_layer_norm_fusion.h"
 #include "core/optimizer/slice_elimination.h"
 #include "core/optimizer/unsqueeze_elimination.h"
@@ -115,10 +116,11 @@ std::vector<std::unique_ptr<GraphTransformer>> GeneratePreTrainingTransformers(
       ORT_THROW_IF_ERROR(rule_transformer->Register(std::make_unique<PythonOpRewriter>()));
 #endif
 
-      // Put ConstantSharing before CommonSubexpressionElimination by intention as it can create more opportunities for
-      // CSE. For example, if A and B nodes both do Add operation with a same value but different initializers, by
-      // default, CSE will not merge them, because the different initializers are represented by different NodeArg.
+      // Put ConstantSharing and ShapeInputMerge before CommonSubexpressionElimination by intention as it can create
+      // more opportunities for CSE. For example, if A and B nodes consume same different args but produce same output
+      // or consume different initializers with same value, by default, CSE will not merge them.
       transformers.emplace_back(std::make_unique<ConstantSharing>(compatible_eps));
+      transformers.emplace_back(std::make_unique<ShapeInputMerge>(compatible_eps));
       // LayerNormFusion must be applied before CommonSubexpressionElimination as the latter will break the pattern when 2 LayerNormFusion share the same input.
       transformers.emplace_back(std::make_unique<LayerNormFusion>(compatible_eps));
       // Remove duplicate nodes. Must be applied before any recompute transformations.

From 3e954da3e69e9f517c6d0f555dcc57366b028abc Mon Sep 17 00:00:00 2001
From: pengwa <pengwa@microsoft.com>
Date: Tue, 12 Mar 2024 10:49:19 +0800
Subject: [PATCH 142/279] Fix and enable few ORTModule Unit Tests (#19847)

### Fix and enable few ORTModule Unit Tests

Fix 'test_bert_inputs_with_dynamic_shape' and
'test_bert_result_with_layerwise_recompute' generate Nan loss in ORT
run.

The root cause is, the logic to generatic attention mask test data is
not correct, only 0 or 1 is allowed in the dataset, but we see lots of
other numbers. ( The reason we don't have this using old version of
transformers for example v4.4.2 or 4.16.2 is because they don't contains
such
https://github.com/huggingface/transformers/commit/d3cb28886ac68beba9a6646b422a4d727b056c0c,
which increase the scaling to a bigger number, causing a overflow to
inf)

Another improvement during the investigation using convergence tools:
Don't dump the activations during model export phase, otherwise, the
dumped data might contains some PyTorch run's result making us confused
during comparing with stock PyTorch run results.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 docs/ORTModule_Convergence_Notes.md           |  2 +-
 .../utils/hooks/_statistics_subscriber.py     | 27 ++++---
 .../python/orttraining_test_ortmodule_api.py  | 78 ++++++++++---------
 3 files changed, 58 insertions(+), 49 deletions(-)

diff --git a/docs/ORTModule_Convergence_Notes.md b/docs/ORTModule_Convergence_Notes.md
index 791b6c32c9b48..2374e7b7c538a 100644
--- a/docs/ORTModule_Convergence_Notes.md
+++ b/docs/ORTModule_Convergence_Notes.md
@@ -89,7 +89,7 @@ The limitation of `GlobalSubscriberManager` is, only 'nn.Module's forward output
 dump the intermediate tensors in a `nn.Module`'s forward function, refer to the following example:
 
 ```diff
-+   from onnxruntime.training.utils import inspect_activation
++   from onnxruntime.training.utils.hooks import inspect_activation
 class BloomForCausalLM(BloomPreTrainedModel):
   def __init__(self, config: BloomConfig):
     ...
diff --git a/orttraining/orttraining/python/training/utils/hooks/_statistics_subscriber.py b/orttraining/orttraining/python/training/utils/hooks/_statistics_subscriber.py
index 68b78f8df70f1..a8e730488d76d 100644
--- a/orttraining/orttraining/python/training/utils/hooks/_statistics_subscriber.py
+++ b/orttraining/orttraining/python/training/utils/hooks/_statistics_subscriber.py
@@ -14,6 +14,7 @@
 import torch
 
 from ._subscriber_base import RuntimeStates, SubscriberBase
+from ._subscriber_manager import ORT_NO_INCREASE_GLOBAL_STEP
 
 
 class _InspectActivation(torch.autograd.Function):
@@ -176,21 +177,23 @@ def _summarize_activations(self, tensor: torch.Tensor, depth: int, name: str, st
         display_name = name + " forward run" if is_forward is True else name + " backward run"
         output_file_name = name + "_forward" if is_forward is True else name + "_backward"
 
-        if tensor is None or not isinstance(tensor, torch.Tensor):
-            print(f"{display_name} not a torch tensor, value: {tensor}")
-            return
+        # Skip dump during model pre-export output schema preparison run and export run.
+        if ORT_NO_INCREASE_GLOBAL_STEP[0] is False:
+            if tensor is None or not isinstance(tensor, torch.Tensor):
+                print(f"{display_name} not a torch tensor, value: {tensor}")
+                return
 
-        step_path = Path(step_folder)
-        if not step_path.exists():
-            step_path.mkdir(parents=True, exist_ok=False)
-        order_file_path = step_path / "order.txt"
-        tensor_file_path = step_path / output_file_name
+            step_path = Path(step_folder)
+            if not step_path.exists():
+                step_path.mkdir(parents=True, exist_ok=False)
+            order_file_path = step_path / "order.txt"
+            tensor_file_path = step_path / output_file_name
 
-        with order_file_path.open(mode="a", encoding="utf-8") as f:
-            f.write(f"{output_file_name}\n")
+            with order_file_path.open(mode="a", encoding="utf-8") as f:
+                f.write(f"{output_file_name}\n")
 
-        with tensor_file_path.open(mode="w", encoding="utf-8") as f:
-            _summarize_tensor(display_name, tensor, f, depth, self._run_on_cpu, self._bucket_size)
+            with tensor_file_path.open(mode="w", encoding="utf-8") as f:
+                _summarize_tensor(display_name, tensor, f, depth, self._run_on_cpu, self._bucket_size)
 
 
 def _summarize_tensor(
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
index 365c2bb8ebe0e..f0261c776609e 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
@@ -417,24 +417,38 @@ def _get_bert_for_sequence_classification_model(
     return model
 
 
-def _get_bert_for_sequence_classification_sample_data(device):
-    """Returns sample data to be used with BertForSequenceClassification model"""
+def _generate_attention_mask_for_encoder_following_hf(batch_size, seq_length, device, past_key_values_length=0):
+    """Generate attention mask for encoder following the implementation in HuggingFace.
 
-    input_ids = torch.randint(0, 100, (32, 64), dtype=torch.long, device=device)
-    input_mask = torch.randint(0, 100, (32, 64), dtype=torch.long, device=device)
-    labels = torch.randint(0, 1, (32,), dtype=torch.long, device=device)
+    Be noted: past_key_values_length is 0 for training.
 
-    return input_ids, input_mask, labels
+    Generate mask using this
+        https://github.com/huggingface/transformers/blame/4f27ee936a861f56f32ea6db138978b274008006/src/transformers/models/bert/modeling_bert.py#L974C81-L974C81
+
+    """
+
+    attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+    return attention_mask
 
 
 def _get_bert_for_sequence_classification_sample_data_with_random_shapes(device):
     """Returns sample data with random shape to be used with BertForSequenceClassification model"""
 
-    x = random.randint(1, 100)
-    y = random.randint(1, 100)
-    input_ids = torch.randint(0, 100, (x, y), dtype=torch.long, device=device)
-    input_mask = torch.randint(0, 100, (x, y), dtype=torch.long, device=device)
-    labels = torch.randint(0, 1, (x,), dtype=torch.long, device=device)
+    bsz = random.randint(1, 100)
+    seq_length = random.randint(1, 100)
+    input_ids = torch.randint(0, 100, (bsz, seq_length), dtype=torch.long, device=device)
+    input_mask = _generate_attention_mask_for_encoder_following_hf(bsz, seq_length, device)
+    labels = torch.randint(0, 1, (bsz,), dtype=torch.long, device=device)
+
+    return input_ids, input_mask, labels
+
+
+def _get_bert_for_sequence_classification_sample_data(device):
+    """Returns sample data to be used with BertForSequenceClassification model"""
+
+    input_ids = torch.randint(0, 100, (32, 64), dtype=torch.long, device=device)
+    input_mask = _generate_attention_mask_for_encoder_following_hf(32, 64, device)
+    labels = torch.randint(0, 1, (32,), dtype=torch.long, device=device)
 
     return input_ids, input_mask, labels
 
@@ -2211,32 +2225,27 @@ def run_step(model, x):
         _test_helpers.assert_gradients_match_and_reset_gradient(ort_model, pt_model)
 
 
-# TODO(askhade): This test is failing with smaller tolerance, need to investigate! Disabling it right now to
-# unblock the move to a later version of transformers to resolve security vulnerability.
-# (Moving from transformers v4.4.2 to v4.30.0)
-# def test_bert_inputs_with_dynamic_shape():
-#     # create pytorch model with dropout disabled
-#     pt_model = _get_bert_for_sequence_classification_model(
-#         "cuda", is_training=True, hidden_dropout_prob=0.0, attention_probs_dropout_prob=0.0
-#     )
-#     ort_model = ORTModule(copy.deepcopy(pt_model))
+def test_bert_inputs_with_dynamic_shape():
+    # create pytorch model with dropout disabled
+    pt_model = _get_bert_for_sequence_classification_model(
+        "cuda", is_training=True, hidden_dropout_prob=0.0, attention_probs_dropout_prob=0.0
+    )
+    ort_model = ORTModule(copy.deepcopy(pt_model))
 
-#     def run_step(model, x, y, z):
-#         outputs = model(x, y, None, None, None, None, z)
-#         loss = outputs[0]
-#         loss.backward()
-#         return outputs[0]
+    def run_step(model, x, y, z):
+        outputs = model(x, y, None, None, None, None, z)
+        loss = outputs[0]
+        loss.backward()
+        return outputs[0]
 
-#     for _step in range(10):
-#         x, y, z = _get_bert_for_sequence_classification_sample_data_with_random_shapes("cuda")
+    for _step in range(10):
+        x, y, z = _get_bert_for_sequence_classification_sample_data_with_random_shapes("cuda")
 
-#         pt_p = run_step(pt_model, x, y, z)
-#         ort_p = run_step(ort_model, x, y, z)
+        pt_p = run_step(pt_model, x, y, z)
+        ort_p = run_step(ort_model, x, y, z)
 
-#         _test_helpers.assert_values_are_close(
-#             ort_p, pt_p, atol=1e-01
-#         )  # TODO: this assert is failing with smaller tolerance, need to investigate!!
-#         # _test_helpers.assert_gradients_match_and_reset_gradient(ort_model, pt_model)  #TODO - enable this check after the investigation
+        _test_helpers.assert_values_are_close(ort_p, pt_p, atol=1e-01)
+        _test_helpers.assert_gradients_match_and_reset_gradient(ort_model, pt_model)
 
 
 @pytest.mark.parametrize("device", ["cuda", "cpu"])
@@ -6424,9 +6433,6 @@ def run_step(model, x):
         del os.environ["ORTMODULE_CONV_ALGO_SEARCH"]
 
 
-@pytest.mark.skip(
-    reason="This test fail because bert forward loss is nan in updated transformers lib, disable for now."
-)
 def test_bert_result_with_layerwise_recompute():
     original_val = os.environ.get("ORTMODULE_MEMORY_OPT_LEVEL", None)
     # Create PyTorch model with dropout disabled.

From 3fb8905393bf91d97eae7afa0ae341828e3f1dbc Mon Sep 17 00:00:00 2001
From: pengwa <pengwa@microsoft.com>
Date: Tue, 12 Mar 2024 10:51:30 +0800
Subject: [PATCH 143/279] Fix torch cpp extension build warnings (#19842)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Fix torch cpp extension build warnings

For the warnings shown as below:

```
cc1plus: warning: command line option ‘-Wstrict-prototypes’ is valid for C/ObjC but not for C++
[4/5] c++ -MMD -MF /opt/conda/envs/ptca/lib/python3.8/site-packages/onnxruntime/training/ortmodule/torch_cpp_extensions/build/temp.linux-x86_64-cpython-38/opt/conda/envs/ptca/lib/python3.8/site-packages/onnxruntime/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/custom_function_bw.o.d -pthread -B /opt/conda/envs/ptca/compiler_compat -Wl,--sysroot=/ -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -Wstrict-prototypes -fPIC -I/opt/conda/envs/ptca/lib/python3.8/site-packages/onnxruntime/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils -I/opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include -I/opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/TH -I/opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/THC -I/opt/conda/envs/ptca/include/python3.8 -c -c /opt/conda/envs/ptca/lib/python3.8/site-packages/onnxruntime/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/custom_function_bw.cc -o /opt/conda/envs/ptca/lib/python3.8/site-packages/onnxruntime/training/ortmodule/torch_cpp_extensions/build/temp.linux-x86_64-cpython-38/opt/conda/envs/ptca/lib/python3.8/site-packages/onnxruntime/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/custom_function_bw.o -O3 -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=torch_interop_utils -D_GLIBCXX_USE_CXX11_ABI=0
cc1plus: warning: command line option ‘-Wstrict-prototypes’ is valid for C/ObjC but not for C++
In file included from /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/torch/csrc/utils/python_arg_parser.h:65,
                 from /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/torch/csrc/utils/tensor_new.h:4,
                 from /opt/conda/envs/ptca/lib/python3.8/site-packages/onnxruntime/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/custom_function_bw.cc:9:
/opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/torch/csrc/utils/python_strings.h:104:19: warning: ‘pybind11::object PyObject_FastGetAttrString(PyObject*, const char*)’ defined but not used [-Wunused-function]
  104 | static py::object PyObject_FastGetAttrString(PyObject* obj, const char* name) {
      |                   ^~~~~~~~~~~~~~~~~~~~~~~~~~
[5/5] c++ -MMD -MF /opt/conda/envs/ptca/lib/python3.8/site-packages/onnxruntime/training/ortmodule/torch_cpp_extensions/build/temp.linux-x86_64-cpython-38/opt/conda/envs/ptca/lib/python3.8/site-packages/onnxruntime/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/custom_function_fw.o.d -pthread -B /opt/conda/envs/ptca/compiler_compat -Wl,--sysroot=/ -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -Wstrict-prototypes -fPIC -I/opt/conda/envs/ptca/lib/python3.8/site-packages/onnxruntime/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils -I/opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include -I/opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/TH -I/opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/THC -I/opt/conda/envs/ptca/include/python3.8 -c -c /opt/conda/envs/ptca/lib/python3.8/site-packages/onnxruntime/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/custom_function_fw.cc -o /opt/conda/envs/ptca/lib/python3.8/site-packages/onnxruntime/training/ortmodule/torch_cpp_extensions/build/temp.linux-x86_64-cpython-38/opt/conda/envs/ptca/lib/python3.8/site-packages/onnxruntime/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/custom_function_fw.o -O3 -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=torch_interop_utils -D_GLIBCXX_USE_CXX11_ABI=0
cc1plus: warning: command line option ‘-Wstrict-prototypes’ is valid for C/ObjC but not for C++
In file included from /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/torch/csrc/utils/python_arg_parser.h:65,
                 from /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/torch/csrc/utils/tensor_new.h:4,
                 from /opt/conda/envs/ptca/lib/python3.8/site-packages/onnxruntime/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/custom_function_fw.cc:13:
/opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/torch/csrc/utils/python_strings.h:104:19: warning: ‘pybind11::object PyObject_FastGetAttrString(PyObject*, const char*)’ defined but not used [-Wunused-function]
  104 | static py::object PyObject_FastGetAttrString(PyObject* obj, const char* name) {
      |                   ^~~~~~~~~~~~~~~~~~~~~~~~~~
g++ -pthread -B /opt/conda/envs/ptca/compiler_compat -Wl,--sysroot=/ -pthread -shared -B /opt/conda/envs/ptca/compiler_compat -L/opt/conda/envs/ptca/lib -Wl,-rpath=/opt/conda/envs/ptca/lib -Wl,--no-as-needed -Wl,--sysroot=/ /opt/conda/envs/ptca/lib/python3.8/site-packages/onnxruntime/training/ortmodule/torch_cpp_extensions/build/temp.linux-x86_64-cpython-38/opt/conda/envs/ptca/lib/python3.8/site-packages/onnxruntime/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/ctx_pool.o /opt/conda/envs/ptca/lib/python3.8/site-packages/onnxruntime/training/ortmodule/torch_cpp_extensions/build/temp.linux-x86_64-cpython-38/opt/conda/envs/ptca/lib/python3.8/site-packages/onnxruntime/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/custom_function_bw.o /opt/conda/envs/ptca/lib/python3.8/site-packages/onnxruntime/training/ortmodule/torch_cpp_extensions/build/temp.linux-x86_64-cpython-38/opt/conda/envs/ptca/lib/python3.8/site-packages/onnxruntime/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/custom_function_fw.o /opt/conda/envs/ptca/lib/python3.8/site-packages/onnxruntime/training/ortmodule/torch_cpp_extensions/build/temp.linux-x86_64-cpython-38/opt/conda/envs/ptca/lib/python3.8/site-packages/onnxruntime/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/custom_function_shared.o /opt/conda/envs/ptca/lib/python3.8/site-packages/onnxruntime/training/ortmodule/torch_cpp_extensions/build/temp.linux-x86_64-cpython-38/opt/conda/envs/ptca/lib/python3.8/site-packages/onnxruntime/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/torch_interop_utils.o -L/opt/conda/envs/ptca/lib/python3.8/site-packages/torch/lib -lc10 -ltorch -ltorch_cpu -ltorch_python -o build/lib.linux-x86_64-cpython-38/torch_interop_utils.cpython-38-x86_64-linux-gnu.so
Installing /opt/conda/envs/ptca/lib/python3.8/site-packages/onnxruntime/training/ortmodule/torch_cpp_extensions/build/lib.linux-x86_64-cpython-38/fused_ops.cpython-38-x86_64-linux-gnu.so -> /opt/conda/envs/ptca/lib/python3.8/site-packages/onnxruntime/training/ortmodule/torch_cpp_extensions/fused_ops.cpython-38-x86_64-linux-gnu.so
Installing /opt/conda/envs/ptca/lib/python3.8/site-packages/onnxruntime/training/ortmodule/torch_cpp_extensions/build/lib.linux-x86_64-cpython-38/aten_op_executor.cpython-38-x86_64-linux-gnu.so -> /opt/conda/envs/ptca/lib/python3.8/site-packages/onnxruntime/training/ortmodule/torch_cpp_extensions/aten_op_executor.cpython-38-x86_64-linux-gnu.so
Installing /opt/conda/envs/ptca/lib/python3.8/site-packages/onnxruntime/training/ortmodule/torch_cpp_extensions/build/lib.linux-x86_64-cpython-38/torch_gpu_allocator.cpython-38-x86_64-linux-gnu.so -> /opt/conda/envs/ptca/lib/python3.8/site-packages/onnxruntime/training/ortmodule/torch_cpp_extensions/torch_gpu_allocator.cpython-38-x86_64-linux-gnu.so
Installing /opt/conda/envs/ptca/lib/python3.8/site-packages/onnxruntime/training/ortmodule/torch_cpp_extensions/build/lib.linux-x86_64-cpython-38/torch_interop_utils.cpython-38-x86_64-linux-gnu.so -> /opt/conda/envs/ptca/lib/python3.8/site-packages/onnxruntime/training/ortmodule/torch_cpp_extensions/torch_interop_utils.cpython-38-x86_64-linux-gnu.so

```

Fix by replacing eixsting `PyObject_GetAttrString` with
`PyObject_FastGetAttrString` which claims to be faster in its
implementation comment.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../core/optimizer/memory_optimizer/memory_optimizer.cc      | 1 -
 .../cpu/torch_interop_utils/custom_function_bw.cc            | 5 +++--
 .../cpu/torch_interop_utils/custom_function_fw.cc            | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/memory_optimizer.cc b/orttraining/orttraining/core/optimizer/memory_optimizer/memory_optimizer.cc
index 40fa2fc5cc737..ac619bdc390d3 100644
--- a/orttraining/orttraining/core/optimizer/memory_optimizer/memory_optimizer.cc
+++ b/orttraining/orttraining/core/optimizer/memory_optimizer/memory_optimizer.cc
@@ -31,7 +31,6 @@ constexpr bool IsForwardPassOperator(ptrdiff_t op_order_in_topological_sort,
 // Reset seed attribute for the dropout node if the seed is not set.
 bool SetSeedForDropoutNode(Node& node) {
   // ONNX Dropout 1, 6, 7, 10 do not have seed attribute, so we remove them from the recompute support.
-  // TODO(pengwa): add the opset check in GetAllowedRecomputeOps.
   if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "Dropout", {12, 13}, kOnnxDomain) ||
       graph_utils::IsSupportedOptypeVersionAndDomain(node, "BitmaskDropout", {1}, kMSDomain) ||
       graph_utils::IsSupportedOptypeVersionAndDomain(node, "BiasDropout", {1}, kMSDomain) ||
diff --git a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/custom_function_bw.cc b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/custom_function_bw.cc
index 88e93b26e0e22..d511743c4b698 100644
--- a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/custom_function_bw.cc
+++ b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/custom_function_bw.cc
@@ -60,9 +60,10 @@ std::vector<PyObject*> custom_function_backward_runner(const char* func_name_cha
         tensor = torch::utils::tensor_fromDLPack(args[arg_index]);
       } else {
         TORCH_CHECK(args[arg_index] == Py_None, "Only None is supported for non-tensor input.");
-        PyObject* fw_kernel_invoke_id = PyObject_GetAttrString(ctx.ptr(), "fw_kernel_invoke_id");
+        py::object fw_kernel_invoke_id = PyObject_FastGetAttrString(ctx.ptr(), "fw_kernel_invoke_id");
+        TORCH_CHECK(fw_kernel_invoke_id.ptr() != nullptr, "fw_kernel_invoke_id is not found in the context.");
         std::string fw_kernel_invoke_id_str =
-            py::cast<std::string>(py::reinterpret_borrow<py::object>(fw_kernel_invoke_id));
+            py::cast<std::string>(fw_kernel_invoke_id);
         CustomFuncOpKernelInfo& fw_kernel_info =
             KernelInfoStore::GetInstance().GetKernelInfoMap().at(fw_kernel_invoke_id_str);
         if (fw_kernel_info.materialize_grads) {
diff --git a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/custom_function_fw.cc b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/custom_function_fw.cc
index 599bdf813907b..3bb5151265eff 100644
--- a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/custom_function_fw.cc
+++ b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/custom_function_fw.cc
@@ -255,7 +255,7 @@ static py::object get_mockup_context_class() {
       throw std::runtime_error("Fails to import the module.");
     }
 
-    auto python_class = py::reinterpret_steal<py::object>(PyObject_GetAttrString(module.ptr(), "FakeContext"));
+    auto python_class = PyObject_FastGetAttrString(module.ptr(), "FakeContext");
     if (!PyCallable_Check(python_class.ptr())) {
       throw std::runtime_error("Cannot instantiate the Python class");
     }

From d4fa4f027647fe04bfe77f0fbd6cdc588d351959 Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Wed, 13 Mar 2024 00:06:59 +0800
Subject: [PATCH 144/279] Remove FFmpeg to meet compliance (#19859)

---
 .../transformers/models/whisper/README.md     |  1 +
 .../models/whisper/requirements.txt           |  1 -
 .../azure-pipelines/bigmodels-ci-pipeline.yml | 30 -------------------
 .../docker/Dockerfile.package_ubuntu_2004_gpu |  4 ---
 4 files changed, 1 insertion(+), 35 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/models/whisper/README.md b/onnxruntime/python/tools/transformers/models/whisper/README.md
index 7a678f2734ade..b44124340a2cd 100644
--- a/onnxruntime/python/tools/transformers/models/whisper/README.md
+++ b/onnxruntime/python/tools/transformers/models/whisper/README.md
@@ -10,6 +10,7 @@ Please note the package versions needed for using Whisper in the `requirements.t
   - Note that `torch` with CUDA enabled is not installed automatically. This is because `torch` should be installed with the CUDA version used on your machine. Please visit [the PyTorch website](https://pytorch.org/get-started/locally/) to download the `torch` version that is used with the CUDA version installed on your machine and satisfies the requirement listed in the file.
 - `requirements.txt`
   - Package versions needed in each of the above files
+- ffmpeg-python is also required, but please install it by source code with allowed codecs to avoid any patent risks.
 
 In addition to the above packages, you will need to install `ffmpeg` on your machine. Visit the [FFmpeg website](https://ffmpeg.org/) for details. You can also install it natively using package managers.
 
diff --git a/onnxruntime/python/tools/transformers/models/whisper/requirements.txt b/onnxruntime/python/tools/transformers/models/whisper/requirements.txt
index 9bbe0d7380406..4cb808501713c 100644
--- a/onnxruntime/python/tools/transformers/models/whisper/requirements.txt
+++ b/onnxruntime/python/tools/transformers/models/whisper/requirements.txt
@@ -1,7 +1,6 @@
 torch>=1.13.0
 transformers>=4.24.0
 openai-whisper
-ffmpeg-python
 datasets
 soundfile
 librosa
diff --git a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
index 43dedbc394c38..8e28342aa634c 100644
--- a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
@@ -392,33 +392,3 @@ stages:
             '
       displayName: 'Convert Whisper Model'
       workingDirectory: $(Build.SourcesDirectory)
-
-    - script: |
-        docker run --rm --gpus all -v $(Build.SourcesDirectory):/workspace \
-           -v $(Build.BinariesDirectory)/ort-artifact/:/ort-artifact \
-           -v $(Agent.TempDirectory)/whisper_large_v3:/whisper_large_v3 \
-           onnxruntimepackagestest \
-            bash -c '
-              set -ex; \
-              pushd /workspace/onnxruntime/python/tools/transformers/ ; \
-              python3 -m pip install --upgrade pip ; \
-              pushd models/whisper ; \
-              python3 -m pip install -r requirements.txt ; \
-              popd ; \
-              python3 -m pip install /ort-artifact/*.whl ; \
-              python3 -m pip uninstall -y torch ; \
-              python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu118 ; \
-              ls whisperlargev3; \
-              python3 -m models.whisper.benchmark \
-                  --benchmark-type ort \
-                  --audio-path models/whisper/test/1272-141231-0002.mp3 \
-                  --model-name openai/whisper-large-v3 \
-                  --ort-model-path /workspace/onnxruntime/python/tools/transformers/whisperlargev3/whisper_large_v3_beamsearch.onnx \
-                  --precision fp32 \
-                  --device cuda > ort_output.txt ; \
-              cat ort_output.txt ; \
-              diff ort_output.txt /workspace/onnxruntime/python/tools/transformers/models/whisper/test/whisper_ort_output.txt && exit 0 || exit 1
-              popd ; \
-            '
-      displayName: 'Test Whisper ONNX Model'
-      workingDirectory: $(Build.SourcesDirectory)
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
index c9038afc0954c..331eb6472070c 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
@@ -25,10 +25,6 @@ RUN apt-get install -y --no-install-recommends \
     python3-dev \
     python3-wheel
 
-# Install ffmpeg, which couldn't be installed in UBI8
-# https://stackoverflow.com/questions/73597789/how-to-install-ffmpeg-on-ubi-docker-images
-RUN apt-get install -y --no-install-recommends ffmpeg
-
 RUN pip install --upgrade pip
 
 # Install TensorRT

From 742595b885d9c9bb3118c59499a3cc089249744a Mon Sep 17 00:00:00 2001
From: Bowen Bao <bowbao@microsoft.com>
Date: Tue, 12 Mar 2024 09:41:11 -0700
Subject: [PATCH 145/279] Speedup Llama2 cpu throughput in bench by 1.69x with
 iobinding (#19853)

### Description
Always set `use_io_binding=True` when using optimum.onnxruntime unless
there is a special case.


### Motivation and Context
By default, `ORTModel` under optimum.onnxruntime will choose the
appropriate `use_io_binding` value based on provider and use cases.

>         use_io_binding (`Optional[bool]`, defaults to `None`):
> Whether to use IOBinding during inference to avoid memory copy between
the host and device, or between numpy/torch tensors and ONNX Runtime
ORTValue. Defaults to
> `True` if the execution provider is CUDAExecutionProvider. For
[~onnxruntime.ORTModelForCausalLM], defaults to `True` on
CPUExecutionProvider,
 >           in all other cases defaults to `False`.

For Llama token benchmark, using iobinding yields almost 2x speedup,
even on CPU. This is because this particular model yields a large number
of outputs (>60). Without iobinding, a copy is performed for each output
from ortvalue to numpy array. This adds significant overhead to the
overall run time.

```
Evaluating Llama2 `model(inputs)` step with past_key_values

Before, w/o iobinding on cpu

Batch Size: 1
Sequence Length: 512
Latency: 0.4518657898902893 s
Throughput: 2.2130464894073856 tps

After, w/ iobinding on cpu

Batch Size: 1
Sequence Length: 512
Latency: 0.2662619352340698 s
Throughput: 3.7557001871893703 tps
```
---
 .../python/tools/transformers/models/llama/benchmark.py   | 2 +-
 .../transformers/models/stable_diffusion/benchmark.py     | 8 ++++----
 .../python/tools/transformers/models/whisper/benchmark.py | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/models/llama/benchmark.py b/onnxruntime/python/tools/transformers/models/llama/benchmark.py
index a53dead77dea6..f597cead40331 100644
--- a/onnxruntime/python/tools/transformers/models/llama/benchmark.py
+++ b/onnxruntime/python/tools/transformers/models/llama/benchmark.py
@@ -243,7 +243,7 @@ def get_model(args: argparse.Namespace):
             decoder_file_name=decoder_file_name,
             decoder_with_past_file_name=decoder_with_past_file_name,
             use_auth_token=args.auth,
-            use_io_binding=(args.device != "cpu"),
+            use_io_binding=True,  # Large perf gain even for cpu due to avoiding output copy.
             use_merged=(True if decoder_file_name == "model.onnx" else None),
             provider=provider,
             provider_options=provider_options,
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py
index 6c337af78e0a9..3879e25386d53 100755
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py
@@ -315,13 +315,13 @@ def get_optimum_ort_pipeline(
                 directory,
                 provider=provider,
                 session_options=None,
-                use_io_binding=False,
+                use_io_binding=False,  # Not supported by Optimum version 1.17.1 at the time of verification.
             )
         else:
             pipeline = ORTStableDiffusionPipeline.from_pretrained(
                 directory,
                 provider=provider,
-                use_io_binding=False,
+                use_io_binding=False,  # Not supported by Optimum version 1.17.1 at the time of verification.
             )
     elif "xl" in model_name:
         pipeline = ORTStableDiffusionXLPipeline.from_pretrained(
@@ -329,7 +329,7 @@ def get_optimum_ort_pipeline(
             export=True,
             provider=provider,
             session_options=None,
-            use_io_binding=False,
+            use_io_binding=False,  # Not supported by Optimum version 1.17.1 at the time of verification.
         )
         pipeline.save_pretrained(directory)
     else:
@@ -337,7 +337,7 @@ def get_optimum_ort_pipeline(
             model_name,
             export=True,
             provider=provider,
-            use_io_binding=False,
+            use_io_binding=False,  # Not supported by Optimum version 1.17.1 at the time of verification.
         )
         pipeline.save_pretrained(directory)
 
diff --git a/onnxruntime/python/tools/transformers/models/whisper/benchmark.py b/onnxruntime/python/tools/transformers/models/whisper/benchmark.py
index 11e596cadc2cb..3f7a292a02748 100644
--- a/onnxruntime/python/tools/transformers/models/whisper/benchmark.py
+++ b/onnxruntime/python/tools/transformers/models/whisper/benchmark.py
@@ -145,10 +145,10 @@ def get_model(args: argparse.Namespace):
         start_time = time.time()
         model = ORTModelForSpeechSeq2Seq.from_pretrained(
             args.hf_ort_dir_path,
-            use_io_binding=(args.device != "cpu"),
             provider=provider,
             provider_options=provider_options,
             session_options=sess_options,
+            use_io_binding=True,  # Avoid memory copy overhead
         )
         end_time = time.time()
 

From 319159b7bde615e8f5b5f785020dd6ae7f49c173 Mon Sep 17 00:00:00 2001
From: zz002 <zhenze.wang@amd.com>
Date: Wed, 13 Mar 2024 01:27:14 +0800
Subject: [PATCH 146/279] 
 [VitisAI]set-data_loaction-as-default-when-load-external-data (#19712)

### Description
<!-- Describe your changes. -->

set-data_loaction-as-default-when-load-external-data
fix vitis ai ep can not get CutomOps by session_option register

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

VitisAI bug daily fixes
when use pass: fuse_qdq_GEMM or fuse_qdq_MATMUL, get error like : Error
Data of TensorProto ( tensor name: xxx) is stored externally and should
not have data field.raw_data

---------

Co-authored-by: Zhenze Wang <zhenzew@xilinx.com>
---
 .../providers/shared_library/provider_interfaces.h   | 11 +++++++++--
 .../providers/shared_library/provider_wrappedtypes.h |  6 ++++--
 onnxruntime/core/providers/vitisai/imp/global_api.cc |  5 +++--
 onnxruntime/core/providers/vitisai/imp/node.cc       | 12 ++++++++++--
 .../core/providers/vitisai/imp/tensor_proto.cc       |  1 +
 onnxruntime/core/session/provider_bridge_ort.cc      |  5 ++++-
 onnxruntime/test/perftest/ort_test_session.cc        |  2 +-
 7 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index 0b8551e0c5a66..8c8d5b1fd460a 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 #include <optional>
+#include <list>
 
 // Public wrappers around internal ort interfaces (currently)
 #include "core/providers/shared_library/provider_host_api.h"
@@ -34,6 +35,7 @@ struct ProviderHostCPU;
 class PhiloxGenerator;
 using ProviderType = const std::string&;
 class RandomGenerator;
+class IOnnxRuntimeOpSchemaCollection;
 
 #ifdef ENABLE_TRAINING_TORCH_INTEROP
 namespace contrib {
@@ -93,6 +95,8 @@ using NodeIndex = size_t;
 // using NodeAttributes = std::unordered_map<std::string, ONNX_NAMESPACE::AttributeProto_Copyable>;
 using ModelMetaData = std::unordered_map<std::string, std::string>;
 
+using IOnnxRuntimeOpSchemaCollectionPtr = std::shared_ptr<IOnnxRuntimeOpSchemaCollection>;
+using IOnnxRuntimeOpSchemaRegistryList = std::list<IOnnxRuntimeOpSchemaCollectionPtr>;
 using InitializedTensorSet = std::unordered_map<std::string, const ONNX_NAMESPACE::TensorProto*>;
 
 struct Node__NodeIterator {
@@ -435,6 +439,7 @@ struct ProviderHost {
   virtual void TensorProto__clear_int64_data(ONNX_NAMESPACE::TensorProto* p) = 0;
   virtual void TensorProto__clear_double_data(ONNX_NAMESPACE::TensorProto* p) = 0;
   virtual void TensorProto__clear_uint64_data(ONNX_NAMESPACE::TensorProto* p) = 0;
+  virtual void TensorProto__set_data_location(ONNX_NAMESPACE::TensorProto* p, ONNX_NAMESPACE::TensorProto_DataLocation data_location) = 0;
 
   virtual bool TensorProto_DataType_IsValid(int value) = 0;
 
@@ -755,8 +760,9 @@ struct ProviderHost {
   virtual void NodeAttributes__reserve(NodeAttributes* p, size_t size) = 0;
 
   // Model
-  virtual std::unique_ptr<Model> Model__construct(ONNX_NAMESPACE::ModelProto&& model_proto,
-                                                  const PathString& model_path, const logging::Logger& logger) = 0;
+  virtual std::unique_ptr<Model> Model__construct(ONNX_NAMESPACE::ModelProto&& model_proto, const PathString& model_path,
+                                                  const IOnnxRuntimeOpSchemaRegistryList* local_registries,
+                                                  const logging::Logger& logger) = 0;
   virtual void Model__operator_delete(Model* p) = 0;
   virtual Graph& Model__MainGraph(Model* p) = 0;
   virtual std::unique_ptr<ONNX_NAMESPACE::ModelProto> Model__ToProto(Model* p) = 0;
@@ -814,6 +820,7 @@ struct ProviderHost {
   virtual Node* Graph__GetNode(Graph* p, NodeIndex node_index) noexcept = 0;
   virtual const Node* Graph__GetNode(const Graph* p, NodeIndex node_index) const = 0;
   virtual const NodeArg* Graph__GetNodeArg(const Graph* p, const std::string& name) const = 0;
+  virtual IOnnxRuntimeOpSchemaCollectionPtr Graph__GetSchemaRegistry(const Graph* p) const = 0;
 
   // GraphViewer
   virtual void GraphViewer__operator_delete(GraphViewer* p) = 0;
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index dc2b79015d95e..bdad18c7edec0 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -205,6 +205,7 @@ struct TensorProto final {
 
   bool has_data_location() const { return g_host->TensorProto__has_data_location(this); }
   TensorProto_DataLocation data_location() const { return TensorProto_DataLocation(g_host->TensorProto__data_location(this)); }
+  void set_data_location(TensorProto_DataLocation data_location) { return g_host->TensorProto__set_data_location(this, data_location); }
 
   bool has_raw_data() const { return g_host->TensorProto__has_raw_data(this); }
   const std::string& raw_data() const { return g_host->TensorProto__raw_data(this); }
@@ -778,8 +779,8 @@ struct NodeAttributes final {
 
 struct Model final {
   static std::unique_ptr<Model> Create(ONNX_NAMESPACE::ModelProto&& model_proto, const PathString& model_path,
-                                       const logging::Logger& logger) {
-    return g_host->Model__construct(std::move(model_proto), model_path, logger);
+                                       const IOnnxRuntimeOpSchemaRegistryList* local_registries, const logging::Logger& logger) {
+    return g_host->Model__construct(std::move(model_proto), model_path, local_registries, logger);
   }
   static void operator delete(void* p) { g_host->Model__operator_delete(reinterpret_cast<Model*>(p)); }
   static Status Load(const PathString& file_path, /*out*/ ONNX_NAMESPACE::ModelProto& model_proto) { return g_host->Model__Load(file_path, model_proto); }
@@ -857,6 +858,7 @@ struct Graph final {
   const Node* GetNode(NodeIndex node_index) const noexcept { return g_host->Graph__GetNode(this, node_index); }
   Node* GetNode(NodeIndex node_index) noexcept { return g_host->Graph__GetNode(this, node_index); }
   const NodeArg* GetNodeArg(const std::string& name) const { return g_host->Graph__GetNodeArg(this, name); }
+  IOnnxRuntimeOpSchemaCollectionPtr GetSchemaRegistry() const { return g_host->Graph__GetSchemaRegistry(this); }
 
   PROVIDER_DISALLOW_ALL(Graph)
 };
diff --git a/onnxruntime/core/providers/vitisai/imp/global_api.cc b/onnxruntime/core/providers/vitisai/imp/global_api.cc
index f609d40f459b7..eba3230d283cf 100644
--- a/onnxruntime/core/providers/vitisai/imp/global_api.cc
+++ b/onnxruntime/core/providers/vitisai/imp/global_api.cc
@@ -188,7 +188,7 @@ vaip_core::OrtApiForVaip* create_org_api_hook() {
     auto file_path = ToPathString(filename);
     auto status = Model::Load(file_path, *model_proto);
     vai_assert(status.IsOK(), "load model proto error");
-    auto model = Model::Create(std::move(*model_proto), file_path, logger);
+    auto model = Model::Create(std::move(*model_proto), file_path, nullptr, logger);
     return model.release();
   };
   the_global_api.model_delete = [](Model* model) { delete model; };
@@ -198,7 +198,8 @@ vaip_core::OrtApiForVaip* create_org_api_hook() {
     auto& model = const_cast<onnxruntime::Model&>(const_model);
     auto model_proto = model.ToProto();
     auto file_path = model.MainGraph().ModelPath().ToPathString();
-    auto ret = Model::Create(std::move(*model_proto), file_path, logger);
+    auto local_registries = IOnnxRuntimeOpSchemaRegistryList{model.MainGraph().GetSchemaRegistry()};
+    auto ret = Model::Create(std::move(*model_proto), file_path, &local_registries, logger);
     auto status = ret->MainGraph().Resolve();
     vai_assert(status.IsOK(), status.ErrorMessage());
     return ret.release();
diff --git a/onnxruntime/core/providers/vitisai/imp/node.cc b/onnxruntime/core/providers/vitisai/imp/node.cc
index 0565171fb7f40..432d7f7daead2 100644
--- a/onnxruntime/core/providers/vitisai/imp/node.cc
+++ b/onnxruntime/core/providers/vitisai/imp/node.cc
@@ -34,9 +34,17 @@ vaip_core::DllSafe<std::vector<const NodeArg*>> node_get_output_node_args(const
   auto ret = std::vector<const NodeArg*>(size);
   for (auto i = 0u; i < size; ++i) {
     auto output = outputs[i];
-    ret[i] = output;
     assert(output != nullptr);
-    vai_assert(output->Exists(), std::string("output must exists. name=" + output->Name()));
+    // Optional Outputs
+    // Some operators have outputs that are optional. When an actual output parameter of an operator is not specified, the operator implementation MAY forgo computing values for such outputs.
+    // There are two ways to leave an optional input or output unspecified: the first, available only for trailing inputs and outputs, is to simply not provide that input; the second method is to use an empty string in place of an input or output name.
+    // so optional output maybe output != null && output->Exists() return false
+    // Our processing : nullptr means optional output , and clinet code needs to handle nullptr
+    if (output->Exists()) {
+      ret[i] = output;
+    } else {
+      ret[i] = nullptr;
+    }
   }
   return vaip_core::DllSafe(ret);
 }
diff --git a/onnxruntime/core/providers/vitisai/imp/tensor_proto.cc b/onnxruntime/core/providers/vitisai/imp/tensor_proto.cc
index 48dcd220a150c..671d852abb0d6 100644
--- a/onnxruntime/core/providers/vitisai/imp/tensor_proto.cc
+++ b/onnxruntime/core/providers/vitisai/imp/tensor_proto.cc
@@ -22,6 +22,7 @@ gsl::span<const char> tensor_proto_as_raw(const ONNX_NAMESPACE::TensorProto& ten
     mut_tensor.clear_double_data();
     mut_tensor.clear_uint64_data();
     memcpy(mut_tensor.mutable_raw_data()->data(), unpacked_tensor.data(), unpacked_tensor.size());
+    mut_tensor.set_data_location(ONNX_NAMESPACE::TensorProto_DataLocation::TensorProto_DataLocation_DEFAULT);
   }
   return gsl::span<const char>(tensor.raw_data().data(), tensor.raw_data().size());
 }
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index d6797512d9e47..c7cf5963fa10f 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -528,6 +528,7 @@ struct ProviderHostImpl : ProviderHost {
   void TensorProto__add_dims(ONNX_NAMESPACE::TensorProto* p, int64_t value) override { p->add_dims(value); }
   bool TensorProto__has_data_location(const ONNX_NAMESPACE::TensorProto* p) override { return p->has_data_location(); }
   int TensorProto__data_location(const ONNX_NAMESPACE::TensorProto* p) override { return p->data_location(); }
+  void TensorProto__set_data_location(ONNX_NAMESPACE::TensorProto* p, ONNX_NAMESPACE::TensorProto_DataLocation data_location) override { return p->set_data_location(data_location); }
   bool TensorProto__has_raw_data(const ONNX_NAMESPACE::TensorProto* p) override { return p->has_raw_data(); }
   const std::string& TensorProto__raw_data(const ONNX_NAMESPACE::TensorProto* p) override { return p->raw_data(); }
   std::string* TensorProto__mutable_raw_data(ONNX_NAMESPACE::TensorProto* p) override { return p->mutable_raw_data(); }
@@ -966,8 +967,9 @@ struct ProviderHostImpl : ProviderHost {
 
   // Model (wrapped)
   std::unique_ptr<Model> Model__construct(ONNX_NAMESPACE::ModelProto&& model_proto, const PathString& model_path,
+                                          const IOnnxRuntimeOpSchemaRegistryList* local_registries,
                                           const logging::Logger& logger) override {
-    return std::make_unique<Model>(model_proto, model_path, nullptr, logger);
+    return std::make_unique<Model>(model_proto, model_path, local_registries, logger);
   }
   void Model__operator_delete(Model* p) override { delete p; }
   Graph& Model__MainGraph(Model* p) override { return p->MainGraph(); }
@@ -1047,6 +1049,7 @@ struct ProviderHostImpl : ProviderHost {
   Node* Graph__GetNode(Graph* p, NodeIndex node_index) noexcept override { return p->GetNode(node_index); }
   const Node* Graph__GetNode(const Graph* p, NodeIndex node_index) const override { return p->GetNode(node_index); }
   const NodeArg* Graph__GetNodeArg(const Graph* p, const std::string& name) const override { return p->GetNodeArg(name); }
+  IOnnxRuntimeOpSchemaCollectionPtr Graph__GetSchemaRegistry(const Graph* p) const override { return p->GetSchemaRegistry(); }
 
   // GraphViewer (wrapped)
   void GraphViewer__operator_delete(GraphViewer* p) override { delete p; }
diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc
index 9679ca6159464..71d260a18ce7b 100644
--- a/onnxruntime/test/perftest/ort_test_session.cc
+++ b/onnxruntime/test/perftest/ort_test_session.cc
@@ -649,7 +649,7 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
       std::string value(token.substr(pos + 1));
       vitisai_session_options[key] = value;
     }
-    session_options.AppendExecutionProvider("VitisAI", vitisai_session_options);
+    session_options.AppendExecutionProvider_VitisAI(vitisai_session_options);
 #else
     ORT_THROW("VitisAI is not supported in this build\n");
 #endif

From 7f0520cdf9ef17681c37119f07ed21f1d4be3f82 Mon Sep 17 00:00:00 2001
From: Ye Wang <52801275+wangyems@users.noreply.github.com>
Date: Tue, 12 Mar 2024 17:33:37 +0000
Subject: [PATCH 147/279] bug fix to multi-cudagraph  (#19856)

### Description
<!-- Describe your changes. -->

run_count_before_capture_ is graph_id aware, fix the bug by adding a map
to retrieve the run_count_ for each graph_id.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../providers/cuda/cuda_execution_provider.cc | 20 ++++++++++++++-----
 .../providers/cuda/cuda_execution_provider.h  |  5 +++--
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index 18c7334af6611..05d9f3b5a1e8f 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -194,8 +194,13 @@ CUDAExecutionProvider::PerThreadContext::~PerThreadContext() {
 
 bool CUDAExecutionProvider::PerThreadContext::IsGraphCaptureAllowed(
     CudaGraphAnnotation_t cuda_graph_annotation_id) const {
-  return regular_run_count_before_graph_capture_ >= min_num_runs_before_cuda_graph_capture_ &&
-         IsGraphCaptureAllowedOnRun(cuda_graph_annotation_id);
+  if (!IsGraphCaptureAllowedOnRun(cuda_graph_annotation_id)) {
+    return false;
+  }
+  if (graph_id_to_run_count_.find(cuda_graph_annotation_id) == graph_id_to_run_count_.end()) {
+    return false;
+  }
+  return graph_id_to_run_count_.at(cuda_graph_annotation_id) >= min_num_runs_before_cuda_graph_capture_;
 }
 
 bool CUDAExecutionProvider::PerThreadContext::IsGraphCaptureAllowedOnRun(
@@ -234,8 +239,13 @@ Status CUDAExecutionProvider::PerThreadContext::ReplayGraph(CudaGraphAnnotation_
   return cuda_graph_.Replay(graph_annotation_id);
 }
 
-void CUDAExecutionProvider::PerThreadContext::IncrementRegularRunCountBeforeGraphCapture() {
-  ++regular_run_count_before_graph_capture_;
+void CUDAExecutionProvider::PerThreadContext::IncrementRegularRunCountBeforeGraphCapture(
+    CudaGraphAnnotation_t cuda_graph_annotation_id) {
+  if (graph_id_to_run_count_.find(cuda_graph_annotation_id) == graph_id_to_run_count_.end()) {
+    graph_id_to_run_count_[cuda_graph_annotation_id] = 1;
+    return;
+  }
+  graph_id_to_run_count_[cuda_graph_annotation_id]++;
 }
 
 void OverrideTunableOpInfoByEnv(CUDAExecutionProviderInfo& info) {
@@ -428,7 +438,7 @@ Status CUDAExecutionProvider::OnRunEnd(bool sync_stream, const onnxruntime::RunO
       // so run the captured graph here to actually execute the work.
       ORT_RETURN_IF_ERROR(GetPerThreadContext().ReplayGraph(cuda_graph_annotation_id));
     } else {
-      GetPerThreadContext().IncrementRegularRunCountBeforeGraphCapture();
+      GetPerThreadContext().IncrementRegularRunCountBeforeGraphCapture(cuda_graph_annotation_id);
     }
   }
 
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.h b/onnxruntime/core/providers/cuda/cuda_execution_provider.h
index 6c70e6abc4fdf..f53779058a8af 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.h
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.h
@@ -175,7 +175,7 @@ class CUDAExecutionProvider : public IExecutionProvider {
     bool IsGraphCaptured(CudaGraphAnnotation_t cuda_graph_annotation_id) const;
     CudaGraphAnnotation_t GetCudaGraphAnnotationId(const onnxruntime::RunOptions& run_options) const;
     Status ReplayGraph(CudaGraphAnnotation_t cuda_graph_annotation_id);
-    void IncrementRegularRunCountBeforeGraphCapture();
+    void IncrementRegularRunCountBeforeGraphCapture(CudaGraphAnnotation_t cuda_graph_annotation_id);
 
    private:
     cublasHandle_t cublas_handle_ = nullptr;
@@ -194,7 +194,8 @@ class CUDAExecutionProvider : public IExecutionProvider {
     // Cuda graph with multi threads will be supported in the future, so cuda_graph_
     // is put under PerThreadContext.
     CUDAGraph cuda_graph_;
-    int regular_run_count_before_graph_capture_ = 0;
+    // Map of graph id to regular_run_count_before_graph_capture
+    std::unordered_map<CudaGraphAnnotation_t, int> graph_id_to_run_count_;
 
     // There is chance that the second regular run allocates GPU memory for causes like:
     // (1) memory pattern is enabled. (2) arena allocation for stream.

From 00c3cd497e6c90e1b923d3e650a964278d36e82e Mon Sep 17 00:00:00 2001
From: Adrian Lizarraga <adlizarraga@microsoft.com>
Date: Tue, 12 Mar 2024 10:47:09 -0700
Subject: [PATCH 148/279] [QDQ Quantization] Refactor shared functionality into
 a base quantizer (#19817)

### Description
This PR does not add or remove any functionality. It refactors common
functionality shared by the `ONNXQuantizer` and `QDQQuantizer` classes
into a new `BaseQuantizer` class.

This change helps decouple the QDQ quantizer from other quantization
modes and makes it easier to determine if a change to one quantization
mode will impact another.

### Motivation and Context
An upcoming PR aims to add mixed-precision support to QDQ models (e.g.,
one part of the graph uses u8 activations and another uses u16
activations). This change makes the upcoming PR smaller and should
presumably make determining the impact on existing features more
straightforward.
---
 .../tools/quantization/base_quantizer.py      | 727 ++++++++++++++++++
 .../tools/quantization/onnx_quantizer.py      | 700 +----------------
 .../tools/quantization/qdq_quantizer.py       |  10 +-
 .../python/tools/quantization/quantize.py     |   2 -
 .../test/python/quantization/test_qdq.py      |   8 +-
 5 files changed, 747 insertions(+), 700 deletions(-)
 create mode 100644 onnxruntime/python/tools/quantization/base_quantizer.py

diff --git a/onnxruntime/python/tools/quantization/base_quantizer.py b/onnxruntime/python/tools/quantization/base_quantizer.py
new file mode 100644
index 0000000000000..6fa88a9e44232
--- /dev/null
+++ b/onnxruntime/python/tools/quantization/base_quantizer.py
@@ -0,0 +1,727 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+import logging
+from typing import Any, Dict
+
+import numpy as np
+import onnx
+import onnx.numpy_helper
+
+try:
+    from onnx.reference.op_run import to_array_extended
+except ImportError:
+    # old version of onnx.
+    to_array_extended = None
+
+from .calibrate import TensorData
+from .onnx_model import ONNXModel
+from .quant_utils import (
+    ONNX_TYPE_TO_NP_TYPE,
+    TENSOR_NAME_QUANT_SUFFIX,
+    QuantizedValue,
+    QuantizedValueType,
+    QuantType,
+    compute_scale_zp,
+    compute_scale_zp_float8,
+    find_by_name,
+    get_qmin_qmax_for_qType,
+    model_has_infer_metadata,
+    quantize_data,
+    quantize_nparray,
+    save_and_reload_model_with_shape_infer,
+    tensor_proto_to_array,
+)
+
+
+class QuantizationParams:
+    def __init__(self, **data: Dict[str, Any]):
+        self.data = {}
+        for k, v in data.items():
+            if not isinstance(k, str):
+                raise TypeError(f"Keys must be strings not {type(k)} for k={k!r}.")
+            if not isinstance(v, (int, str, np.ndarray)):
+                raise TypeError(f"Values must be numpy arrays, int, float, str not {type(v)} for k={k!r}.")
+            if k == "scale" and v.dtype not in (np.float32, np.float16):
+                raise ValueError(f"scale must a float32 or float16 numpy element but is {v.dtype} for k={k!r}")
+            self.data[k] = v
+
+    def __iter__(self):
+        yield from self.data
+
+    def __getitem__(self, key):
+        return self.data[key]
+
+    def __len__(self):
+        return len(self.data)
+
+
+class BaseQuantizer:
+    def __init__(
+        self,
+        model,
+        per_channel,
+        reduce_range,
+        weight_qType,
+        activation_qType,
+        tensors_range,
+        nodes_to_quantize,
+        nodes_to_exclude,
+        op_types_to_quantize,
+        extra_options=None,
+    ):
+        if not model_has_infer_metadata(model):
+            model = save_and_reload_model_with_shape_infer(model)
+        self.value_infos = {vi.name: vi for vi in model.graph.value_info}
+        self.value_infos.update({ot.name: ot for ot in model.graph.output})
+        self.value_infos.update({it.name: it for it in model.graph.input})
+
+        self.model = ONNXModel(model)
+        self.per_channel = per_channel  # weight-pack per channel
+        self.reduce_range = reduce_range
+
+        self.extra_options = extra_options if extra_options else {}
+        self.enable_subgraph_quantization = (
+            "EnableSubgraph" in self.extra_options and self.extra_options["EnableSubgraph"]
+        )
+        self.parent = None
+        self.force_quantize_no_input_check = (
+            "ForceQuantizeNoInputCheck" in self.extra_options and self.extra_options["ForceQuantizeNoInputCheck"]
+        )
+        self.is_weight_symmetric = self.extra_options.get(
+            "WeightSymmetric", weight_qType in (QuantType.QInt8, QuantType.QInt16, QuantType.QFLOAT8E4M3FN)
+        )
+        self.is_activation_symmetric = self.extra_options.get("ActivationSymmetric", False)
+        self.min_real_range = self.extra_options.get("MinimumRealRange")
+
+        self.activation_qType = getattr(activation_qType, "tensor_type", activation_qType)
+        self.weight_qType = getattr(weight_qType, "tensor_type", weight_qType)
+
+        """
+            Dictionary specifying the min and max values for tensors. It has following format:
+                {
+                    "param_name": [min, max]
+                }
+            example:
+                {
+                    'Conv_3:0': [np.float32(0), np.float32(0.5)],
+                    'Conv_4:0': [np.float32(1), np.float32(3.5)]
+                }
+        """
+        if tensors_range is not None and any(map(lambda t: not isinstance(t, TensorData), tensors_range.values())):
+            raise TypeError(
+                f"tensors_range contains unexpected types {set(type(v) for v in tensors_range.values())}, not TensorData."
+            )
+        self.tensors_range = tensors_range
+        self.nodes_to_quantize = nodes_to_quantize  # specific nodes to quantize
+        self.nodes_to_exclude = nodes_to_exclude  # specific nodes to exclude
+        self.op_types_to_quantize = op_types_to_quantize
+
+        self.opset_version = self.check_opset_version()
+
+        # Map of all original value names to quantized value names
+        self.quantized_value_map = {}
+
+        self.tensor_quant_overrides, self.tensor_quant_override_types = self._get_and_check_tensor_quant_overrides()
+        self.quantization_params = self.calculate_quantization_params()
+
+        # to store specified scale and zeropoint instead of calculated value, tensor_name->(scale, zeropoint)
+        self.used_scale_zp_map = {}
+
+    def set_quant_scale_zp(self, tensor_name, value):
+        assert isinstance(value, tuple) and len(value) == 2, "value must be scale(float or float16) and zeropoint"
+        assert hasattr(value[0], "dtype")
+        assert tensor_name not in self.used_scale_zp_map, f"{tensor_name} has been setted before"
+        self.used_scale_zp_map[tensor_name] = value
+
+    def find_quant_scale_zp(self, input_name):
+        if input_name in self.used_scale_zp_map:
+            return self.used_scale_zp_map[input_name]
+        if self.parent is not None:
+            return self.parent.find_quantized_value(input_name)
+        return (None, None)
+
+    def quantize_model(self):
+        raise NotImplementedError
+
+    def is_input_a_initializer(self, input_name):
+        initializer = find_by_name(input_name, self.model.initializer())
+        return initializer is not None
+
+    def is_per_channel(self):
+        return self.per_channel
+
+    def is_valid_quantize_weight(self, weight_name):
+        weight = find_by_name(weight_name, self.model.initializer())
+        if weight is not None:
+            return weight.data_type in (onnx.TensorProto.FLOAT, onnx.TensorProto.FLOAT16)
+        if (not self.enable_subgraph_quantization) or (self.parent is None):
+            return False
+        return self.parent.is_valid_quantize_weight(weight_name)
+
+    def should_quantize_node(self, node):
+        if (
+            self.nodes_to_quantize is not None
+            and len(self.nodes_to_quantize) != 0
+            and node.name not in self.nodes_to_quantize
+        ):
+            return False
+
+        if node.op_type not in self.op_types_to_quantize:
+            return False
+
+        if self.nodes_to_exclude is not None and node.name in self.nodes_to_exclude:
+            return False
+
+        return True
+
+    def check_opset_version(self):
+        ai_onnx_domain = [
+            opset for opset in self.model.model.opset_import if not opset.domain or opset.domain == "ai.onnx"
+        ]
+        if len(ai_onnx_domain) != 1:
+            raise ValueError("Failed to find proper ai.onnx domain")
+        opset_version = ai_onnx_domain[0].version
+
+        if opset_version == 10:
+            logging.warning(
+                "The original model opset version is {}, which does not support node fusions. Please update the model to opset >= 11 for better performance.".format(
+                    opset_version
+                )
+            )
+            return 10
+
+        if opset_version < 10:
+            logging.warning(
+                "The original model opset version is {}, which does not support quantization. Please update the model to opset >= 11. Updating the model automatically to opset 11. Please verify the quantized model.".format(
+                    opset_version
+                )
+            )
+            self.model.model.opset_import.remove(ai_onnx_domain[0])
+            self.model.model.opset_import.extend([onnx.helper.make_opsetid("", 11)])
+            opset_version = 11
+
+        if opset_version < 19 and self.weight_qType == onnx.TensorProto.FLOAT8E4M3FN:
+            logging.warning(
+                "The original model opset version is {}, which does not support quantization to float 8. "
+                "Please update the model to opset >= 19. Updating the model automatically to opset 19. "
+                "Please verify the quantized model.".format(opset_version)
+            )
+            self.model.model.opset_import.remove(ai_onnx_domain[0])
+            self.model.model.opset_import.extend([onnx.helper.make_opsetid("", 19)])
+            self.model.model.ir_version = 9
+            opset_version = 19
+
+        return opset_version
+
+    def quantize_bias_static(self, bias_name, input_name, weight_name, beta=1.0):
+        """
+        Quantized the bias. Zero Point == 0 and Scale == Input_Scale * Weight_Scale
+        """
+
+        # Handle case where bias already in quantization map
+        if bias_name in self.quantized_value_map:
+            return self.quantized_value_map[bias_name].q_name
+
+        # get scale for weight
+        weight_scale_name = self.quantized_value_map[weight_name].scale_name
+        weight_initializer = find_by_name(weight_scale_name, self.model.initializer())
+        weight_scale = tensor_proto_to_array(weight_initializer)
+
+        # get bias
+        bias_initializer = find_by_name(bias_name, self.model.initializer())
+        bias_data = tensor_proto_to_array(bias_initializer)
+        quantized_bias_name = bias_name + TENSOR_NAME_QUANT_SUFFIX
+
+        # get scale for input
+        if input_name in self.quantized_value_map:
+            input_scale_name = self.quantized_value_map[input_name].scale_name
+        elif input_name in self.quantization_params:
+            _, input_scale_name, _, _, _ = self._get_quantization_params(input_name)
+        else:
+            raise ValueError(f"Expected {input_name} to be in quantized value map for static quantization")
+
+        inputscale_initializer = find_by_name(input_scale_name, self.model.initializer())
+        input_scale = tensor_proto_to_array(inputscale_initializer)
+
+        # quantize bias
+        if self.weight_qType == onnx.TensorProto.FLOAT8E4M3FN:
+            data = np.asarray(bias_data)
+            if data.dtype == np.float16:
+                node_qtype = onnx.TensorProto.FLOAT16
+            elif data.dtype == np.float32:
+                node_qtype = onnx.TensorProto.FLOAT
+            else:
+                raise TypeError(f"Only float16 or float32 are supported with float 8 but bias dtype is {data.dtype}.")
+            quantized_data = data.astype(np.float32)
+            bias_scale = np.array([1], dtype=quantized_data.dtype)
+            bias_scale_data = bias_scale.reshape(-1)
+            packed_bias_initializer = onnx.numpy_helper.from_array(quantized_data, quantized_bias_name)
+            self.model.initializer_extend([packed_bias_initializer])
+            node_type = "Cast"
+        else:
+            # calculate scale for bias
+            # TODO: This formula should be explained including why the scale is not estimated for the bias as well.
+            bias_scale = input_scale * weight_scale * beta
+
+            quantized_data = (np.asarray(bias_data) / bias_scale).round().astype(np.int32)
+
+            # update bias initializer
+            bias_np_data = np.asarray(quantized_data, dtype=np.int32).reshape(bias_initializer.dims)
+            packed_bias_initializer = onnx.numpy_helper.from_array(bias_np_data, quantized_bias_name)
+            self.model.initializer_extend([packed_bias_initializer])
+            bias_scale_data = np.asarray(bias_scale, dtype=np.float32).reshape(-1)
+            node_type = "DequantizeLinear"
+            node_qtype = self.weight_qType
+
+        # update scale initializer
+        quantized_bias_scale_name = quantized_bias_name + "_scale"
+        packed_bias_scale_initializer = onnx.numpy_helper.from_array(bias_scale_data, quantized_bias_scale_name)
+        self.model.initializer_extend([packed_bias_scale_initializer])
+
+        # update zero initializer
+        if self.weight_qType == onnx.TensorProto.FLOAT8E4M3FN:
+            tensor_type = self.weight_qType
+        else:
+            tensor_type = onnx.TensorProto.INT32
+
+        quantized_bias_zp_name = quantized_bias_name + "_zero_point"
+        if self.weight_qType == onnx.TensorProto.FLOAT8E4M3FN:
+            packed_bias_zp_initializer = onnx.helper.make_tensor(quantized_bias_zp_name, self.weight_qType, [1], [0.0])
+        elif self.is_per_channel():
+            bias_zp_data = np.zeros(bias_scale.shape, dtype=np.int32).reshape(-1)
+            packed_bias_zp_initializer = onnx.numpy_helper.from_array(bias_zp_data, quantized_bias_zp_name)
+        else:
+            packed_bias_zp_initializer = onnx.helper.make_tensor(quantized_bias_zp_name, tensor_type, [], [0])
+        self.model.initializer_extend([packed_bias_zp_initializer])
+
+        assert bias_name not in self.quantized_value_map
+        quantized_value = QuantizedValue(
+            bias_name,
+            quantized_bias_name,
+            quantized_bias_scale_name,
+            quantized_bias_zp_name,
+            QuantizedValueType.Initializer,
+            0 if bias_scale_data.size > 1 else None,
+            node_type=node_type,
+            node_qtype=node_qtype,
+        )
+        self.quantized_value_map[bias_name] = quantized_value
+
+        return quantized_bias_name
+
+    def quantize_initializer(self, weight, qType, reduce_range=False, keep_float_weight=False):
+        """
+        :param weight: TensorProto initializer
+        :param qType: type to quantize to
+        :param keep_float_weight: Whether to quantize the weight. In some cases, we only want to qunatize scale and zero point.
+                                  If keep_float_weight is False, quantize the weight, or don't quantize the weight.
+        :return: quantized weight name, zero point name, scale name
+        """
+        # Find if this input is already quantized
+        if weight.name in self.quantized_value_map:
+            quantized_value = self.quantized_value_map[weight.name]
+            return (
+                quantized_value.q_name,
+                quantized_value.zp_name,
+                quantized_value.scale_name,
+            )
+
+        q_weight_name = weight.name + TENSOR_NAME_QUANT_SUFFIX
+        zp_name = weight.name + "_zero_point"
+        scale_name = weight.name + "_scale"
+
+        # Quantize weight data. Use quantization overrides if provided by the user.
+        weight_data = tensor_proto_to_array(weight)
+        quant_overrides = self.get_per_tensor_quant_overrides(weight.name)
+        if "quant_type" in quant_overrides:
+            qType = quant_overrides["quant_type"].tensor_type  # noqa: N806
+
+        if "scale" in quant_overrides and "zero_point" in quant_overrides:
+            zero_point = np.array(quant_overrides["zero_point"], dtype=ONNX_TYPE_TO_NP_TYPE[qType])
+            scale = np.array(quant_overrides["scale"])
+            q_weight_data = quantize_nparray(qType, weight_data.flatten(), scale, zero_point)
+            assert isinstance(zero_point, np.ndarray), f"Unexpected type {type(zero_point)}"
+            assert (
+                zero_point.dtype != np.float32 and zero_point.dtype != np.float16
+            ), f"Unexpected dtype {zero_point.dtype}"
+            assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}"
+
+        else:
+            _, _, zero_point, scale, q_weight_data = quantize_data(
+                weight_data.flatten(),
+                qType,
+                quant_overrides.get("symmetric", self.is_weight_symmetric),
+                reduce_range=quant_overrides.get("reduce_range", self.reduce_range and reduce_range),
+                min_real_range=self.min_real_range,
+                rmin_override=quant_overrides.get("rmin"),
+                rmax_override=quant_overrides.get("rmax"),
+            )
+
+            assert isinstance(zero_point, np.ndarray), f"Unexpected type {type(zero_point)}"
+            assert (
+                zero_point.dtype != np.float32 and zero_point.dtype != np.float16
+            ), f"Unexpected dtype {zero_point.dtype}"
+            assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}"
+
+        scale_dtype = weight.data_type
+        scale_initializer = onnx.helper.make_tensor(scale_name, scale_dtype, [], scale.reshape((-1,)).tolist())
+        zero_initializer = onnx.helper.make_tensor(zp_name, qType, [], zero_point.reshape((-1,)).tolist())
+        self.model.initializer_extend([scale_initializer, zero_initializer])
+
+        if not keep_float_weight:
+            if self.weight_qType == onnx.TensorProto.FLOAT8E4M3FN:
+                q_weight_initializer = onnx.TensorProto()
+                q_weight_initializer.data_type = self.weight_qType
+                q_weight_initializer.dims.extend(weight.dims)
+                q_weight_initializer.name = q_weight_name
+                # Do not remove .flatten().copy() numpy is not clear about data persistence.
+                q_weight_initializer.raw_data = q_weight_data.flatten().copy().tobytes()
+                if to_array_extended is not None:
+                    # This test should not be needed but it helped catch some issues
+                    # with data persistence and tobytes.
+                    check = to_array_extended(q_weight_initializer)
+                    if check.shape != weight_data.shape or check.tobytes() != q_weight_data.tobytes():
+                        raise RuntimeError(
+                            f"The initializer of shape {weight_data.shape} could not be created, expecting "
+                            f"{q_weight_data.tobytes()[:10]}, got {check.tobytes()[:10]} and shape={weight.shape}"
+                            f"\nraw={str(q_weight_initializer)[:200]}."
+                        )
+            else:
+                q_weight_data = np.asarray(q_weight_data, dtype=onnx.helper.tensor_dtype_to_np_dtype(qType)).reshape(
+                    weight.dims
+                )
+                q_weight_initializer = onnx.numpy_helper.from_array(q_weight_data, q_weight_name)
+            self.model.initializer_extend([q_weight_initializer])
+
+        # Log entry for this quantized weight
+        quantized_value = QuantizedValue(
+            weight.name,
+            q_weight_name,
+            scale_name,
+            zp_name,
+            QuantizedValueType.Initializer,
+            None,
+        )
+        self.quantized_value_map[weight.name] = quantized_value
+        return q_weight_name, zp_name, scale_name
+
+    def quantize_weight_per_channel(
+        self,
+        weight_name,
+        weight_qType,
+        channel_axis,
+        reduce_range=True,
+        keep_float_weight=False,
+    ):
+        # Find if this input is already quantized
+        if weight_name in self.quantized_value_map:
+            quantized_value = self.quantized_value_map[weight_name]
+            return (
+                quantized_value.q_name,
+                quantized_value.zp_name,
+                quantized_value.scale_name,
+            )
+
+        initializer = find_by_name(weight_name, self.model.initializer())
+        if initializer is None:
+            raise ValueError("{} is not an initializer", weight_name)
+
+        weights = tensor_proto_to_array(initializer)
+        channel_count = weights.shape[channel_axis]
+        quant_overrides_for_channels = self.get_per_channel_quant_overrides(weight_name, channel_count)
+
+        # If user provides per-channel quantization overrides, all channels must use the same quantization type.
+        # So, just use the first channel's type.
+        if "quant_type" in quant_overrides_for_channels[0]:
+            weight_qType = quant_overrides_for_channels[0]["quant_type"].tensor_type  # noqa: N806
+
+        zero_point_list = []
+        scale_list = []
+        quantized_per_channel_data_list = []
+        for i in range(channel_count):
+            per_channel_data = weights.take(i, channel_axis)
+            channel_quant_overrides = quant_overrides_for_channels[i]
+
+            if "scale" in channel_quant_overrides and "zero_point" in channel_quant_overrides:
+                zero_point = np.array(channel_quant_overrides["zero_point"], dtype=ONNX_TYPE_TO_NP_TYPE[weight_qType])
+                scale = np.array(channel_quant_overrides["scale"])
+                quantized_per_channel_data = quantize_nparray(
+                    weight_qType, per_channel_data.flatten(), scale, zero_point
+                )
+                assert isinstance(zero_point, np.ndarray), f"Unexpected type {type(zero_point)}"
+                assert (
+                    zero_point.dtype != np.float32 and zero_point.dtype != np.float16
+                ), f"Unexpected dtype {zero_point.dtype}"
+                assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}"
+                assert isinstance(
+                    quantized_per_channel_data, np.ndarray
+                ), f"Unexpected type {type(quantized_per_channel_data)}"
+
+            else:
+                symmetric = channel_quant_overrides.get(
+                    "symmetric",
+                    (
+                        self.is_weight_symmetric
+                        or weight_qType in (onnx.TensorProto.INT8, onnx.TensorProto.FLOAT8E4M3FN)
+                    ),
+                )
+                _, _, zero_point, scale, quantized_per_channel_data = quantize_data(
+                    per_channel_data.flatten(),
+                    weight_qType,
+                    symmetric,
+                    reduce_range=channel_quant_overrides.get("reduce_range", self.reduce_range and reduce_range),
+                    min_real_range=self.min_real_range,
+                    rmin_override=channel_quant_overrides.get("rmin"),
+                    rmax_override=channel_quant_overrides.get("rmax"),
+                )
+
+                assert isinstance(zero_point, np.ndarray), f"Unexpected type {type(zero_point)}"
+                assert (
+                    zero_point.dtype != np.float32 and zero_point.dtype != np.float16
+                ), f"Unexpected dtype {zero_point.dtype}"
+                assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}"
+                assert isinstance(
+                    quantized_per_channel_data, np.ndarray
+                ), f"Unexpected type {type(quantized_per_channel_data)}"
+
+            zero_point_list.append(zero_point)
+            scale_list.append(scale)
+            quantized_per_channel_data_list.append(quantized_per_channel_data)
+
+        # combine per_channel_data into one
+        reshape_dims = list(weights.shape)  # deep copy
+        reshape_dims[channel_axis] = 1  # only one per channel for reshape
+        quantized_weights = np.asarray(quantized_per_channel_data_list[0]).reshape(reshape_dims)
+        for i in range(1, len(quantized_per_channel_data_list)):
+            channel_weights = np.asarray(quantized_per_channel_data_list[i]).reshape(reshape_dims)
+            quantized_weights = np.concatenate((quantized_weights, channel_weights), channel_axis)
+
+        q_weight_name = weight_name + TENSOR_NAME_QUANT_SUFFIX
+        zp_name = weight_name + "_zero_point"
+        scale_name = weight_name + "_scale"
+
+        quantized_value = QuantizedValue(
+            weight_name,
+            q_weight_name,
+            scale_name,
+            zp_name,
+            QuantizedValueType.Initializer,
+            None,
+        )
+        self.quantized_value_map[weight_name] = quantized_value
+
+        # Update packed weight, zero point, and scale initializers
+        zero_scale_shape = [initializer.dims[channel_axis]]
+        scale_initializer = onnx.helper.make_tensor(
+            scale_name, initializer.data_type, zero_scale_shape, np.hstack(scale_list).tolist()
+        )
+        zero_initializer = onnx.helper.make_tensor(
+            zp_name, weight_qType, zero_scale_shape, np.hstack(zero_point_list).tolist()
+        )
+
+        self.model.initializer_extend([scale_initializer, zero_initializer])
+
+        if not keep_float_weight:
+            quantized_weights = np.asarray(
+                quantized_weights,
+                dtype=onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[weight_qType],
+            ).reshape(initializer.dims)
+            q_weight_initializer = onnx.numpy_helper.from_array(quantized_weights, q_weight_name)
+            self.model.initializer_extend([q_weight_initializer])
+
+        return q_weight_name, zp_name, scale_name
+
+    def _get_and_check_tensor_quant_overrides(self):
+        """
+        Get tensor quantization overrides and check correctness.
+        """
+        tensor_quant_overrides = self.extra_options.get("TensorQuantOverrides", {})
+        tensor_quant_override_types = set()
+
+        # Validate that compatible/valid overrides are provided.
+        if tensor_quant_overrides:
+            initializer_names = self.model.get_initializer_name_set()
+            value_info_names = set(self.value_infos.keys())
+            keys_unsupported_with_scale_zp = {"symmetric", "reduce_range", "rmax", "rmin"}
+
+            for tensor_name, quant_overrides_list in tensor_quant_overrides.items():
+                if tensor_name not in initializer_names and tensor_name not in value_info_names:
+                    raise ValueError(f"Tensor '{tensor_name}' in TensorQuantOverrides is not present in the model")
+
+                if not isinstance(quant_overrides_list, list):
+                    raise ValueError(f"Tensor quantization overrides for '{tensor_name}' are not in a list")
+
+                is_initializer = tensor_name in initializer_names
+                if not is_initializer and len(quant_overrides_list) > 1:
+                    raise ValueError(
+                        f"Tensor '{tensor_name}' has a list of per-channel overrides, but is not an initializer"
+                    )
+
+                quant_type = None
+                for index, quant_overrides in enumerate(quant_overrides_list):
+                    if not isinstance(quant_overrides, dict):
+                        raise ValueError(
+                            f"Tensor quantization overrides at index {index} for '{tensor_name}' are not in a dict"
+                        )
+
+                    # For per-channel quantization, all channels must use the same quantization type.
+                    # Therefore, if the user tries to override the quant_type for a channel, it must match in all
+                    # other channels.
+                    if index == 0:
+                        quant_type = quant_overrides.get("quant_type")
+                        if quant_type:
+                            tensor_quant_override_types.add(quant_type.tensor_type)
+                    elif quant_type != quant_overrides.get("quant_type"):
+                        raise ValueError(
+                            "Channel quantization types for tensor '{tensor_name}' do not match at index {index}."
+                        )
+
+                    has_scale = "scale" in quant_overrides
+                    has_zero_point = "zero_point" in quant_overrides
+
+                    if (has_scale and not has_zero_point) or (has_zero_point and not has_scale):
+                        raise ValueError(
+                            "Must provide both 'scale' and 'zero_point' if one of the overrides is provided"
+                        )
+
+                    if has_scale:
+                        for key in keys_unsupported_with_scale_zp:
+                            if key in quant_overrides:
+                                raise ValueError(
+                                    f"Tensor override option '{key}' is invalid with 'scale' and 'zero_point'"
+                                )
+
+        return tensor_quant_overrides, tensor_quant_override_types
+
+    def get_per_tensor_quant_overrides(self, tensor_name):
+        quant_overrides_list = self.tensor_quant_overrides.get(tensor_name, [{}])
+        num_overrides = len(quant_overrides_list)
+        if num_overrides > 1:
+            raise ValueError(
+                f"Expected tensor '{tensor_name}' to use per-tensor quantization overrides, "
+                f"but found {num_overrides} per-channel overrides."
+            )
+
+        return quant_overrides_list[0] if num_overrides > 0 else {}
+
+    def get_per_channel_quant_overrides(self, tensor_name, num_channels):
+        quant_overrides_list = self.tensor_quant_overrides.get(tensor_name, [{} for i in range(num_channels)])
+
+        if len(quant_overrides_list) != num_channels:
+            raise ValueError(
+                f"Expected tensor '{tensor_name}' to have {num_channels} per-channel quantization overrides, "
+                f"but found {len(quant_overrides_list)} instead."
+            )
+
+        return quant_overrides_list
+
+    def _get_quantization_params(self, param_name, use_scale=None, use_zeropoint=None):
+        """
+        Create initializers and inputs in the graph for zero point and scale of output.
+        Zero point and scale values are obtained from self.quantization_params if specified.
+            parameter param_name: Name of the quantization parameter.
+            return: result, scale_name, zero_point_name, scale_shape, zero_point_shape.
+        """
+        zero_point_type = self.activation_qType
+
+        if use_scale is None or use_zeropoint is None:
+            if self.quantization_params is None or param_name not in self.quantization_params:
+                logging.info(f'Quantization parameters for tensor:"{param_name}" not specified')
+                return False, "", "", "", ""
+
+            params = self.quantization_params[param_name]
+            if not isinstance(params, QuantizationParams):
+                raise TypeError(f"Unexpected type {type(params)} for {param_name!r}.")
+            if params is None or len(params) != 3:
+                raise ValueError(
+                    "Quantization parameters should contain zero point, scale, quant type. "
+                    f"Specified values for output {param_name}: {params}"
+                )
+
+            zero_point_values = np.array([params["zero_point"]])
+            if not hasattr(params["scale"], "dtype") or params["scale"].dtype not in (np.float32, np.float16):
+                raise ValueError(f"Unexpected type {type(params['scale'])} and param_name={param_name!r}")
+            scale_values = np.array([params["scale"]])
+            assert scale_values.dtype != np.float64
+            zero_point_type = params["quant_type"]
+        else:
+            zero_point_values = np.array([use_zeropoint])
+            scale_values = np.array([use_scale])
+            params = self.quantization_params[param_name]
+            if "scale" in params:
+                dtype = params["scale"].dtype
+                scale_values = scale_values.astype(dtype)
+            assert scale_values.dtype != np.float64
+
+        zero_point_shape = []
+        zero_point_name = param_name + "_zero_point"
+        scale_shape = []
+        scale_name = param_name + "_scale"
+
+        # Add initializers
+        init_zp = onnx.helper.make_tensor(
+            zero_point_name, zero_point_type, zero_point_shape, zero_point_values.ravel().tolist()
+        )
+        self.model.add_initializer(init_zp)
+        if scale_values.dtype == np.float32:
+            scale_type = onnx.TensorProto.FLOAT
+        elif scale_values.dtype == np.float16:
+            scale_type = onnx.TensorProto.FLOAT16
+        else:
+            raise ValueError(f"Unexpected dtype={scale_values.dtype} for param_name={param_name!r}")
+        init_scale = onnx.helper.make_tensor(scale_name, scale_type, scale_shape, scale_values.reshape((-1,)).tolist())
+        self.model.add_initializer(init_scale)
+
+        return True, scale_name, zero_point_name, scale_shape, zero_point_shape
+
+    def calculate_quantization_params(self):
+        if self.tensors_range is None:
+            return {}
+
+        # adjust tensor_ranges for input of Clip and Relu node
+        for node in self.model.nodes():
+            if node.op_type not in ["Clip", "Relu"]:
+                continue
+            if self.is_activation_symmetric:
+                continue
+            if not self.should_quantize_node(node):
+                continue
+            if len(self.model.input_name_to_nodes()[node.input[0]]) != 1:
+                continue
+            if node.input[0] not in self.tensors_range or node.output[0] not in self.tensors_range:
+                continue
+            td = self.tensors_range[node.output[0]]
+            if not isinstance(td, TensorData):
+                raise TypeError(f"Unexpected type {type(td)} for {node.output[0]!r}.")
+            self.tensors_range[node.input[0]] = td
+
+        quantization_params = {}
+        for tensor_name in self.tensors_range:
+            td = self.tensors_range[tensor_name]
+            if not isinstance(td, TensorData):
+                raise TypeError(f"Unexpected type {type(td)} for {tensor_name!r}.")
+
+            quant_overrides = self.get_per_tensor_quant_overrides(tensor_name)
+
+            quant_type = self.activation_qType
+            if "quant_type" in quant_overrides:
+                quant_type = quant_overrides["quant_type"].tensor_type
+
+            if "scale" in quant_overrides and "zero_point" in quant_overrides:
+                zero, scale = quant_overrides["zero_point"], quant_overrides["scale"]
+            elif quant_type == onnx.TensorProto.FLOAT8E4M3FN:
+                zero, scale = compute_scale_zp_float8(quant_type, td.avg_std[1])
+            else:
+                rmin = quant_overrides.get("rmin", td.range_value[0])
+                rmax = quant_overrides.get("rmax", td.range_value[1])
+                symmetric = quant_overrides.get("symmetric", self.is_activation_symmetric)
+                reduce_range = quant_overrides.get("reduce_range", False)
+                qmin, qmax = get_qmin_qmax_for_qType(quant_type, reduce_range=reduce_range, symmetric=symmetric)
+                zero, scale = compute_scale_zp(rmin, rmax, qmin, qmax, symmetric, self.min_real_range)
+
+            quantization_params[tensor_name] = QuantizationParams(zero_point=zero, scale=scale, quant_type=quant_type)
+
+        return quantization_params
diff --git a/onnxruntime/python/tools/quantization/onnx_quantizer.py b/onnxruntime/python/tools/quantization/onnx_quantizer.py
index 19a72e38dea33..e2044db04303d 100644
--- a/onnxruntime/python/tools/quantization/onnx_quantizer.py
+++ b/onnxruntime/python/tools/quantization/onnx_quantizer.py
@@ -4,9 +4,7 @@
 # license information.
 # --------------------------------------------------------------------------
 import logging
-from typing import Any, Dict
 
-import numpy as np
 import onnx
 import onnx.numpy_helper
 from onnx import onnx_pb as onnx_proto
@@ -17,57 +15,25 @@
     # old version of onnx.
     to_array_extended = None
 
-from .calibrate import TensorData
+from .base_quantizer import BaseQuantizer
 from .onnx_model import ONNXModel
 from .quant_utils import (
-    ONNX_TYPE_TO_NP_TYPE,
     TENSOR_NAME_QUANT_SUFFIX,
     QuantizationMode,
     QuantizedValue,
-    QuantizedValueType,
-    QuantType,
     __producer__,
     __version__,
     add_infer_metadata,
     attribute_to_kwarg,
-    compute_scale_zp,
-    compute_scale_zp_float8,
     find_by_name,
-    get_qmin_qmax_for_qType,
     get_qrange_for_qType,
-    model_has_infer_metadata,
     ms_domain,
-    quantize_data,
-    quantize_nparray,
     save_and_reload_model_with_shape_infer,
-    tensor_proto_to_array,
 )
 from .registry import CreateOpQuantizer
 
 
-class QuantizationParams:
-    def __init__(self, **data: Dict[str, Any]):
-        self.data = {}
-        for k, v in data.items():
-            if not isinstance(k, str):
-                raise TypeError(f"Keys must be strings not {type(k)} for k={k!r}.")
-            if not isinstance(v, (int, str, np.ndarray)):
-                raise TypeError(f"Values must be numpy arrays, int, float, str not {type(v)} for k={k!r}.")
-            if k == "scale" and v.dtype not in (np.float32, np.float16):
-                raise ValueError(f"scale must a float32 or float16 numpy element but is {v.dtype} for k={k!r}")
-            self.data[k] = v
-
-    def __iter__(self):
-        yield from self.data
-
-    def __getitem__(self, key):
-        return self.data[key]
-
-    def __len__(self):
-        return len(self.data)
-
-
-class ONNXQuantizer:
+class ONNXQuantizer(BaseQuantizer):
     def __init__(
         self,
         model,
@@ -83,13 +49,20 @@ def __init__(
         op_types_to_quantize,
         extra_options=None,
     ):
-        if not model_has_infer_metadata(model):
-            model = save_and_reload_model_with_shape_infer(model)
-        self.value_infos = {vi.name: vi for vi in model.graph.value_info}
-        self.value_infos.update({ot.name: ot for ot in model.graph.output})
-        self.value_infos.update({it.name: it for it in model.graph.input})
+        BaseQuantizer.__init__(
+            self,
+            model,
+            per_channel,
+            reduce_range,
+            weight_qType,
+            activation_qType,
+            tensors_range,
+            nodes_to_quantize,
+            nodes_to_exclude,
+            op_types_to_quantize,
+            extra_options,
+        )
 
-        self.model = ONNXModel(model)
         if not static:
             self.model.replace_gemm_with_matmul()
             # We need to update value_infos.
@@ -99,49 +72,12 @@ def __init__(
             self.value_infos.update({it.name: it for it in model.graph.input})
             self.model = ONNXModel(model)
 
-        self.per_channel = per_channel  # weight-pack per channel
-        self.reduce_range = reduce_range
         self.mode = mode  # QuantizationMode.Value
         self.static = static  # use static quantization for inputs.
-        self.fuse_dynamic_quant = False
+        self.fuse_dynamic_quant = self.opset_version > 10
 
-        self.extra_options = extra_options if extra_options else {}
-        self.enable_subgraph_quantization = (
-            "EnableSubgraph" in self.extra_options and self.extra_options["EnableSubgraph"]
-        )
-        self.force_quantize_no_input_check = (
-            "ForceQuantizeNoInputCheck" in self.extra_options and self.extra_options["ForceQuantizeNoInputCheck"]
-        )
         self.q_matmul_const_b_only = "MatMulConstBOnly" in self.extra_options and self.extra_options["MatMulConstBOnly"]
-        self.is_weight_symmetric = self.extra_options.get(
-            "WeightSymmetric", weight_qType in (QuantType.QInt8, QuantType.QInt16, QuantType.QFLOAT8E4M3FN)
-        )
-        self.is_activation_symmetric = self.extra_options.get("ActivationSymmetric", False)
-        self.min_real_range = self.extra_options.get("MinimumRealRange")
-
-        self.activation_qType = getattr(activation_qType, "tensor_type", activation_qType)
-        self.weight_qType = getattr(weight_qType, "tensor_type", weight_qType)
-        """
-            Dictionary specifying the min and max values for tensors. It has following format:
-                {
-                    "param_name": [min, max]
-                }
-            example:
-                {
-                    'Conv_3:0': [np.float32(0), np.float32(0.5)],
-                    'Conv_4:0': [np.float32(1), np.float32(3.5)]
-                }
-        """
-        if tensors_range is not None and any(map(lambda t: not isinstance(t, TensorData), tensors_range.values())):
-            raise TypeError(
-                f"tensors_range contains unexpected types {set(type(v) for v in tensors_range.values())}, not TensorData."
-            )
-        self.tensors_range = tensors_range
-        self.nodes_to_quantize = nodes_to_quantize  # specific nodes to quantize
-        self.nodes_to_exclude = nodes_to_exclude  # specific nodes to exclude
-        self.op_types_to_quantize = op_types_to_quantize
         self.new_nodes = []
-        self.parent = None
         self.graph_scope = "/"  # for human readable debug information
         self.tensor_names = {}  # in case the shape inference not totally working
         self.tensor_names.update({ot.name: 1 for ot in model.graph.output})
@@ -149,14 +85,9 @@ def __init__(
         for node in self.model.model.graph.node:
             self.tensor_names.update({output_name: 1 for output_name in node.output})
 
-        self.opset_version = self.check_opset_version()
-
         if self.mode not in QuantizationMode:
             raise ValueError(f"unsupported quantization mode {self.mode}")
 
-        self.tensor_quant_overrides, self.tensor_quant_override_types = self._get_and_check_tensor_quant_overrides()
-        self.quantization_params = self.calculate_quantization_params()
-
         # QuantizeRange tensor name and zero tensor name for scale and zero point calculation.
         # Used when static is False
         self.fixed_qrange_uint8_name = "fixed_quantization_range_uint8"
@@ -166,98 +97,9 @@ def __init__(
         # For int8 data-type, zero point is always zero (respresented by fixed_zero_point_name tensor)
         self.fixed_zero_zp_name = "fixed_zero_zp"
 
-        # Map of all original value names to quantized value names
-        self.quantized_value_map = {}
         # some output from nodes will be quantized, yet itself should be treat as existing so
         # no dequantized will be applied when needed later
         self.generated_value_names = self.model.get_non_initializer_inputs()
-        # to store specified scale and zeropoint instead of calculated value, tensor_name->(scale, zeropoint)
-        self.used_scale_zp_map = {}
-
-    def _get_and_check_tensor_quant_overrides(self):
-        """
-        Get tensor quantization overrides and check correctness.
-        Also returns a set of quantization types (as TensorProto) specified across all overrides.
-        """
-        tensor_quant_overrides = self.extra_options.get("TensorQuantOverrides", {})
-        tensor_quant_override_types = set()
-
-        # Validate that compatible/valid overrides are provided.
-        if tensor_quant_overrides:
-            initializer_names = self.model.get_initializer_name_set()
-            value_info_names = set(self.value_infos.keys())
-            keys_unsupported_with_scale_zp = {"symmetric", "reduce_range", "rmax", "rmin"}
-
-            for tensor_name, quant_overrides_list in tensor_quant_overrides.items():
-                if tensor_name not in initializer_names and tensor_name not in value_info_names:
-                    raise ValueError(f"Tensor '{tensor_name}' in TensorQuantOverrides is not present in the model")
-
-                if not isinstance(quant_overrides_list, list):
-                    raise ValueError(f"Tensor quantization overrides for '{tensor_name}' are not in a list")
-
-                is_initializer = tensor_name in initializer_names
-                if not is_initializer and len(quant_overrides_list) > 1:
-                    raise ValueError(
-                        f"Tensor '{tensor_name}' has a list of per-channel overrides, but is not an initializer"
-                    )
-
-                quant_type = None
-                for index, quant_overrides in enumerate(quant_overrides_list):
-                    if not isinstance(quant_overrides, dict):
-                        raise ValueError(
-                            f"Tensor quantization overrides at index {index} for '{tensor_name}' are not in a dict"
-                        )
-
-                    # For per-channel quantization, all channels must use the same quantization type.
-                    # Therefore, if the user tries to override the quant_type for a channel, it must match in all
-                    # other channels.
-                    if index == 0:
-                        quant_type = quant_overrides.get("quant_type")
-                        if quant_type is not None:
-                            tensor_quant_override_types.add(quant_type.tensor_type)
-                    elif quant_type != quant_overrides.get("quant_type"):
-                        raise ValueError(
-                            "Channel quantization types for tensor '{tensor_name}' do not match at index {index}."
-                        )
-
-                    has_scale = "scale" in quant_overrides
-                    has_zero_point = "zero_point" in quant_overrides
-
-                    if (has_scale and not has_zero_point) or (has_zero_point and not has_scale):
-                        raise ValueError(
-                            "Must provide both 'scale' and 'zero_point' if one of the overrides is provided"
-                        )
-
-                    if has_scale:
-                        for key in keys_unsupported_with_scale_zp:
-                            if key in quant_overrides:
-                                raise ValueError(
-                                    f"Tensor override option '{key}' is invalid with 'scale' and 'zero_point'"
-                                )
-
-        return tensor_quant_overrides, tensor_quant_override_types
-
-    def get_per_tensor_quant_overrides(self, tensor_name):
-        quant_overrides_list = self.tensor_quant_overrides.get(tensor_name, [{}])
-        num_overrides = len(quant_overrides_list)
-        if num_overrides > 1:
-            raise ValueError(
-                f"Expected tensor '{tensor_name}' to use per-tensor quantization overrides, "
-                f"but found {num_overrides} per-channel overrides."
-            )
-
-        return quant_overrides_list[0] if num_overrides > 0 else {}
-
-    def get_per_channel_quant_overrides(self, tensor_name, num_channels):
-        quant_overrides_list = self.tensor_quant_overrides.get(tensor_name, [{} for i in range(num_channels)])
-
-        if len(quant_overrides_list) != num_channels:
-            raise ValueError(
-                f"Expected tensor '{tensor_name}' to have {num_channels} per-channel quantization overrides, "
-                f"but found {len(quant_overrides_list)} instead."
-            )
-
-        return quant_overrides_list
 
     # routines for subgraph support
     def quantize_subgraph(self, subgraph, graph_key):
@@ -325,46 +167,6 @@ def quantize_node_with_sub_graph(self, node):
             kwargs.update(kv)
         return onnx.helper.make_node(node.op_type, node.input, node.output, name=node.name, **kwargs)
 
-    def check_opset_version(self):
-        ai_onnx_domain = [
-            opset for opset in self.model.model.opset_import if not opset.domain or opset.domain == "ai.onnx"
-        ]
-        if len(ai_onnx_domain) != 1:
-            raise ValueError("Failed to find proper ai.onnx domain")
-        opset_version = ai_onnx_domain[0].version
-
-        if opset_version == 10:
-            logging.warning(
-                "The original model opset version is {}, which does not support node fusions. Please update the model to opset >= 11 for better performance.".format(
-                    opset_version
-                )
-            )
-            return 10
-
-        if opset_version < 10:
-            logging.warning(
-                "The original model opset version is {}, which does not support quantization. Please update the model to opset >= 11. Updating the model automatically to opset 11. Please verify the quantized model.".format(
-                    opset_version
-                )
-            )
-            self.model.model.opset_import.remove(ai_onnx_domain[0])
-            self.model.model.opset_import.extend([onnx.helper.make_opsetid("", 11)])
-            opset_version = 11
-
-        if opset_version < 19 and self.weight_qType == onnx_proto.TensorProto.FLOAT8E4M3FN:
-            logging.warning(
-                "The original model opset version is {}, which does not support quantization to float 8. "
-                "Please update the model to opset >= 19. Updating the model automatically to opset 19. "
-                "Please verify the quantized model.".format(opset_version)
-            )
-            self.model.model.opset_import.remove(ai_onnx_domain[0])
-            self.model.model.opset_import.extend([onnx.helper.make_opsetid("", 19)])
-            self.model.model.ir_version = 9
-            opset_version = 19
-
-        self.fuse_dynamic_quant = True
-        return opset_version
-
     def has_QDQ_nodes(self):  # noqa: N802
         """
         Detect if model already has QuantizeLinear or DequantizeLinear.
@@ -431,21 +233,6 @@ def quantize_model(self):
 
         return self.model.model
 
-    def is_input_a_initializer(self, input_name):
-        initializer = find_by_name(input_name, self.model.initializer())
-        return initializer is not None
-
-    def is_per_channel(self):
-        return self.per_channel
-
-    def is_valid_quantize_weight(self, weight_name):
-        weight = find_by_name(weight_name, self.model.initializer())
-        if weight is not None:
-            return weight.data_type in (onnx_proto.TensorProto.FLOAT, onnx_proto.TensorProto.FLOAT16)
-        if (not self.enable_subgraph_quantization) or (self.parent is None):
-            return False
-        return self.parent.is_valid_quantize_weight(weight_name)
-
     def _get_default_tensor_type(self, tensor_name):
         if "DefaultTensorType" in self.extra_options:
             logging.info(
@@ -513,22 +300,6 @@ def is_float_tensor(self, tensor_name):
         )
         return False
 
-    def should_quantize_node(self, node):
-        if (
-            self.nodes_to_quantize is not None
-            and len(self.nodes_to_quantize) != 0
-            and node.name not in self.nodes_to_quantize
-        ):
-            return False
-
-        if node.op_type not in self.op_types_to_quantize:
-            return False
-
-        if self.nodes_to_exclude is not None and node.name in self.nodes_to_exclude:
-            return False
-
-        return True
-
     def _get_dynamic_input_quantization_params(self, input_name, nodes_list, qType):
         """
         Create nodes for dynamic quantization of input and add them to nodes_list.
@@ -723,65 +494,6 @@ def _get_dynamic_input_quantization_params_uint8(self, input_name, nodes_list, i
 
         return input_scale_name, input_zp_name, [], []
 
-    def _get_quantization_params(self, param_name, use_scale=None, use_zeropoint=None):
-        """
-        Create initializers and inputs in the graph for zero point and scale of output.
-        Zero point and scale values are obtained from self.quantization_params if specified.
-            parameter param_name: Name of the quantization parameter.
-            return: result, scale_name, zero_point_name, scale_shape, zero_point_shape.
-        """
-        zero_point_type = self.activation_qType
-
-        if use_scale is None or use_zeropoint is None:
-            if self.quantization_params is None or param_name not in self.quantization_params:
-                logging.info(f'Quantization parameters for tensor:"{param_name}" not specified')
-                return False, "", "", "", ""
-
-            params = self.quantization_params[param_name]
-            if not isinstance(params, QuantizationParams):
-                raise TypeError(f"Unexpected type {type(params)} for {param_name!r}.")
-            if params is None or len(params) != 3:
-                raise ValueError(
-                    "Quantization parameters should contain zero point, scale, quant type. "
-                    f"Specified values for output {param_name}: {params}"
-                )
-
-            zero_point_values = np.array([params["zero_point"]])
-            if not hasattr(params["scale"], "dtype") or params["scale"].dtype not in (np.float32, np.float16):
-                raise ValueError(f"Unexpected type {type(params['scale'])} and param_name={param_name!r}")
-            scale_values = np.array([params["scale"]])
-            assert scale_values.dtype != np.float64
-            zero_point_type = params["quant_type"]
-        else:
-            zero_point_values = np.array([use_zeropoint])
-            scale_values = np.array([use_scale])
-            params = self.quantization_params[param_name]
-            if "scale" in params:
-                dtype = params["scale"].dtype
-                scale_values = scale_values.astype(dtype)
-            assert scale_values.dtype != np.float64
-
-        zero_point_shape = []
-        zero_point_name = param_name + "_zero_point"
-        scale_shape = []
-        scale_name = param_name + "_scale"
-
-        # Add initializers
-        init_zp = onnx.helper.make_tensor(
-            zero_point_name, zero_point_type, zero_point_shape, zero_point_values.ravel().tolist()
-        )
-        self.model.add_initializer(init_zp)
-        if scale_values.dtype == np.float32:
-            scale_type = onnx_proto.TensorProto.FLOAT
-        elif scale_values.dtype == np.float16:
-            scale_type = onnx_proto.TensorProto.FLOAT16
-        else:
-            raise ValueError(f"Unexpected dtype={scale_values.dtype} for param_name={param_name!r}")
-        init_scale = onnx.helper.make_tensor(scale_name, scale_type, scale_shape, scale_values.reshape((-1,)).tolist())
-        self.model.add_initializer(init_scale)
-
-        return True, scale_name, zero_point_name, scale_shape, zero_point_shape
-
     def _get_quantize_input_nodes(self, node, input_index, qType, given_scale_name=None, given_zp_name=None):
         """
         Given an input for a node (which is not a initializer), this function
@@ -845,19 +557,6 @@ def _get_quantize_input_nodes(self, node, input_index, qType, given_scale_name=N
         self.quantized_value_map[input_name] = QuantizedValue(input_name, output_name, scale_name, zp_name, qType)
         return [*nodes, qlinear_node]
 
-    def set_quant_scale_zp(self, tensor_name, value):
-        assert isinstance(value, tuple) and len(value) == 2, "value must be scale(float or float16) and zeropoint"
-        assert hasattr(value[0], "dtype")
-        assert tensor_name not in self.used_scale_zp_map, f"{tensor_name} has been setted before"
-        self.used_scale_zp_map[tensor_name] = value
-
-    def find_quant_scale_zp(self, input_name):
-        if input_name in self.used_scale_zp_map:
-            return self.used_scale_zp_map[input_name]
-        if self.parent is not None:
-            return self.parent.find_quantized_value(input_name)
-        return (None, None)
-
     def find_quantized_value(self, input_name):
         if input_name in self.quantized_value_map:
             return self.quantized_value_map[input_name]
@@ -865,102 +564,6 @@ def find_quantized_value(self, input_name):
             return self.parent.find_quantized_value(input_name)
         return None
 
-    def quantize_bias_static(self, bias_name, input_name, weight_name, beta=1.0):
-        """
-        Quantized the bias. Zero Point == 0 and Scale == Input_Scale * Weight_Scale
-        """
-
-        # Handle case where bias already in quantization map
-        if bias_name in self.quantized_value_map:
-            return self.quantized_value_map[bias_name].q_name
-
-        # get scale for weight
-        weight_scale_name = self.quantized_value_map[weight_name].scale_name
-        weight_initializer = find_by_name(weight_scale_name, self.model.initializer())
-        weight_scale = tensor_proto_to_array(weight_initializer)
-
-        # get bias
-        bias_initializer = find_by_name(bias_name, self.model.initializer())
-        bias_data = tensor_proto_to_array(bias_initializer)
-        quantized_bias_name = bias_name + TENSOR_NAME_QUANT_SUFFIX
-
-        # get scale for input
-        if input_name in self.quantized_value_map:
-            input_scale_name = self.quantized_value_map[input_name].scale_name
-        elif input_name in self.quantization_params:
-            _, input_scale_name, _, _, _ = self._get_quantization_params(input_name)
-        else:
-            raise ValueError(f"Expected {input_name} to be in quantized value map for static quantization")
-
-        inputscale_initializer = find_by_name(input_scale_name, self.model.initializer())
-        input_scale = tensor_proto_to_array(inputscale_initializer)
-
-        # quantize bias
-        if self.weight_qType == onnx_proto.TensorProto.FLOAT8E4M3FN:
-            data = np.asarray(bias_data)
-            if data.dtype == np.float16:
-                node_qtype = onnx.TensorProto.FLOAT16
-            elif data.dtype == np.float32:
-                node_qtype = onnx.TensorProto.FLOAT
-            else:
-                raise TypeError(f"Only float16 or float32 are supported with float 8 but bias dtype is {data.dtype}.")
-            quantized_data = data.astype(np.float32)
-            bias_scale = np.array([1], dtype=quantized_data.dtype)
-            bias_scale_data = bias_scale.reshape(-1)
-            packed_bias_initializer = onnx.numpy_helper.from_array(quantized_data, quantized_bias_name)
-            self.model.initializer_extend([packed_bias_initializer])
-            node_type = "Cast"
-        else:
-            # calculate scale for bias
-            # TODO: This formula should be explained including why the scale is not estimated for the bias as well.
-            bias_scale = input_scale * weight_scale * beta
-
-            quantized_data = (np.asarray(bias_data) / bias_scale).round().astype(np.int32)
-
-            # update bias initializer
-            bias_np_data = np.asarray(quantized_data, dtype=np.int32).reshape(bias_initializer.dims)
-            packed_bias_initializer = onnx.numpy_helper.from_array(bias_np_data, quantized_bias_name)
-            self.model.initializer_extend([packed_bias_initializer])
-            bias_scale_data = np.asarray(bias_scale, dtype=np.float32).reshape(-1)
-            node_type = "DequantizeLinear"
-            node_qtype = self.weight_qType
-
-        # update scale initializer
-        quantized_bias_scale_name = quantized_bias_name + "_scale"
-        packed_bias_scale_initializer = onnx.numpy_helper.from_array(bias_scale_data, quantized_bias_scale_name)
-        self.model.initializer_extend([packed_bias_scale_initializer])
-
-        # update zero initializer
-        if self.weight_qType == onnx_proto.TensorProto.FLOAT8E4M3FN:
-            tensor_type = self.weight_qType
-        else:
-            tensor_type = onnx_proto.TensorProto.INT32
-
-        quantized_bias_zp_name = quantized_bias_name + "_zero_point"
-        if self.weight_qType == onnx_proto.TensorProto.FLOAT8E4M3FN:
-            packed_bias_zp_initializer = onnx.helper.make_tensor(quantized_bias_zp_name, self.weight_qType, [1], [0.0])
-        elif self.is_per_channel():
-            bias_zp_data = np.zeros(bias_scale.shape, dtype=np.int32).reshape(-1)
-            packed_bias_zp_initializer = onnx.numpy_helper.from_array(bias_zp_data, quantized_bias_zp_name)
-        else:
-            packed_bias_zp_initializer = onnx.helper.make_tensor(quantized_bias_zp_name, tensor_type, [], [0])
-        self.model.initializer_extend([packed_bias_zp_initializer])
-
-        assert bias_name not in self.quantized_value_map
-        quantized_value = QuantizedValue(
-            bias_name,
-            quantized_bias_name,
-            quantized_bias_scale_name,
-            quantized_bias_zp_name,
-            QuantizedValueType.Initializer,
-            0 if bias_scale_data.size > 1 else None,
-            node_type=node_type,
-            node_qtype=node_qtype,
-        )
-        self.quantized_value_map[bias_name] = quantized_value
-
-        return quantized_bias_name
-
     def contains_tensor(self, tensor_name):
         """
         only check for value info and newly generated tensor names, initializers are checked separately
@@ -1118,228 +721,6 @@ def __quantize_inputs(
 
         return quantized_input_names, zero_point_names, scale_names, nodes
 
-    def quantize_initializer(self, weight, qType, reduce_range=False, keep_float_weight=False):
-        """
-        :param weight: TensorProto initializer
-        :param qType: type to quantize to
-        :param keep_float_weight: Whether to quantize the weight. In some cases, we only want to qunatize scale and zero point.
-                                  If keep_float_weight is False, quantize the weight, or don't quantize the weight.
-        :return: quantized weight name, zero point name, scale name
-        """
-        # Find if this input is already quantized
-        if weight.name in self.quantized_value_map:
-            quantized_value = self.quantized_value_map[weight.name]
-            return (
-                quantized_value.q_name,
-                quantized_value.zp_name,
-                quantized_value.scale_name,
-            )
-
-        q_weight_name = weight.name + TENSOR_NAME_QUANT_SUFFIX
-        zp_name = weight.name + "_zero_point"
-        scale_name = weight.name + "_scale"
-
-        # Quantize weight data. Use quantization overrides if provided by the user.
-        weight_data = tensor_proto_to_array(weight)
-        quant_overrides = self.get_per_tensor_quant_overrides(weight.name)
-        if "quant_type" in quant_overrides:
-            qType = quant_overrides["quant_type"].tensor_type  # noqa: N806
-
-        if "scale" in quant_overrides and "zero_point" in quant_overrides:
-            zero_point = np.array(quant_overrides["zero_point"], dtype=ONNX_TYPE_TO_NP_TYPE[qType])
-            scale = np.array(quant_overrides["scale"])
-            q_weight_data = quantize_nparray(qType, weight_data.flatten(), scale, zero_point)
-            assert isinstance(zero_point, np.ndarray), f"Unexpected type {type(zero_point)}"
-            assert (
-                zero_point.dtype != np.float32 and zero_point.dtype != np.float16
-            ), f"Unexpected dtype {zero_point.dtype}"
-            assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}"
-
-        else:
-            _, _, zero_point, scale, q_weight_data = quantize_data(
-                weight_data.flatten(),
-                qType,
-                quant_overrides.get("symmetric", self.is_weight_symmetric),
-                reduce_range=quant_overrides.get("reduce_range", self.reduce_range and reduce_range),
-                min_real_range=self.min_real_range,
-                rmin_override=quant_overrides.get("rmin"),
-                rmax_override=quant_overrides.get("rmax"),
-            )
-
-            assert isinstance(zero_point, np.ndarray), f"Unexpected type {type(zero_point)}"
-            assert (
-                zero_point.dtype != np.float32 and zero_point.dtype != np.float16
-            ), f"Unexpected dtype {zero_point.dtype}"
-            assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}"
-
-        scale_dtype = weight.data_type
-        scale_initializer = onnx.helper.make_tensor(scale_name, scale_dtype, [], scale.reshape((-1,)).tolist())
-        zero_initializer = onnx.helper.make_tensor(zp_name, qType, [], zero_point.reshape((-1,)).tolist())
-        self.model.initializer_extend([scale_initializer, zero_initializer])
-
-        if not keep_float_weight:
-            if self.weight_qType == onnx_proto.TensorProto.FLOAT8E4M3FN:
-                q_weight_initializer = onnx.TensorProto()
-                q_weight_initializer.data_type = self.weight_qType
-                q_weight_initializer.dims.extend(weight.dims)
-                q_weight_initializer.name = q_weight_name
-                # Do not remove .flatten().copy() numpy is not clear about data persistence.
-                q_weight_initializer.raw_data = q_weight_data.flatten().copy().tobytes()
-                if to_array_extended is not None:
-                    # This test should not be needed but it helped catch some issues
-                    # with data persistence and tobytes.
-                    check = to_array_extended(q_weight_initializer)
-                    if check.shape != weight_data.shape or check.tobytes() != q_weight_data.tobytes():
-                        raise RuntimeError(
-                            f"The initializer of shape {weight_data.shape} could not be created, expecting "
-                            f"{q_weight_data.tobytes()[:10]}, got {check.tobytes()[:10]} and shape={weight.shape}"
-                            f"\nraw={str(q_weight_initializer)[:200]}."
-                        )
-            else:
-                q_weight_data = np.asarray(q_weight_data, dtype=onnx.helper.tensor_dtype_to_np_dtype(qType)).reshape(
-                    weight.dims
-                )
-                q_weight_initializer = onnx.numpy_helper.from_array(q_weight_data, q_weight_name)
-            self.model.initializer_extend([q_weight_initializer])
-
-        # Log entry for this quantized weight
-        quantized_value = QuantizedValue(
-            weight.name,
-            q_weight_name,
-            scale_name,
-            zp_name,
-            QuantizedValueType.Initializer,
-            None,
-        )
-        self.quantized_value_map[weight.name] = quantized_value
-        return q_weight_name, zp_name, scale_name
-
-    def quantize_weight_per_channel(
-        self,
-        weight_name,
-        weight_qType,
-        channel_axis,
-        reduce_range=True,
-        keep_float_weight=False,
-    ):
-        # Find if this input is already quantized
-        if weight_name in self.quantized_value_map:
-            quantized_value = self.quantized_value_map[weight_name]
-            return (
-                quantized_value.q_name,
-                quantized_value.zp_name,
-                quantized_value.scale_name,
-            )
-
-        initializer = find_by_name(weight_name, self.model.initializer())
-        if initializer is None:
-            raise ValueError("{} is not an initializer", weight_name)
-
-        weights = tensor_proto_to_array(initializer)
-        channel_count = weights.shape[channel_axis]
-        quant_overrides_for_channels = self.get_per_channel_quant_overrides(weight_name, channel_count)
-
-        # If user provides per-channel quantization overrides, all channels must use the same quantization type.
-        # So, just use the first channel's type.
-        if "quant_type" in quant_overrides_for_channels[0]:
-            weight_qType = quant_overrides_for_channels[0]["quant_type"].tensor_type  # noqa: N806
-
-        zero_point_list = []
-        scale_list = []
-        quantized_per_channel_data_list = []
-        for i in range(channel_count):
-            per_channel_data = weights.take(i, channel_axis)
-            channel_quant_overrides = quant_overrides_for_channels[i]
-
-            if "scale" in channel_quant_overrides and "zero_point" in channel_quant_overrides:
-                zero_point = np.array(channel_quant_overrides["zero_point"], dtype=ONNX_TYPE_TO_NP_TYPE[weight_qType])
-                scale = np.array(channel_quant_overrides["scale"])
-                quantized_per_channel_data = quantize_nparray(
-                    weight_qType, per_channel_data.flatten(), scale, zero_point
-                )
-                assert isinstance(zero_point, np.ndarray), f"Unexpected type {type(zero_point)}"
-                assert (
-                    zero_point.dtype != np.float32 and zero_point.dtype != np.float16
-                ), f"Unexpected dtype {zero_point.dtype}"
-                assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}"
-                assert isinstance(
-                    quantized_per_channel_data, np.ndarray
-                ), f"Unexpected type {type(quantized_per_channel_data)}"
-
-            else:
-                symmetric = channel_quant_overrides.get(
-                    "symmetric",
-                    (
-                        self.is_weight_symmetric
-                        or weight_qType in (onnx_proto.TensorProto.INT8, onnx_proto.TensorProto.FLOAT8E4M3FN)
-                    ),
-                )
-                _, _, zero_point, scale, quantized_per_channel_data = quantize_data(
-                    per_channel_data.flatten(),
-                    weight_qType,
-                    symmetric,
-                    reduce_range=channel_quant_overrides.get("reduce_range", self.reduce_range and reduce_range),
-                    min_real_range=self.min_real_range,
-                    rmin_override=channel_quant_overrides.get("rmin"),
-                    rmax_override=channel_quant_overrides.get("rmax"),
-                )
-
-                assert isinstance(zero_point, np.ndarray), f"Unexpected type {type(zero_point)}"
-                assert (
-                    zero_point.dtype != np.float32 and zero_point.dtype != np.float16
-                ), f"Unexpected dtype {zero_point.dtype}"
-                assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}"
-                assert isinstance(
-                    quantized_per_channel_data, np.ndarray
-                ), f"Unexpected type {type(quantized_per_channel_data)}"
-
-            zero_point_list.append(zero_point)
-            scale_list.append(scale)
-            quantized_per_channel_data_list.append(quantized_per_channel_data)
-
-        # combine per_channel_data into one
-        reshape_dims = list(weights.shape)  # deep copy
-        reshape_dims[channel_axis] = 1  # only one per channel for reshape
-        quantized_weights = np.asarray(quantized_per_channel_data_list[0]).reshape(reshape_dims)
-        for i in range(1, len(quantized_per_channel_data_list)):
-            channel_weights = np.asarray(quantized_per_channel_data_list[i]).reshape(reshape_dims)
-            quantized_weights = np.concatenate((quantized_weights, channel_weights), channel_axis)
-
-        q_weight_name = weight_name + TENSOR_NAME_QUANT_SUFFIX
-        zp_name = weight_name + "_zero_point"
-        scale_name = weight_name + "_scale"
-
-        quantized_value = QuantizedValue(
-            weight_name,
-            q_weight_name,
-            scale_name,
-            zp_name,
-            QuantizedValueType.Initializer,
-            None,
-        )
-        self.quantized_value_map[weight_name] = quantized_value
-
-        # Update packed weight, zero point, and scale initializers
-        zero_scale_shape = [initializer.dims[channel_axis]]
-        scale_initializer = onnx.helper.make_tensor(
-            scale_name, initializer.data_type, zero_scale_shape, np.hstack(scale_list).tolist()
-        )
-        zero_initializer = onnx.helper.make_tensor(
-            zp_name, weight_qType, zero_scale_shape, np.hstack(zero_point_list).tolist()
-        )
-
-        self.model.initializer_extend([scale_initializer, zero_initializer])
-
-        if not keep_float_weight:
-            quantized_weights = np.asarray(
-                quantized_weights,
-                dtype=onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[weight_qType],
-            ).reshape(initializer.dims)
-            q_weight_initializer = onnx.numpy_helper.from_array(quantized_weights, q_weight_name)
-            self.model.initializer_extend([q_weight_initializer])
-
-        return q_weight_name, zp_name, scale_name
-
     def _dequantize_value(self, value_name):
         """
         Given a value (input/output) which is quantized, add a DequantizeLinear node to dequantize
@@ -1390,52 +771,3 @@ def _dequantize_outputs(self):
             dequantize_node = self._dequantize_value(output.name)
             if dequantize_node is not None:
                 self.new_nodes.append(dequantize_node)
-
-    def calculate_quantization_params(self):
-        if self.tensors_range is None:
-            return
-
-        # adjust tensor_ranges for input of Clip and Relu node
-        for node in self.model.nodes():
-            if node.op_type not in ["Clip", "Relu"]:
-                continue
-            if self.is_activation_symmetric:
-                continue
-            if not self.should_quantize_node(node):
-                continue
-            if len(self.model.input_name_to_nodes()[node.input[0]]) != 1:
-                continue
-            if node.input[0] not in self.tensors_range or node.output[0] not in self.tensors_range:
-                continue
-            td = self.tensors_range[node.output[0]]
-            if not isinstance(td, TensorData):
-                raise TypeError(f"Unexpected type {type(td)} for {node.output[0]!r}.")
-            self.tensors_range[node.input[0]] = td
-
-        quantization_params = {}
-        for tensor_name in self.tensors_range:
-            td = self.tensors_range[tensor_name]
-            if not isinstance(td, TensorData):
-                raise TypeError(f"Unexpected type {type(td)} for {tensor_name!r}.")
-
-            quant_overrides = self.get_per_tensor_quant_overrides(tensor_name)
-
-            quant_type = self.activation_qType
-            if "quant_type" in quant_overrides:
-                quant_type = quant_overrides["quant_type"].tensor_type
-
-            if "scale" in quant_overrides and "zero_point" in quant_overrides:
-                zero, scale = quant_overrides["zero_point"], quant_overrides["scale"]
-            elif quant_type == onnx.TensorProto.FLOAT8E4M3FN:
-                zero, scale = compute_scale_zp_float8(quant_type, td.avg_std[1])
-            else:
-                rmin = quant_overrides.get("rmin", td.range_value[0])
-                rmax = quant_overrides.get("rmax", td.range_value[1])
-                symmetric = quant_overrides.get("symmetric", self.is_activation_symmetric)
-                reduce_range = quant_overrides.get("reduce_range", False)
-                qmin, qmax = get_qmin_qmax_for_qType(quant_type, reduce_range=reduce_range, symmetric=symmetric)
-                zero, scale = compute_scale_zp(rmin, rmax, qmin, qmax, symmetric, self.min_real_range)
-
-            quantization_params[tensor_name] = QuantizationParams(zero_point=zero, scale=scale, quant_type=quant_type)
-
-        return quantization_params
diff --git a/onnxruntime/python/tools/quantization/qdq_quantizer.py b/onnxruntime/python/tools/quantization/qdq_quantizer.py
index 76cd0d21fca37..e221a2d57db8b 100644
--- a/onnxruntime/python/tools/quantization/qdq_quantizer.py
+++ b/onnxruntime/python/tools/quantization/qdq_quantizer.py
@@ -11,7 +11,7 @@
 from onnx import TensorProto
 from onnx import onnx_pb as onnx_proto
 
-from .onnx_quantizer import ONNXQuantizer
+from .base_quantizer import BaseQuantizer
 from .quant_utils import (
     DEQUANT_OP_NAME,
     QUANT_OP_NAME,
@@ -46,14 +46,12 @@ def __init__(self, tensor_type=QDQQuantTensorType.ACTIVATION, quant_para_provide
         self.data_type = data_type
 
 
-class QDQQuantizer(ONNXQuantizer):
+class QDQQuantizer(BaseQuantizer):
     def __init__(
         self,
         model,
         per_channel,
         reduce_range,
-        mode,
-        static,
         weight_qType,
         activation_qType,
         tensors_range,
@@ -62,13 +60,11 @@ def __init__(
         op_types_to_quantize,
         extra_options=None,
     ):
-        ONNXQuantizer.__init__(
+        BaseQuantizer.__init__(
             self,
             model,
             per_channel,
             reduce_range,
-            mode,
-            static,
             weight_qType,
             activation_qType,
             tensors_range,
diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py
index 05d3ac248c92c..9b0c15e4b4dde 100644
--- a/onnxruntime/python/tools/quantization/quantize.py
+++ b/onnxruntime/python/tools/quantization/quantize.py
@@ -523,8 +523,6 @@ def inc_dataloader():
             model,
             per_channel,
             reduce_range,
-            mode,
-            True,  # static
             weight_type,
             activation_type,
             tensors_range,
diff --git a/onnxruntime/test/python/quantization/test_qdq.py b/onnxruntime/test/python/quantization/test_qdq.py
index 223f405e8947a..9e7a4a125121d 100644
--- a/onnxruntime/test/python/quantization/test_qdq.py
+++ b/onnxruntime/test/python/quantization/test_qdq.py
@@ -20,7 +20,7 @@
     create_clip_node,
 )
 
-from onnxruntime.quantization import QDQQuantizer, QuantFormat, QuantizationMode, QuantType, quantize_static
+from onnxruntime.quantization import QDQQuantizer, QuantFormat, QuantType, quantize_static
 from onnxruntime.quantization.calibrate import TensorData
 
 
@@ -87,14 +87,11 @@ def td(vals):
 
         op_types_to_quantize = ["Add"]
 
-        mode = QuantizationMode.QLinearOps
         model = onnx.load_model(test_model_path)
         quantizer = QDQQuantizer(
             model,
             True,  # per_channel
             False,  # reduce_range
-            mode,
-            True,  # static
             QuantType.QInt8,  # weight_type
             QuantType.QInt8,  # activation_type
             compute_data,
@@ -191,14 +188,11 @@ def td(vals):
 
         op_types_to_quantize = ["Add", "MatMul"]
 
-        mode = QuantizationMode.QLinearOps
         model = onnx.load_model(test_model_path)
         quantizer = QDQQuantizer(
             model,
             True,  # per_channel
             False,  # reduce_range
-            mode,
-            True,  # static
             QuantType.QInt8,  # weight_type
             QuantType.QInt8,  # activation_type
             compute_data,

From 860eb762c27e314e5d2a4f35004d0c94059a0789 Mon Sep 17 00:00:00 2001
From: Edward Chen <18449977+edgchen1@users.noreply.github.com>
Date: Tue, 12 Mar 2024 11:33:30 -0700
Subject: [PATCH 149/279] [Apple framework] Fix minimal build with training
 enabled. (#19858)

Fix some linker errors that come up when integrating the onnxruntime-training-c pod into another Xcode project. The problematic configuration is a minimal build with training APIs enabled.
- training_op_defs.o had some unresolved references to ONNX functions. It should not be included at all in a minimal build.
- tree_ensemble_helper.o also had unresolved references to ONNX ParseData. The containing function is unused in a minimal build.

Added a test to cover this configuration.
---
 cmake/onnxruntime_graph.cmake                 | 53 +++++++++++--------
 .../providers/cpu/ml/tree_ensemble_helper.cc  |  4 ++
 .../providers/cpu/ml/tree_ensemble_helper.h   |  5 ++
 onnxruntime/core/session/environment.cc       |  2 -
 ...os_simulator_framework_build_settings.json | 22 ++++++++
 .../azure-pipelines/post-merge-jobs.yml       | 40 +++++++++++++-
 6 files changed, 100 insertions(+), 26 deletions(-)
 create mode 100644 tools/ci_build/github/apple/test_minimal_training_ios_simulator_framework_build_settings.json

diff --git a/cmake/onnxruntime_graph.cmake b/cmake/onnxruntime_graph.cmake
index 3f532ec2c3261..4d51325b8414e 100644
--- a/cmake/onnxruntime_graph.cmake
+++ b/cmake/onnxruntime_graph.cmake
@@ -7,8 +7,26 @@ file(GLOB_RECURSE onnxruntime_graph_src CONFIGURE_DEPENDS
   "${ONNXRUNTIME_ROOT}/core/graph/*.cc"
   )
 
-# create empty list for any excludes
+# start with empty training srcs list
+set(orttraining_graph_src)
+
+if (onnxruntime_ENABLE_TRAINING_OPS AND NOT onnxruntime_ENABLE_TRAINING)
+  set(orttraining_graph_src
+      "${ORTTRAINING_SOURCE_DIR}/core/graph/training_op_defs.cc"
+      "${ORTTRAINING_SOURCE_DIR}/core/graph/training_op_defs.h"
+      )
+endif()
+
+if (onnxruntime_ENABLE_TRAINING)
+  file(GLOB_RECURSE orttraining_graph_src CONFIGURE_DEPENDS
+      "${ORTTRAINING_SOURCE_DIR}/core/graph/*.h"
+      "${ORTTRAINING_SOURCE_DIR}/core/graph/*.cc"
+      )
+endif()
+
+# create empty lists for any excludes
 set(onnxruntime_graph_src_exclude_patterns)
+set(orttraining_graph_src_exclude_patterns)
 
 if (onnxruntime_MINIMAL_BUILD)
   # remove schema registration support
@@ -22,11 +40,18 @@ if (onnxruntime_MINIMAL_BUILD)
     "${ONNXRUNTIME_ROOT}/core/graph/contrib_ops/onnx_function_util.cc"
     "${ONNXRUNTIME_ROOT}/core/graph/contrib_ops/shape_inference_functions.h"
     "${ONNXRUNTIME_ROOT}/core/graph/contrib_ops/shape_inference_functions.cc"
+    "${ONNXRUNTIME_ROOT}/core/graph/dml_ops/dml_defs.h"
+    "${ONNXRUNTIME_ROOT}/core/graph/dml_ops/dml_defs.cc"
     "${ONNXRUNTIME_ROOT}/core/graph/function_template.h"
     "${ONNXRUNTIME_ROOT}/core/graph/function_utils.h"
     "${ONNXRUNTIME_ROOT}/core/graph/function_utils.cc"
   )
 
+  list(APPEND orttraining_graph_src_exclude_patterns
+    "${ORTTRAINING_SOURCE_DIR}/core/graph/training_op_defs.h"
+    "${ORTTRAINING_SOURCE_DIR}/core/graph/training_op_defs.cc"
+  )
+
   # no Function support initially
   list(APPEND onnxruntime_graph_src_exclude_patterns
     "${ONNXRUNTIME_ROOT}/core/graph/function*"
@@ -64,30 +89,12 @@ endif()
 file(GLOB onnxruntime_graph_src_exclude ${onnxruntime_graph_src_exclude_patterns})
 list(REMOVE_ITEM onnxruntime_graph_src ${onnxruntime_graph_src_exclude})
 
-file(GLOB_RECURSE onnxruntime_ir_defs_src CONFIGURE_DEPENDS
-  "${ONNXRUNTIME_ROOT}/core/defs/*.cc"
-)
-
-if (onnxruntime_ENABLE_TRAINING_OPS AND NOT onnxruntime_ENABLE_TRAINING)
-  set(orttraining_graph_src
-      "${ORTTRAINING_SOURCE_DIR}/core/graph/training_op_defs.cc"
-      "${ORTTRAINING_SOURCE_DIR}/core/graph/training_op_defs.h"
-      )
-endif()
-
-if (onnxruntime_ENABLE_TRAINING)
-  file(GLOB_RECURSE orttraining_graph_src CONFIGURE_DEPENDS
-      "${ORTTRAINING_SOURCE_DIR}/core/graph/*.h"
-      "${ORTTRAINING_SOURCE_DIR}/core/graph/*.cc"
-      )
-endif()
-
-set(onnxruntime_graph_lib_src ${onnxruntime_graph_src} ${onnxruntime_ir_defs_src})
 if (onnxruntime_ENABLE_TRAINING_OPS)
-    list(APPEND onnxruntime_graph_lib_src ${orttraining_graph_src})
+  file(GLOB orttraining_graph_src_exclude ${orttraining_graph_src_exclude_patterns})
+  list(REMOVE_ITEM orttraining_graph_src ${orttraining_graph_src_exclude})
 endif()
 
-onnxruntime_add_static_library(onnxruntime_graph ${onnxruntime_graph_lib_src})
+onnxruntime_add_static_library(onnxruntime_graph ${onnxruntime_graph_src} ${orttraining_graph_src})
 add_dependencies(onnxruntime_graph onnx_proto flatbuffers::flatbuffers)
 onnxruntime_add_include_to_target(onnxruntime_graph onnxruntime_common ${WIL_TARGET} onnx onnx_proto ${PROTOBUF_LIB} flatbuffers::flatbuffers safeint_interface Boost::mp11)
 
@@ -120,7 +127,7 @@ endif()
 
 set_target_properties(onnxruntime_graph PROPERTIES FOLDER "ONNXRuntime")
 set_target_properties(onnxruntime_graph PROPERTIES LINKER_LANGUAGE CXX)
-source_group(TREE ${REPO_ROOT} FILES ${onnxruntime_graph_src} ${onnxruntime_ir_defs_src})
+source_group(TREE ${REPO_ROOT} FILES ${onnxruntime_graph_src})
 if (onnxruntime_ENABLE_TRAINING_OPS)
     source_group(TREE ${ORTTRAINING_ROOT} FILES ${orttraining_graph_src})
 endif()
diff --git a/onnxruntime/core/providers/cpu/ml/tree_ensemble_helper.cc b/onnxruntime/core/providers/cpu/ml/tree_ensemble_helper.cc
index 180b3153fbb34..e2981da3a6f25 100644
--- a/onnxruntime/core/providers/cpu/ml/tree_ensemble_helper.cc
+++ b/onnxruntime/core/providers/cpu/ml/tree_ensemble_helper.cc
@@ -1,6 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#if !defined(ORT_MINIMAL_BUILD)
+
 #include "core/providers/cpu/ml/tree_ensemble_helper.h"
 #include "core/common/common.h"
 #include "onnx/defs/tensor_proto_util.h"
@@ -64,3 +66,5 @@ Status GetVectorAttrsOrDefault(const OpKernelInfo& info, const std::string& name
 
 }  // namespace ml
 }  // namespace onnxruntime
+
+#endif  // !defined(ORT_MINIMAL_BUILD)
diff --git a/onnxruntime/core/providers/cpu/ml/tree_ensemble_helper.h b/onnxruntime/core/providers/cpu/ml/tree_ensemble_helper.h
index 3c8a5a840bc76..33172c343a88e 100644
--- a/onnxruntime/core/providers/cpu/ml/tree_ensemble_helper.h
+++ b/onnxruntime/core/providers/cpu/ml/tree_ensemble_helper.h
@@ -2,6 +2,9 @@
 // Licensed under the MIT License.
 
 #pragma once
+
+#if !defined(ORT_MINIMAL_BUILD)
+
 #include "core/common/common.h"
 #include "core/framework/op_kernel.h"
 
@@ -13,3 +16,5 @@ Status GetVectorAttrsOrDefault(const OpKernelInfo& info, const std::string& name
 
 }  // namespace ml
 }  // namespace onnxruntime
+
+#endif  // !defined(ORT_MINIMAL_BUILD)
diff --git a/onnxruntime/core/session/environment.cc b/onnxruntime/core/session/environment.cc
index 80a0cb673c199..318c76645bdf5 100644
--- a/onnxruntime/core/session/environment.cc
+++ b/onnxruntime/core/session/environment.cc
@@ -240,12 +240,10 @@ Status Environment::Initialize(std::unique_ptr<logging::LoggingManager> logging_
 // Register contributed schemas.
 // The corresponding kernels are registered inside the appropriate execution provider.
 #ifndef DISABLE_CONTRIB_OPS
-#ifndef ORT_MINIMAL_BUILD
       RegisterOpSetSchema<contrib::OpSet_Microsoft_ver1>();
       RegisterOpSetSchema<contrib::OpSet_ONNX_Deprecated>();
       // internal opset that has NHWC versions of ONNX operators
       RegisterOpSetSchema<internal_nhwc_onnx::OpSet_Internal_NHWC_ONNX>();
-#endif
       contrib::RegisterContribSchemas();
 #endif
 
diff --git a/tools/ci_build/github/apple/test_minimal_training_ios_simulator_framework_build_settings.json b/tools/ci_build/github/apple/test_minimal_training_ios_simulator_framework_build_settings.json
new file mode 100644
index 0000000000000..1a89d941e5e52
--- /dev/null
+++ b/tools/ci_build/github/apple/test_minimal_training_ios_simulator_framework_build_settings.json
@@ -0,0 +1,22 @@
+{
+  "build_osx_archs": {
+    "iphonesimulator": [
+      "x86_64"
+    ]
+  },
+  "build_params": {
+    "base": [
+      "--parallel",
+      "--use_xcode",
+      "--build_apple_framework",
+      "--minimal_build=extended",
+      "--enable_training_apis",
+      "--skip_tests",
+      "--cmake_extra_defines=onnxruntime_BUILD_UNIT_TESTS=OFF"
+    ],
+    "iphonesimulator": [
+      "--ios",
+      "--apple_deploy_target=12.0"
+    ]
+  }
+}
diff --git a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
index 1803c8769c510..bb4402faeb191 100644
--- a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
+++ b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
@@ -417,6 +417,7 @@ stages:
     - template: templates/use-xcode-version.yml
       parameters:
         xcodeVersion: 14.3
+
     - script: |
         pip install -r tools/ci_build/github/apple/ios_packaging.requirements.txt
       displayName: "Install Python requirements"
@@ -433,4 +434,41 @@ stages:
           --framework_info_file "$(Build.BinariesDirectory)/ios_framework/xcframework_info.json" \
           --c_framework_dir "$(Build.BinariesDirectory)/ios_framework/framework_out" \
           --variant Mobile
-      displayName: "Test pod with iOS dynamic framework"
+      displayName: "Test pod with iOS framework"
+
+- stage: IosMinimalTrainingBuild
+  dependsOn: []
+  jobs:
+  - job: IosMinimalTrainingBuild
+    timeoutInMinutes: 120
+    pool:
+      vmImage: "macOS-13"
+
+    steps:
+    - task: UsePythonVersion@0
+      inputs:
+        versionSpec: "3.9"
+        addToPath: true
+        architecture: "x64"
+
+    - template: templates/use-xcode-version.yml
+      parameters:
+        xcodeVersion: 14.3
+
+    - script: |
+        pip install -r tools/ci_build/github/apple/ios_packaging.requirements.txt
+      displayName: "Install Python requirements"
+
+    - script: |
+        python tools/ci_build/github/apple/build_apple_framework.py \
+          --build_dir "$(Build.BinariesDirectory)/ios_framework" \
+          tools/ci_build/github/apple/test_minimal_training_ios_simulator_framework_build_settings.json
+      displayName: "Build iOS framework with minimal build and training enabled"
+
+    - script: |
+        python tools/ci_build/github/apple/test_apple_packages.py \
+          --framework_info_file "$(Build.BinariesDirectory)/ios_framework/xcframework_info.json" \
+          --c_framework_dir "$(Build.BinariesDirectory)/ios_framework/framework_out" \
+          --variant Training \
+          --skip_macos_test
+      displayName: "Test pod with iOS framework"

From 22ad629cf761e083336e15304c328eb413003763 Mon Sep 17 00:00:00 2001
From: wejoncy <wejoncy@163.com>
Date: Wed, 13 Mar 2024 09:27:46 +0800
Subject: [PATCH 150/279] [bug fix] dequantize 4bit (#19793)

### Description
<!-- Describe your changes. -->


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../contrib_ops/cpu/quantization/matmul_nbits_impl.cc      | 3 ++-
 .../contrib_ops/cuda/quantization/dequantize_blockwise.cu  | 7 ++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits_impl.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits_impl.cc
index f92e59e990ba5..7e343d85f4048 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits_impl.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits_impl.cc
@@ -41,8 +41,9 @@ void Dequantize4BitsKernelReOrder(
   T* output_i = output + out_y * out_cols + out_x;
   uint32_t quant_value = *(reinterpret_cast<const uint32_t*>(quant_data + element_offset / 2));
   const int remain_x = std::min(8, out_cols - out_x);
+  const int32_t* reorder_idx_with_off = reorder_idx + kb_idx * block_size + ((threadIdx_x * 8) & (block_size - 1));
   for (int i = 0; i < remain_x; i++) {
-    int32_t rid = reorder_idx ? reorder_idx[kb_idx * block_size + i] : kb_idx;
+    int32_t rid = reorder_idx ? reorder_idx_with_off[i] : kb_idx;
     T scale = *(scale_data + n_idx * scales_shape_x + rid);
     float zp_f = 8;
     if (zero_points) {
diff --git a/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise.cu b/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise.cu
index cd6593352008b..265adf22eeb61 100644
--- a/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise.cu
+++ b/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise.cu
@@ -23,7 +23,7 @@ namespace cuda {
 
 __device__ __forceinline__ void DequantizeEightElements(uint32_t values_quant, half scale, half zp, half* output) {
   half2 scale_half2 = {scale, scale};
-  half zp_adjust = -scale * __short2half_rn(zp);
+  half zp_adjust = -scale * zp;
   half2 zp_adjust2 = {zp_adjust, zp_adjust};
 
   alignas(16) half2 results[4];
@@ -83,8 +83,9 @@ __global__ void Dequantize4BitsKernelReOrder(
   int element_offset = group_id * block_size + ((threadIdx.x * 8) & (block_size - 1));
   T* output_i = output + element_offset;
   uint32_t quant_value = *(reinterpret_cast<const uint32_t*>(quant_data + element_offset / 2));
+  const int32_t* reorder_idx_with_off = reorder_idx + kb_idx * block_size + ((threadIdx.x * 8) & (block_size - 1));
   for (int i = 0; i < 8; i++) {
-    int32_t rid = reorder_idx[kb_idx * block_size + i];
+    int32_t rid = reorder_idx_with_off[i];
     T scale = *(scale_data + n_idx * scales_shape_x + rid);
     uint8_t zp = 8;
     if (zero_points) {
@@ -157,7 +158,7 @@ Status Dequantize4Bits(
   int groups_per_K = k / block_size;
   int total_groups = n * groups_per_K;  // total elemenets in quant_data
   int groups_per_grid = static_cast<int>(CeilDiv(total_groups, groups_per_threadblock));
-  if (!reorder_idx) {
+  if (!reorder_idx || std::is_same_v<ZeroT, T>) {
     Dequantize4BitsKernel<T, ZeroT><<<groups_per_grid, GridDim::maxThreadsPerBlock, 0, stream>>>(
         output,
         quant_data,

From 4538d31a8bcab8a1f928f7e428e2a06c1bea27a8 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Tue, 12 Mar 2024 19:50:51 -0700
Subject: [PATCH 151/279] [js/webgpu] expose a few properties in WebGPU API
 (#19857)

### Description
This change exposes a few properties in `ort.env.webgpu` to resolve
feature requirement mentioned in properties in
https://github.com/microsoft/onnxruntime/pull/14579#discussion_r1519612619.

- Add `powerPreference` and `forceFallbackAdapter` in `ort.env.webgpu`,
to allow users to set the value of the properties before the first
inference session is created.
- Add readonly property `adapter` in `ort.env.webgpu` to allow users to
get the adapter instance. Now users can access `ort.env.webgpu.device`
and `ort.env.webgpu.adapter`.

@xenova @beaufortfrancois
---
 js/common/lib/env.ts                   | 35 ++++++++++++++++++++++++++
 js/web/lib/wasm/jsep/backend-webgpu.ts |  1 +
 js/web/lib/wasm/wasm-core-impl.ts      | 10 +++++++-
 3 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/js/common/lib/env.ts b/js/common/lib/env.ts
index 73a47d1a4f937..dd8bde2b596f4 100644
--- a/js/common/lib/env.ts
+++ b/js/common/lib/env.ts
@@ -143,9 +143,44 @@ export declare namespace Env {
        */
       ondata?: (data: WebGpuProfilingData) => void;
     };
+    /**
+     * Set or get the power preference.
+     *
+     * Setting this property only has effect before the first WebGPU inference session is created. The value will be
+     * used as options for `navigator.gpu.requestAdapter()`.
+     *
+     * See {@link https://gpuweb.github.io/gpuweb/#dictdef-gpurequestadapteroptions} for more details.
+     *
+     * @defaultValue `undefined`
+     */
+    powerPreference?: 'low-power'|'high-performance';
+    /**
+     * Set or get the force fallback adapter flag.
+     *
+     * Setting this property only has effect before the first WebGPU inference session is created. The value will be
+     * used as options for `navigator.gpu.requestAdapter()`.
+     *
+     * See {@link https://gpuweb.github.io/gpuweb/#dictdef-gpurequestadapteroptions} for more details.
+     *
+     * @defaultValue `undefined`
+     */
+    forceFallbackAdapter?: boolean;
+    /**
+     * Get the adapter for WebGPU.
+     *
+     * This property is only available after the first WebGPU inference session is created.
+     *
+     * When use with TypeScript, the type of this property is `GPUAdapter` defined in "@webgpu/types".
+     * Use `const adapter = env.webgpu.adapter as GPUAdapter;` in TypeScript to access this property with correct type.
+     *
+     * see comments on {@link GpuBufferType}
+     */
+    readonly adapter: unknown;
     /**
      * Get the device for WebGPU.
      *
+     * This property is only available after the first WebGPU inference session is created.
+     *
      * When use with TypeScript, the type of this property is `GPUDevice` defined in "@webgpu/types".
      * Use `const device = env.webgpu.device as GPUDevice;` in TypeScript to access this property with correct type.
      *
diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts
index 27c5566ab9fed..182c1cd351c9d 100644
--- a/js/web/lib/wasm/jsep/backend-webgpu.ts
+++ b/js/web/lib/wasm/jsep/backend-webgpu.ts
@@ -231,6 +231,7 @@ export class WebGpuBackend {
     };
 
     Object.defineProperty(this.env.webgpu, 'device', {value: this.device});
+    Object.defineProperty(this.env.webgpu, 'adapter', {value: adapter});
 
     // init queryType, which is necessary for InferenceSession.create
     this.setQueryType();
diff --git a/js/web/lib/wasm/wasm-core-impl.ts b/js/web/lib/wasm/wasm-core-impl.ts
index 37b9ed6a1002f..afab9ba00b0c4 100644
--- a/js/web/lib/wasm/wasm-core-impl.ts
+++ b/js/web/lib/wasm/wasm-core-impl.ts
@@ -89,7 +89,15 @@ export const initEp = async(env: Env, epName: string): Promise<void> => {
     if (typeof navigator === 'undefined' || !navigator.gpu) {
       throw new Error('WebGPU is not supported in current environment');
     }
-    const adapter = await navigator.gpu.requestAdapter();
+    const powerPreference = env.webgpu?.powerPreference;
+    if (powerPreference !== undefined && powerPreference !== 'low-power' && powerPreference !== 'high-performance') {
+      throw new Error(`Invalid powerPreference setting: "${powerPreference}"`);
+    }
+    const forceFallbackAdapter = env.webgpu?.forceFallbackAdapter;
+    if (forceFallbackAdapter !== undefined && typeof forceFallbackAdapter !== 'boolean') {
+      throw new Error(`Invalid forceFallbackAdapter setting: "${forceFallbackAdapter}"`);
+    }
+    const adapter = await navigator.gpu.requestAdapter({powerPreference, forceFallbackAdapter});
     if (!adapter) {
       throw new Error(
           'Failed to get GPU adapter. You may need to enable flag "--enable-unsafe-webgpu" if you are using Chrome.');

From 53de2d8cb03bec237ea99ff2132a8735090c3a14 Mon Sep 17 00:00:00 2001
From: Yang Gu <yang.gu@intel.com>
Date: Wed, 13 Mar 2024 13:25:07 +0800
Subject: [PATCH 152/279] [js/webgpu] Enable GroupedConvVectorize path (#19791)

Vectorize met 2 failed cases in a CI bot with NVIDIA GPU, but we
couldn't repro with all the GPUs at hand, including NVIDIA GPUs. This PR
introduces GPUAdapterInfo and enables this opt on non-NVIDIA GPUs to
make the bots happy.
No obivous perf gain can be seen if we enable vectorize on NVIDIA.
However, it shows big perf improvement on Intel. On my Gen12 Intel GPU,
mobilenetv2-12 perf was improved from 11.14ms to 7.1ms.
---
 js/web/lib/wasm/jsep/backend-webgpu.ts  | 24 +++++++++++++++++++++++-
 js/web/lib/wasm/jsep/init.ts            |  4 +++-
 js/web/lib/wasm/jsep/webgpu/ops/conv.ts |  7 ++++---
 js/web/lib/wasm/jsep/webgpu/types.ts    | 12 ++++++++++++
 4 files changed, 42 insertions(+), 5 deletions(-)

diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts
index 182c1cd351c9d..d92b8ac68dbe7 100644
--- a/js/web/lib/wasm/jsep/backend-webgpu.ts
+++ b/js/web/lib/wasm/jsep/backend-webgpu.ts
@@ -10,7 +10,7 @@ import {createView, TensorView} from './tensor-view';
 import {createGpuDataManager, downloadGpuData, GpuDataManager} from './webgpu/gpu-data-manager';
 import {RunFunction, WEBGPU_OP_RESOLVE_RULES} from './webgpu/op-resolve-rules';
 import {ProgramManager} from './webgpu/program-manager';
-import {ComputeContext, GpuData, ProgramInfo, ProgramInputTensorInfoDependency, SessionState, TimestampQuery} from './webgpu/types';
+import {AdapterInfo, ComputeContext, GpuArchitecture, GpuData, GpuVendor, ProgramInfo, ProgramInputTensorInfoDependency, SessionState, TimestampQuery} from './webgpu/types';
 
 interface CommandInfo {
   readonly kernelId: number;
@@ -94,11 +94,32 @@ const getProgramInfoUniqueKey =
       return key;
     };
 
+class AdapterInfoImpl implements AdapterInfo {
+  readonly architecture?: string;
+  readonly vendor?: string;
+
+  constructor(adapterInfo: GPUAdapterInfo) {
+    if (adapterInfo) {
+      this.architecture = adapterInfo.architecture;
+      this.vendor = adapterInfo.vendor;
+    }
+  }
+
+  isArchitecture(architecture: GpuArchitecture): boolean {
+    return this.architecture === architecture;
+  }
+
+  isVendor(vendor: GpuVendor): boolean {
+    return this.vendor === vendor;
+  }
+}
+
 /**
  * this class is designed to store status and being used as a singleton for JSEP. It will be passed to jsepInit() as
  * the first parameter so that it is stored for future use.
  */
 export class WebGpuBackend {
+  adapterInfo: AdapterInfoImpl;
   device: GPUDevice;
   /**
    * an instance of GpuDataManager to manage a GpuDataId -> GpuBuffer mapping
@@ -212,6 +233,7 @@ export class WebGpuBackend {
     }
 
     this.device = await adapter.requestDevice(deviceDescriptor);
+    this.adapterInfo = new AdapterInfoImpl(await adapter.requestAdapterInfo());
     this.gpuDataManager = createGpuDataManager(this);
     this.programManager = new ProgramManager(this);
     this.kernels = new Map();
diff --git a/js/web/lib/wasm/jsep/init.ts b/js/web/lib/wasm/jsep/init.ts
index b64abf9cc5424..4936b94ef7a86 100644
--- a/js/web/lib/wasm/jsep/init.ts
+++ b/js/web/lib/wasm/jsep/init.ts
@@ -10,7 +10,7 @@ import {WebGpuBackend} from './backend-webgpu';
 import {LOG_DEBUG} from './log';
 import {TensorView} from './tensor-view';
 import {ShapeUtil} from './util';
-import {ComputeContext, ComputeContextInputsOutputsMapping, ProgramInfo} from './webgpu/types';
+import {AdapterInfo, ComputeContext, ComputeContextInputsOutputsMapping, ProgramInfo} from './webgpu/types';
 
 /* eslint-disable no-bitwise */
 
@@ -54,6 +54,7 @@ class TensorViewImpl implements TensorView {
 }
 
 class ComputeContextImpl implements ComputeContext {
+  readonly adapterInfo: AdapterInfo;
   readonly opKernelContext: number;
   readonly inputs: readonly TensorView[];
   readonly outputCount: number;
@@ -66,6 +67,7 @@ class ComputeContextImpl implements ComputeContext {
   private customDataOffset = 0;
   private customDataSize = 0;
   constructor(private module: OrtWasmModule, private backend: WebGpuBackend, contextDataOffset: number) {
+    this.adapterInfo = backend.adapterInfo;
     const heapU32 = module.HEAPU32;
 
     // extract context data
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
index 5afec0389fac8..b68d4dcae4cb9 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
@@ -148,11 +148,12 @@ const conv2d = (context: ComputeContext, inputs: readonly TensorView[], attribut
   // const hasPreluActivationWeights = false; /* TODO: add support for prelu activation weights */
   const isChannelsLast = attributes.format === 'NHWC';
   if (attributes.group !== 1) {
-    // Temporarily disable createGroupedConvVectorizeProgramInfo path due to bots failures with below two cases:
+    // NVIDIA GPU with ampere architecture fails with below 2 cases, but we couldn't repro them with any other
+    // GPUs. So just disable vectorize on NVIDIA ampere to ensure always correct outputs.
     // [webgpu]Conv - conv - vectorize group - B
     // [webgpu]Conv - conv - vectorize group - D
-    const disableGroupedConvVectorize = true;
-    if (!disableGroupedConvVectorize && isChannelsLast && inputs[1].dims[0] === attributes.group &&
+    const enableGroupedConvVectorize = !context.adapterInfo.isArchitecture('ampere');
+    if (enableGroupedConvVectorize && isChannelsLast && inputs[1].dims[0] === attributes.group &&
         inputs[1].dims[1] === 1 && attributes.dilations[0] === 1 && attributes.dilations[1] === 1) {
       const outputShape = calculateOutputShape(
           inputs[0].dims, inputs[1].dims, attributes.dilations, adjustedAttributes.pads, attributes.strides,
diff --git a/js/web/lib/wasm/jsep/webgpu/types.ts b/js/web/lib/wasm/jsep/webgpu/types.ts
index ba5b84fcfe067..48e0855f01a97 100644
--- a/js/web/lib/wasm/jsep/webgpu/types.ts
+++ b/js/web/lib/wasm/jsep/webgpu/types.ts
@@ -15,6 +15,13 @@ export enum GpuDataType {
 }
 export type GpuDataId = number;
 
+export type GpuArchitecture = 'ampere';
+export type GpuVendor = 'amd'|'intel'|'nvidia';
+export interface AdapterInfo {
+  isArchitecture: (architecture: GpuArchitecture) => boolean;
+  isVendor: (vendor: GpuVendor) => boolean;
+}
+
 export interface GpuData {
   type: GpuDataType;
   id: GpuDataId;
@@ -146,6 +153,11 @@ export interface ComputeContextInputsOutputsMapping {
  * A ComputeContext instance carries the states that representing the current running of a kernel.
  */
 export interface ComputeContext {
+  /**
+   * gpu adapter info
+   */
+  readonly adapterInfo: AdapterInfo;
+
   /**
    * stores the pointer to OpKernelContext
    */

From 6579f74af0c160f2d6090fbf12d1366a2cc305ef Mon Sep 17 00:00:00 2001
From: George Wu <jywu@microsoft.com>
Date: Tue, 12 Mar 2024 23:35:05 -0700
Subject: [PATCH 153/279] skip onnx node_tests for tensorrt ep (#19880)

fix build break caused by image update. tensorrt isn't expected to pass
all onnx node tests.
---
 onnxruntime/test/providers/cpu/model_tests.cc | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/test/providers/cpu/model_tests.cc b/onnxruntime/test/providers/cpu/model_tests.cc
index 7461717377144..af71fe5cf79ae 100644
--- a/onnxruntime/test/providers/cpu/model_tests.cc
+++ b/onnxruntime/test/providers/cpu/model_tests.cc
@@ -620,6 +620,7 @@ ::std::vector<::std::basic_string<ORTCHAR_T>> GetParameterStrings() {
   std::vector<std::filesystem::path> paths;
 
   for (std::pair<ORT_STRING_VIEW, std::vector<ORT_STRING_VIEW>> kvp : provider_names) {
+    const ORT_STRING_VIEW provider_name = kvp.first;
     // Setup ONNX node tests. The test data is preloaded on our CI build machines.
 #if !defined(_WIN32)
     ORT_STRING_VIEW node_test_root_path = ORT_TSTR("/data/onnx");
@@ -627,7 +628,10 @@ ::std::vector<::std::basic_string<ORTCHAR_T>> GetParameterStrings() {
     ORT_STRING_VIEW node_test_root_path = ORT_TSTR("c:\\local\\data\\onnx");
 #endif
     for (auto p : kvp.second) {
-      paths.push_back(ConcatPathComponent(node_test_root_path, p));
+      // tensorrt ep isn't expected to pass all onnx node tests. exclude and run model tests only.
+      if (provider_name != provider_name_tensorrt) {
+        paths.push_back(ConcatPathComponent(node_test_root_path, p));
+      }
     }
 
     // Same as the above, except this one is for large models
@@ -646,7 +650,6 @@ ::std::vector<::std::basic_string<ORTCHAR_T>> GetParameterStrings() {
     }
 #endif
 
-    const ORT_STRING_VIEW provider_name = kvp.first;
     std::unordered_set<std::basic_string<ORTCHAR_T>> all_disabled_tests(std::begin(immutable_broken_tests),
                                                                         std::end(immutable_broken_tests));
     if (provider_name == provider_name_cuda) {

From 60ad6c6409479d2c4c0c829b5b88bdb9d2248872 Mon Sep 17 00:00:00 2001
From: Hector Li <hecli@microsoft.com>
Date: Wed, 13 Mar 2024 08:35:21 -0700
Subject: [PATCH 154/279] Enable float32 model with FP16 precision for QNN HTP
 backend (#19863)

### Description
Enable float32 model with FP16 precision for QNN HTP backend
---
 .../core/session/onnxruntime_c_api.h          |  4 ++++
 .../providers/qnn/qnn_execution_provider.cc   | 23 +++++++++++++++++++
 .../providers/qnn/qnn_execution_provider.h    |  1 +
 onnxruntime/test/onnx/main.cc                 | 13 ++++++++++-
 .../test/perftest/command_args_parser.cc      |  2 ++
 onnxruntime/test/perftest/ort_test_session.cc | 11 ++++++++-
 .../test/providers/qnn/qnn_basic_test.cc      | 19 +++++++++++++++
 7 files changed, 71 insertions(+), 2 deletions(-)

diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index 5577c840c5379..144ee1205ee1a 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -3619,6 +3619,10 @@ struct OrtApi {
    *     - "73"
    *     - "75"
    *   "device_id": The ID of the device to use when setting 'htp_arch'. Defaults to "0" (for single device).
+       "enable_htp_fp16_precision": Only used for float32 model.
+       Enable the float32 model to be inferenced with fp16 precision. Otherwise, it will be fp32 precision.
+         - "0": Default. With fp32 precision.
+         - "1": With fp16 precision.
    *
    * SNPE supported keys:
    *   "runtime": SNPE runtime engine, options: "CPU", "CPU_FLOAT32", "GPU", "GPU_FLOAT32_16_HYBRID", "GPU_FLOAT16",
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index 5c4fa3e0fb88b..ef90b1f629b26 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -300,6 +300,19 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
     }
   }
 
+  static const std::string QNN_HTP_FP16_MODE = "enable_htp_fp16_precision";
+  auto htp_fp16_mode_pos = provider_options_map.find(QNN_HTP_FP16_MODE);
+  if (htp_fp16_mode_pos != provider_options_map.end()) {
+    if ("1" == htp_fp16_mode_pos->second) {
+      enable_HTP_FP16_precision_ = true;
+    } else if ("0" == htp_fp16_mode_pos->second) {
+      enable_HTP_FP16_precision_ = false;
+    } else {
+      LOGS_DEFAULT(VERBOSE) << "Invalid enable_htp_fp16_precision: " << enable_HTP_FP16_precision_ << " only 0 or 1 allowed. Set to 0.";
+    }
+    LOGS_DEFAULT(VERBOSE) << "User specified enable_htp_fp16_precision: " << enable_HTP_FP16_precision_;
+  }
+
   qnn_backend_manager_ = std::make_unique<qnn::QnnBackendManager>(
       std::move(backend_path),
       profiling_level,
@@ -637,6 +650,16 @@ void QNNExecutionProvider::InitQnnGraphConfigs(qnn::QnnConfigsBuilder<QnnGraph_C
       graph_opt_config_vtcm.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
       graph_opt_config_vtcm.customConfig = &htp_graph_opt_config_vtcm;
     }
+
+    if (enable_HTP_FP16_precision_) {
+      QnnHtpGraph_CustomConfig_t& htp_graph_precision_config = configs_builder.PushCustomConfig();
+      htp_graph_precision_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_PRECISION;
+      htp_graph_precision_config.precision = QNN_PRECISION_FLOAT16;
+
+      QnnGraph_Config_t& graph_precision_config = configs_builder.PushConfig();
+      graph_precision_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+      graph_precision_config.customConfig = &htp_graph_precision_config;
+    }
   }
 }
 
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.h b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
index 43b5e7bff827e..82dceb8ae3973 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.h
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
@@ -84,6 +84,7 @@ class QNNExecutionProvider : public IExecutionProvider {
   uint32_t device_id_ = 0;
   qnn::HtpPerformanceMode default_htp_performance_mode_ = qnn::HtpPerformanceMode::kHtpDefault;
   uint32_t default_rpc_control_latency_ = 0;
+  bool enable_HTP_FP16_precision_ = false;
 
   class PerThreadContext final {
    public:
diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc
index aca609cf94270..5a2104ffeb0da 100644
--- a/onnxruntime/test/onnx/main.cc
+++ b/onnxruntime/test/onnx/main.cc
@@ -64,6 +64,8 @@ void usage() {
       "\t    [QNN only] [htp_arch]: The minimum HTP architecture. The driver will use ops compatible with this architecture. \n"
       "\t    Options are '0', '68', '69', '73', '75'. Defaults to '0' (none). \n"
       "\t    [QNN only] [device_id]: The ID of the device to use when setting 'htp_arch'. Defaults to '0' (for single device). \n"
+      "\t    [QNN only] [enable_htp_fp16_precision]: Enable the HTP_FP16 precision so that the float32 model will be inferenced with fp16 precision. \n"
+      "\t    Otherwise, it will be fp32 precision. Only works for float32 model. Defaults to '0' (with FP32 precision.). \n"
       "\t [Usage]: -e <provider_name> -i '<key1>|<value1> <key2>|<value2>' \n\n"
       "\t [Example] [For QNN EP] -e qnn -i \"profiling_level|detailed backend_path|/folderpath/libQnnCpu.so\" \n\n"
       "\t    [SNPE only] [runtime]: SNPE runtime, options: 'CPU', 'GPU', 'GPU_FLOAT16', 'DSP', 'AIP_FIXED_TF'. \n"
@@ -525,11 +527,20 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
             std::string str = str_stream.str();
             ORT_THROW("Wrong value for htp_arch. select from: " + str);
           }
+        } else if (key == "enable_htp_fp16_precision") {
+          std::unordered_set<std::string> supported_options = {"0", "1"};
+          if (supported_options.find(value) == supported_options.end()) {
+            std::ostringstream str_stream;
+            std::copy(supported_options.begin(), supported_options.end(),
+                      std::ostream_iterator<std::string>(str_stream, ","));
+            std::string str = str_stream.str();
+            ORT_THROW("Wrong value for enable_htp_fp16_precision. select from: " + str);
+          }
         } else {
           ORT_THROW(R"(Wrong key type entered. Choose from options: ['backend_path',
 'profiling_level', 'rpc_control_latency', 'vtcm_mb', 'htp_performance_mode',
 'qnn_saver_path', 'htp_graph_finalization_optimization_mode', 'qnn_context_priority',
-'soc_model', 'htp_arch', 'device_id'])");
+'soc_model', 'htp_arch', 'device_id', 'enable_htp_fp16_precision'])");
         }
 
         qnn_options[key] = value;
diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc
index 16c90c39f300f..93e44fd8e8d2d 100644
--- a/onnxruntime/test/perftest/command_args_parser.cc
+++ b/onnxruntime/test/perftest/command_args_parser.cc
@@ -94,6 +94,8 @@ namespace perftest {
       "\t    [QNN only] [htp_arch]: The minimum HTP architecture. The driver will use ops compatible with this architecture. \n"
       "\t    Options are '0', '68', '69', '73', '75'. Defaults to '0' (none). \n"
       "\t    [QNN only] [device_id]: The ID of the device to use when setting 'htp_arch'. Defaults to '0' (for single device). \n"
+      "\t    [QNN only] [enable_htp_fp16_precision]: Enable the HTP_FP16 precision so that the float32 model will be inferenced with fp16 precision. \n"
+      "\t    Otherwise, it will be fp32 precision. Only works for float32 model. Defaults to '0' (with FP32 precision.). \n"
       "\t    [Example] [For QNN EP] -e qnn -i \"backend_path|/folderpath/libQnnCpu.so\" \n"
       "\n"
       "\t    [TensorRT only] [trt_max_partition_iterations]: Maximum iterations for TensorRT parser to get capability.\n"
diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc
index 71d260a18ce7b..6e10763d8f293 100644
--- a/onnxruntime/test/perftest/ort_test_session.cc
+++ b/onnxruntime/test/perftest/ort_test_session.cc
@@ -382,11 +382,20 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
           std::string str = str_stream.str();
           ORT_THROW("Wrong value for htp_arch. select from: " + str);
         }
+      } else if (key == "enable_htp_fp16_precision") {
+        std::unordered_set<std::string> supported_options = {"0", "1"};
+        if (supported_options.find(value) == supported_options.end()) {
+          std::ostringstream str_stream;
+          std::copy(supported_options.begin(), supported_options.end(),
+                    std::ostream_iterator<std::string>(str_stream, ","));
+          std::string str = str_stream.str();
+          ORT_THROW("Wrong value for enable_htp_fp16_precision. select from: " + str);
+        }
       } else {
         ORT_THROW(R"(Wrong key type entered. Choose from options: ['backend_path',
 'profiling_level', 'rpc_control_latency', 'vtcm_mb', 'htp_performance_mode',
 'qnn_saver_path', 'htp_graph_finalization_optimization_mode', 'qnn_context_priority', 'soc_model',
-'htp_arch', 'device_id'])");
+'htp_arch', 'device_id', 'enable_htp_fp16_precision'])");
       }
 
       qnn_options[key] = value;
diff --git a/onnxruntime/test/providers/qnn/qnn_basic_test.cc b/onnxruntime/test/providers/qnn/qnn_basic_test.cc
index 8f07c2ce77e77..4f294f899c170 100644
--- a/onnxruntime/test/providers/qnn/qnn_basic_test.cc
+++ b/onnxruntime/test/providers/qnn/qnn_basic_test.cc
@@ -815,6 +815,25 @@ TEST_F(QnnHTPBackendTests, DISABLED_CastAddHTPAccuracyTest) {
                   ExpectedEPNodeAssignment::All);
 }
 
+// Test float32 model with FP16 precision
+TEST_F(QnnHTPBackendTests, Float32ModelWithFP16PrecisionTest) {
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+  provider_options["enable_htp_fp16_precision"] = "1";
+
+  auto input_defs = {TestInputDef<float>({1, 2, 2, 2}, false, -10.0f, 10.0f),
+                     TestInputDef<float>({1, 2, 2, 2}, false, -10.0f, 10.0f)};
+  RunQnnModelTest(BuildOpTestCase<float>("Add", input_defs, {}, {}, kOnnxDomain),
+                  provider_options,
+                  13,
+                  ExpectedEPNodeAssignment::All,
+                  0.008f);
+}
+
 #endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
 #endif  // !defined(ORT_MINIMAL_BUILD)
 

From 7313aa4efe2ff1256b6d834101c2d70c38505ef5 Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Thu, 14 Mar 2024 00:45:22 +0800
Subject: [PATCH 155/279] Remove --extra-index-url (#19885)

### Description
<!-- Describe your changes. -->


### Motivation and Context
--extra-index-url is not allowed by injected Secure Supply Chain Step in
packaging pipelines.
```
> Starting Multifeed Python Security Analysis:
##[warning]tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml - Found "extra-index-url". (https://aka.ms/cfs/pypi)
```
And those 2 packages can be installed from PyPI as well now.

Co-authored-by: Yi Zhang <your@email.com>
---
 .../ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
index 8e28342aa634c..b9a47f6739fe8 100644
--- a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
@@ -25,7 +25,7 @@ pr:
     - BUILD.md
     - 'js/web'
     - 'onnxruntime/core/providers/js'
-#### end trigger ####parameters:
+#### end trigger ####
 
 # reference: https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md
 parameters:
@@ -214,7 +214,7 @@ stages:
             python3 -m pip install /Release/*.whl; \
             pushd /workspace/onnxruntime/python/tools/transformers/models/stable_diffusion; \
             python3 -m pip install -r requirements-cuda11.txt; \
-            python3 -m pip install --upgrade polygraphy onnx-graphsurgeon --extra-index-url https://pypi.ngc.nvidia.com; \
+            python3 -m pip install --upgrade polygraphy onnx-graphsurgeon ; \
             echo Generate an image guided by a text prompt; \
             python3 demo_txt2img.py --framework-model-dir /model_cache --seed 1 --deterministic "astronaut riding a horse on mars" ; \
             find $(pwd)/ORT_CUDA -name "*.png" -exec cp {} /images/ \; ; \

From 9e0a0f0f32708c5cf57f058e894f9caae31056eb Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Thu, 14 Mar 2024 00:59:57 +0800
Subject: [PATCH 156/279] Check whether required tests are executed. (#19884)

### Description
Check the onnx node tests and model tests worked

### Motivation and Context
onnx node test data and model data are mount in one dir.
And onnxruntime_test_all search the dir and load the data.
If the dir does exist or there's some change in onnxruntime_test_all,
those tests may not be executed.
For example, all onnx node test data is 32M. It's hardly for us aware of
the regression.
So I add the simple check to ensure those tests are executed.

---------

Co-authored-by: Yi Zhang <your@email.com>
---
 .../azure-pipelines/linux-ci-pipeline.yml     |  9 +++++++--
 .../azure-pipelines/linux-gpu-ci-pipeline.yml |  4 ++++
 .../templates/check_test_result.yml           | 20 +++++++++++++++++++
 3 files changed, 31 insertions(+), 2 deletions(-)
 create mode 100644 tools/ci_build/github/azure-pipelines/templates/check_test_result.yml

diff --git a/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml
index 02147c321fab3..82e571bf6519f 100644
--- a/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml
@@ -86,7 +86,7 @@ stages:
                     -e NIGHTLY_BUILD \
                     -e BUILD_BUILDNUMBER \
                     onnxruntimecpubuildcentos8x64 \
-                    /bin/bash -c "
+                    /bin/bash -c '
                       set -ex; \
                       python3.9 /onnxruntime_src/tools/ci_build/build.py \
                         --build_dir /build --cmake_generator 'Ninja' \
@@ -105,7 +105,8 @@ stages:
                         --parallel --use_binskim_compliant_compile_flags \
                         --build_csharp \
                         --enable_onnx_tests --enable_address_sanitizer \
-                        --test;"
+                        --test;
+                      '
                 workingDirectory: $(Build.SourcesDirectory)
 
       - task: PublishTestResults@2
@@ -285,6 +286,10 @@ stages:
               "
         displayName: 'Run Release tests and symbolic shape infer test'
 
+      - template: templates/check_test_result.yml
+        parameters:
+          FileName: '$(Build.BinariesDirectory)/Release/onnxruntime_test_all.Release.results.xml'
+
       - task: PublishTestResults@2
         displayName: 'Publish unit test results'
         inputs:
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
index 165bd804a8ad5..78ee0e1a318e4 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
@@ -265,4 +265,8 @@ stages:
               '
       displayName: 'Run Tests'
 
+    - template: templates/check_test_result.yml
+      parameters:
+        FileName: '$(Build.BinariesDirectory)/Release/onnxruntime_test_all.Release.results.xml'
+
     - template: templates/clean-agent-build-directory-step.yml
diff --git a/tools/ci_build/github/azure-pipelines/templates/check_test_result.yml b/tools/ci_build/github/azure-pipelines/templates/check_test_result.yml
new file mode 100644
index 0000000000000..1a68d415c44d6
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/templates/check_test_result.yml
@@ -0,0 +1,20 @@
+parameters:
+- name: FileName
+  type: string
+
+steps:
+  - task: UsePythonVersion@0
+    inputs:
+      versionSpec: '3.x'
+      addToPath: true
+      architecture: 'x64'
+
+  - task: PythonScript@0
+    displayName: 'Check test result yml'
+    inputs:
+      scriptSource: 'inline'
+      script: |
+        with open('${{parameters.FileName}}', 'r') as file:
+          content = file.read()
+        assert 'data_onnx_opset' in content, "operator test not found in test result file"
+        assert 'models_zoo_opset' in content, "models_zoo model not found in test reuslt file"

From faea42af95253771c530242b917376c36b4d75ec Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchuby@users.noreply.github.com>
Date: Wed, 13 Mar 2024 10:00:32 -0700
Subject: [PATCH 157/279] Bump ruff to 0.3.2 and black to 24 (#19878)

### Motivation and Context

Routing updates
---
 cgmanifests/generate_cgmanifest.py            |   4 +-
 .../examples/plot_train_convert_predict.py    |   2 +-
 .../python/tools/microbench/benchmark.py      |   9 +-
 .../tools/quantization/base_quantizer.py      |  12 +-
 .../python/tools/quantization/calibrate.py    |   6 +-
 .../quantization/matmul_4bits_quantizer.py    |   2 +-
 .../python/tools/quantization/onnx_model.py   |   6 +-
 .../tools/quantization/operators/concat.py    |   4 +-
 .../tools/quantization/operators/gemm.py      |   4 +-
 .../tools/quantization/qdq_quantizer.py       |   4 +-
 .../python/tools/quantization/quant_utils.py  |   4 +-
 .../python/tools/symbolic_shape_infer.py      |  58 +--
 .../python/tools/tensorrt/perf/benchmark.py   |   4 +-
 .../tools/tensorrt/perf/benchmark_wrapper.py  |   6 +-
 .../python/tools/transformers/benchmark.py    |   4 +-
 .../tools/transformers/benchmark_helper.py    |   2 +-
 .../tools/transformers/bert_perf_test.py      |   6 +-
 .../transformers/compare_bert_results.py      |  16 +-
 .../python/tools/transformers/float16.py      |   4 +-
 .../transformers/fusion_attention_unet.py     |   8 +-
 .../tools/transformers/fusion_embedlayer.py   |   9 +-
 .../transformers/fusion_qordered_gelu.py      |   8 +-
 .../transformers/fusion_qordered_layernorm.py |   8 +-
 .../models/gpt2/benchmark_gpt2.py             |   2 +-
 .../transformers/models/gpt2/gpt2_helper.py   |   6 +-
 .../transformers/models/gpt2/gpt2_parity.py   |   2 +-
 .../transformers/models/llama/benchmark.py    |  34 +-
 .../models/llama/convert_to_onnx.py           |   8 +-
 .../models/longformer/benchmark_longformer.py |   4 +-
 .../models/phi2/inference_example.py          |  31 +-
 .../stable_diffusion/diffusion_models.py      |   1 -
 .../models/whisper/convert_to_onnx.py         |   6 +-
 .../models/whisper/whisper_chain.py           |   8 +-
 .../models/whisper/whisper_helper.py          |   2 +-
 .../tools/transformers/onnx_model_phi.py      |   6 +-
 .../tools/transformers/onnx_model_unet.py     |   2 +-
 .../tools/transformers/shape_optimizer.py     |   4 +-
 .../reduction_test_cases_generator.py         |   4 +-
 onnxruntime/test/providers/cpu/rnn/GRU.py     |   2 +-
 onnxruntime/test/providers/cpu/rnn/LSTM.py    |   2 +-
 .../test/python/quantization/test_op_pad.py   |   8 +-
 .../test_quantizeblockwise_4bits.py           |   8 +-
 .../transformers/bert_model_generator.py      |  24 +-
 .../transformers/conformer_model_generator.py |   4 +-
 .../transformers/gpt2_model_generator.py      | 360 ++++++++++--------
 .../sharded_moe/test_sharded_moe.py           |  68 ++--
 .../generate_tiny_keras2onnx_bert_models.py   |   4 +-
 .../python/transformers/test_flash_attn.py    |  16 +-
 .../transformers/whisper_model_generator.py   |   4 +-
 .../custom_op_test_float8.py                  |   1 +
 .../adamw_test/adamw_test_data_generator.py   |   2 +-
 .../transform/fusion/embed_layer_norm_gen.py  | 260 +++++++------
 .../orttraining/python/training/__init__.py   |   4 +-
 .../training/optim/_apex_amp_modifier.py      |   1 -
 .../python/training/ort_triton/_lowering.py   |   2 +-
 .../training/ort_triton/kernel/_flash_attn.py |   2 +-
 .../ortmodule/_fallback_exceptions.py         |  12 -
 .../ortmodule/_graph_execution_manager.py     |   7 +-
 .../python/training/ortmodule/_logger.py      |   8 +-
 .../training/ortmodule/_runtime_inspector.py  |  12 +-
 .../training/ortmodule/_training_manager.py   |   4 +-
 .../python/training/ortmodule/_utils.py       |   2 +-
 .../python/training/ortmodule/options.py      |   1 -
 .../test/external_custom_ops/setup.py         |   4 +-
 .../orttraining/test/python/_test_commons.py  |   2 +-
 .../orttraining/test/python/_test_helpers.py  |   1 -
 .../test/python/orttraining_test_gru.py       |   4 +-
 .../test/python/orttraining_test_lstm.py      |  16 +-
 .../orttraining_test_ort_apis_onnxblock.py    |   8 +-
 .../python/orttraining_test_ortmodule_api.py  |  30 +-
 ...training_test_ortmodule_bert_classifier.py |   2 +-
 ...test_ortmodule_bert_classifier_autocast.py |   2 +-
 ...g_test_ortmodule_deepspeed_zero_stage_1.py |  25 +-
 ...t_ortmodule_fairscale_sharded_optimizer.py |  16 +-
 .../orttraining_test_ortmodule_onnx_ops.py    |   1 +
 .../python/orttraining_test_ortmodule_poc.py  |  18 +-
 .../test/python/qat_poc_example/quantize.py   |   2 +-
 orttraining/tools/amdgpu/script/rocprof.py    |  12 +-
 .../tools/ci_test/run_bert_perf_test.py       |   4 +-
 .../tools/scripts/nv_run_pretraining.py       |  12 +-
 orttraining/tools/scripts/watch_experiment.py |  12 +-
 pyproject.toml                                |  12 +-
 requirements-lintrunner.txt                   |   4 +-
 setup.py                                      |  14 +-
 tools/ci_build/build.py                       |  20 +-
 tools/ci_build/clean_docker_image_cache.py    |   6 +-
 tools/ci_build/get_docker_image.py            |   6 +-
 .../github/android/build_aar_package.py       |   8 +-
 .../apple/build_and_assemble_apple_pods.py    |   4 +-
 .../github/apple/build_apple_framework.py     |   8 +-
 .../ort_minimal/check_build_binary_size.py    |   4 +-
 .../windows/post_binary_sizes_to_dashboard.py |   2 +-
 tools/ci_build/op_registration_utils.py       |   2 -
 tools/ci_build/op_registration_validator.py   |   2 +-
 tools/doc/rename_folders.py                   |   1 +
 .../nuget/generate_nuspec_for_native_nuget.py |  10 +-
 tools/python/dump_ort_model.py                |  18 +-
 ...ptimizer_opset_version_updates_required.py |   4 +-
 tools/python/gen_contrib_doc.py               |   6 +-
 .../python/util/convert_onnx_models_to_ort.py |  14 +-
 .../check_model_can_use_ort_mobile_pkg.py     |   2 +-
 .../operator_type_usage_processors.py         |  12 +-
 .../ort_format_model/ort_model_processor.py   |   8 +-
 103 files changed, 702 insertions(+), 764 deletions(-)

diff --git a/cgmanifests/generate_cgmanifest.py b/cgmanifests/generate_cgmanifest.py
index 81181d3ccfb20..3cecbb0cc977f 100644
--- a/cgmanifests/generate_cgmanifest.py
+++ b/cgmanifests/generate_cgmanifest.py
@@ -115,8 +115,8 @@ def normalize_path_separators(path):
 submodule_lines = proc.stdout.splitlines()
 for submodule_line in submodule_lines:
     (absolute_path, url, commit) = submodule_line.split(" ")
-    git_deps[GitDep(commit, url)] = "git submodule at {}".format(
-        normalize_path_separators(os.path.relpath(absolute_path, REPO_DIR))
+    git_deps[GitDep(commit, url)] = (
+        f"git submodule at {normalize_path_separators(os.path.relpath(absolute_path, REPO_DIR))}"
     )
 
 with open(os.path.join(SCRIPT_DIR, "..", "cmake", "deps.txt")) as f:
diff --git a/docs/python/examples/plot_train_convert_predict.py b/docs/python/examples/plot_train_convert_predict.py
index dcbc84b20767a..44b6bb74c29df 100644
--- a/docs/python/examples/plot_train_convert_predict.py
+++ b/docs/python/examples/plot_train_convert_predict.py
@@ -134,7 +134,7 @@ def loop(X_test, fct, n=None):
     nrow = X_test.shape[0]
     if n is None:
         n = nrow
-    for i in range(0, n):
+    for i in range(n):
         im = i % nrow
         fct(X_test[im : im + 1])
 
diff --git a/onnxruntime/python/tools/microbench/benchmark.py b/onnxruntime/python/tools/microbench/benchmark.py
index a52740d45956c..a5936afcfe13e 100644
--- a/onnxruntime/python/tools/microbench/benchmark.py
+++ b/onnxruntime/python/tools/microbench/benchmark.py
@@ -147,20 +147,17 @@ def __init__(self, args):
 
     @classmethod
     @abstractmethod
-    def create_inputs_outputs(cls, op_param):
-        ...
+    def create_inputs_outputs(cls, op_param): ...
 
     def add_case(self, op_param, model):
         self.cases += [(op_param, model)]
 
     @abstractmethod
-    def create_cases(self):
-        ...
+    def create_cases(self): ...
 
     @classmethod
     @abstractmethod
-    def case_profile(cls, op_param, time):
-        ...
+    def case_profile(cls, op_param, time): ...
 
     def benchmark(self):
         self.create_cases()
diff --git a/onnxruntime/python/tools/quantization/base_quantizer.py b/onnxruntime/python/tools/quantization/base_quantizer.py
index 6fa88a9e44232..667d7047c1fbd 100644
--- a/onnxruntime/python/tools/quantization/base_quantizer.py
+++ b/onnxruntime/python/tools/quantization/base_quantizer.py
@@ -187,17 +187,13 @@ def check_opset_version(self):
 
         if opset_version == 10:
             logging.warning(
-                "The original model opset version is {}, which does not support node fusions. Please update the model to opset >= 11 for better performance.".format(
-                    opset_version
-                )
+                f"The original model opset version is {opset_version}, which does not support node fusions. Please update the model to opset >= 11 for better performance."
             )
             return 10
 
         if opset_version < 10:
             logging.warning(
-                "The original model opset version is {}, which does not support quantization. Please update the model to opset >= 11. Updating the model automatically to opset 11. Please verify the quantized model.".format(
-                    opset_version
-                )
+                f"The original model opset version is {opset_version}, which does not support quantization. Please update the model to opset >= 11. Updating the model automatically to opset 11. Please verify the quantized model."
             )
             self.model.model.opset_import.remove(ai_onnx_domain[0])
             self.model.model.opset_import.extend([onnx.helper.make_opsetid("", 11)])
@@ -205,9 +201,9 @@ def check_opset_version(self):
 
         if opset_version < 19 and self.weight_qType == onnx.TensorProto.FLOAT8E4M3FN:
             logging.warning(
-                "The original model opset version is {}, which does not support quantization to float 8. "
+                f"The original model opset version is {opset_version}, which does not support quantization to float 8. "
                 "Please update the model to opset >= 19. Updating the model automatically to opset 19. "
-                "Please verify the quantized model.".format(opset_version)
+                "Please verify the quantized model."
             )
             self.model.model.opset_import.remove(ai_onnx_domain[0])
             self.model.model.opset_import.extend([onnx.helper.make_opsetid("", 19)])
diff --git a/onnxruntime/python/tools/quantization/calibrate.py b/onnxruntime/python/tools/quantization/calibrate.py
index 624049b244580..971cc203f4d73 100644
--- a/onnxruntime/python/tools/quantization/calibrate.py
+++ b/onnxruntime/python/tools/quantization/calibrate.py
@@ -918,11 +918,7 @@ def compute_entropy(self):
         thresholds_dict = {}  # per tensor thresholds
 
         print(f"Number of tensors : {len(histogram_dict)}")
-        print(
-            "Number of histogram bins : {} (The number may increase depends on the data it collects)".format(
-                self.num_bins
-            )
-        )
+        print(f"Number of histogram bins : {self.num_bins} (The number may increase depends on the data it collects)")
         print(f"Number of quantized bins : {self.num_quantized_bins}")
 
         for tensor, histogram in histogram_dict.items():
diff --git a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
index a1916e806c5c0..f4bcd508960a1 100644
--- a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
+++ b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
@@ -216,7 +216,7 @@ def pack_on_row_fast_248bit(pack_tensor, ori_int_tensor, bits):
             pack_tensor = pack_tensor.T
         if bits in [2, 4, 8]:
             compress_ratio = pack_tensor.element_size() * 8 // bits
-            for j in range(0, compress_ratio):
+            for j in range(compress_ratio):
                 pack_tensor[0:] |= ori_int_tensor[j::compress_ratio] << (bits * (j))
         else:
             raise NotImplementedError("Only 2,4,8 bits are supported.")
diff --git a/onnxruntime/python/tools/quantization/onnx_model.py b/onnxruntime/python/tools/quantization/onnx_model.py
index 46d245d353a07..716dd1eacec6a 100644
--- a/onnxruntime/python/tools/quantization/onnx_model.py
+++ b/onnxruntime/python/tools/quantization/onnx_model.py
@@ -79,11 +79,7 @@ def _clean_initializers_helper(graph, model):
                 graph.input.remove(name_to_input[initializer.name])
             except StopIteration:
                 if model.ir_version < 4:
-                    print(
-                        "Warning: invalid weight name {} found in the graph (not a graph input)".format(
-                            initializer.name
-                        )
-                    )
+                    print(f"Warning: invalid weight name {initializer.name} found in the graph (not a graph input)")
 
     requesting_tensor_names.difference_update(input.name for input in graph.input)
 
diff --git a/onnxruntime/python/tools/quantization/operators/concat.py b/onnxruntime/python/tools/quantization/operators/concat.py
index a4f359cf56847..57fcec9cd380b 100644
--- a/onnxruntime/python/tools/quantization/operators/concat.py
+++ b/onnxruntime/python/tools/quantization/operators/concat.py
@@ -30,7 +30,7 @@ def quantize(self):
             zero_point_names,
             scale_names,
             nodes,
-        ) = self.quantizer.quantize_activation(node, [*range(0, len(node.input))])
+        ) = self.quantizer.quantize_activation(node, [*range(len(node.input))])
         if not data_found or q_input_names is None:
             return super().quantize()
 
@@ -52,7 +52,7 @@ def quantize(self):
         qnode_name = node.name + "_quant" if node.name else ""
 
         qlconcat_inputs = [output_scale_name, output_zp_name]
-        for i in range(0, len(q_input_names)):
+        for i in range(len(q_input_names)):
             qlconcat_inputs.extend([q_input_names[i], scale_names[i], zero_point_names[i]])
         qlconcat_node = onnx.helper.make_node(
             "QLinearConcat", qlconcat_inputs, [quantized_output_value.q_name], qnode_name, **kwargs
diff --git a/onnxruntime/python/tools/quantization/operators/gemm.py b/onnxruntime/python/tools/quantization/operators/gemm.py
index 32fdb729635a8..d269c8fb47bd1 100644
--- a/onnxruntime/python/tools/quantization/operators/gemm.py
+++ b/onnxruntime/python/tools/quantization/operators/gemm.py
@@ -157,7 +157,5 @@ def quantize(self):
                 set_default_beta(self.node)
             else:
                 logging.warning(
-                    "Bias of Gemm node '{}' is not constant. Please exclude this node for better performance.".format(
-                        self.node.name
-                    )
+                    f"Bias of Gemm node '{self.node.name}' is not constant. Please exclude this node for better performance."
                 )
diff --git a/onnxruntime/python/tools/quantization/qdq_quantizer.py b/onnxruntime/python/tools/quantization/qdq_quantizer.py
index e221a2d57db8b..1875c552fab9c 100644
--- a/onnxruntime/python/tools/quantization/qdq_quantizer.py
+++ b/onnxruntime/python/tools/quantization/qdq_quantizer.py
@@ -153,9 +153,7 @@ def _is_tensor_quantizable(self, tensor_name):
                 return True
         else:
             logging.warning(
-                "failed to infer the type of tensor: {}. Skip to quantize it. Please check if it is expected.".format(
-                    tensor_name
-                )
+                f"failed to infer the type of tensor: {tensor_name}. Skip to quantize it. Please check if it is expected."
             )
 
         return False
diff --git a/onnxruntime/python/tools/quantization/quant_utils.py b/onnxruntime/python/tools/quantization/quant_utils.py
index 036f49b420734..131e55458fb86 100644
--- a/onnxruntime/python/tools/quantization/quant_utils.py
+++ b/onnxruntime/python/tools/quantization/quant_utils.py
@@ -276,7 +276,7 @@ def compute_scale_zp_float8(element_type, std):
             from onnx.reference.custom_element_types import float8e4m3fn
 
             zp_dtype = float8e4m3fn
-            all_values = [float8e4m3_to_float32(i) for i in range(0, 256)]
+            all_values = [float8e4m3_to_float32(i) for i in range(256)]
             values = numpy.array(
                 [f for f in all_values if not numpy.isnan(f) and not numpy.isinf(f)], dtype=numpy.float32
             )
@@ -530,7 +530,7 @@ def get_elem_index(elem_name, elem_list):
     Helper function to return index of an item in a node list
     """
     elem_idx = -1
-    for i in range(0, len(elem_list)):
+    for i in range(len(elem_list)):
         if elem_list[i] == elem_name:
             elem_idx = i
     return elem_idx
diff --git a/onnxruntime/python/tools/symbolic_shape_infer.py b/onnxruntime/python/tools/symbolic_shape_infer.py
index 4b029f9b172b0..8a911071864aa 100755
--- a/onnxruntime/python/tools/symbolic_shape_infer.py
+++ b/onnxruntime/python/tools/symbolic_shape_infer.py
@@ -282,7 +282,7 @@ def _add_suggested_merge(self, symbols, apply=False):
         # when nothing to map to, use the shorter one
         if map_to is None:
             if self.verbose_ > 0:
-                logger.warning("Potential unsafe merge between symbolic expressions: ({})".format(",".join(symbols)))
+                logger.warning("Potential unsafe merge between symbolic expressions: (%s)", ",".join(symbols))
             symbols_list = list(symbols)
             lens = [len(s) for s in symbols_list]
             map_to = symbols_list[lens.index(min(lens))]
@@ -335,10 +335,7 @@ def _merge_symbols(self, dims):
                     int_dim = is_int.index(1)
                     if self.verbose_ > 0:
                         logger.debug(
-                            "dim {} has been merged with value {}".format(
-                                unique_dims[:int_dim] + unique_dims[int_dim + 1 :],
-                                unique_dims[int_dim],
-                            )
+                            f"dim {unique_dims[:int_dim] + unique_dims[int_dim + 1 :]} has been merged with value {unique_dims[int_dim]}"
                         )
                     self._check_merged_dims(unique_dims, allow_broadcast=False)
                     return unique_dims[int_dim]
@@ -379,7 +376,7 @@ def _broadcast_shapes(self, shape1, shape2):
                     if self.auto_merge_:
                         self._add_suggested_merge([dim1, dim2], apply=True)
                     else:
-                        logger.warning("unsupported broadcast between " + str(dim1) + " " + str(dim2))
+                        logger.warning("unsupported broadcast between " + str(dim1) + " " + str(dim2))  # noqa: G003
             new_shape = [new_dim, *new_shape]
         return new_shape
 
@@ -663,12 +660,7 @@ def _new_symbolic_dim(self, prefix, dim):
 
     def _new_symbolic_dim_from_output(self, node, out_idx=0, dim=0):
         return self._new_symbolic_dim(
-            "{}{}_{}_o{}_".format(
-                node.op_type,
-                self.prefix_,
-                list(self.out_mp_.graph.node).index(node),
-                out_idx,
-            ),
+            f"{node.op_type}{self.prefix_}_{list(self.out_mp_.graph.node).index(node)}_o{out_idx}_",
             dim,
         )
 
@@ -1216,9 +1208,7 @@ def _infer_Loop(self, node):  # noqa: N802
         if need_second_infer:
             if self.verbose_ > 2:
                 logger.debug(
-                    "Rerun Loop: {}({}...), because of sequence in loop carried variables".format(
-                        node.name, node.output[0]
-                    )
+                    f"Rerun Loop: {node.name}({node.output[0]}...), because of sequence in loop carried variables"
                 )
             self._onnx_infer_subgraph(node, subgraph, inc_subgraph_id=False)
 
@@ -1843,7 +1833,7 @@ def handle_negative_index(index, bound):
             axes = self._try_get_value(node, 3)
             steps = self._try_get_value(node, 4)
             if axes is None and not (starts is None and ends is None):
-                axes = list(range(0, len(starts if starts is not None else ends)))
+                axes = list(range(len(starts if starts is not None else ends)))
             if steps is None and not (starts is None and ends is None):
                 steps = [1] * len(starts if starts is not None else ends)
             axes = as_list(axes, keep_none=True)
@@ -2669,11 +2659,9 @@ def get_prereq(node):
                         break
 
             if self.verbose_ > 2:
-                logger.debug(node.op_type + ": " + node.name)
+                logger.debug(node.op_type + ": " + node.name)  # noqa: G003
                 for i, name in enumerate(node.input):
-                    logger.debug(
-                        "  Input {}: {} {}".format(i, name, "initializer" if name in self.initializers_ else "")
-                    )
+                    logger.debug("  Input %s: %s %s", i, name, "initializer" if name in self.initializers_ else "")
 
             # onnx automatically merge dims with value, i.e. Mul(['aaa', 'bbb'], [1000, 1]) -> [1000, 'bbb']
             # symbolic shape inference needs to apply merge of 'aaa' -> 1000 in this case
@@ -2722,7 +2710,7 @@ def get_prereq(node):
                             seq_cls_type = out_type.sequence_type.elem_type.WhichOneof("value")
                             if seq_cls_type == "tensor_type":
                                 logger.debug(
-                                    "  {}: sequence of {} {}".format(
+                                    "  {}: sequence of {} {}".format(  # noqa: G001
                                         node.output[i_o],
                                         str(get_shape_from_value_info(vi)),
                                         onnx.TensorProto.DataType.Name(
@@ -2740,14 +2728,10 @@ def get_prereq(node):
                 out_type_undefined = out_type.tensor_type.elem_type == onnx.TensorProto.UNDEFINED
                 if self.verbose_ > 2:
                     logger.debug(
-                        "  {}: {} {}".format(
-                            node.output[i_o],
-                            str(out_shape),
-                            onnx.TensorProto.DataType.Name(vi.type.tensor_type.elem_type),
-                        )
+                        f"  {node.output[i_o]}: {out_shape!s} {onnx.TensorProto.DataType.Name(vi.type.tensor_type.elem_type)}"
                     )
                     if node.output[i_o] in self.sympy_data_:
-                        logger.debug("  Sympy Data: " + str(self.sympy_data_[node.output[i_o]]))
+                        logger.debug("  Sympy Data: " + str(self.sympy_data_[node.output[i_o]]))  # noqa: G003
 
                 # onnx >= 1.11.0, use unk__#index instead of None when the shape dim is uncertain
                 if (
@@ -2848,24 +2832,16 @@ def get_prereq(node):
                             if self.verbose_ > 0:
                                 if is_unknown_op:
                                     logger.debug(
-                                        "Possible unknown op: {} node: {}, guessing {} shape".format(
-                                            node.op_type, node.name, vi.name
-                                        )
+                                        f"Possible unknown op: {node.op_type} node: {node.name}, guessing {vi.name} shape"
                                     )
                                 if self.verbose_ > 2:
-                                    logger.debug(
-                                        "  {}: {} {}".format(
-                                            node.output[i_o],
-                                            str(new_shape),
-                                            vi.type.tensor_type.elem_type,
-                                        )
-                                    )
+                                    logger.debug(f"  {node.output[i_o]}: {new_shape!s} {vi.type.tensor_type.elem_type}")
 
                             self.run_ = True
                             continue  # continue the inference after guess, no need to stop as no merge is needed
 
                     if self.verbose_ > 0 or not self.auto_merge_ or out_type_undefined:
-                        logger.debug("Stopping at incomplete shape inference at " + node.op_type + ": " + node.name)
+                        logger.debug("Stopping at incomplete shape inference at %s: %s", node.op_type, node.name)
                         logger.debug("node inputs:")
                         for i in node.input:
                             if i in self.known_vi_:
@@ -2879,7 +2855,7 @@ def get_prereq(node):
                             else:
                                 logger.debug(f"not in known_vi_ for {o}")
                         if self.auto_merge_ and not out_type_undefined:
-                            logger.debug("Merging: " + str(self.suggested_merge_))
+                            logger.debug("Merging: " + str(self.suggested_merge_))  # noqa: G003
                     return False
 
         self.run_ = False
@@ -2964,9 +2940,9 @@ def parse_arguments():
 
 if __name__ == "__main__":
     args = parse_arguments()
-    logger.info("input model: " + args.input)
+    logger.info("input model: " + args.input)  # noqa: G003
     if args.output:
-        logger.info("output model " + args.output)
+        logger.info("output model " + args.output)  # noqa: G003
     logger.info("Doing symbolic shape inference...")
     out_mp = SymbolicShapeInference.infer_shapes(
         onnx.load(args.input),
diff --git a/onnxruntime/python/tools/tensorrt/perf/benchmark.py b/onnxruntime/python/tools/tensorrt/perf/benchmark.py
index 20bb8a71dc35f..8af074f24acc9 100644
--- a/onnxruntime/python/tools/tensorrt/perf/benchmark.py
+++ b/onnxruntime/python/tools/tensorrt/perf/benchmark.py
@@ -790,7 +790,7 @@ def skip_ep(model_name, ep, model_to_fail_ep):
 
     # if ep in fail_ep_list and fail_ep_list[ep] == "runtime error":
     if ep in fail_ep_list:
-        logger.info("Skip testing " + model_name + " using " + ep + " since it has some issues.")
+        logger.info("Skip testing " + model_name + " using " + ep + " since it has some issues.")  # noqa: G003
         return True
 
     return False
@@ -925,7 +925,7 @@ def find_model_path(path):
 
     logger.info(target_model_path)
     if len(target_model_path) > 1:
-        logger.error("We expect to find only one model in " + path)
+        logger.error("We expect to find only one model in " + path)  # noqa: G003
         raise
 
     return target_model_path[0]
diff --git a/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py b/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py
index 93d41551c7121..f12d4599817b7 100644
--- a/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py
+++ b/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py
@@ -80,9 +80,9 @@ def main():
     benchmark = is_benchmark_mode(args.running_mode)  # noqa: F405
 
     for model, model_info in models.items():
-        logger.info("\n" + "=" * 40 + "=" * len(model))  # noqa: F405
-        logger.info("=" * 20 + model + "=" * 20)  # noqa: F405
-        logger.info("=" * 40 + "=" * len(model))  # noqa: F405
+        logger.info("\n" + "=" * 40 + "=" * len(model))  # noqa: F405, G003
+        logger.info("=" * 20 + model + "=" * 20)  # noqa: F405, G003
+        logger.info("=" * 40 + "=" * len(model))  # noqa: F405, G003
 
         model_info["model_name"] = model
 
diff --git a/onnxruntime/python/tools/transformers/benchmark.py b/onnxruntime/python/tools/transformers/benchmark.py
index 89f9947688583..9baafbbfff0e3 100644
--- a/onnxruntime/python/tools/transformers/benchmark.py
+++ b/onnxruntime/python/tools/transformers/benchmark.py
@@ -802,7 +802,7 @@ def main():
         try:
             os.mkdir(args.cache_dir)
         except OSError:
-            logger.error("Creation of the directory %s failed" % args.cache_dir)
+            logger.error("Creation of the directory %s failed" % args.cache_dir)  # noqa: G002
 
     enable_torch = "torch" in args.engines
     enable_torch2 = "torch2" in args.engines
@@ -921,7 +921,7 @@ def main():
                     args,
                 )
             except Exception:
-                logger.error("Exception", exc_info=True)
+                logger.exception("Exception")
 
     time_stamp = datetime.now().strftime("%Y%m%d-%H%M%S")
     if model_fusion_statistics:
diff --git a/onnxruntime/python/tools/transformers/benchmark_helper.py b/onnxruntime/python/tools/transformers/benchmark_helper.py
index c9c815f01e053..66f7a63447764 100644
--- a/onnxruntime/python/tools/transformers/benchmark_helper.py
+++ b/onnxruntime/python/tools/transformers/benchmark_helper.py
@@ -142,7 +142,7 @@ def create_onnxruntime_session(
 
         session = onnxruntime.InferenceSession(onnx_model_path, sess_options, providers=providers)
     except Exception:
-        logger.error("Exception", exc_info=True)
+        logger.error("Exception", exc_info=True)  # noqa: G201
 
     return session
 
diff --git a/onnxruntime/python/tools/transformers/bert_perf_test.py b/onnxruntime/python/tools/transformers/bert_perf_test.py
index 9c743a83819c3..17c5d3602bb3b 100644
--- a/onnxruntime/python/tools/transformers/bert_perf_test.py
+++ b/onnxruntime/python/tools/transformers/bert_perf_test.py
@@ -232,9 +232,9 @@ def onnxruntime_inference(session, all_inputs, output_names):
 def to_string(model_path, session, test_setting):
     sess_options = session.get_session_options()
     option = f"model={os.path.basename(model_path)},"
-    option += "graph_optimization_level={},intra_op_num_threads={},".format(
-        sess_options.graph_optimization_level, sess_options.intra_op_num_threads
-    ).replace("GraphOptimizationLevel.ORT_", "")
+    option += f"graph_optimization_level={sess_options.graph_optimization_level},intra_op_num_threads={sess_options.intra_op_num_threads},".replace(
+        "GraphOptimizationLevel.ORT_", ""
+    )
 
     option += f"batch_size={test_setting.batch_size},sequence_length={test_setting.sequence_length},"
     option += f"test_cases={test_setting.test_cases},test_times={test_setting.test_times},"
diff --git a/onnxruntime/python/tools/transformers/compare_bert_results.py b/onnxruntime/python/tools/transformers/compare_bert_results.py
index 61e4c97c75c8c..0c5125e74c8a4 100644
--- a/onnxruntime/python/tools/transformers/compare_bert_results.py
+++ b/onnxruntime/python/tools/transformers/compare_bert_results.py
@@ -59,16 +59,10 @@ def compare(baseline_results, treatment_results, verbose, rtol=1e-1, atol=1e-3):
                         print(f"abs_diff={abs_diff}")
 
     if diff_count == 0:
-        print(
-            "100% passed for {} random inputs given thresholds (rtol={}, atol={}).".format(
-                len(baseline_results), rtol, atol
-            )
-        )
+        print(f"100% passed for {len(baseline_results)} random inputs given thresholds (rtol={rtol}, atol={atol}).")
     else:
         print(
-            "WARNING: {} out of {} results NOT passed for thresholds (rtol={}, atol={}).".format(
-                diff_count, len(baseline_results), rtol, atol
-            )
+            f"WARNING: {diff_count} out of {len(baseline_results)} results NOT passed for thresholds (rtol={rtol}, atol={atol})."
         )
 
     print(f"maximum absolute difference={max_abs_diff}")
@@ -117,11 +111,7 @@ def run_test(
         baseline_model, all_inputs, use_gpu, disable_optimization=True
     )
     if verbose:
-        print(
-            "baseline average latency (all optimizations disabled): {} ms".format(
-                statistics.mean(baseline_latency) * 1000
-            )
-        )
+        print(f"baseline average latency (all optimizations disabled): {statistics.mean(baseline_latency) * 1000} ms")
 
     if output_dir is not None:
         for i, inputs in enumerate(all_inputs):
diff --git a/onnxruntime/python/tools/transformers/float16.py b/onnxruntime/python/tools/transformers/float16.py
index 48c79b1d5fa0f..2398bb9d6031b 100644
--- a/onnxruntime/python/tools/transformers/float16.py
+++ b/onnxruntime/python/tools/transformers/float16.py
@@ -411,9 +411,7 @@ def convert_float_to_float16(
             value_info_list.append(make_value_info_from_tensor(value.initializer))
             if value.fp32_nodes and not force_fp16_initializers:
                 logger.info(
-                    "initializer is used by both fp32 and fp16 nodes. Consider add these nodes to block list:{}".format(
-                        value.fp16_nodes
-                    )
+                    f"initializer is used by both fp32 and fp16 nodes. Consider add these nodes to block list:{value.fp16_nodes}"
                 )
 
     # Some operators have data type fixed as float for some input. Add a float16 to float cast for those inputs.
diff --git a/onnxruntime/python/tools/transformers/fusion_attention_unet.py b/onnxruntime/python/tools/transformers/fusion_attention_unet.py
index 9a353e7e2d675..048c13cdb1e2c 100644
--- a/onnxruntime/python/tools/transformers/fusion_attention_unet.py
+++ b/onnxruntime/python/tools/transformers/fusion_attention_unet.py
@@ -373,9 +373,7 @@ def create_attention_node(
             else "MultiHeadAttention ({})".format(
                 "self attention with packed qkv"
                 if self.enable_packed_qkv
-                else "cross attention with packed kv"
-                if self.enable_packed_kv
-                else "cross attention"
+                else "cross attention with packed kv" if self.enable_packed_kv else "cross attention"
             )
         )
         self.increase_counter(counter_name)
@@ -843,9 +841,7 @@ def create_attention_node_lora(
             else "MultiHeadAttention ({})".format(
                 "self attention with packed qkv"
                 if self.enable_packed_qkv
-                else "cross attention with packed kv"
-                if self.enable_packed_kv
-                else "cross attention"
+                else "cross attention with packed kv" if self.enable_packed_kv else "cross attention"
             )
         )
         self.increase_counter(counter_name)
diff --git a/onnxruntime/python/tools/transformers/fusion_embedlayer.py b/onnxruntime/python/tools/transformers/fusion_embedlayer.py
index 42156d9123383..70ff57f0626e1 100644
--- a/onnxruntime/python/tools/transformers/fusion_embedlayer.py
+++ b/onnxruntime/python/tools/transformers/fusion_embedlayer.py
@@ -345,18 +345,13 @@ def check_embedding(self, word_embedding_gather, segment_embedding_gather, posit
                 and input_ids_shape[1] == position_ids_shape[1]
             ):
                 logger.info(
-                    "Cannot fuse EmbedLayerNormalization: input_ids and position_ids not matched in 2nd dimension: {} vs {}".format(
-                        input_ids_shape, position_ids_shape
-                    )
+                    f"Cannot fuse EmbedLayerNormalization: input_ids and position_ids not matched in 2nd dimension: {input_ids_shape} vs {position_ids_shape}"
                 )
                 return False
 
             if segment_ids and not self.shape_infer.compare_shape(input_ids, segment_ids):
                 logger.info(
-                    "Cannot fuse EmbedLayerNormalization: input_ids and segment_ids does not have same shape: {} != {}".format(
-                        input_ids_shape,
-                        self.shape_infer.get_edge_shape(segment_ids),
-                    )
+                    f"Cannot fuse EmbedLayerNormalization: input_ids and segment_ids does not have same shape: {input_ids_shape} != {self.shape_infer.get_edge_shape(segment_ids)}"
                 )
                 return False
 
diff --git a/onnxruntime/python/tools/transformers/fusion_qordered_gelu.py b/onnxruntime/python/tools/transformers/fusion_qordered_gelu.py
index 6c44bb11e24dc..5f395b364eb6f 100644
--- a/onnxruntime/python/tools/transformers/fusion_qordered_gelu.py
+++ b/onnxruntime/python/tools/transformers/fusion_qordered_gelu.py
@@ -75,9 +75,11 @@ def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
 
         if not self.model.is_safe_to_fuse_nodes(
             subgraph_nodes,
-            [node.output[0], downstream_quantize_node.output[0]]
-            if downstream_shape_node is not None
-            else downstream_quantize_node.output,
+            (
+                [node.output[0], downstream_quantize_node.output[0]]
+                if downstream_shape_node is not None
+                else downstream_quantize_node.output
+            ),
             input_name_to_nodes,
             output_name_to_node,
         ):
diff --git a/onnxruntime/python/tools/transformers/fusion_qordered_layernorm.py b/onnxruntime/python/tools/transformers/fusion_qordered_layernorm.py
index cf2b357721757..5ec6dadc1e677 100644
--- a/onnxruntime/python/tools/transformers/fusion_qordered_layernorm.py
+++ b/onnxruntime/python/tools/transformers/fusion_qordered_layernorm.py
@@ -77,9 +77,11 @@ def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
 
         if not self.model.is_safe_to_fuse_nodes(
             subgraph_nodes,
-            [node.output[0], downstream_quantize_node.output[0]]
-            if downstream_shape_node is not None
-            else downstream_quantize_node.output,
+            (
+                [node.output[0], downstream_quantize_node.output[0]]
+                if downstream_shape_node is not None
+                else downstream_quantize_node.output
+            ),
             input_name_to_nodes,
             output_name_to_node,
         ):
diff --git a/onnxruntime/python/tools/transformers/models/gpt2/benchmark_gpt2.py b/onnxruntime/python/tools/transformers/models/gpt2/benchmark_gpt2.py
index e48f0adc832c5..6d6a057574a17 100644
--- a/onnxruntime/python/tools/transformers/models/gpt2/benchmark_gpt2.py
+++ b/onnxruntime/python/tools/transformers/models/gpt2/benchmark_gpt2.py
@@ -400,7 +400,7 @@ def main(args):
                         }
                         csv_writer.writerow(row)
                     except Exception:
-                        logger.error("Exception", exc_info=True)
+                        logger.error("Exception", exc_info=True)  # noqa: G201
                         return None
 
     logger.info(f"Results are saved to file {csv_filename}")
diff --git a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_helper.py b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_helper.py
index e01585ae84163..9153193a4974a 100644
--- a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_helper.py
+++ b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_helper.py
@@ -630,7 +630,7 @@ def pytorch_inference(model, inputs: Gpt2Inputs, total_runs: int = 0):
                 latency.append(time.time() - start)
 
         average_latency = sum(latency) * 1000 / len(latency)
-        logger.debug("PyTorch inference time = {} ms".format(format(average_latency, ".2f")))
+        logger.debug("PyTorch inference time = {} ms".format(format(average_latency, ".2f")))  # noqa: G001
 
         return outputs, average_latency
 
@@ -662,7 +662,7 @@ def onnxruntime_inference(ort_session, inputs: Gpt2Inputs, total_runs: int = 0):
             latency.append(time.time() - start)
 
         average_latency = sum(latency) * 1000 / len(latency)
-        logger.debug("OnnxRuntime Inference time = {} ms".format(format(average_latency, ".2f")))
+        logger.debug("OnnxRuntime Inference time = {} ms".format(format(average_latency, ".2f")))  # noqa: G001
 
         return ort_outputs, average_latency
 
@@ -741,7 +741,7 @@ def onnxruntime_inference_with_binded_io(
             latency.append(time.time() - start)
 
         average_latency = sum(latency) * 1000 / len(latency)
-        logger.debug("OnnxRuntime with IO binding inference time = {} ms".format(format(average_latency, ".2f")))
+        logger.debug("OnnxRuntime with IO binding inference time = %.2f ms", average_latency)
 
         return ort_outputs, average_latency
 
diff --git a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_parity.py b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_parity.py
index 4823f0d5874dd..b039f1351b1d0 100644
--- a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_parity.py
+++ b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_parity.py
@@ -179,7 +179,7 @@ def print_wins(wins, rows, test_name):
         for row in rows:
             if row["run_id"] == key:
                 logger.info(
-                    "{:02d}: WINs={:02d}, run_id={}, latency={:5.2f}, top1_match={:.4f}, size={}_MB, experiment={}, {}".format(
+                    "{:02d}: WINs={:02d}, run_id={}, latency={:5.2f}, top1_match={:.4f}, size={}_MB, experiment={}, {}".format(  # noqa: G001
                         rank,
                         value,
                         key,
diff --git a/onnxruntime/python/tools/transformers/models/llama/benchmark.py b/onnxruntime/python/tools/transformers/models/llama/benchmark.py
index f597cead40331..bfe108d21a595 100644
--- a/onnxruntime/python/tools/transformers/models/llama/benchmark.py
+++ b/onnxruntime/python/tools/transformers/models/llama/benchmark.py
@@ -55,11 +55,7 @@ def get_inputs(args: argparse.Namespace, ort_model_inputs_len: int):
     max_seq_len = (
         2048
         if args.benchmark_type == "ort-msft"
-        else 16384
-        if "codellama" in temp_name
-        else 4096
-        if "llama2" in temp_name
-        else 2048
+        else 16384 if "codellama" in temp_name else 4096 if "llama2" in temp_name else 2048
     )
 
     if args.benchmark_type in {"hf-pt-eager", "hf-pt-compile"}:
@@ -278,21 +274,25 @@ def time_fn(args, fn, inputs):
         outputs = fn(inputs)
         logger.info(outputs)
 
-    input_sync = (  # noqa: E731
-        lambda *kwargs: args.io_binding.synchronize_inputs()
+    input_sync = lambda *kwargs: (  # noqa: E731
+        args.io_binding.synchronize_inputs()
         if args.device != "cpu" and args.benchmark_type in {"ort-msft", "ort-convert-to-onnx"}  # ORT synchronize
-        else lambda *kwargs: torch.cuda.synchronize()
-        if args.device != "cpu" and torch.cuda.is_available()  # PyTorch synchronize
-        else lambda *kwargs: None  # no-op function
-    )
+        else lambda *kwargs: (
+            torch.cuda.synchronize()
+            if args.device != "cpu" and torch.cuda.is_available()  # PyTorch synchronize
+            else lambda *kwargs: None
+        )
+    )  # no-op function
 
-    output_sync = (  # noqa: E731
-        lambda *kwargs: args.io_binding.synchronize_outputs()
+    output_sync = lambda *kwargs: (  # noqa: E731
+        args.io_binding.synchronize_outputs()
         if args.device != "cpu" and args.benchmark_type in {"ort-msft", "ort-convert-to-onnx"}  # ORT synchronize
-        else lambda *kwargs: torch.cuda.synchronize()
-        if args.device != "cpu" and torch.cuda.is_available()  # PyTorch synchronize
-        else lambda *kwargs: None  # no-op function
-    )
+        else lambda *kwargs: (
+            torch.cuda.synchronize()
+            if args.device != "cpu" and torch.cuda.is_available()  # PyTorch synchronize
+            else lambda *kwargs: None
+        )
+    )  # no-op function
 
     for _ in warmup_range:
         input_sync()
diff --git a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
index c9ff384a4c856..1ad58327b7fc2 100644
--- a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
+++ b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
@@ -944,9 +944,11 @@ def main():
                             ort_quantization.quantize_dynamic(
                                 fp32_path,
                                 int8_path,
-                                op_types_to_quantize=["MatMul", "Gemm", "Gather"]
-                                if args.quantize_embedding_layer
-                                else ["MatMul", "Gemm"],
+                                op_types_to_quantize=(
+                                    ["MatMul", "Gemm", "Gather"]
+                                    if args.quantize_embedding_layer
+                                    else ["MatMul", "Gemm"]
+                                ),
                                 per_channel=args.quantize_per_channel,
                                 reduce_range=args.quantize_reduce_range,
                                 use_external_data_format=True,
diff --git a/onnxruntime/python/tools/transformers/models/longformer/benchmark_longformer.py b/onnxruntime/python/tools/transformers/models/longformer/benchmark_longformer.py
index 51a967cf22608..ab92a12343732 100644
--- a/onnxruntime/python/tools/transformers/models/longformer/benchmark_longformer.py
+++ b/onnxruntime/python/tools/transformers/models/longformer/benchmark_longformer.py
@@ -335,7 +335,7 @@ def test_ort(args, device) -> List[Dict[str, Any]]:
 
     onnx_model_path = find_onnx_model(model_name) if not args.onnx else args.onnx
 
-    optimized = onnx_model_path.endswith("_fp16.onnx") or onnx_model_path.endswith("_fp32.onnx")
+    optimized = onnx_model_path.endswith("_fp16.onnx") or onnx_model_path.endswith("_fp32.onnx")  # noqa: PIE810
     precision = "fp32" if not onnx_model_path.endswith("_fp16.onnx") else "fp16"
 
     model = load_torch_model(model_name, device)
@@ -590,7 +590,7 @@ def run_tests(
     logger.info(f"ORT_LONGFORMER_COMPACT_MEMORY={compact_memory}")
 
     os.environ["ORT_LONGFORMER_USE_HALF4"] = "1" if use_half4 else "0"
-    logger.info("ORT_LONGFORMER_USE_HALF4={}".format("1" if use_half4 else "0"))
+    logger.info("ORT_LONGFORMER_USE_HALF4={}".format("1" if use_half4 else "0"))  # noqa: G001
 
     results = []
     test_times = 1000
diff --git a/onnxruntime/python/tools/transformers/models/phi2/inference_example.py b/onnxruntime/python/tools/transformers/models/phi2/inference_example.py
index 829334b46b469..eb66533f00834 100644
--- a/onnxruntime/python/tools/transformers/models/phi2/inference_example.py
+++ b/onnxruntime/python/tools/transformers/models/phi2/inference_example.py
@@ -121,9 +121,11 @@ def get_initial_inputs_and_outputs(self, encodings_dict):
         if not self.use_traced_inputs:
             for i in range(self.num_layers):
                 past = torch.zeros(past_shape, device=self.device, dtype=self.torch_dtype)
-                inputs.update(
-                    {f"past_key_{i}": past.contiguous(), f"past_value_{i}": past.clone().contiguous()}
-                ) if not self.packed_kv else inputs.update({f"past_{i}": past.contiguous()})
+                (
+                    inputs.update({f"past_key_{i}": past.contiguous(), f"past_value_{i}": past.clone().contiguous()})
+                    if not self.packed_kv
+                    else inputs.update({f"past_{i}": past.contiguous()})
+                )
         else:
             for i in range(self.num_layers):
                 inputs.update(
@@ -144,9 +146,13 @@ def get_initial_inputs_and_outputs(self, encodings_dict):
             )
             for i in range(self.num_layers):
                 present = torch.zeros(present_shape, device=self.device, dtype=self.torch_dtype)
-                outputs.update(
-                    {f"present_key_{i}": present.contiguous(), f"present_value_{i}": present.contiguous()}
-                ) if not self.packed_kv else outputs.update({f"present_{i}": present.contiguous()})
+                (
+                    outputs.update(
+                        {f"present_key_{i}": present.contiguous(), f"present_value_{i}": present.contiguous()}
+                    )
+                    if not self.packed_kv
+                    else outputs.update({f"present_{i}": present.contiguous()})
+                )
 
         return inputs, outputs
 
@@ -323,9 +329,16 @@ def generate_impl(self, encodings_dict, max_length, cuda_graph_annotation, bench
                 )
                 for i in range(self.num_layers):
                     present = torch.zeros(present_shape, device=self.device, dtype=self.torch_dtype)
-                    outputs.update(
-                        {f"present_key_{i}": present.contiguous(), f"present_value_{i}": present.clone().contiguous()}
-                    ) if not self.packed_kv else outputs.update({f"present_{i}": present.contiguous()})
+                    (
+                        outputs.update(
+                            {
+                                f"present_key_{i}": present.contiguous(),
+                                f"present_value_{i}": present.clone().contiguous(),
+                            }
+                        )
+                        if not self.packed_kv
+                        else outputs.update({f"present_{i}": present.contiguous()})
+                    )
 
         if benchmark:
             print(
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_models.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_models.py
index 10af22e44d3a5..c2cfc165e32cf 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_models.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_models.py
@@ -414,7 +414,6 @@ def get_profile_id(self, batch_size, image_height, image_width, static_batch, st
 
     def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_image_shape):
         """For TensorRT"""
-        pass
 
     def get_shape_dict(self, batch_size, image_height, image_width):
         pass
diff --git a/onnxruntime/python/tools/transformers/models/whisper/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/whisper/convert_to_onnx.py
index 35211aab272e4..5921e4ed42936 100644
--- a/onnxruntime/python/tools/transformers/models/whisper/convert_to_onnx.py
+++ b/onnxruntime/python/tools/transformers/models/whisper/convert_to_onnx.py
@@ -414,9 +414,9 @@ def export_onnx_models(
                     quantization.quantize_dynamic(
                         onnx_path,
                         output_path,
-                        op_types_to_quantize=["MatMul", "Gemm", "Gather"]
-                        if quantize_embedding_layer
-                        else ["MatMul", "Gemm"],
+                        op_types_to_quantize=(
+                            ["MatMul", "Gemm", "Gather"] if quantize_embedding_layer else ["MatMul", "Gemm"]
+                        ),
                         use_external_data_format=use_external_data_format,
                         per_channel=quantize_per_channel,
                         reduce_range=quantize_reduce_range,
diff --git a/onnxruntime/python/tools/transformers/models/whisper/whisper_chain.py b/onnxruntime/python/tools/transformers/models/whisper/whisper_chain.py
index 14691da4ad643..0b128f122e0f4 100644
--- a/onnxruntime/python/tools/transformers/models/whisper/whisper_chain.py
+++ b/onnxruntime/python/tools/transformers/models/whisper/whisper_chain.py
@@ -149,9 +149,11 @@ def chain_model(args):
         helper.make_attribute("translate_token_id", tokenizer.convert_tokens_to_ids(["<|translate|>"])[0]),
         helper.make_attribute("transcribe_token_id", tokenizer.convert_tokens_to_ids(["<|transcribe|>"])[0]),
         helper.make_attribute("start_of_lm_token_id", tokenizer.convert_tokens_to_ids(["<|startoflm|>"])[0]),
-        helper.make_attribute("no_speech_token_id", tokenizer.convert_tokens_to_ids(["<|nospeech|>"])[0])
-        if args.output_no_speech_probs
-        else "",
+        (
+            helper.make_attribute("no_speech_token_id", tokenizer.convert_tokens_to_ids(["<|nospeech|>"])[0])
+            if args.output_no_speech_probs
+            else ""
+        ),
         helper.make_attribute("no_timestamps_token_id", tokenizer.convert_tokens_to_ids(["<|notimestamps|>"])[0]),
         helper.make_attribute("beginning_timestamp_token_id", tokenizer.convert_tokens_to_ids(["<|0.00|>"])[0]),
         helper.make_attribute("no_repeat_ngram_size", args.no_repeat_ngram_size),
diff --git a/onnxruntime/python/tools/transformers/models/whisper/whisper_helper.py b/onnxruntime/python/tools/transformers/models/whisper/whisper_helper.py
index 1b47b9426d983..adf7f69470ae7 100644
--- a/onnxruntime/python/tools/transformers/models/whisper/whisper_helper.py
+++ b/onnxruntime/python/tools/transformers/models/whisper/whisper_helper.py
@@ -334,7 +334,7 @@ def verify_onnx(
         try:
             from datasets import load_dataset
         except Exception as e:
-            logger.error(f"An error occurred while importing `datasets`: {e}", exc_info=True)
+            logger.error(f"An error occurred while importing `datasets`: {e}", exc_info=True)  # noqa: G201
             install_cmd = "pip install datasets"
             logger.warning(f"Could not import `datasets`. Attempting to install `datasets` via `{install_cmd}`.")
             os.system(install_cmd)
diff --git a/onnxruntime/python/tools/transformers/onnx_model_phi.py b/onnxruntime/python/tools/transformers/onnx_model_phi.py
index 0fdce29ae0fa0..05a27ba487f4d 100644
--- a/onnxruntime/python/tools/transformers/onnx_model_phi.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_phi.py
@@ -353,8 +353,10 @@ def process_graph_io(self, attn_op_type: AttentionOpType):
                     elem_type=TensorProto.INT64,
                     shape=[1],
                 )
-                new_inputs.extend([vi_iid, vi_step, vi_mask]) if not self.use_vllm else new_inputs.extend(
-                    [vi_iid, vi_pid, vi_meta]
+                (
+                    new_inputs.extend([vi_iid, vi_step, vi_mask])
+                    if not self.use_vllm
+                    else new_inputs.extend([vi_iid, vi_pid, vi_meta])
                 )
             if self.use_attn:
                 if "past_key" in vi.name:
diff --git a/onnxruntime/python/tools/transformers/onnx_model_unet.py b/onnxruntime/python/tools/transformers/onnx_model_unet.py
index 01298b3576eb1..77e24986f0fde 100644
--- a/onnxruntime/python/tools/transformers/onnx_model_unet.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_unet.py
@@ -127,7 +127,7 @@ def optimize(self, options: Optional[FusionOptions] = None):
 
             with logging_redirect_tqdm():
                 steps = 18
-                progress_bar = tqdm.tqdm(range(0, steps), initial=0, desc="fusion")
+                progress_bar = tqdm.tqdm(range(steps), initial=0, desc="fusion")
                 self._optimize(options, progress_bar)
         else:
             logger.info("tqdm is not installed. Run optimization without progress bar")
diff --git a/onnxruntime/python/tools/transformers/shape_optimizer.py b/onnxruntime/python/tools/transformers/shape_optimizer.py
index ac62188662990..503930b23229f 100644
--- a/onnxruntime/python/tools/transformers/shape_optimizer.py
+++ b/onnxruntime/python/tools/transformers/shape_optimizer.py
@@ -133,9 +133,7 @@ def use_static_input(self, inputs, batch_size=1, max_seq_len=128):
                     dim_proto.dim_value = max_seq_len
                 elif dim_proto.HasField("dim_value") and dim_proto.dim_value != max_seq_len:
                     raise ValueError(
-                        "Unable to set dimension value to {} for axis {} of {}. Contradicts existing dimension value {}.".format(
-                            max_seq_len, 1, input.name, dim_proto.dim_value
-                        )
+                        f"Unable to set dimension value to {max_seq_len} for axis {1} of {input.name}. Contradicts existing dimension value {dim_proto.dim_value}."
                     )
 
     def create_dummy_inputs(
diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_test_cases_generator.py b/onnxruntime/test/providers/cpu/reduction/reduction_test_cases_generator.py
index 727351cae84ac..568a4649f3977 100644
--- a/onnxruntime/test/providers/cpu/reduction/reduction_test_cases_generator.py
+++ b/onnxruntime/test/providers/cpu/reduction/reduction_test_cases_generator.py
@@ -59,7 +59,7 @@ def PrintResult(op, axes, keepdims, res):  # noqa: N802
 
     print(" // expected values")
     print("{", end="")
-    for i in range(0, res.size):
+    for i in range(res.size):
         print("%5.6ff," % res.item(i))
 
     print("})},")
@@ -128,7 +128,7 @@ def PrintReenableOptimizations():  # noqa: N802
     print("ReductionTestCases testcases = {")
     print("// input_data")
     print("{")
-    for i in range(0, input_data.size):
+    for i in range(input_data.size):
         print(
             "%5.6ff," % input_data.item(i),
         )
diff --git a/onnxruntime/test/providers/cpu/rnn/GRU.py b/onnxruntime/test/providers/cpu/rnn/GRU.py
index 144acaf14db61..f141710cf31ef 100644
--- a/onnxruntime/test/providers/cpu/rnn/GRU.py
+++ b/onnxruntime/test/providers/cpu/rnn/GRU.py
@@ -84,7 +84,7 @@ def run(self):
             hidden_size = f_output.shape[3]
 
             output = np.empty((0, 2, batch_size, hidden_size), np.float32)
-            for x in range(0, seq_length):
+            for x in range(seq_length):
                 output = np.append(output, f_output[x])
                 output = np.append(output, r_output_orig_input_order[x])
 
diff --git a/onnxruntime/test/providers/cpu/rnn/LSTM.py b/onnxruntime/test/providers/cpu/rnn/LSTM.py
index 116ec3671bf01..49e28a93385a4 100644
--- a/onnxruntime/test/providers/cpu/rnn/LSTM.py
+++ b/onnxruntime/test/providers/cpu/rnn/LSTM.py
@@ -124,7 +124,7 @@ def run(self):
             output = np.empty((0, 2, batch_size, hidden_size), np.float32)
             # Y_h = np.empty((0, 2, batch_size, hidden_size), np.float32)
             # Y_c = np.empty((0, 2, hidden_size, hidden_size), np.float32)
-            for x in range(0, seq_length):
+            for x in range(seq_length):
                 output = np.append(output, f_output[x])
                 output = np.append(output, r_output_orig_input_order[x])
 
diff --git a/onnxruntime/test/python/quantization/test_op_pad.py b/onnxruntime/test/python/quantization/test_op_pad.py
index 03e29dd64f8a7..291bf42405d58 100644
--- a/onnxruntime/test/python/quantization/test_op_pad.py
+++ b/onnxruntime/test/python/quantization/test_op_pad.py
@@ -222,12 +222,8 @@ def verify_quantize_with_pad_mode(
         activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8
         activation_type_str = "u8" if (activation_type == QuantType.QUInt8) else "s8"
         weight_type_str = "u8" if (weight_type == QuantType.QUInt8) else "s8"
-        model_i8_path = "qop_pad_{}_i8_{}{}_{}{}.onnx".format(
-            quantize_mode,
-            tag_pad_mode,
-            tag_constant_value,
-            activation_type_str,
-            weight_type_str,
+        model_i8_path = (
+            f"qop_pad_{quantize_mode}_i8_{tag_pad_mode}{tag_constant_value}_{activation_type_str}{weight_type_str}.onnx"
         )
         data_reader.rewind()
         self.quantize_model(
diff --git a/onnxruntime/test/python/quantization/test_quantizeblockwise_4bits.py b/onnxruntime/test/python/quantization/test_quantizeblockwise_4bits.py
index 765825d4b86e3..97931acf03f42 100644
--- a/onnxruntime/test/python/quantization/test_quantizeblockwise_4bits.py
+++ b/onnxruntime/test/python/quantization/test_quantizeblockwise_4bits.py
@@ -122,9 +122,11 @@ def test_quantize_blockwise_4bits(self):
                                     dequantize_blockwise_4bits(
                                         quant_value_ref[c, k],
                                         scales_ref[c, k],
-                                        (zero_point_ref[c, k // 2] >> 4)
-                                        if (k & 1)
-                                        else (zero_point_ref[c, k // 2] & 0x0F),
+                                        (
+                                            (zero_point_ref[c, k // 2] >> 4)
+                                            if (k & 1)
+                                            else (zero_point_ref[c, k // 2] & 0x0F)
+                                        ),
                                         min(block_size, rows - k * block_size),
                                     ),
                                     dequantize_blockwise_4bits(
diff --git a/onnxruntime/test/python/transformers/bert_model_generator.py b/onnxruntime/test/python/transformers/bert_model_generator.py
index 9b9409545615b..a84137f092e64 100644
--- a/onnxruntime/test/python/transformers/bert_model_generator.py
+++ b/onnxruntime/test/python/transformers/bert_model_generator.py
@@ -94,12 +94,16 @@ def create_bert_attention(
             perm=[0, 2, 3, 1],
         ),
         # mask nodes
-        helper.make_node("Unsqueeze", ["input_mask", "axes_1"], ["unsqueeze0_out"], "unsqueeze0")
-        if has_unsqueeze_two_inputs
-        else helper.make_node("Unsqueeze", ["input_mask"], ["unsqueeze0_out"], "unsqueeze0", axes=[1]),
-        helper.make_node("Unsqueeze", ["unsqueeze0_out", "axes_2"], ["unsqueeze1_out"], "unsqueeze1")
-        if has_unsqueeze_two_inputs
-        else helper.make_node("Unsqueeze", ["unsqueeze0_out"], ["unsqueeze1_out"], "unsqueeze1", axes=[2]),
+        (
+            helper.make_node("Unsqueeze", ["input_mask", "axes_1"], ["unsqueeze0_out"], "unsqueeze0")
+            if has_unsqueeze_two_inputs
+            else helper.make_node("Unsqueeze", ["input_mask"], ["unsqueeze0_out"], "unsqueeze0", axes=[1])
+        ),
+        (
+            helper.make_node("Unsqueeze", ["unsqueeze0_out", "axes_2"], ["unsqueeze1_out"], "unsqueeze1")
+            if has_unsqueeze_two_inputs
+            else helper.make_node("Unsqueeze", ["unsqueeze0_out"], ["unsqueeze1_out"], "unsqueeze1", axes=[2])
+        ),
         # when attention_mask is float type, no need to cast
         helper.make_node("Cast", ["unsqueeze1_out"], ["cast_out"], "cast", to=1) if not use_float_mask else None,
         helper.make_node(
@@ -291,9 +295,11 @@ def create_tf2onnx_attention_3d(input_hidden_size=16, num_heads=4, head_size=4,
         helper.make_node("Add", ["einsum_k_out", "add_k_weight"], ["add_k_out"], "add_k"),
         helper.make_node("Mul", ["add_k_out", "mul_weight_1"], ["mul_k_out"], "mul_k"),
         # mask nodes
-        helper.make_node("Unsqueeze", ["input_mask", "axes_1"], ["unsqueeze0_out"], "unsqueeze0")
-        if has_unsqueeze_two_inputs
-        else helper.make_node("Unsqueeze", ["input_mask"], ["unsqueeze0_out"], "unsqueeze0", axes=[1, 2]),
+        (
+            helper.make_node("Unsqueeze", ["input_mask", "axes_1"], ["unsqueeze0_out"], "unsqueeze0")
+            if has_unsqueeze_two_inputs
+            else helper.make_node("Unsqueeze", ["input_mask"], ["unsqueeze0_out"], "unsqueeze0", axes=[1, 2])
+        ),
         helper.make_node(
             "Slice",
             ["unsqueeze0_out", "slice_start", "slice_end", "slice_axes", "slice_steps"],
diff --git a/onnxruntime/test/python/transformers/conformer_model_generator.py b/onnxruntime/test/python/transformers/conformer_model_generator.py
index 71e4f2b63cf4f..5b27a46ea0fdc 100644
--- a/onnxruntime/test/python/transformers/conformer_model_generator.py
+++ b/onnxruntime/test/python/transformers/conformer_model_generator.py
@@ -22,9 +22,7 @@ def get_tensor_and_weight(name: str, shape: List[int], random=False, zeros=False
     weights = (
         [np.random.uniform(low, high) for _ in range(total_elements)]
         if random
-        else [0.0] * total_elements
-        if zeros
-        else [1.0] * total_elements
+        else [0.0] * total_elements if zeros else [1.0] * total_elements
     )
     return helper.make_tensor(name, TensorProto.FLOAT, shape, weights), weights
 
diff --git a/onnxruntime/test/python/transformers/gpt2_model_generator.py b/onnxruntime/test/python/transformers/gpt2_model_generator.py
index 4a1b48d4d1b48..0865c87b70da7 100644
--- a/onnxruntime/test/python/transformers/gpt2_model_generator.py
+++ b/onnxruntime/test/python/transformers/gpt2_model_generator.py
@@ -41,15 +41,17 @@ def create_gpt2_attention(hidden_size=64, num_heads=4, max_seq_len=32, switch_ad
             ["fc_out"],
             "add_fc",
         ),
-        helper.make_node("Split", ["fc_out", "split_q_k_v"], ["q", "k", "v"], "split_qkv", axis=2)
-        if is_opset_13_or_newer
-        else helper.make_node(
-            "Split",
-            ["fc_out"],
-            ["q", "k", "v"],
-            "split_qkv",
-            axis=2,
-            split=[hidden_size, hidden_size, hidden_size],
+        (
+            helper.make_node("Split", ["fc_out", "split_q_k_v"], ["q", "k", "v"], "split_qkv", axis=2)
+            if is_opset_13_or_newer
+            else helper.make_node(
+                "Split",
+                ["fc_out"],
+                ["q", "k", "v"],
+                "split_qkv",
+                axis=2,
+                split=[hidden_size, hidden_size, hidden_size],
+            )
         ),
         # q nodes
         helper.make_node("Reshape", ["q", "reshape_x_shape"], ["reshape_q_out"], "reshape_q"),
@@ -79,19 +81,23 @@ def create_gpt2_attention(hidden_size=64, num_heads=4, max_seq_len=32, switch_ad
             perm=[0, 2, 1, 3],
         ),
         # past
-        helper.make_node("Split", ["past", "split_1_1"], ["split_k", "split_v"], "split_past", axis=0)
-        if is_opset_13_or_newer
-        else helper.make_node(
-            "Split",
-            ["past"],
-            ["split_k", "split_v"],
-            "split_past",
-            axis=0,
-            split=[1, 1],
+        (
+            helper.make_node("Split", ["past", "split_1_1"], ["split_k", "split_v"], "split_past", axis=0)
+            if is_opset_13_or_newer
+            else helper.make_node(
+                "Split",
+                ["past"],
+                ["split_k", "split_v"],
+                "split_past",
+                axis=0,
+                split=[1, 1],
+            )
+        ),
+        (
+            helper.make_node("Squeeze", ["split_k", "axes_0"], ["past_k"], "squeeze_past_k")
+            if is_opset_13_or_newer
+            else helper.make_node("Squeeze", ["split_k"], ["past_k"], "squeeze_past_k", axes=[0])
         ),
-        helper.make_node("Squeeze", ["split_k", "axes_0"], ["past_k"], "squeeze_past_k")
-        if is_opset_13_or_newer
-        else helper.make_node("Squeeze", ["split_k"], ["past_k"], "squeeze_past_k", axes=[0]),
         helper.make_node(
             "Concat",
             ["past_k", "transpose_k_out"],
@@ -106,9 +112,11 @@ def create_gpt2_attention(hidden_size=64, num_heads=4, max_seq_len=32, switch_ad
             "transpose_concat_k",
             perm=[0, 1, 3, 2],
         ),
-        helper.make_node("Squeeze", ["split_v", "axes_0"], ["past_v"], "squeeze_past_v")
-        if is_opset_13_or_newer
-        else helper.make_node("Squeeze", ["split_v"], ["past_v"], "squeeze_past_v", axes=[0]),
+        (
+            helper.make_node("Squeeze", ["split_v", "axes_0"], ["past_v"], "squeeze_past_v")
+            if is_opset_13_or_newer
+            else helper.make_node("Squeeze", ["split_v"], ["past_v"], "squeeze_past_v", axes=[0])
+        ),
         helper.make_node(
             "Concat",
             ["past_v", "transpose_v_out"],
@@ -117,33 +125,37 @@ def create_gpt2_attention(hidden_size=64, num_heads=4, max_seq_len=32, switch_ad
             axis=-2,
         ),
         # present
-        helper.make_node(
-            "Unsqueeze",
-            ["concat_k_out", "axes_0"],
-            ["concat_k_unsqueeze_out"],
-            "concat_k_unsqueeze",
-        )
-        if is_opset_13_or_newer
-        else helper.make_node(
-            "Unsqueeze",
-            ["concat_k_out"],
-            ["concat_k_unsqueeze_out"],
-            "concat_k_unsqueeze",
-            axes=[0],
+        (
+            helper.make_node(
+                "Unsqueeze",
+                ["concat_k_out", "axes_0"],
+                ["concat_k_unsqueeze_out"],
+                "concat_k_unsqueeze",
+            )
+            if is_opset_13_or_newer
+            else helper.make_node(
+                "Unsqueeze",
+                ["concat_k_out"],
+                ["concat_k_unsqueeze_out"],
+                "concat_k_unsqueeze",
+                axes=[0],
+            )
         ),
-        helper.make_node(
-            "Unsqueeze",
-            ["concat_v_out", "axes_0"],
-            ["concat_v_unsqueeze_out"],
-            "concat_v_unsqueeze",
-        )
-        if is_opset_13_or_newer
-        else helper.make_node(
-            "Unsqueeze",
-            ["concat_v_out"],
-            ["concat_v_unsqueeze_out"],
-            "concat_v_unsqueeze",
-            axes=[0],
+        (
+            helper.make_node(
+                "Unsqueeze",
+                ["concat_v_out", "axes_0"],
+                ["concat_v_unsqueeze_out"],
+                "concat_v_unsqueeze",
+            )
+            if is_opset_13_or_newer
+            else helper.make_node(
+                "Unsqueeze",
+                ["concat_v_out"],
+                ["concat_v_unsqueeze_out"],
+                "concat_v_unsqueeze",
+                axes=[0],
+            )
         ),
         helper.make_node(
             "Concat",
@@ -159,19 +171,21 @@ def create_gpt2_attention(hidden_size=64, num_heads=4, max_seq_len=32, switch_ad
             ["transpose_q_shape_slice_out"],
             "transpose_q_shape_slice",
         ),
-        helper.make_node(
-            "Squeeze",
-            ["transpose_q_shape_slice_out", "axes_0"],
-            ["transpose_q_shape_slice_squeeze_out"],
-            "transpose_q_shape_slice_squeeze",
-        )
-        if is_opset_13_or_newer
-        else helper.make_node(
-            "Squeeze",
-            ["transpose_q_shape_slice_out"],
-            ["transpose_q_shape_slice_squeeze_out"],
-            "transpose_q_shape_slice_squeeze",
-            axes=[0],
+        (
+            helper.make_node(
+                "Squeeze",
+                ["transpose_q_shape_slice_out", "axes_0"],
+                ["transpose_q_shape_slice_squeeze_out"],
+                "transpose_q_shape_slice_squeeze",
+            )
+            if is_opset_13_or_newer
+            else helper.make_node(
+                "Squeeze",
+                ["transpose_q_shape_slice_out"],
+                ["transpose_q_shape_slice_squeeze_out"],
+                "transpose_q_shape_slice_squeeze",
+                axes=[0],
+            )
         ),
         helper.make_node("Shape", ["concat_k_out"], ["concat_k_shape_out"], "concat_k_shape"),
         helper.make_node(
@@ -180,19 +194,21 @@ def create_gpt2_attention(hidden_size=64, num_heads=4, max_seq_len=32, switch_ad
             ["concat_k_shape_slice_out"],
             "concat_k_shape_slice",
         ),
-        helper.make_node(
-            "Squeeze",
-            ["concat_k_shape_slice_out", "axes_0"],
-            ["concat_k_shape_slice_squeeze_out"],
-            "concat_k_shape_slice_squeeze",
-        )
-        if is_opset_13_or_newer
-        else helper.make_node(
-            "Squeeze",
-            ["concat_k_shape_slice_out"],
-            ["concat_k_shape_slice_squeeze_out"],
-            "concat_k_shape_slice_squeeze",
-            axes=[0],
+        (
+            helper.make_node(
+                "Squeeze",
+                ["concat_k_shape_slice_out", "axes_0"],
+                ["concat_k_shape_slice_squeeze_out"],
+                "concat_k_shape_slice_squeeze",
+            )
+            if is_opset_13_or_newer
+            else helper.make_node(
+                "Squeeze",
+                ["concat_k_shape_slice_out"],
+                ["concat_k_shape_slice_squeeze_out"],
+                "concat_k_shape_slice_squeeze",
+                axes=[0],
+            )
         ),
         helper.make_node(
             "Sub",
@@ -200,22 +216,26 @@ def create_gpt2_attention(hidden_size=64, num_heads=4, max_seq_len=32, switch_ad
             ["sub_out"],
             "sub",
         ),
-        helper.make_node("Unsqueeze", ["sub_out", "axes_0"], ["sub_unsqueeze_out"], "sub_unsqueeze")
-        if is_opset_13_or_newer
-        else helper.make_node("Unsqueeze", ["sub_out"], ["sub_unsqueeze_out"], "sub_unsqueeze", axes=[0]),
-        helper.make_node(
-            "Unsqueeze",
-            ["concat_k_shape_slice_squeeze_out", "axes_0"],
-            ["concat_k_shape_slice_squeeze_unsqueeze_out"],
-            "concat_k_shape_slice_squeeze_unsqueeze",
-        )
-        if is_opset_13_or_newer
-        else helper.make_node(
-            "Unsqueeze",
-            ["concat_k_shape_slice_squeeze_out"],
-            ["concat_k_shape_slice_squeeze_unsqueeze_out"],
-            "concat_k_shape_slice_squeeze_unsqueeze",
-            axes=[0],
+        (
+            helper.make_node("Unsqueeze", ["sub_out", "axes_0"], ["sub_unsqueeze_out"], "sub_unsqueeze")
+            if is_opset_13_or_newer
+            else helper.make_node("Unsqueeze", ["sub_out"], ["sub_unsqueeze_out"], "sub_unsqueeze", axes=[0])
+        ),
+        (
+            helper.make_node(
+                "Unsqueeze",
+                ["concat_k_shape_slice_squeeze_out", "axes_0"],
+                ["concat_k_shape_slice_squeeze_unsqueeze_out"],
+                "concat_k_shape_slice_squeeze_unsqueeze",
+            )
+            if is_opset_13_or_newer
+            else helper.make_node(
+                "Unsqueeze",
+                ["concat_k_shape_slice_squeeze_out"],
+                ["concat_k_shape_slice_squeeze_unsqueeze_out"],
+                "concat_k_shape_slice_squeeze_unsqueeze",
+                axes=[0],
+            )
         ),
         helper.make_node(
             "Slice",
@@ -255,23 +275,27 @@ def create_gpt2_attention(hidden_size=64, num_heads=4, max_seq_len=32, switch_ad
             ["input_mask_reshape_out"],
             "input_mask_reshape",
         ),
-        helper.make_node(
-            "Unsqueeze",
-            ["input_mask_reshape_out", "axes_1"],
-            ["unsqueeze0_out"],
-            "unsqueeze0",
-        )
-        if is_opset_13_or_newer
-        else helper.make_node(
-            "Unsqueeze",
-            ["input_mask_reshape_out"],
-            ["unsqueeze0_out"],
-            "unsqueeze0",
-            axes=[1],
+        (
+            helper.make_node(
+                "Unsqueeze",
+                ["input_mask_reshape_out", "axes_1"],
+                ["unsqueeze0_out"],
+                "unsqueeze0",
+            )
+            if is_opset_13_or_newer
+            else helper.make_node(
+                "Unsqueeze",
+                ["input_mask_reshape_out"],
+                ["unsqueeze0_out"],
+                "unsqueeze0",
+                axes=[1],
+            )
+        ),
+        (
+            helper.make_node("Unsqueeze", ["unsqueeze0_out", "axes_2"], ["unsqueeze1_out"], "unsqueeze1")
+            if is_opset_13_or_newer
+            else helper.make_node("Unsqueeze", ["unsqueeze0_out"], ["unsqueeze1_out"], "unsqueeze1", axes=[2])
         ),
-        helper.make_node("Unsqueeze", ["unsqueeze0_out", "axes_2"], ["unsqueeze1_out"], "unsqueeze1")
-        if is_opset_13_or_newer
-        else helper.make_node("Unsqueeze", ["unsqueeze0_out"], ["unsqueeze1_out"], "unsqueeze1", axes=[2]),
         helper.make_node("Sub", ["sub_weight", "unsqueeze1_out"], ["mask_sub_out"], "sub_mask"),
         helper.make_node("Mul", ["mask_sub_out", "mul_weight"], ["mul_mask_out"], "mul_mask"),
         # qk nodes
@@ -322,33 +346,37 @@ def create_gpt2_attention(hidden_size=64, num_heads=4, max_seq_len=32, switch_ad
             ["qkv_shape_slice_out"],
             "qkv_shape_slice",
         ),
-        helper.make_node(
-            "Squeeze",
-            ["qkv_shape_slice_out", "axes_0"],
-            ["qkv_shape_slice_squeeze_out"],
-            "qkv_shape_slice_squeeze",
-        )
-        if is_opset_13_or_newer
-        else helper.make_node(
-            "Squeeze",
-            ["qkv_shape_slice_out"],
-            ["qkv_shape_slice_squeeze_out"],
-            "qkv_shape_slice_squeeze",
-            axes=[0],
+        (
+            helper.make_node(
+                "Squeeze",
+                ["qkv_shape_slice_out", "axes_0"],
+                ["qkv_shape_slice_squeeze_out"],
+                "qkv_shape_slice_squeeze",
+            )
+            if is_opset_13_or_newer
+            else helper.make_node(
+                "Squeeze",
+                ["qkv_shape_slice_out"],
+                ["qkv_shape_slice_squeeze_out"],
+                "qkv_shape_slice_squeeze",
+                axes=[0],
+            )
         ),
-        helper.make_node(
-            "Unsqueeze",
-            ["qkv_shape_slice_squeeze_out", "axes_0"],
-            ["qkv_shape_slice_squeeze_unsqueeze_out"],
-            "qkv_shape_slice_squeeze_unsqueeze",
-        )
-        if is_opset_13_or_newer
-        else helper.make_node(
-            "Unsqueeze",
-            ["qkv_shape_slice_squeeze_out"],
-            ["qkv_shape_slice_squeeze_unsqueeze_out"],
-            "qkv_shape_slice_squeeze_unsqueeze",
-            axes=[0],
+        (
+            helper.make_node(
+                "Unsqueeze",
+                ["qkv_shape_slice_squeeze_out", "axes_0"],
+                ["qkv_shape_slice_squeeze_unsqueeze_out"],
+                "qkv_shape_slice_squeeze_unsqueeze",
+            )
+            if is_opset_13_or_newer
+            else helper.make_node(
+                "Unsqueeze",
+                ["qkv_shape_slice_squeeze_out"],
+                ["qkv_shape_slice_squeeze_unsqueeze_out"],
+                "qkv_shape_slice_squeeze_unsqueeze",
+                axes=[0],
+            )
         ),
         helper.make_node(
             "Concat",
@@ -387,33 +415,37 @@ def create_gpt2_attention(hidden_size=64, num_heads=4, max_seq_len=32, switch_ad
             "shape_qkv_gather_0",
             axis=0,
         ),
-        helper.make_node(
-            "Unsqueeze",
-            ["qkv_shape_1", "axes_0"],
-            ["qkv_shape_1_unsqueeze_out"],
-            "qkv_shape_1_unsqueeze",
-        )
-        if is_opset_13_or_newer
-        else helper.make_node(
-            "Unsqueeze",
-            ["qkv_shape_1"],
-            ["qkv_shape_1_unsqueeze_out"],
-            "qkv_shape_1_unsqueeze",
-            axes=[0],
+        (
+            helper.make_node(
+                "Unsqueeze",
+                ["qkv_shape_1", "axes_0"],
+                ["qkv_shape_1_unsqueeze_out"],
+                "qkv_shape_1_unsqueeze",
+            )
+            if is_opset_13_or_newer
+            else helper.make_node(
+                "Unsqueeze",
+                ["qkv_shape_1"],
+                ["qkv_shape_1_unsqueeze_out"],
+                "qkv_shape_1_unsqueeze",
+                axes=[0],
+            )
         ),
-        helper.make_node(
-            "Unsqueeze",
-            ["qkv_shape_0", "axes_0"],
-            ["qkv_shape_0_unsqueeze_out"],
-            "qkv_shape_0_unsqueeze",
-        )
-        if is_opset_13_or_newer
-        else helper.make_node(
-            "Unsqueeze",
-            ["qkv_shape_0"],
-            ["qkv_shape_0_unsqueeze_out"],
-            "qkv_shape_0_unsqueeze",
-            axes=[0],
+        (
+            helper.make_node(
+                "Unsqueeze",
+                ["qkv_shape_0", "axes_0"],
+                ["qkv_shape_0_unsqueeze_out"],
+                "qkv_shape_0_unsqueeze",
+            )
+            if is_opset_13_or_newer
+            else helper.make_node(
+                "Unsqueeze",
+                ["qkv_shape_0"],
+                ["qkv_shape_0_unsqueeze_out"],
+                "qkv_shape_0_unsqueeze",
+                axes=[0],
+            )
         ),
         helper.make_node(
             "Concat",
@@ -767,9 +799,11 @@ def create_gpt2_fused_embedlayer(
                 "",
                 "ids",
             ],
-            ["EmbedLayerNormalization_0_output", "EmbedLayerNormalization_0_dummy_mask_index", "embedding_sum"]
-            if output_embedding_sum
-            else ["EmbedLayerNormalization_0_output", "EmbedLayerNormalization_0_dummy_mask_index"],
+            (
+                ["EmbedLayerNormalization_0_output", "EmbedLayerNormalization_0_dummy_mask_index", "embedding_sum"]
+                if output_embedding_sum
+                else ["EmbedLayerNormalization_0_output", "EmbedLayerNormalization_0_dummy_mask_index"]
+            ),
             "EmbedLayerNormalization_0",
             domain="com.microsoft",
             epsilon=epsilon,
diff --git a/onnxruntime/test/python/transformers/sharded_moe/test_sharded_moe.py b/onnxruntime/test/python/transformers/sharded_moe/test_sharded_moe.py
index af835d2906e87..fd1d58cd2a3b8 100644
--- a/onnxruntime/test/python/transformers/sharded_moe/test_sharded_moe.py
+++ b/onnxruntime/test/python/transformers/sharded_moe/test_sharded_moe.py
@@ -59,39 +59,41 @@ def create_moe_onnx_graph(
 ):
     use_sharded_moe = local_experts_start_index >= 0
     nodes = [
-        helper.make_node(
-            "MoE",
-            [
-                "input",
-                "router_probs",
-                "fc1_experts_weights",
-                "fc2_experts_weights",
-                "fc1_experts_bias",
-                "fc2_experts_bias",
-            ],
-            ["output"],
-            "MoE_0",
-            k=1,
-            activation_type="gelu",
-            domain="com.microsoft",
-        )
-        if not use_sharded_moe
-        else helper.make_node(
-            "ShardedMoE",
-            [
-                "input",
-                "router_probs",
-                "fc1_experts_weights",
-                "fc2_experts_weights",
-                "fc1_experts_bias",
-                "fc2_experts_bias",
-            ],
-            ["output"],
-            "MoE_0",
-            k=1,
-            activation_type="gelu",
-            local_experts_start_index=local_experts_start_index,
-            domain="com.microsoft",
+        (
+            helper.make_node(
+                "MoE",
+                [
+                    "input",
+                    "router_probs",
+                    "fc1_experts_weights",
+                    "fc2_experts_weights",
+                    "fc1_experts_bias",
+                    "fc2_experts_bias",
+                ],
+                ["output"],
+                "MoE_0",
+                k=1,
+                activation_type="gelu",
+                domain="com.microsoft",
+            )
+            if not use_sharded_moe
+            else helper.make_node(
+                "ShardedMoE",
+                [
+                    "input",
+                    "router_probs",
+                    "fc1_experts_weights",
+                    "fc2_experts_weights",
+                    "fc1_experts_bias",
+                    "fc2_experts_bias",
+                ],
+                ["output"],
+                "MoE_0",
+                k=1,
+                activation_type="gelu",
+                local_experts_start_index=local_experts_start_index,
+                domain="com.microsoft",
+            )
         ),
     ]
 
diff --git a/onnxruntime/test/python/transformers/test_data/bert_squad_tensorflow2.1_keras2onnx_opset11/generate_tiny_keras2onnx_bert_models.py b/onnxruntime/test/python/transformers/test_data/bert_squad_tensorflow2.1_keras2onnx_opset11/generate_tiny_keras2onnx_bert_models.py
index c42c42c3ca170..0086ce0d289c7 100644
--- a/onnxruntime/test/python/transformers/test_data/bert_squad_tensorflow2.1_keras2onnx_opset11/generate_tiny_keras2onnx_bert_models.py
+++ b/onnxruntime/test/python/transformers/test_data/bert_squad_tensorflow2.1_keras2onnx_opset11/generate_tiny_keras2onnx_bert_models.py
@@ -403,9 +403,7 @@ def generate_test_data(
         evalTime = timeit.default_timer() - start_time  # noqa: N806
         if outputs[0].tolist() != result[0].tolist():
             print(
-                "Error: not same result after optimization. use_cpu={}, no_opt_output={}, opt_output={}".format(
-                    use_cpu, result[0].tolist(), outputs[1].tolist()
-                )
+                f"Error: not same result after optimization. use_cpu={use_cpu}, no_opt_output={result[0].tolist()}, opt_output={outputs[1].tolist()}"
             )
         print(f"** Evaluation done in total {evalTime} secs")
 
diff --git a/onnxruntime/test/python/transformers/test_flash_attn.py b/onnxruntime/test/python/transformers/test_flash_attn.py
index 90d28872d3cc8..b784c83329c76 100644
--- a/onnxruntime/test/python/transformers/test_flash_attn.py
+++ b/onnxruntime/test/python/transformers/test_flash_attn.py
@@ -229,9 +229,11 @@ def create_group_query_attention_graph_prompt(
             [
                 config.batch_size,
                 config.q_sequence_length,
-                (config.num_heads * config.head_size)
-                if not packed
-                else (config.num_heads * config.head_size + 2 * config.kv_num_heads * config.head_size),
+                (
+                    (config.num_heads * config.head_size)
+                    if not packed
+                    else (config.num_heads * config.head_size + 2 * config.kv_num_heads * config.head_size)
+                ),
             ],
         ),
         helper.make_tensor_value_info(
@@ -415,9 +417,11 @@ def create_group_query_attention_graph_past(
             [
                 config.batch_size,
                 config.sequence_length,
-                (config.num_heads * config.head_size)
-                if not packed
-                else (config.num_heads * config.head_size + 2 * config.kv_num_heads * config.head_size),
+                (
+                    (config.num_heads * config.head_size)
+                    if not packed
+                    else (config.num_heads * config.head_size + 2 * config.kv_num_heads * config.head_size)
+                ),
             ],
         ),
         helper.make_tensor_value_info(
diff --git a/onnxruntime/test/python/transformers/whisper_model_generator.py b/onnxruntime/test/python/transformers/whisper_model_generator.py
index 71d1a4cbdceeb..a57b45cbc5ea3 100644
--- a/onnxruntime/test/python/transformers/whisper_model_generator.py
+++ b/onnxruntime/test/python/transformers/whisper_model_generator.py
@@ -22,9 +22,7 @@ def get_tensor_and_weight(name: str, shape: List[int], random=False, zeros=False
     weights = (
         [np.random.uniform(low, high) for _ in range(total_elements)]
         if random
-        else [0.0] * total_elements
-        if zeros
-        else [1.0] * total_elements
+        else [0.0] * total_elements if zeros else [1.0] * total_elements
     )
     return helper.make_tensor(name, TensorProto.FLOAT, shape, weights), weights
 
diff --git a/onnxruntime/test/testdata/custom_op_library/custom_op_test_float8.py b/onnxruntime/test/testdata/custom_op_library/custom_op_test_float8.py
index 84cf71455f84a..6db8e8fe660f8 100644
--- a/onnxruntime/test/testdata/custom_op_library/custom_op_test_float8.py
+++ b/onnxruntime/test/testdata/custom_op_library/custom_op_test_float8.py
@@ -1,6 +1,7 @@
 """
 This file was used to generate model `custom_op_test_float8.py`.
 """
+
 from onnx import TensorProto
 from onnx.checker import check_model
 from onnx.helper import make_graph, make_model, make_node, make_opsetid, make_tensor_value_info
diff --git a/onnxruntime/test/testdata/test_data_generation/adamw_test/adamw_test_data_generator.py b/onnxruntime/test/testdata/test_data_generation/adamw_test/adamw_test_data_generator.py
index 4c1e3a70de1c7..443444044bb8d 100644
--- a/onnxruntime/test/testdata/test_data_generation/adamw_test/adamw_test_data_generator.py
+++ b/onnxruntime/test/testdata/test_data_generation/adamw_test/adamw_test_data_generator.py
@@ -190,7 +190,7 @@ def main():
     device_candidates = ["cuda", "cpu"]
     test_data_step_count = 11
     for device in device_candidates:
-        for adam_mode in range(0, 2):
+        for adam_mode in range(2):
             generate_adamw_single_weight_tests(adam_mode, test_data_step_count, device)
             generate_adamw_multiple_weights_tests(adam_mode, test_data_step_count, device)
 
diff --git a/onnxruntime/test/testdata/transform/fusion/embed_layer_norm_gen.py b/onnxruntime/test/testdata/transform/fusion/embed_layer_norm_gen.py
index ed06495b42beb..54fe7b808bf12 100644
--- a/onnxruntime/test/testdata/transform/fusion/embed_layer_norm_gen.py
+++ b/onnxruntime/test/testdata/transform/fusion/embed_layer_norm_gen.py
@@ -21,19 +21,21 @@ def GenerateNodes(model_name, has_cast, suffix=""):  # noqa: N802
             ["gather0_out" + suffix],
             "gather0" + suffix,
         ),
-        helper.make_node(
-            "Unsqueeze",
-            ["gather0_out" + suffix, "axes_0"],
-            ["unsqueeze0_out" + suffix],
-            "unsqueeze0" + suffix,
-        )
-        if opset_version == 13
-        else helper.make_node(
-            "Unsqueeze",
-            ["gather0_out" + suffix],
-            ["unsqueeze0_out" + suffix],
-            "unsqueeze0" + suffix,
-            axes=[0],
+        (
+            helper.make_node(
+                "Unsqueeze",
+                ["gather0_out" + suffix, "axes_0"],
+                ["unsqueeze0_out" + suffix],
+                "unsqueeze0" + suffix,
+            )
+            if opset_version == 13
+            else helper.make_node(
+                "Unsqueeze",
+                ["gather0_out" + suffix],
+                ["unsqueeze0_out" + suffix],
+                "unsqueeze0" + suffix,
+                axes=[0],
+            )
         ),
         helper.make_node("Shape", ["input_ids" + suffix], ["shape2_out" + suffix], "shape2" + suffix),
         helper.make_node(
@@ -42,19 +44,21 @@ def GenerateNodes(model_name, has_cast, suffix=""):  # noqa: N802
             ["gather1_out" + suffix],
             "gather1" + suffix,
         ),
-        helper.make_node(
-            "Unsqueeze",
-            ["gather1_out" + suffix, "axes_0"],
-            ["unsqueeze1_out" + suffix],
-            "unsqueeze1" + suffix,
-        )
-        if opset_version == 13
-        else helper.make_node(
-            "Unsqueeze",
-            ["gather1_out" + suffix],
-            ["unsqueeze1_out" + suffix],
-            "unsqueeze1" + suffix,
-            axes=[0],
+        (
+            helper.make_node(
+                "Unsqueeze",
+                ["gather1_out" + suffix, "axes_0"],
+                ["unsqueeze1_out" + suffix],
+                "unsqueeze1" + suffix,
+            )
+            if opset_version == 13
+            else helper.make_node(
+                "Unsqueeze",
+                ["gather1_out" + suffix],
+                ["unsqueeze1_out" + suffix],
+                "unsqueeze1" + suffix,
+                axes=[0],
+            )
         ),
         helper.make_node(
             "Concat",
@@ -80,19 +84,21 @@ def GenerateNodes(model_name, has_cast, suffix=""):  # noqa: N802
             ["range_out" + suffix],
             "range" + suffix,
         ),
-        helper.make_node(
-            "Unsqueeze",
-            ["range_out" + suffix, "axes_0"],
-            ["unsqueeze2_out" + suffix],
-            "unsqueeze2" + suffix,
-        )
-        if opset_version == 13
-        else helper.make_node(
-            "Unsqueeze",
-            ["range_out" + suffix],
-            ["unsqueeze2_out" + suffix],
-            "unsqueeze2" + suffix,
-            axes=[0],
+        (
+            helper.make_node(
+                "Unsqueeze",
+                ["range_out" + suffix, "axes_0"],
+                ["unsqueeze2_out" + suffix],
+                "unsqueeze2" + suffix,
+            )
+            if opset_version == 13
+            else helper.make_node(
+                "Unsqueeze",
+                ["range_out" + suffix],
+                ["unsqueeze2_out" + suffix],
+                "unsqueeze2" + suffix,
+                axes=[0],
+            )
         ),
         helper.make_node(
             "Expand",
@@ -145,21 +151,23 @@ def GenerateNodes(model_name, has_cast, suffix=""):  # noqa: N802
             "mask_cast" + suffix,
             to=6,
         ),
-        helper.make_node(
-            "ReduceSum",
-            ["mask_cast_out" + suffix, "axes_1"],
-            ["mask_index_out" + suffix],
-            "mask_index" + suffix,
-            keepdims=0,
-        )
-        if opset_version == 13
-        else helper.make_node(
-            "ReduceSum",
-            ["mask_cast_out" + suffix],
-            ["mask_index_out" + suffix],
-            "mask_index" + suffix,
-            axes=[1],
-            keepdims=0,
+        (
+            helper.make_node(
+                "ReduceSum",
+                ["mask_cast_out" + suffix, "axes_1"],
+                ["mask_index_out" + suffix],
+                "mask_index" + suffix,
+                keepdims=0,
+            )
+            if opset_version == 13
+            else helper.make_node(
+                "ReduceSum",
+                ["mask_cast_out" + suffix],
+                ["mask_index_out" + suffix],
+                "mask_index" + suffix,
+                axes=[1],
+                keepdims=0,
+            )
         ),
         helper.make_node(
             "Attention",
@@ -372,21 +380,23 @@ def GenerateModel5(model_name):  # noqa: N802
             epsion=0.000009999999747378752,
         ),
         helper.make_node("Cast", ["input_mask"], ["mask_cast_out"], "mask_cast", to=6),
-        helper.make_node(
-            "ReduceSum",
-            ["mask_cast_out", "axes_1"],
-            ["mask_index_out"],
-            "mask_index",
-            keepdims=0,
-        )
-        if opset_version == 13
-        else helper.make_node(
-            "ReduceSum",
-            ["mask_cast_out"],
-            ["mask_index_out"],
-            "mask_index",
-            axes=[1],
-            keepdims=0,
+        (
+            helper.make_node(
+                "ReduceSum",
+                ["mask_cast_out", "axes_1"],
+                ["mask_index_out"],
+                "mask_index",
+                keepdims=0,
+            )
+            if opset_version == 13
+            else helper.make_node(
+                "ReduceSum",
+                ["mask_cast_out"],
+                ["mask_index_out"],
+                "mask_index",
+                axes=[1],
+                keepdims=0,
+            )
         ),
         helper.make_node(
             "Attention",
@@ -514,14 +524,18 @@ def GenerateModel6(model_name):  # noqa: N802
     nodes = [  # LayerNorm subgraph
         helper.make_node("Shape", ["input_ids"], ["shape1_out"], "shape1"),
         helper.make_node("Gather", ["shape1_out", "indices_0"], ["gather0_out"], "gather0"),
-        helper.make_node("Unsqueeze", ["gather0_out", "axes_0"], ["unsqueeze0_out"], "unsqueeze0")
-        if opset_version == 13
-        else helper.make_node("Unsqueeze", ["gather0_out"], ["unsqueeze0_out"], "unsqueeze0", axes=[0]),
+        (
+            helper.make_node("Unsqueeze", ["gather0_out", "axes_0"], ["unsqueeze0_out"], "unsqueeze0")
+            if opset_version == 13
+            else helper.make_node("Unsqueeze", ["gather0_out"], ["unsqueeze0_out"], "unsqueeze0", axes=[0])
+        ),
         helper.make_node("Shape", ["input_ids"], ["shape2_out"], "shape2"),
         helper.make_node("Gather", ["shape2_out", "indices_1"], ["gather1_out"], "gather1"),
-        helper.make_node("Unsqueeze", ["gather1_out", "axes_0"], ["unsqueeze1_out"], "unsqueeze1")
-        if opset_version == 13
-        else helper.make_node("Unsqueeze", ["gather1_out"], ["unsqueeze1_out"], "unsqueeze1", axes=[0]),
+        (
+            helper.make_node("Unsqueeze", ["gather1_out", "axes_0"], ["unsqueeze1_out"], "unsqueeze1")
+            if opset_version == 13
+            else helper.make_node("Unsqueeze", ["gather1_out"], ["unsqueeze1_out"], "unsqueeze1", axes=[0])
+        ),
         helper.make_node(
             "Concat",
             ["unsqueeze0_out", "unsqueeze1_out"],
@@ -533,9 +547,11 @@ def GenerateModel6(model_name):  # noqa: N802
         helper.make_node("Equal", ["reshape_out", "equal_init"], ["equal_out"], "equal"),
         helper.make_node("Where", ["equal_out", "where_init", "reshape_out"], ["where_out"], "where"),
         helper.make_node("Range", ["start_0", "gather1_out", "delta_1"], ["range_out"], "range"),
-        helper.make_node("Unsqueeze", ["range_out", "axes_0"], ["unsqueeze2_out"], "unsqueeze2")
-        if opset_version == 13
-        else helper.make_node("Unsqueeze", ["range_out"], ["unsqueeze2_out"], "unsqueeze2", axes=[0]),
+        (
+            helper.make_node("Unsqueeze", ["range_out", "axes_0"], ["unsqueeze2_out"], "unsqueeze2")
+            if opset_version == 13
+            else helper.make_node("Unsqueeze", ["range_out"], ["unsqueeze2_out"], "unsqueeze2", axes=[0])
+        ),
         helper.make_node("Expand", ["unsqueeze2_out", "where_out"], ["expand_out"], "expand"),
         helper.make_node("Gather", ["pos_embed", "expand_out"], ["pos_gather_out"], "pos_gather"),
         helper.make_node("Gather", ["word_embed", "input_ids"], ["word_gather_out"], "word_gather"),
@@ -556,21 +572,23 @@ def GenerateModel6(model_name):  # noqa: N802
             epsion=0.000009999999747378752,
         ),
         helper.make_node("Cast", ["input_mask"], ["mask_cast_out"], "mask_cast", to=6),
-        helper.make_node(
-            "ReduceSum",
-            ["mask_cast_out", "axes_1"],
-            ["mask_index_out"],
-            "mask_index",
-            keepdims=0,
-        )
-        if opset_version == 13
-        else helper.make_node(
-            "ReduceSum",
-            ["mask_cast_out"],
-            ["mask_index_out"],
-            "mask_index",
-            axes=[1],
-            keepdims=0,
+        (
+            helper.make_node(
+                "ReduceSum",
+                ["mask_cast_out", "axes_1"],
+                ["mask_index_out"],
+                "mask_index",
+                keepdims=0,
+            )
+            if opset_version == 13
+            else helper.make_node(
+                "ReduceSum",
+                ["mask_cast_out"],
+                ["mask_index_out"],
+                "mask_index",
+                axes=[1],
+                keepdims=0,
+            )
         ),
         helper.make_node(
             "Attention",
@@ -756,9 +774,11 @@ def GenerateNodes2(attention_heads):  # noqa: N802
         helper.make_node("Shape", ["input_ids"], ["shape0_out"], "shape0"),
         helper.make_node("Gather", ["shape0_out", "indices_1"], ["gather0_out"], "gather0"),
         helper.make_node("Range", ["start", "gather0_out", "delta"], ["range0_out"], "range0"),
-        helper.make_node("Unsqueeze", ["range0_out", "axes_0"], ["unsqueeze0_out"], "unsqueeze0")
-        if opset_version == 13
-        else helper.make_node("Unsqueeze", ["range0_out"], ["unsqueeze0_out"], "unsqueeze0", axes=[0]),
+        (
+            helper.make_node("Unsqueeze", ["range0_out", "axes_0"], ["unsqueeze0_out"], "unsqueeze0")
+            if opset_version == 13
+            else helper.make_node("Unsqueeze", ["range0_out"], ["unsqueeze0_out"], "unsqueeze0", axes=[0])
+        ),
         helper.make_node("Shape", ["input_ids"], ["shape1_out"], "shape1"),
         helper.make_node("Expand", ["unsqueeze0_out", "shape1_out"], ["expand_out"], "expand"),
         helper.make_node(
@@ -778,21 +798,23 @@ def GenerateNodes2(attention_heads):  # noqa: N802
             epsion=0.000009999999747378752,
         ),
         helper.make_node("Cast", ["input_mask"], ["mask_cast_out"], "mask_cast", to=6),
-        helper.make_node(
-            "ReduceSum",
-            ["mask_cast_out", "axes_1"],
-            ["mask_index_out"],
-            "mask_index",
-            keepdims=0,
-        )
-        if opset_version == 13
-        else helper.make_node(
-            "ReduceSum",
-            ["mask_cast_out"],
-            ["mask_index_out"],
-            "mask_index",
-            axes=[1],
-            keepdims=0,
+        (
+            helper.make_node(
+                "ReduceSum",
+                ["mask_cast_out", "axes_1"],
+                ["mask_index_out"],
+                "mask_index",
+                keepdims=0,
+            )
+            if opset_version == 13
+            else helper.make_node(
+                "ReduceSum",
+                ["mask_cast_out"],
+                ["mask_index_out"],
+                "mask_index",
+                axes=[1],
+                keepdims=0,
+            )
         ),
         helper.make_node(
             "Attention",
@@ -898,12 +920,16 @@ def GenerateModel9(model_name):  # noqa: N802
         helper.make_node("Expand", ["unsqueeze0_out", "shape_out"], ["expand_out"], "expand"),
         helper.make_node("Gather", ["shape_out", "indices_0"], ["gather1_out"], "gather1"),
         helper.make_node("Gather", ["shape_out", "indices_1"], ["gather2_out"], "gather2"),
-        helper.make_node("Unsqueeze", ["gather1_out", "axes_0"], ["unsqueeze1_out"], "unsqueeze1")
-        if opset_version == 13
-        else helper.make_node("Unsqueeze", ["gather1_out"], ["unsqueeze1_out"], "unsqueeze1", axes=[0]),
-        helper.make_node("Unsqueeze", ["gather2_out", "axes_0"], ["unsqueeze2_out"], "unsqueeze2")
-        if opset_version == 13
-        else helper.make_node("Unsqueeze", ["gather2_out"], ["unsqueeze2_out"], "unsqueeze2", axes=[0]),
+        (
+            helper.make_node("Unsqueeze", ["gather1_out", "axes_0"], ["unsqueeze1_out"], "unsqueeze1")
+            if opset_version == 13
+            else helper.make_node("Unsqueeze", ["gather1_out"], ["unsqueeze1_out"], "unsqueeze1", axes=[0])
+        ),
+        (
+            helper.make_node("Unsqueeze", ["gather2_out", "axes_0"], ["unsqueeze2_out"], "unsqueeze2")
+            if opset_version == 13
+            else helper.make_node("Unsqueeze", ["gather2_out"], ["unsqueeze2_out"], "unsqueeze2", axes=[0])
+        ),
         helper.make_node(
             "Concat",
             ["unsqueeze1_out", "unsqueeze2_out"],
diff --git a/orttraining/orttraining/python/training/__init__.py b/orttraining/orttraining/python/training/__init__.py
index a3c22686a1039..1da95dff94f9f 100644
--- a/orttraining/orttraining/python/training/__init__.py
+++ b/orttraining/orttraining/python/training/__init__.py
@@ -23,9 +23,9 @@
 
 try:
     if is_ortmodule_available():
-        from .ortmodule import ORTModule  # noqa: F401
+        from .ortmodule import ORTModule
 
-        __all__.append("ORTModule")
+        __all__ += ["ORTModule"]
 except ImportError:
     # That is OK iff this is not a ORTModule training package
     pass
diff --git a/orttraining/orttraining/python/training/optim/_apex_amp_modifier.py b/orttraining/orttraining/python/training/optim/_apex_amp_modifier.py
index d7bbd249a000e..ff128c4da4259 100644
--- a/orttraining/orttraining/python/training/optim/_apex_amp_modifier.py
+++ b/orttraining/orttraining/python/training/optim/_apex_amp_modifier.py
@@ -15,7 +15,6 @@
 class ApexAMPModifier(FP16OptimizerModifier):
     def __init__(self, optimizer, **kwargs) -> None:
         super().__init__(optimizer)
-        pass
 
     def can_be_modified(self):
         return self.check_requirements(
diff --git a/orttraining/orttraining/python/training/ort_triton/_lowering.py b/orttraining/orttraining/python/training/ort_triton/_lowering.py
index 5c848d2cecc58..4b580a0cc86de 100644
--- a/orttraining/orttraining/python/training/ort_triton/_lowering.py
+++ b/orttraining/orttraining/python/training/ort_triton/_lowering.py
@@ -312,7 +312,7 @@ def _group_nodes(self):
             for j in range(i + 1, len(groups)):
                 if any(output in group_inputs for output in groups[j].nodes_groups[0].output):
                     group_dependencies[i].add(j)
-                    for k in range(0, i):
+                    for k in range(i):
                         if i in group_dependencies[k]:
                             group_dependencies[k].add(j)
 
diff --git a/orttraining/orttraining/python/training/ort_triton/kernel/_flash_attn.py b/orttraining/orttraining/python/training/ort_triton/kernel/_flash_attn.py
index 03bb0f4373d8d..f7b7c1ff08300 100644
--- a/orttraining/orttraining/python/training/ort_triton/kernel/_flash_attn.py
+++ b/orttraining/orttraining/python/training/ort_triton/kernel/_flash_attn.py
@@ -694,7 +694,7 @@ def _bwd_kernel(
     LSE += off_hb * seqlen_q_rounded
     if not SEQUENCE_PARALLEL:
         num_block_n = tl.cdiv(seqlen_k, BLOCK_N)
-        for start_n in range(0, num_block_n):
+        for start_n in range(num_block_n):
             _bwd_kernel_one_col_block(
                 start_n,
                 Q,
diff --git a/orttraining/orttraining/python/training/ortmodule/_fallback_exceptions.py b/orttraining/orttraining/python/training/ortmodule/_fallback_exceptions.py
index 12780016a9ab1..871d3fff8ce3f 100644
--- a/orttraining/orttraining/python/training/ortmodule/_fallback_exceptions.py
+++ b/orttraining/orttraining/python/training/ortmodule/_fallback_exceptions.py
@@ -10,8 +10,6 @@ class ORTModuleFallbackException(Exception):  # noqa: N818
     it can also be used for generic exception that require fallback
     """
 
-    pass
-
 
 class ORTModuleInitException(ORTModuleFallbackException):
     """Trigger fallback for ORTModule initialization related exceptions
@@ -20,8 +18,6 @@ class ORTModuleInitException(ORTModuleFallbackException):
     including PyTorch version, missing ORTModule's PyTorch C++ extension binaries, etc.
     """
 
-    pass
-
 
 class ORTModuleDeviceException(ORTModuleFallbackException):
     """Trigger fallback for device related exceptions
@@ -31,8 +27,6 @@ class ORTModuleDeviceException(ORTModuleFallbackException):
     This exception does not capture these scenarios.
     """
 
-    pass
-
 
 class ORTModuleIOError(ORTModuleFallbackException):
     """Trigger fallback for I/O related exceptions
@@ -42,8 +36,6 @@ class ORTModuleIOError(ORTModuleFallbackException):
     This exception does not capture these scenarios.
     """
 
-    pass
-
 
 class ORTModuleTorchModelException(ORTModuleFallbackException):
     """Trigger fallback for PyTorch modules related exceptions
@@ -52,8 +44,6 @@ class ORTModuleTorchModelException(ORTModuleFallbackException):
     checking type(model) over a hardcoded list of incompatible models.
     """
 
-    pass
-
 
 class ORTModuleONNXModelException(ORTModuleFallbackException):
     """Trigger fallback for ONNX model related exceptions
@@ -61,8 +51,6 @@ class ORTModuleONNXModelException(ORTModuleFallbackException):
     This exception is raised during model conversion to ONNX and post-processing validation within ORTModule frontend.
     """
 
-    pass
-
 
 def wrap_exception(
     new_exception: ORTModuleFallbackException, raised_exception: Exception
diff --git a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
index c67b05758c5aa..568c92b71277f 100755
--- a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
+++ b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
@@ -188,7 +188,6 @@ def forward(self):
         This is an abstract method and must be overridden by a concrete implementation.
         This is the only method that the user should call on a concrete instance of the ExecutionManager
         All other methods are internal"""
-        pass
 
     def _build_graph(self, config):
         if self._runtime_options.use_static_shape:
@@ -412,9 +411,9 @@ def _get_exported_model(self, input_schema: ORTModelInputOutputSchemaType, *inpu
                     # From some PyTorch version, autograd_inlining is a valid argument.
                     # We allow it to be True if custom autograd function is disabled (where autograd.Function
                     # anyway is not supported in ONNX until it can be inlined).
-                    required_export_kwargs[
-                        "autograd_inlining"
-                    ] = not self._runtime_options.enable_custom_autograd_function
+                    required_export_kwargs["autograd_inlining"] = (
+                        not self._runtime_options.enable_custom_autograd_function
+                    )
 
                 invalid_args = self._export_extra_kwargs.keys() & required_export_kwargs.keys()
 
diff --git a/orttraining/orttraining/python/training/ortmodule/_logger.py b/orttraining/orttraining/python/training/ortmodule/_logger.py
index a01db28374b8d..91b99d4323d6f 100644
--- a/orttraining/orttraining/python/training/ortmodule/_logger.py
+++ b/orttraining/orttraining/python/training/ortmodule/_logger.py
@@ -267,9 +267,11 @@ def wrapper(graph_execution_manager, *args, **kwargs):
                 on_exit=partial(
                     _log_with_filter,
                     graph_execution_manager._logger,
-                    graph_execution_manager._debug_options.onnxruntime_log_filter
-                    if self.is_ort_filter
-                    else graph_execution_manager._debug_options.torch_exporter_filter,
+                    (
+                        graph_execution_manager._debug_options.onnxruntime_log_filter
+                        if self.is_ort_filter
+                        else graph_execution_manager._debug_options.torch_exporter_filter
+                    ),
                     self.phase.to_string(),
                 ),
             ):
diff --git a/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py b/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py
index 22e31466887a6..d3fe132609a90 100644
--- a/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py
+++ b/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py
@@ -433,9 +433,7 @@ def _print_embed_label_stats(self):
                 total_token,
                 valid_token_per_batch,
             ) in self._stats:
-                stat += "\t| {:<10} | {:<10} | {:<15} | {:<10} | {:<9.2f}% | {:<15} | {:<15} | {:<15} |\n".format(
-                    step, input_type, input_name, padding_idx, density, valid_token, total_token, valid_token_per_batch
-                )
+                stat += f"\t| {step:<10} | {input_type:<10} | {input_name:<15} | {padding_idx:<10} | {density:<9.2f}% | {valid_token:<15} | {total_token:<15} | {valid_token_per_batch:<15} |\n"
             stat += "<<<\n"
             self._logger.info(stat)
             self._stats.clear()
@@ -697,9 +695,11 @@ def _get_user_config_without_freq(configs: str):
                     [
                         f" - Plan {index}",
                         ":",
-                        "ON"
-                        if all(cluster_id in user_configs_with_out_freq for cluster_id in cluster_ids_without_freq)
-                        else "OFF",
+                        (
+                            "ON"
+                            if all(cluster_id in user_configs_with_out_freq for cluster_id in cluster_ids_without_freq)
+                            else "OFF"
+                        ),
                         ":",
                         cluster_id,
                         saving_symbolic.freq if details else "",
diff --git a/orttraining/orttraining/python/training/ortmodule/_training_manager.py b/orttraining/orttraining/python/training/ortmodule/_training_manager.py
index 73c32a2f51e41..5fa332d12f01c 100644
--- a/orttraining/orttraining/python/training/ortmodule/_training_manager.py
+++ b/orttraining/orttraining/python/training/ortmodule/_training_manager.py
@@ -171,10 +171,10 @@ def backward(ctx, *grad_outputs):
                 for idx, grad_output in enumerate(grad_outputs):
                     if idx in self._graph_info.output_grad_indices_non_differentiable:
                         assert grad_output is None, (
-                            "ORT found the {}-th module output '{}' is "
+                            f"ORT found the {idx}-th module output '{self._graph_info.user_output_names[idx]}' is "
                             "non-differentiable according to the onnx graph. "
                             "However, the gradient value is still provided by "
-                            "PyTorch's autograd engine.".format(idx, self._graph_info.user_output_names[idx])
+                            "PyTorch's autograd engine."
                         )
                         continue
 
diff --git a/orttraining/orttraining/python/training/ortmodule/_utils.py b/orttraining/orttraining/python/training/ortmodule/_utils.py
index 91825fc492208..5faa1c62bae4f 100644
--- a/orttraining/orttraining/python/training/ortmodule/_utils.py
+++ b/orttraining/orttraining/python/training/ortmodule/_utils.py
@@ -91,7 +91,7 @@ def _ortvalues_to_torch_tensor(
         # Second option makes it impossible to directly use `_from_dlpack` or
         # or `from_dlpack` from torch.
         # The best option would be to add boolean type in DLDataTypeCode.
-        for i in range(0, len(bool_indices)):
+        for i in range(len(bool_indices)):
             j = bool_indices[i]
             res[j] = res[j].to(torch.bool)
 
diff --git a/orttraining/orttraining/python/training/ortmodule/options.py b/orttraining/orttraining/python/training/ortmodule/options.py
index 7263a5719e262..1bde07dc29ba9 100644
--- a/orttraining/orttraining/python/training/ortmodule/options.py
+++ b/orttraining/orttraining/python/training/ortmodule/options.py
@@ -387,7 +387,6 @@ def _override_from_env_vars(self):
             try:
                 import triton  # noqa: F401
             except ImportError:
-                pass
                 self._logger.warning(
                     "triton library missing. Please install triton with `pip install triton`. Triton feature will be off."
                 )
diff --git a/orttraining/orttraining/test/external_custom_ops/setup.py b/orttraining/orttraining/test/external_custom_ops/setup.py
index 435b83b818380..29383e3618346 100644
--- a/orttraining/orttraining/test/external_custom_ops/setup.py
+++ b/orttraining/orttraining/test/external_custom_ops/setup.py
@@ -28,9 +28,7 @@ def build_extension(self, ext):
         subprocess.check_call(
             [
                 "cmake",
-                "-DPYBIND11_PYTHON_VERSION={}.{}.{}".format(
-                    sys.version_info.major, sys.version_info.minor, sys.version_info.micro
-                ),
+                f"-DPYBIND11_PYTHON_VERSION={sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}",
                 f"-Dpybind11_DIR={pybind11.get_cmake_dir()}",
                 f"-DONNX_INCLUDE={os.path.dirname(os.path.dirname(onnx.__file__))}",
                 "-DONNXRUNTIME_EXTERNAL_INCLUDE={}".format(
diff --git a/orttraining/orttraining/test/python/_test_commons.py b/orttraining/orttraining/test/python/_test_commons.py
index fb7e62551de63..762c4c4d55f9f 100644
--- a/orttraining/orttraining/test/python/_test_commons.py
+++ b/orttraining/orttraining/test/python/_test_commons.py
@@ -25,5 +25,5 @@ def run_subprocess(args, cwd=None, capture=False, dll_path=None, shell=False, en
     completed_process = subprocess.run(args, cwd=cwd, check=True, stdout=stdout, stderr=stderr, env=my_env, shell=shell)
 
     if log:
-        log.debug("Subprocess completed. Return code=" + str(completed_process.returncode))
+        log.debug("Subprocess completed. Return code=%s", completed_process.returncode)
     return completed_process
diff --git a/orttraining/orttraining/test/python/_test_helpers.py b/orttraining/orttraining/test/python/_test_helpers.py
index 8f2a18b5ec00b..65043c10d8a01 100644
--- a/orttraining/orttraining/test/python/_test_helpers.py
+++ b/orttraining/orttraining/test/python/_test_helpers.py
@@ -288,7 +288,6 @@ def cpu_barrier_func():
 
     def cuda_barrier_func():
         torch.cuda.synchronize()
-        pass
 
     cuda = torch.device("cuda:0")
     run_evaluate_test_on_device_and_compare(
diff --git a/orttraining/orttraining/test/python/orttraining_test_gru.py b/orttraining/orttraining/test/python/orttraining_test_gru.py
index fcb7e13b1694f..c9e22bf7384af 100644
--- a/orttraining/orttraining/test/python/orttraining_test_gru.py
+++ b/orttraining/orttraining/test/python/orttraining_test_gru.py
@@ -355,9 +355,7 @@ def backward_np(
                 prev_h = (
                     all_hidden_states[t - 1, 0, idx, :]
                     if t > 0
-                    else initial_hidden_state[0, idx, :]
-                    if initial_hidden_state is not None
-                    else 0
+                    else initial_hidden_state[0, idx, :] if initial_hidden_state is not None else 0
                 )
 
                 grad_update_gate = (prev_h - hidden_gate) * grad_h
diff --git a/orttraining/orttraining/test/python/orttraining_test_lstm.py b/orttraining/orttraining/test/python/orttraining_test_lstm.py
index 2b296cf70c2c1..4debe73951b2f 100644
--- a/orttraining/orttraining/test/python/orttraining_test_lstm.py
+++ b/orttraining/orttraining/test/python/orttraining_test_lstm.py
@@ -480,9 +480,7 @@ def backward_np(
                 grad_forget_gate = grad_c * (
                     all_cell_states[t - 1, 0, idx, :]
                     if t > 0
-                    else initial_cell_state[0, idx, :]
-                    if initial_cell_state is not None
-                    else 0
+                    else initial_cell_state[0, idx, :] if initial_cell_state is not None else 0
                 )
                 grad_control_gate = grad_c * input_gate
 
@@ -522,9 +520,7 @@ def backward_np(
                 prev_h = (
                     all_hidden_states[t - 1, 0, idx, :]
                     if t > 0
-                    else initial_hidden_state[0, idx, :]
-                    if initial_hidden_state is not None
-                    else 0
+                    else initial_hidden_state[0, idx, :] if initial_hidden_state is not None else 0
                 )
                 grad_recurrence_weights[0, : self._hidden_size, :] += np.dot(
                     np.expand_dims(grad_input_activation, axis=0).T, np.expand_dims(prev_h, axis=0)
@@ -553,9 +549,7 @@ def backward_np(
                     grad_peephole_weights[0, : self._hidden_size] += grad_input_activation * (
                         all_cell_states[t - 1, 0, idx, :]
                         if t > 0
-                        else initial_cell_state[0, idx, :]
-                        if initial_cell_state is not None
-                        else 0
+                        else initial_cell_state[0, idx, :] if initial_cell_state is not None else 0
                     )
                     grad_peephole_weights[0, self._hidden_size : 2 * self._hidden_size] += (
                         grad_output_activation * all_cell_states[t, 0, idx, :]
@@ -565,9 +559,7 @@ def backward_np(
                     ] += grad_forget_activation * (
                         all_cell_states[t - 1, 0, idx, :]
                         if t > 0
-                        else initial_cell_state[0, idx, :]
-                        if initial_cell_state is not None
-                        else 0
+                        else initial_cell_state[0, idx, :] if initial_cell_state is not None else 0
                     )
 
                 grad_c = grad_prev_c
diff --git a/orttraining/orttraining/test/python/orttraining_test_ort_apis_onnxblock.py b/orttraining/orttraining/test/python/orttraining_test_ort_apis_onnxblock.py
index 3d41c8678278c..11df3fa347ff8 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ort_apis_onnxblock.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ort_apis_onnxblock.py
@@ -190,9 +190,11 @@ def _get_training_ort_inputs(x, target, pt_model, onnx_model, target_type=None):
 
     ort_inputs = {
         onnx_model.graph.input[0].name: _to_numpy(copy.deepcopy(x)),
-        onnx_model.graph.input[1].name: _to_numpy(copy.deepcopy(target))
-        if target_type is None
-        else _to_numpy(copy.deepcopy(target).type(target_type)),
+        onnx_model.graph.input[1].name: (
+            _to_numpy(copy.deepcopy(target))
+            if target_type is None
+            else _to_numpy(copy.deepcopy(target).type(target_type))
+        ),
     }
     if target_type is not None:
         ort_inputs[onnx_model.graph.input[1].name]
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
index f0261c776609e..7afad9145ed27 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
@@ -3797,7 +3797,7 @@ def forward(self, input1=None, input2=None):
             model.eval()
 
         # Must work because forward() and dict order match
-        y1, y2 = model(**{"input1": input1, "input2": input2})
+        y1, y2 = model(input1=input1, input2=input2)
         assert y1 is not None
         assert y2 is not None
         if model._is_training():
@@ -3805,7 +3805,7 @@ def forward(self, input1=None, input2=None):
             loss.backward()
 
         # Must work even when forward() and dict order mismatch
-        y1, y2 = model(**{"input2": input2, "input1": input1})
+        y1, y2 = model(input2=input2, input1=input1)
         assert y1 is not None
         assert y2 is not None
         if model._is_training():
@@ -3887,17 +3887,20 @@ def run_step(expected, a, b, c, d, e, f, y, z):
             None,
             None,
         )
-        run_step(
-            a.item() + f.item(), **{"a": a, "b": None, "c": None, "d": None, "e": None, "f": f, "y": None, "z": None}
-        )
+        run_step(a.item() + f.item(), a=a, b=None, c=None, d=None, e=None, f=f, y=None, z=None)
         run_step(a.item() + z.item(), a, None, None, None, None, None, None, z)
-        run_step(
-            a.item() + z.item(), **{"a": a, "b": None, "c": None, "d": None, "e": None, "f": None, "y": None, "z": z}
-        )
+        run_step(a.item() + z.item(), a=a, b=None, c=None, d=None, e=None, f=None, y=None, z=z)
         run_step(a.item() + c.item() + y.item(), a, None, c, None, None, None, y, None)
         run_step(
             a.item() + c.item() + y.item(),
-            **{"a": a, "b": None, "c": c, "d": None, "e": None, "f": None, "y": y, "z": None},
+            a=a,
+            b=None,
+            c=c,
+            d=None,
+            e=None,
+            f=None,
+            y=y,
+            z=None,
         )
         run_step(
             a.item() + b.item() + c.item() + d.item() + e.item() + f.item() + y.item() + z.item(),
@@ -3912,7 +3915,14 @@ def run_step(expected, a, b, c, d, e, f, y, z):
         )
         run_step(
             a.item() + b.item() + c.item() + d.item() + e.item() + f.item() + y.item() + z.item(),
-            **{"a": a, "b": b, "c": c, "d": d, "e": e, "f": f, "y": y, "z": z},
+            a=a,
+            b=b,
+            c=c,
+            d=d,
+            e=e,
+            f=f,
+            y=y,
+            z=z,
         )
 
     del os.environ["ORTMODULE_SKIPCHECK_POLICY"]
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier.py
index 3d92e0b323c19..a1a7d4660f266 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier.py
@@ -441,7 +441,7 @@ def main():
 
     # 4. Train loop (fine-tune)
     total_training_time, total_test_time, epoch_0_training, validation_accuracy = 0, 0, 0, 0
-    for epoch_i in range(0, args.epochs):
+    for epoch_i in range(args.epochs):
         total_training_time += train(model, optimizer, scheduler, train_dataloader, epoch_i, device, args)
         if not args.pytorch_only and epoch_i == 0:
             epoch_0_training = total_training_time
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier_autocast.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier_autocast.py
index 87c8e66231a29..0d5aba1a1a5c4 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier_autocast.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier_autocast.py
@@ -446,7 +446,7 @@ def main():
 
     # 4. Train loop (fine-tune)
     total_training_time, total_test_time, epoch_0_training, validation_accuracy = 0, 0, 0, 0
-    for epoch_i in range(0, args.epochs):
+    for epoch_i in range(args.epochs):
         total_training_time += train(model, optimizer, scaler, scheduler, train_dataloader, epoch_i, device, args)
         if not args.pytorch_only and epoch_i == 0:
             epoch_0_training = total_training_time
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_deepspeed_zero_stage_1.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_deepspeed_zero_stage_1.py
index 86e8d9aea1d37..5b28e9c52b480 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_deepspeed_zero_stage_1.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_deepspeed_zero_stage_1.py
@@ -8,6 +8,7 @@
     --deepspeed_config=orttraining_test_ortmodule_deepspeed_zero_stage_1_config.json
 ```
 """
+
 import argparse
 import time
 
@@ -36,11 +37,7 @@ def forward(self, input1):
 
 
 def train(args, model, device, optimizer, loss_fn, train_loader, epoch):
-    print(
-        "\n======== Epoch {:} / {:} with batch size {:} ========".format(
-            epoch + 1, args.epochs, model.train_batch_size()
-        )
-    )
+    print(f"\n======== Epoch {epoch + 1} / {args.epochs} with batch size {model.train_batch_size()} ========")
     model.train()
     # Measure how long the training epoch takes.
     t0 = time.time()
@@ -77,13 +74,7 @@ def train(args, model, device, optimizer, loss_fn, train_loader, epoch):
             curr_time = time.time()
             elapsed_time = curr_time - start_time
             print(
-                "[{:5}/{:5} ({:2.0f}%)]\tLoss: {:.6f}\tExecution time: {:.4f}".format(
-                    iteration * len(data),
-                    len(train_loader.dataset),
-                    100.0 * iteration / len(train_loader),
-                    loss,
-                    elapsed_time,
-                )
+                f"[{iteration * len(data):5}/{len(train_loader.dataset):5} ({100.0 * iteration / len(train_loader):2.0f}%)]\tLoss: {loss:.6f}\tExecution time: {elapsed_time:.4f}"
             )
             start_time = curr_time
 
@@ -115,13 +106,7 @@ def test(args, model, device, loss_fn, test_loader):
             correct += pred.eq(target.view_as(pred)).sum().item()
     test_loss /= len(test_loader.dataset)
     print(
-        "\nTest set: Batch size: {:}, Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n".format(
-            args.test_batch_size,
-            test_loss,
-            correct,
-            len(test_loader.dataset),
-            100.0 * correct / len(test_loader.dataset),
-        )
+        f"\nTest set: Batch size: {args.test_batch_size}, Average loss: {test_loss:.4f}, Accuracy: {correct}/{len(test_loader.dataset)} ({100.0 * correct / len(test_loader.dataset):.0f}%)\n"
     )
 
     # Report the final accuracy for this validation run.
@@ -251,7 +236,7 @@ def main():
 
     # Train loop
     total_training_time, total_test_time, epoch_0_training = 0, 0, 0
-    for epoch in range(0, args.epochs):
+    for epoch in range(args.epochs):
         total_training_time += train(args, model, device, optimizer, my_loss, train_loader, epoch)
         if not args.pytorch_only and epoch == 0:
             epoch_0_training = total_training_time
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_fairscale_sharded_optimizer.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_fairscale_sharded_optimizer.py
index 53e1928e2d2f3..4437611283122 100755
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_fairscale_sharded_optimizer.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_fairscale_sharded_optimizer.py
@@ -123,13 +123,7 @@ def train_step(args, model, device, optimizer, loss_fn, train_loader, epoch):
             curr_time = time.time()
             elapsed_time = curr_time - start_time
             print(
-                "[{:5}/{:5} ({:2.0f}%)]\tLoss: {:.6f}\tExecution time: {:.4f}".format(
-                    iteration * len(data),
-                    len(train_loader.dataset),
-                    100.0 * iteration / len(train_loader),
-                    loss,
-                    elapsed_time,
-                )
+                f"[{iteration * len(data):5}/{len(train_loader.dataset):5} ({100.0 * iteration / len(train_loader):2.0f}%)]\tLoss: {loss:.6f}\tExecution time: {elapsed_time:.4f}"
             )
             start_time = curr_time
 
@@ -160,13 +154,7 @@ def test(args, model, device, loss_fn, test_loader):
             correct += pred.eq(target.view_as(pred)).sum().item()
     test_loss /= len(test_loader.dataset)
     print(
-        "\nTest set: Batch size: {:}, Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n".format(
-            args.test_batch_size,
-            test_loss,
-            correct,
-            len(test_loader.dataset),
-            100.0 * correct / len(test_loader.dataset),
-        )
+        f"\nTest set: Batch size: {args.test_batch_size}, Average loss: {test_loss:.4f}, Accuracy: {correct}/{len(test_loader.dataset)} ({100.0 * correct / len(test_loader.dataset):.0f}%)\n"
     )
 
     # Report the final accuracy for this validation run.
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_onnx_ops.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_onnx_ops.py
index 2f240406b25b9..df0b5f195f0b9 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_onnx_ops.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_onnx_ops.py
@@ -1,6 +1,7 @@
 """
 @brief      test log(time=3s)
 """
+
 import copy
 import unittest
 
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_poc.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_poc.py
index 1cb0b3626e54e..d6f84d94c2838 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_poc.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_poc.py
@@ -64,13 +64,7 @@ def train(args, model, device, optimizer, loss_fn, train_loader, epoch):
             curr_time = time.time()
             elapsed_time = curr_time - start_time
             print(
-                "[{:5}/{:5} ({:2.0f}%)]\tLoss: {:.6f}\tExecution time: {:.4f}".format(
-                    iteration * len(data),
-                    len(train_loader.dataset),
-                    100.0 * iteration / len(train_loader),
-                    loss,
-                    elapsed_time,
-                )
+                f"[{iteration * len(data):5}/{len(train_loader.dataset):5} ({100.0 * iteration / len(train_loader):2.0f}%)]\tLoss: {loss:.6f}\tExecution time: {elapsed_time:.4f}"
             )
             start_time = curr_time
 
@@ -102,13 +96,7 @@ def test(args, model, device, loss_fn, test_loader):
             correct += pred.eq(target.view_as(pred)).sum().item()
     test_loss /= len(test_loader.dataset)
     print(
-        "\nTest set: Batch size: {:}, Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n".format(
-            args.test_batch_size,
-            test_loss,
-            correct,
-            len(test_loader.dataset),
-            100.0 * correct / len(test_loader.dataset),
-        )
+        f"\nTest set: Batch size: {args.test_batch_size}, Average loss: {test_loss:.4f}, Accuracy: {correct}/{len(test_loader.dataset)} ({100.0 * correct / len(test_loader.dataset):.0f}%)\n"
     )
 
     # Report the final accuracy for this validation run.
@@ -221,7 +209,7 @@ def main():
 
     # Train loop
     total_training_time, total_test_time, epoch_0_training, validation_accuracy = 0, 0, 0, 0
-    for epoch in range(0, args.epochs):
+    for epoch in range(args.epochs):
         total_training_time += train(args, model, device, optimizer, my_loss, train_loader, epoch)
         if not args.pytorch_only and epoch == 0:
             epoch_0_training = total_training_time
diff --git a/orttraining/orttraining/test/python/qat_poc_example/quantize.py b/orttraining/orttraining/test/python/qat_poc_example/quantize.py
index 6d9ea284fd3ef..225fb2f8e81b4 100644
--- a/orttraining/orttraining/test/python/qat_poc_example/quantize.py
+++ b/orttraining/orttraining/test/python/qat_poc_example/quantize.py
@@ -53,7 +53,7 @@ def quantize_static(input_model_dir, output_model_dir):
     logging.info(
         "Invoking onnxruntime.quantization.quantize_static with AddQDQPairToWeight=True and QuantizeBias=False.."
     )
-    logging.info("Quantized model will be saved to %s." % output_model_dir)
+    logging.info("Quantized model will be saved to %s.", output_model_dir)
     quantization.quantize_static(
         input_model_dir,
         output_model_dir,
diff --git a/orttraining/tools/amdgpu/script/rocprof.py b/orttraining/tools/amdgpu/script/rocprof.py
index e5b107ba285bf..21dd8501f3f1d 100644
--- a/orttraining/tools/amdgpu/script/rocprof.py
+++ b/orttraining/tools/amdgpu/script/rocprof.py
@@ -68,18 +68,10 @@ def gpu_kernel_calls(activities):
 for name in groups:
     activities = groups[name]
     print(
-        "{}: N={}, calls={}, absolute={:.3f}s, percent={:.2f}%".format(
-            name,
-            len(activities),
-            gpu_kernel_calls(activities),
-            gpu_absolute_time(activities),
-            gpu_percent_time(activities),
-        )
+        f"{name}: N={len(activities)}, calls={gpu_kernel_calls(activities)}, absolute={gpu_absolute_time(activities):.3f}s, percent={gpu_percent_time(activities):.2f}%"
     )
 
 total = [item for name in groups for item in groups[name]]
 print(
-    "Total: N={}, calls={}, absolute={:.3f}s, percent={:.2f}%".format(
-        len(total), gpu_kernel_calls(total), gpu_absolute_time(total), gpu_percent_time(total)
-    )
+    f"Total: N={len(total)}, calls={gpu_kernel_calls(total)}, absolute={gpu_absolute_time(total):.3f}s, percent={gpu_percent_time(total):.2f}%"
 )
diff --git a/orttraining/tools/ci_test/run_bert_perf_test.py b/orttraining/tools/ci_test/run_bert_perf_test.py
index bb15d6f5965b6..13d5e9f140958 100644
--- a/orttraining/tools/ci_test/run_bert_perf_test.py
+++ b/orttraining/tools/ci_test/run_bert_perf_test.py
@@ -99,8 +99,8 @@ def main():
 
         subprocess.run(cmds).check_returncode()  # noqa: PLW1510
         if c.expected_perf > 0.0:
-            json_filename = "onnxruntime_perf_metrics_{}.onnx_bert_{}_{}_Lamb.json".format(
-                model, precision_prefix, c.max_seq_length
+            json_filename = (
+                f"onnxruntime_perf_metrics_{model}.onnx_bert_{precision_prefix}_{c.max_seq_length}_Lamb.json"
             )
             with open(os.path.join(SCRIPT_DIR, "results", json_filename)) as json_file:
                 results = json.load(json_file)
diff --git a/orttraining/tools/scripts/nv_run_pretraining.py b/orttraining/tools/scripts/nv_run_pretraining.py
index f64460f3ff0b9..8c57101f72ddb 100644
--- a/orttraining/tools/scripts/nv_run_pretraining.py
+++ b/orttraining/tools/scripts/nv_run_pretraining.py
@@ -81,9 +81,11 @@ def __len__(self):
 
     def __getitem__(self, index):
         [input_ids, input_mask, segment_ids, masked_lm_positions, masked_lm_ids, next_sentence_labels] = [
-            torch.from_numpy(input[index].astype(np.int64))
-            if indice < 5
-            else torch.from_numpy(np.asarray(input[index].astype(np.int64)))
+            (
+                torch.from_numpy(input[index].astype(np.int64))
+                if indice < 5
+                else torch.from_numpy(np.asarray(input[index].astype(np.int64)))
+            )
             for indice, input in enumerate(self.inputs)
         ]
 
@@ -231,9 +233,7 @@ def setup_training(args):
         )
     if args.train_batch_size % args.gradient_accumulation_steps != 0:
         raise ValueError(
-            "Invalid gradient_accumulation_steps parameter: {}, batch size {} should be divisible".format(
-                args.gradient_accumulation_steps, args.train_batch_size
-            )
+            f"Invalid gradient_accumulation_steps parameter: {args.gradient_accumulation_steps}, batch size {args.train_batch_size} should be divisible"
         )
 
     args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
diff --git a/orttraining/tools/scripts/watch_experiment.py b/orttraining/tools/scripts/watch_experiment.py
index aefa1f57cfc16..d2255b63c66b5 100644
--- a/orttraining/tools/scripts/watch_experiment.py
+++ b/orttraining/tools/scripts/watch_experiment.py
@@ -57,11 +57,7 @@
     remote_root = args.remote_dir
 
     if run.get_status() in ["Completed", "Failed", "Canceled"]:
-        print(
-            "Downloading Experiment files from remote directory: '{}' to local directory: '{}'".format(
-                remote_root, local_root
-            )
-        )
+        print(f"Downloading Experiment files from remote directory: '{remote_root}' to local directory: '{local_root}'")
         files = [f for f in run.get_file_names() if f.startswith(remote_root)]
         for remote_path in files:
             local_path = os.path.join(local_root, os.path.basename(remote_path))
@@ -71,11 +67,7 @@
         event = Event()
         session = Session()
 
-        print(
-            "Streaming Experiment files from remote directory: '{}' to local directory: '{}'".format(
-                remote_root, local_root
-            )
-        )
+        print(f"Streaming Experiment files from remote directory: '{remote_root}' to local directory: '{local_root}'")
         watcher = RunWatcher(
             run, local_root=local_root, remote_root=remote_root, executor=executor, event=event, session=session
         )
diff --git a/pyproject.toml b/pyproject.toml
index 97515cb9fa62b..8fe114d4692c9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -44,19 +44,26 @@ reportMissingImports = false
 [tool.ruff]
 # NOTE: Do not create an exclude list. Edit .lintrunner.toml instead
 target-version = "py38"
+
+[tool.ruff.lint]
 select = [
     "B", # flake8-bugbear
     "E", # pycodestyle
     "F", # Pyflakes
+    "FURB", # refurb
+    "G", # flake8-logging-format
     "ISC", # flake8-implicit-str-concat
     "N", # pep8-naming
     "NPY", # numpy
     "PERF", # Perflint
+    "PIE", # flake8-pie
     "PLC", # pylint conventions
     "PLE", # pylint errors
     "PLW", # pylint warnings
+    "PYI", # flake8-pyi
     "RUF", # Ruff-specific rules
     "SIM", # flake8-simplify
+    "SLOT", # flake8-slots
     "T10", # flake8-debugger
     "UP", # pyupgrade
     "W", # pycodestyle
@@ -67,12 +74,15 @@ select = [
 ignore = [
     "B028", # FIXME: Add stacklevel to warnings
     "E501", # Line length controlled by black
+    "G004", # FIXME: Enable when the rule can be autofixed
     "N803", # Argument casing
     "N812", # Allow import torch.nn.functional as F
     "N999", # Module names
     "NPY002", # np.random.Generator may not always fit our use cases
     "PERF203", # "try-except-in-loop" only affects Python <3.11, and the improvement is minor; can have false positives
     "PERF401", # List comprehensions are not always readable
+    "PYI041", # May create confusion
+    "PYI024", # May create confusion
     "SIM102", # We don't perfer always combining if branches
     "SIM108", # We don't encourage ternary operators
     "SIM114", # Don't combine if branches for debugability
@@ -84,7 +94,7 @@ unfixable = [
     "SIM112", # Use upper case for env vars
 ]
 
-[tool.ruff.per-file-ignores]
+[tool.ruff.lint.per-file-ignores]
 # NOTE: Refrain from growing the ignore list unless for exceptional cases.
 # Prefer inline ignores with `noqa: xxx`.
 # Eventually this list should become empty.
diff --git a/requirements-lintrunner.txt b/requirements-lintrunner.txt
index 6836d5df69324..d19ebe379b50b 100644
--- a/requirements-lintrunner.txt
+++ b/requirements-lintrunner.txt
@@ -1,9 +1,9 @@
 # This file is auto updated by dependabot
 lintrunner-adapters>=0.11.0
 # RUFF
-ruff==0.2.1
+ruff==0.3.2
 # BLACK-ISORT
-black==23.10.1
+black==24.2.0
 isort==5.12.0
 # CLANGFORMAT
 clang-format==17.0.4
diff --git a/setup.py b/setup.py
index 9a5fc29dd5e02..ac7a70b991fbf 100644
--- a/setup.py
+++ b/setup.py
@@ -257,7 +257,7 @@ def run(self):
                 auditwheel_cmd = ["auditwheel", "-v", "repair", "-w", self.dist_dir, file]
                 for i in cuda_dependencies + rocm_dependencies + tensorrt_dependencies:
                     auditwheel_cmd += ["--exclude", i]
-                logger.info("Running {}".format(" ".join([shlex.quote(arg) for arg in auditwheel_cmd])))
+                logger.info("Running %s", " ".join([shlex.quote(arg) for arg in auditwheel_cmd]))
                 try:
                     subprocess.run(auditwheel_cmd, check=True, stdout=subprocess.PIPE)
                 finally:
@@ -614,9 +614,7 @@ def reformat_run_count(count_str):
             # TODO: this is the last time we have to do this!!!
             # We shall bump up release number right after release cut.
             if ort_version.major == 1 and ort_version.minor == 8 and ort_version.micro == 0:
-                version_number = "{major}.{minor}.{macro}".format(
-                    major=ort_version.major, minor=ort_version.minor + 1, macro=ort_version.micro
-                )
+                version_number = f"{ort_version.major}.{ort_version.minor + 1}.{ort_version.micro}"
 
     version_number = version_number + ".dev" + build_suffix
 
@@ -667,9 +665,11 @@ def save_build_and_package_info(package_name, version_number, cuda_version, rocm
                 else:
                     print(
                         "Error getting cudart version. ",
-                        "did not find any cudart library"
-                        if not cudart_versions or len(cudart_versions) == 0
-                        else "found multiple cudart libraries",
+                        (
+                            "did not find any cudart library"
+                            if not cudart_versions or len(cudart_versions) == 0
+                            else "found multiple cudart libraries"
+                        ),
                     )
             elif rocm_version:
                 f.write(f"rocm_version = '{rocm_version}'\n")
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 1056c4ed84510..067f151844b1b 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -38,8 +38,6 @@ def version_to_tuple(version: str) -> tuple:
 class BaseError(Exception):
     """Base class for errors originating from build.py."""
 
-    pass
-
 
 class BuildError(BaseError):
     """Error from running build steps."""
@@ -89,7 +87,7 @@ def _openvino_verify_device_type(device_read):
         res = True
     elif device_read in choices1:
         res = True
-    elif device_read.startswith("HETERO:") or device_read.startswith("MULTI:") or device_read.startswith("AUTO:"):
+    elif device_read.startswith(("HETERO:", "MULTI:", "AUTO:")):
         res = True
         comma_separated_devices = device_read.split(":")
         comma_separated_devices = comma_separated_devices[1].split(",")
@@ -118,7 +116,7 @@ def invalid_hetero_build():
         print("pick the build type for specific Hardware Device from following options: ", choices)
         print("(or) from the following options with graph partitioning disabled: ", choices1)
         print("\n")
-        if not (device_read.startswith("HETERO") or device_read.startswith("MULTI") or device_read.startswith("AUTO")):
+        if not (device_read.startswith(("HETERO", "MULTI", "AUTO"))):
             invalid_hetero_build()
         sys.exit("Wrong Build Type selected")
 
@@ -1721,9 +1719,7 @@ def setup_cuda_vars(args):
         if not cuda_home_valid or (not is_windows() and not cudnn_home_valid):
             raise BuildError(
                 "cuda_home and cudnn_home paths must be specified and valid.",
-                "cuda_home='{}' valid={}. cudnn_home='{}' valid={}".format(
-                    cuda_home, cuda_home_valid, cudnn_home, cudnn_home_valid
-                ),
+                f"cuda_home='{cuda_home}' valid={cuda_home_valid}. cudnn_home='{cudnn_home}' valid={cudnn_home_valid}",
             )
 
     return cuda_home, cudnn_home
@@ -2489,11 +2485,11 @@ def diff_file(path, regenerate_qualifiers=""):
                     nonlocal have_diff
                     have_diff = True
                     log.warning(
-                        "The updated document {} is different from the checked in version. "
-                        "Please regenerate the file{}, or copy the updated version from the "
-                        "CI build's published artifacts if applicable.".format(path, regenerate_qualifiers)
+                        f"The updated document {path} is different from the checked in version. "
+                        f"Please regenerate the file{regenerate_qualifiers}, or copy the updated version from the "
+                        "CI build's published artifacts if applicable."
                     )
-                    log.debug("diff:\n" + diff)
+                    log.debug("diff:\n" + diff)  # noqa: G003
 
             diff_file(opkernel_doc_path, " with CPU, CUDA and DML execution providers enabled")
             diff_file(contrib_op_doc_path)
@@ -2508,7 +2504,7 @@ def diff_file(path, regenerate_qualifiers=""):
 
 
 def main():
-    log.debug("Command line arguments:\n  {}".format(" ".join(shlex.quote(arg) for arg in sys.argv[1:])))
+    log.debug("Command line arguments:\n  {}".format(" ".join(shlex.quote(arg) for arg in sys.argv[1:])))  # noqa: G001
 
     args = parse_arguments()
 
diff --git a/tools/ci_build/clean_docker_image_cache.py b/tools/ci_build/clean_docker_image_cache.py
index f9b41ce31f92a..8ec2b6b438176 100755
--- a/tools/ci_build/clean_docker_image_cache.py
+++ b/tools/ci_build/clean_docker_image_cache.py
@@ -237,13 +237,13 @@ def main():
     def sorted_image_names(image_infos):
         return sorted([get_image_name(image_info) for image_info in image_infos])
 
-    log.debug("All images:\n{}".format("\n".join(sorted_image_names(all_images))))
-    log.debug("Valid images:\n{}".format("\n".join(sorted_image_names(valid_images))))
+    log.debug("All images:\n{}".format("\n".join(sorted_image_names(all_images))))  # noqa: G001
+    log.debug("Valid images:\n{}".format("\n".join(sorted_image_names(valid_images))))  # noqa: G001
 
     images_to_clean = all_images - valid_images
     image_names_to_clean = sorted_image_names(images_to_clean)
 
-    log.info("Images to clean:\n{}".format("\n".join(image_names_to_clean)))
+    log.info("Images to clean:\n{}".format("\n".join(image_names_to_clean)))  # noqa: G001
 
     if args.dry_run:
         log.info("Dry run, no images will be cleaned.")
diff --git a/tools/ci_build/get_docker_image.py b/tools/ci_build/get_docker_image.py
index 2ce1764c96327..99ecaf677f339 100755
--- a/tools/ci_build/get_docker_image.py
+++ b/tools/ci_build/get_docker_image.py
@@ -56,11 +56,7 @@ def parse_args():
 def main():
     args = parse_args()
 
-    log.debug(
-        "Dockerfile: {}, context: {}, docker build args: '{}'".format(
-            args.dockerfile, args.context, args.docker_build_args
-        )
-    )
+    log.debug(f"Dockerfile: {args.dockerfile}, context: {args.context}, docker build args: '{args.docker_build_args}'")
 
     use_container_registry = args.container_registry is not None
 
diff --git a/tools/ci_build/github/android/build_aar_package.py b/tools/ci_build/github/android/build_aar_package.py
index f9688a1453e12..3aaced63dd410 100644
--- a/tools/ci_build/github/android/build_aar_package.py
+++ b/tools/ci_build/github/android/build_aar_package.py
@@ -149,9 +149,11 @@ def _build_aar(args):
         "-DminSdkVer=" + str(build_settings["android_min_sdk_version"]),
         "-DtargetSdkVer=" + str(build_settings["android_target_sdk_version"]),
         "-DbuildVariant=" + str(build_settings["build_variant"]),
-        "-DENABLE_TRAINING_APIS=1"
-        if "--enable_training_apis" in build_settings["build_params"]
-        else "-DENABLE_TRAINING_APIS=0",
+        (
+            "-DENABLE_TRAINING_APIS=1"
+            if "--enable_training_apis" in build_settings["build_params"]
+            else "-DENABLE_TRAINING_APIS=0"
+        ),
     ]
 
     # clean, build, and publish to a local directory
diff --git a/tools/ci_build/github/apple/build_and_assemble_apple_pods.py b/tools/ci_build/github/apple/build_and_assemble_apple_pods.py
index 006dc4c33ffce..6188c7d7c0678 100755
--- a/tools/ci_build/github/apple/build_and_assemble_apple_pods.py
+++ b/tools/ci_build/github/apple/build_and_assemble_apple_pods.py
@@ -86,9 +86,7 @@ def run(arg_list, cwd=None):
     import shlex
     import subprocess
 
-    log.info(
-        "Running subprocess in '{}'\n  {}".format(cwd or os.getcwd(), " ".join([shlex.quote(arg) for arg in arg_list]))
-    )
+    log.info("Running subprocess in '%s'\n  %s", cwd or os.getcwd(), " ".join([shlex.quote(arg) for arg in arg_list]))
 
     return subprocess.run(arg_list, check=True, cwd=cwd)
 
diff --git a/tools/ci_build/github/apple/build_apple_framework.py b/tools/ci_build/github/apple/build_apple_framework.py
index 5137a0644b2e7..7b8a87632f5c7 100644
--- a/tools/ci_build/github/apple/build_apple_framework.py
+++ b/tools/ci_build/github/apple/build_apple_framework.py
@@ -65,9 +65,11 @@ def _build_for_apple_sysroot(
             build_dir_current_arch,
             build_config,
             build_config + "-" + sysroot,
-            "onnxruntime.framework"
-            if build_dynamic_framework
-            else os.path.join("static_framework", "onnxruntime.framework"),
+            (
+                "onnxruntime.framework"
+                if build_dynamic_framework
+                else os.path.join("static_framework", "onnxruntime.framework")
+            ),
         )
         ort_libs.append(os.path.join(framework_dir, "onnxruntime"))
 
diff --git a/tools/ci_build/github/linux/ort_minimal/check_build_binary_size.py b/tools/ci_build/github/linux/ort_minimal/check_build_binary_size.py
index ea4a3fd32b18b..40debff3b2fef 100644
--- a/tools/ci_build/github/linux/ort_minimal/check_build_binary_size.py
+++ b/tools/ci_build/github/linux/ort_minimal/check_build_binary_size.py
@@ -31,9 +31,7 @@ def _check_binary_size(path, readelf, threshold, os_str, arch, build_config):
 
     if threshold is not None and sections_total > threshold:
         raise RuntimeError(
-            "Sections total size for {} of {} exceeds threshold of {} by {}. On-disk size={}".format(
-                path, sections_total, threshold, sections_total - threshold, ondisk_size
-            )
+            f"Sections total size for {path} of {sections_total} exceeds threshold of {threshold} by {sections_total - threshold}. On-disk size={ondisk_size}"
         )
 
 
diff --git a/tools/ci_build/github/windows/post_binary_sizes_to_dashboard.py b/tools/ci_build/github/windows/post_binary_sizes_to_dashboard.py
index acca4fb13c45a..a9667fe4d0654 100644
--- a/tools/ci_build/github/windows/post_binary_sizes_to_dashboard.py
+++ b/tools/ci_build/github/windows/post_binary_sizes_to_dashboard.py
@@ -49,7 +49,7 @@ def get_binary_sizes(size_data_file):
                 break
             linedata = line.strip().split(",")
             tablerow = {}
-            for i in range(0, len(headers)):
+            for i in range(len(headers)):
                 if headers[i] == "size":
                     tablerow[headers[i]] = int(linedata[i])
                 else:
diff --git a/tools/ci_build/op_registration_utils.py b/tools/ci_build/op_registration_utils.py
index 3fd01253a3e37..811ce424eae10 100644
--- a/tools/ci_build/op_registration_utils.py
+++ b/tools/ci_build/op_registration_utils.py
@@ -104,14 +104,12 @@ def process_registration(
         :param end_version: End version or None if unversioned registration
         :param type: Type or types used in registration, if this is a typed registration
         """
-        pass
 
     def process_other_line(self, line):
         """
         Process a line that does not contain a kernel registration
         :param line: Original line
         """
-        pass
 
     def ok(self):
         """
diff --git a/tools/ci_build/op_registration_validator.py b/tools/ci_build/op_registration_validator.py
index 5c7edfa88a48b..d92050a31f967 100644
--- a/tools/ci_build/op_registration_validator.py
+++ b/tools/ci_build/op_registration_validator.py
@@ -45,7 +45,7 @@ def domain_and_op_str(self):
 
 
 def _log_registration_error(r: RegistrationInfo, message: str):
-    log.error("Invalid registration for {}. {}\n{}".format(r.domain_and_op_str(), message, "".join(r.lines)))
+    log.error("Invalid registration for %s. %s\n%s", r.domain_and_op_str(), message, "".join(r.lines))
 
 
 class RegistrationValidator(op_registration_utils.RegistrationProcessor):
diff --git a/tools/doc/rename_folders.py b/tools/doc/rename_folders.py
index cc64775ae158d..90d800f2a4498 100644
--- a/tools/doc/rename_folders.py
+++ b/tools/doc/rename_folders.py
@@ -3,6 +3,7 @@
 This extension does not publish any folder starting with `_`.
 These folders need to be renamed.
 """
+
 import os
 import re
 
diff --git a/tools/nuget/generate_nuspec_for_native_nuget.py b/tools/nuget/generate_nuspec_for_native_nuget.py
index 09fe99d36cc34..d5139f00e2f04 100644
--- a/tools/nuget/generate_nuspec_for_native_nuget.py
+++ b/tools/nuget/generate_nuspec_for_native_nuget.py
@@ -324,10 +324,12 @@ def generate_metadata(line_list, args):
     generate_owners(metadata_list, "Microsoft")
     generate_description(metadata_list, args.package_name)
     generate_copyright(metadata_list, "\xc2\xa9 " + "Microsoft Corporation. All rights reserved.")
-    generate_tags(
-        metadata_list, "ONNX ONNX Runtime Machine Learning"
-    ) if "Microsoft.ML.OnnxRuntime.Training." in args.package_name else generate_tags(
-        metadata_list, "native ONNX ONNXRuntime-Training Learning-on-The-Edge On-Device-Training MachineLearning"
+    (
+        generate_tags(metadata_list, "ONNX ONNX Runtime Machine Learning")
+        if "Microsoft.ML.OnnxRuntime.Training." in args.package_name
+        else generate_tags(
+            metadata_list, "native ONNX ONNXRuntime-Training Learning-on-The-Edge On-Device-Training MachineLearning"
+        )
     )
     generate_icon(metadata_list, "ORT_icon_for_light_bg.png")
     generate_license(metadata_list)
diff --git a/tools/python/dump_ort_model.py b/tools/python/dump_ort_model.py
index 2177c42f5bc35..b9e3bfa0d3bcd 100644
--- a/tools/python/dump_ort_model.py
+++ b/tools/python/dump_ort_model.py
@@ -29,10 +29,10 @@ def __init__(self, model_path: str):
 
     def _dump_initializers(self, graph: fbs.Graph):
         print("Initializers:")
-        for idx in range(0, graph.InitializersLength()):
+        for idx in range(graph.InitializersLength()):
             tensor = graph.Initializers(idx)
             dims = []
-            for dim in range(0, tensor.DimsLength()):
+            for dim in range(tensor.DimsLength()):
                 dims.append(tensor.Dims(dim))
 
             print(f"{tensor.Name().decode()} data_type={tensor.DataType()} dims={dims}")
@@ -40,7 +40,7 @@ def _dump_initializers(self, graph: fbs.Graph):
 
     def _dump_nodeargs(self, graph: fbs.Graph):
         print("NodeArgs:")
-        for idx in range(0, graph.NodeArgsLength()):
+        for idx in range(graph.NodeArgsLength()):
             node_arg = graph.NodeArgs(idx)
             type = node_arg.Type()
             if not type:
@@ -57,7 +57,7 @@ def _dump_nodeargs(self, graph: fbs.Graph):
                 shape = tensor_type_and_shape.Shape()
                 if shape:
                     dims = []
-                    for dim in range(0, shape.DimLength()):
+                    for dim in range(shape.DimLength()):
                         d = shape.Dim(dim).Value()
                         if d.DimType() == fbs.DimensionValueType.DimensionValueType.VALUE:
                             dims.append(str(d.DimValue()))
@@ -76,8 +76,8 @@ def _dump_node(self, node: fbs.Node):
         domain = node.Domain().decode() or "ai.onnx"  # empty domain defaults to ai.onnx
         since_version = node.SinceVersion()
 
-        inputs = [node.Inputs(i).decode() for i in range(0, node.InputsLength())]
-        outputs = [node.Outputs(i).decode() for i in range(0, node.OutputsLength())]
+        inputs = [node.Inputs(i).decode() for i in range(node.InputsLength())]
+        outputs = [node.Outputs(i).decode() for i in range(node.OutputsLength())]
         print(
             f"{node.Index()}:{node.Name().decode()}({domain}:{optype}:{since_version}) "
             f'inputs=[{",".join(inputs)}] outputs=[{",".join(outputs)}]'
@@ -91,12 +91,12 @@ def _dump_graph(self, graph: fbs.Graph):
         self._dump_initializers(graph)
         self._dump_nodeargs(graph)
         print("Nodes:")
-        for i in range(0, graph.NodesLength()):
+        for i in range(graph.NodesLength()):
             node = graph.Nodes(i)
             self._dump_node(node)
 
             # Read all the attributes
-            for j in range(0, node.AttributesLength()):
+            for j in range(node.AttributesLength()):
                 attr = node.Attributes(j)
                 attr_type = attr.Type()
                 if attr_type == fbs.AttributeType.AttributeType.GRAPH:
@@ -107,7 +107,7 @@ def _dump_graph(self, graph: fbs.Graph):
                     # the ONNX spec doesn't currently define any operators that have multiple graphs in an attribute
                     # so entering this 'elif' isn't currently possible
                     print(f"## Subgraphs for {node.OpType().decode()}.{attr.Name().decode()} ##")
-                    for k in range(0, attr.GraphsLength()):
+                    for k in range(attr.GraphsLength()):
                         print(f"## Subgraph {k} ##")
                         self._dump_graph(attr.Graphs(k))
                         print(f"## End Subgraph {k} ##")
diff --git a/tools/python/find_optimizer_opset_version_updates_required.py b/tools/python/find_optimizer_opset_version_updates_required.py
index 8a5e57b51e38d..b46f7e4a54d9c 100644
--- a/tools/python/find_optimizer_opset_version_updates_required.py
+++ b/tools/python/find_optimizer_opset_version_updates_required.py
@@ -199,9 +199,7 @@ def find_potential_issues(root_dir, op_to_opset):
                 latest = op_to_opset[op]
                 if int(latest) != int(last_version):
                     log.warning(
-                        "Newer opset found for {}. Latest:{} Optimizer support ends at {}. File:{}".format(
-                            op, latest, last_version, file
-                        )
+                        f"Newer opset found for {op}. Latest:{latest} Optimizer support ends at {last_version}. File:{file}"
                     )
             else:
                 log.error(f"Failed to find version information for {op}. File:{file}")
diff --git a/tools/python/gen_contrib_doc.py b/tools/python/gen_contrib_doc.py
index accab96bd3593..ab9421b395326 100644
--- a/tools/python/gen_contrib_doc.py
+++ b/tools/python/gen_contrib_doc.py
@@ -359,11 +359,7 @@ def main(output_path: str, domain_filter: [str]):
 
             for _, namemap in supportmap:
                 for n, schema, versions in namemap:  # noqa: B007
-                    s = '  * {}<a href="#{}">{}</a>\n'.format(
-                        support_level_str(schema.support_level),
-                        format_name_with_domain(domain, n),
-                        format_name_with_domain(domain, n),
-                    )
+                    s = f'  * {support_level_str(schema.support_level)}<a href="#{format_name_with_domain(domain, n)}">{format_name_with_domain(domain, n)}</a>\n'
                     fout.write(s)
 
         fout.write("\n")
diff --git a/tools/python/util/convert_onnx_models_to_ort.py b/tools/python/util/convert_onnx_models_to_ort.py
index 18bba78661796..08e840092bc22 100644
--- a/tools/python/util/convert_onnx_models_to_ort.py
+++ b/tools/python/util/convert_onnx_models_to_ort.py
@@ -302,9 +302,7 @@ def convert_onnx_models_to_ort(
 
     for optimization_style in optimization_styles:
         print(
-            "Converting models with optimization style '{}' and level '{}'".format(
-                optimization_style.name, optimization_level_str
-            )
+            f"Converting models with optimization style '{optimization_style.name}' and level '{optimization_level_str}'"
         )
 
         converted_models = _convert(
@@ -330,9 +328,9 @@ def convert_onnx_models_to_ort(
                 )
                 session_options_config_entries_for_second_conversion = session_options_config_entries.copy()
                 # Limit the optimizations to those that can run in a model with runtime optimizations.
-                session_options_config_entries_for_second_conversion[
-                    "optimization.minimal_build_optimizations"
-                ] = "apply"
+                session_options_config_entries_for_second_conversion["optimization.minimal_build_optimizations"] = (
+                    "apply"
+                )
 
                 print(
                     "Converting models again without runtime optimizations to generate a complete config file. "
@@ -351,9 +349,7 @@ def convert_onnx_models_to_ort(
                 )
 
             print(
-                "Generating config file from ORT format models with optimization style '{}' and level '{}'".format(
-                    optimization_style.name, optimization_level_str
-                )
+                f"Generating config file from ORT format models with optimization style '{optimization_style.name}' and level '{optimization_level_str}'"
             )
 
             config_file = _create_config_file_path(
diff --git a/tools/python/util/mobile_helpers/check_model_can_use_ort_mobile_pkg.py b/tools/python/util/mobile_helpers/check_model_can_use_ort_mobile_pkg.py
index f8cc34e04afa0..548d4a8ba6c45 100644
--- a/tools/python/util/mobile_helpers/check_model_can_use_ort_mobile_pkg.py
+++ b/tools/python/util/mobile_helpers/check_model_can_use_ort_mobile_pkg.py
@@ -230,7 +230,7 @@ def run_check_with_model(
     if unsupported_ops:
         logger.info("Unsupported operators:")
         for entry in sorted(unsupported_ops):
-            logger.info("  " + entry)
+            logger.info("  " + entry)  # noqa: G003
 
     if unsupported:
         logger.info("\nModel is not supported by the pre-built package due to unsupported types and/or operators.")
diff --git a/tools/python/util/ort_format_model/operator_type_usage_processors.py b/tools/python/util/ort_format_model/operator_type_usage_processors.py
index 22d7dff3e13b2..598549c42b60a 100644
--- a/tools/python/util/ort_format_model/operator_type_usage_processors.py
+++ b/tools/python/util/ort_format_model/operator_type_usage_processors.py
@@ -92,7 +92,6 @@ def to_config_entry(self):
         Generate a configuration file entry in JSON format with the required types for the operator.
         :return: JSON string with required type information.
         """
-        pass
 
     @abstractmethod
     def from_config_entry(self, entry: str):
@@ -101,7 +100,6 @@ def from_config_entry(self, entry: str):
         NOTE: Any existing type information should be cleared prior to re-creating from a config file entry.
         :param entry: Configuration file entry
         """
-        pass
 
 
 class DefaultTypeUsageProcessor(TypeUsageProcessor):
@@ -182,9 +180,7 @@ def process_node(self, node: fbs.Node, value_name_to_typeinfo: dict):
             # Don't know of any ops where the number of outputs changed across versions, so require a valid length
             if o >= node.OutputsLength():
                 raise RuntimeError(
-                    "Node has {} outputs. Tracker for {} incorrectly configured as it requires {}.".format(
-                        node.OutputsLength(), self.name, o
-                    )
+                    f"Node has {node.OutputsLength()} outputs. Tracker for {self.name} incorrectly configured as it requires {o}."
                 )
 
             type_str = value_name_to_typestr(node.Outputs(o), value_name_to_typeinfo)
@@ -514,7 +510,6 @@ def is_typed_registration_needed(self, domain: str, optype: str, type_registrati
         :param type_registration_str: Type string from kernel registration
         :return: True is required. False if not.
         """
-        pass
 
     @abstractmethod
     def get_cpp_entries(self):
@@ -522,7 +517,6 @@ def get_cpp_entries(self):
         Get the C++ code that specifies the operator types to enable.
         :return: List of strings. One line of C++ code per entry.
         """
-        pass
 
 
 class OperatorTypeUsageManager:
@@ -644,9 +638,7 @@ def __init__(self, globally_allowed_types: typing.Set[str]):
 
         if not globally_allowed_types.issubset(self._valid_allowed_types):
             raise ValueError(
-                "Globally allowed types must all be valid. Invalid types: {}".format(
-                    sorted(globally_allowed_types - self._valid_allowed_types)
-                )
+                f"Globally allowed types must all be valid. Invalid types: {sorted(globally_allowed_types - self._valid_allowed_types)}"
             )
 
         self._globally_allowed_types = globally_allowed_types
diff --git a/tools/python/util/ort_format_model/ort_model_processor.py b/tools/python/util/ort_format_model/ort_model_processor.py
index d3a07efe92aa5..b20f3a0cfd97d 100644
--- a/tools/python/util/ort_format_model/ort_model_processor.py
+++ b/tools/python/util/ort_format_model/ort_model_processor.py
@@ -35,7 +35,7 @@ def _setup_type_info(graph: fbs.Graph, outer_scope_value_typeinfo={}):  # noqa:
         :return: Dictionary of NodeArg name to TypeInfo
         """
         value_name_to_typeinfo = outer_scope_value_typeinfo.copy()
-        for j in range(0, graph.NodeArgsLength()):
+        for j in range(graph.NodeArgsLength()):
             n = graph.NodeArgs(j)
             value_name_to_typeinfo[n.Name()] = n.Type()  # TypeInfo for this NodeArg's name
 
@@ -57,7 +57,7 @@ def _process_graph(self, graph: fbs.Graph, outer_scope_value_typeinfo: dict):
         # Merge the TypeInfo for all values in this level of the graph with the outer scope value TypeInfo.
         value_name_to_typeinfo = OrtFormatModelProcessor._setup_type_info(graph, outer_scope_value_typeinfo)
 
-        for i in range(0, graph.NodesLength()):
+        for i in range(graph.NodesLength()):
             node = graph.Nodes(i)
 
             optype = node.OpType().decode()
@@ -69,7 +69,7 @@ def _process_graph(self, graph: fbs.Graph, outer_scope_value_typeinfo: dict):
                 self._op_type_processors.process_node(node, value_name_to_typeinfo)
 
             # Read all the attributes
-            for j in range(0, node.AttributesLength()):
+            for j in range(node.AttributesLength()):
                 attr = node.Attributes(j)
                 attr_type = attr.Type()
                 if attr_type == fbs.AttributeType.AttributeType.GRAPH:
@@ -77,7 +77,7 @@ def _process_graph(self, graph: fbs.Graph, outer_scope_value_typeinfo: dict):
                 elif attr_type == fbs.AttributeType.AttributeType.GRAPHS:
                     # the ONNX spec doesn't currently define any operators that have multiple graphs in an attribute
                     # so entering this 'elif' isn't currently possible
-                    for k in range(0, attr.GraphsLength()):
+                    for k in range(attr.GraphsLength()):
                         self._process_graph(attr.Graphs(k), value_name_to_typeinfo)
 
     def process(self):

From ed306b4f97ee24e66cbfd3627bf831cb1c185d7a Mon Sep 17 00:00:00 2001
From: Hariharan Seshadri <shariharan91@gmail.com>
Date: Wed, 13 Mar 2024 10:09:43 -0700
Subject: [PATCH 158/279] Fix Android CI pipeline (#19877)

---
 tools/ci_build/github/linux/upload_code_coverage_data.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/ci_build/github/linux/upload_code_coverage_data.sh b/tools/ci_build/github/linux/upload_code_coverage_data.sh
index 2f63e4c2fe087..cba54a421d511 100755
--- a/tools/ci_build/github/linux/upload_code_coverage_data.sh
+++ b/tools/ci_build/github/linux/upload_code_coverage_data.sh
@@ -2,5 +2,5 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 set -x -e
-/usr/bin/env python3 -m pip install -r $BUILD_SOURCESDIRECTORY/tools/ci_build/github/windows/post_to_dashboard/requirements.txt
+/usr/bin/env python3 -m pip install --user -r $BUILD_SOURCESDIRECTORY/tools/ci_build/github/windows/post_to_dashboard/requirements.txt
 $BUILD_SOURCESDIRECTORY/tools/ci_build/github/windows/post_code_coverage_to_dashboard.py --commit_hash=$BUILD_SOURCEVERSION --report_file $1 --report_url $2 --branch $BUILD_SOURCEBRANCHNAME --arch $3 --os $4 --build_config $5
\ No newline at end of file

From ed250b88c3a5b28560e64e7c4142ee0c7424b850 Mon Sep 17 00:00:00 2001
From: Satya Kumar Jandhyala <satya.k.jandhyala@gmail.com>
Date: Wed, 13 Mar 2024 10:33:14 -0700
Subject: [PATCH 159/279] [JS/WebGPU] Optimize MatMulNBits (#19852)

### Description
Use vec<2> or vec<4>, operands in MatMulNBits


### Motivation and Context
Improve performance
---
 .../lib/wasm/jsep/webgpu/ops/matmulnbits.ts   | 208 ++++++++++++------
 js/web/test/data/ops/matmulnbits.jsonc        |  57 +++++
 2 files changed, 194 insertions(+), 71 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/matmulnbits.ts b/js/web/lib/wasm/jsep/webgpu/ops/matmulnbits.ts
index ead7635cf3ac4..9bf5e4066139d 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/matmulnbits.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/matmulnbits.ts
@@ -7,7 +7,7 @@ import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, ProgramInfo, ProgramUniform} from '../types';
 
-import {createTensorShapeVariables, inputVariable, outputVariable, ShaderHelper, tensorTypeToWsglStorageType, UniformsArrayType} from './common';
+import {createTensorShapeVariables, getMaxComponents, inputVariable, outputVariable, ShaderHelper, tensorTypeToWsglStorageType, UniformsArrayType} from './common';
 
 //  TODO support quantization bits not equal to 4
 export interface MatMulNBitsAttributes extends AttributeWithCacheKey {
@@ -51,29 +51,38 @@ const validateInputs = (inputs: readonly TensorView[], attributes: MatMulNBitsAt
 
 export const createMatMulNBitsProgramInfo =
     (inputs: readonly TensorView[], attributes: MatMulNBitsAttributes): ProgramInfo => {
-      const a = inputs[0];
-      const b = inputs[1];
-      const scales = inputs[2];
-      const aRank = a.dims.length;
-      const outputShape = a.dims.slice(0, aRank - 1).concat(attributes.n);
-      const outputSize = ShapeUtil.size(outputShape);
-
-
+      const inputShape = inputs[0].dims;
+      const aRank = inputShape.length;
+      const outputShape = inputShape.slice(0, aRank - 1).concat(attributes.n);
+      const m = inputShape[aRank - 2];
+      const blobSize = attributes.blockSize / 8 * attributes.bits;
+      const blobSizeInWords = blobSize / 4;
+      const outputNumber = getMaxComponents(m);
+      const components = getMaxComponents(attributes.n);
+      const aComponents = getMaxComponents(attributes.k);
+      const bComponents = getMaxComponents(blobSizeInWords);
+      const outputSize = ShapeUtil.size(outputShape) / components / outputNumber;
       const programUniforms: ProgramUniform[] = [
         {type: DataType.uint32, data: outputSize}, {type: DataType.uint32, data: attributes.k},
         {type: DataType.uint32, data: attributes.n}, {type: DataType.uint32, data: attributes.accuracyLevel},
         {type: DataType.uint32, data: attributes.bits}, {type: DataType.uint32, data: attributes.blockSize}
       ];
-      programUniforms.push(...createTensorShapeVariables(a.dims));
-      programUniforms.push(...createTensorShapeVariables(ShapeUtil.convertShape(b.dims)));
-      programUniforms.push(...createTensorShapeVariables(scales.dims));
+      const aShape = inputShape.slice();
+      aShape.splice(-1, 1, attributes.k / aComponents);
+      const bShape = ShapeUtil.convertShape(inputs[1].dims).slice();
+      bShape.splice(-1, 1, blobSizeInWords / bComponents);
+      programUniforms.push(...createTensorShapeVariables(aShape));
+      programUniforms.push(...createTensorShapeVariables(bShape));
+      programUniforms.push(...createTensorShapeVariables(inputs[2].dims));
       if (inputs.length === 4) {
         programUniforms.push(...createTensorShapeVariables(ShapeUtil.convertShape(inputs[3].dims)));
       }
-      programUniforms.push(...createTensorShapeVariables(outputShape));
+      const oShape = outputShape.slice();
+      oShape.splice(-1, 1, attributes.n / components);
+      programUniforms.push(...createTensorShapeVariables(oShape));
       const getShaderSource = (shaderHelper: ShaderHelper) => {
-        const a = inputVariable('a', inputs[0].dataType, inputs[0].dims.length);
-        const b = inputVariable('b', DataType.uint32, inputs[1].dims.length);
+        const a = inputVariable('a', inputs[0].dataType, aShape.length, aComponents);
+        const b = inputVariable('b', DataType.uint32, bShape.length, bComponents);
         const scales = inputVariable('scales', inputs[2].dataType, inputs[2].dims.length);
         const inputVariables = [a, b, scales];
         const zeroPoints =
@@ -81,86 +90,143 @@ export const createMatMulNBitsProgramInfo =
         if (zeroPoints) {
           inputVariables.push(zeroPoints);
         }
-        const output = outputVariable('output', inputs[0].dataType, outputShape.length);
+        const output = outputVariable('output', inputs[0].dataType, outputShape.length, components);
         const uniforms: UniformsArrayType = [
-          {name: 'output_size', type: 'u32'}, {name: 'k', type: 'u32'}, {name: 'n', type: 'u32'},
+          {name: 'output_size', type: 'u32'}, {name: 'K', type: 'u32'}, {name: 'N', type: 'u32'},
           {name: 'accuracy_level', type: 'u32'}, {name: 'bits', type: 'u32'}, {name: 'block_size', type: 'u32'}
         ];
         const nBlocksPerCol = Math.floor((attributes.k + attributes.blockSize - 1) / attributes.blockSize);
-        const blobSize = attributes.blockSize / 8 * attributes.bits;
-        const wordPerBlob = blobSize / 4;
         const dataType = tensorTypeToWsglStorageType(inputs[0].dataType);
-        return `
-        fn ortUnpack8x4snorm(value: u32) -> array<${dataType}, 8>{
-          var result = array<${dataType}, 8>();
+
+        const qDqDataType = (() => {
+          switch (aComponents) {
+            case 1:
+              return `array<${dataType}, 8>`;
+            case 2:
+              return `mat4x2<${dataType}>`;
+            case 4:
+              return `mat2x4<${dataType}>`;
+            default:
+              throw new Error(`${aComponents}-component is not supported.`);
+          }
+        })();
+
+        const dequantizeImpl = `
+        fn dequantize(quantized: ${qDqDataType}, zero_point: ${dataType}, scale: ${dataType}) -> ${qDqDataType} {
+          ${(() => {
+          if (aComponents === 1) {
+            return `var dequantized = ${qDqDataType}(${
+                Array.from({length: 8}, (_, i) => `(quantized[${i}] - zero_point) * scale`).join(', ')});
+              return dequantized;`;
+          } else {
+            return `var zero_points: ${qDqDataType} = ${qDqDataType}(${Array(8).fill('zero_point').join(',')});
+              return (quantized - zero_points) * scale;`;
+          }
+        })()}
+        }`;
+        const ortUnpack8x4snormImpl = `
+        fn ortUnpack8x4snorm(value: u32) -> ${qDqDataType} {
+          var quantized: ${qDqDataType};
           var offset: u32 = 0;
           let count: u32 = 4;
           for (var i: u32 = 0; i < 8u; i++) {
-            result[i] = ${dataType}(extractBits(value, offset, count));
+            var result = ${dataType}(extractBits(value, offset, count));
+            ${(() => {
+          switch (aComponents) {
+            case 1:
+              return 'quantized[i] = result;';
+            case 2:
+              return 'quantized[i / 2][i % 2] = result;';
+            case 4:
+              return 'quantized[i / 4][i % 4] = result;';
+            default:
+              throw new Error(`${aComponents}-component is not supported.`);
+          }
+        })()}
             offset += count;
           }
-          return result;
-        }
+          return quantized;
+        }`;
+
+        const updateZeroPointIndex = zeroPoints ? `
+          zero_point_offset += 4;
+          if (zero_point_offset == 32) {
+            zero_point_offset = 0;
+            zero_point_index++;
+            zero_point_word = ${zeroPoints.getByOffset('zero_point_index')};
+          }` :
+                                                  '';
+
+        return `
+        ${dequantizeImpl};
+        ${ortUnpack8x4snormImpl};
         ${shaderHelper.registerUniforms(uniforms).declareVariables(...inputVariables, output)}
         ${shaderHelper.mainStart()}
           ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')}
-          var value: ${dataType} = 0.0;
-          let output_indices = ${output.offsetToIndices('global_idx')};
-          var a_indices: ${a.type.indices} = output_indices;
+          var output_values: array<${output.type.value}, ${outputNumber}>;
+          var output_indices = ${output.offsetToIndices('global_idx')};
           var n = ${output.indicesGet('output_indices', aRank - 1)};
+          var m = ${output.indicesGet('output_indices', aRank - 2)};
+          var a_indices: ${a.type.indices} = output_indices;
           // Two zero points are packed into one byte because uniforms.bits <= 4.
           // zero_point_offset is either 0 or 4. It is bit offset within one byte.
           // TODO support zero_point_offset for bits > 4
           ${
             zeroPoints ? `
-            var zero_point_index: u32 = n * ((${nBlocksPerCol} + 1) / 2) / 4;
-            var zero_point_word: u32 = ${zeroPoints.getByOffset('zero_point_index')};
-            var zero_point_offset: u32 = 0;` :
+          var zero_point_index: u32 = n * ${components} * ((${nBlocksPerCol} + 1) / 2) / 4;
+          var zero_point_word: u32 = ${zeroPoints.getByOffset('zero_point_index')};
+          var zero_point_offset: u32 = 0;` :
                          ''}
-          var scale_idex = n * ${nBlocksPerCol};
+          var scale_index = n * ${nBlocksPerCol * components};
           var b_indices: ${b.type.indices};
-          ${b.indicesSet('b_indices', '0', 'n')};
-          var block_offset: u32 = 0;
-          for (var block: u32 = 0; block < ${nBlocksPerCol}; block++) {
-            // The scale and zero points are computed per block.
-            let scale = ${scales.getByOffset('scale_idex')};
-            // The default zero point is 8 for unsigned 4-bit quantization.
-            let zero_point: ${dataType} = ${
-            zeroPoints ? `${dataType}(extractBits(zero_point_word, zero_point_offset, 4))` : 8.0};
-            ${b.indicesSet('b_indices', '1', 'block')};
-            var word_offset: u32 = block_offset;
-            for (var word: u32 = 0; word < ${wordPerBlob}; word++) {
-              ${b.indicesSet('b_indices', '2', 'word')};
-              let b_value = ${b.getByIndices('b_indices')};
-              let b_quantized_values: array<${dataType}, 8> = ortUnpack8x4snorm(b_value);
-              // Number of B elements per 32-bit word is 32/bits = 32/4 = 8
-              var offset: u32 = word_offset;
-              for (var i: u32 = 0; i < 8; i++) {
-                ${a.indicesSet('a_indices', aRank - 1, 'offset')};
-                let a_value = ${a.getByIndices('a_indices')};
-                let b_quantized_value = b_quantized_values[i];
-                let b_dequantized_value = (b_quantized_value - zero_point) * scale;
-                value += a_value * b_dequantized_value;
-                offset++;
+          for (var c: u32 = 0; c < ${components}; c++) {
+            ${b.indicesSet('b_indices', '0', `n * ${components} + c`)};
+            var block_offset: u32 = 0;
+            for (var block: u32 = 0; block < ${nBlocksPerCol}; block++) {
+              // The scale and zero points are computed per block.
+              let scale = ${scales.getByOffset('scale_index')};
+              // The default zero point is 8 for unsigned 4-bit quantization.
+              let zero_point = ${dataType}(${zeroPoints ? 'extractBits(zero_point_word, zero_point_offset, 4)' : 8.0});
+              ${b.indicesSet('b_indices', '1', 'block')};
+              var word_offset: u32 = block_offset;
+              for (var word: u32 = 0; word < ${blobSizeInWords}; word += ${bComponents}) {
+                ${b.indicesSet('b_indices', '2', 'word')};
+                let b_data = ${b.getByIndices('b_indices')};
+                for (var i: u32 = 0; i < ${bComponents}; i++) {
+                  let b_value = ${bComponents === 1 ? 'b_data' : 'b_data[word + i]'};
+                  let b_quantized_values: ${qDqDataType} = ortUnpack8x4snorm(b_value);
+                  let b_dequantized_values = dequantize(b_quantized_values, zero_point, scale);
+                  // Number of B elements per 32-bit word is 32/bits = 32/4 = 8
+                  var offset: u32 = word_offset;
+                  for (var j: u32 = 0; j < 8/${aComponents}; j++) {
+                    ${a.indicesSet('a_indices', aRank - 1, `offset/${aComponents}`)};
+                    for (var k: u32 = 0; k < ${outputNumber}u; k++) {
+                      ${a.indicesSet('a_indices', aRank - 2, `m * ${outputNumber} + k`)};
+                      let a_data = ${a.getByIndices('a_indices')};
+                      output_values[k]${components > 1 ? '[c]' : ''} += ${
+            aComponents === 1 ? 'a_data * b_dequantized_values[j]' : 'dot(a_data, b_dequantized_values[j])'};
+                    }
+                    offset += ${aComponents};
+                  }
+                  word_offset += 8;
+                }
               }
-              word_offset += 8;
+              scale_index++;
+              ${updateZeroPointIndex}
+              block_offset += uniforms.block_size;
             }
-            scale_idex++;
+            // Drop the trailing 4 bits if the zero_poit_offset is not a byte boundary to align with the next byte.
             ${
-            zeroPoints ? `
-            if (zero_point_offset == 28) {
-              zero_point_offset = 0;
-              zero_point_index++;
-              zero_point_word = ${zeroPoints.getByOffset('zero_point_index')};
-            } else {
-              zero_point_offset += 4;
-            }` :
+            zeroPoints ? `if (zero_point_offset % 8 > 0) {
+                ${updateZeroPointIndex}
+              }` :
                          ''}
-            block_offset += uniforms.block_size;
-          }
-          ${output.setByOffset('global_idx', 'value')};
-        }
-        `;
+            }
+            for (var k: u32 = 0u; k < ${outputNumber}u; k++) {
+              ${output.indicesSet('output_indices', aRank - 2, `${outputNumber + ' * m + k'}`)};
+              ${output.setByIndices('output_indices', 'output_values[k]')}
+            }
+        }`;
       };
       return {
         name: 'MatMulNBits',
@@ -168,7 +234,7 @@ export const createMatMulNBitsProgramInfo =
             {hint: `${attributes.cacheKey};${inputs.length}`, inputDependencies: Array(inputs.length).fill('rank')},
         getRunData: () => ({
           outputs: [{dims: outputShape, dataType: inputs[0].dataType}],
-          dispatchGroup: {x: Math.ceil(outputSize / 64)},
+          dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)},
           programUniforms
         }),
         getShaderSource
diff --git a/js/web/test/data/ops/matmulnbits.jsonc b/js/web/test/data/ops/matmulnbits.jsonc
index c57c431afb3ce..175be78cc0818 100644
--- a/js/web/test/data/ops/matmulnbits.jsonc
+++ b/js/web/test/data/ops/matmulnbits.jsonc
@@ -1,4 +1,61 @@
 [
+  {
+    "name": "MatMulNBits; K=16, N=16, block_size=16, bits=4",
+    "operator": "MatMulNBits",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "attributes": [
+      { "name": "K", "data": 16, "type": "int" },
+      { "name": "N", "data": 8, "type": "int" },
+      { "name": "block_size", "data": 16, "type": "int" },
+      { "name": "bits", "data": 4, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "MatMulNBits; K=16, N=16, block_size=16, bits=4; symmetric",
+        "inputs": [
+          {
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+              55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
+              81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
+              106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
+              127
+            ],
+            "dims": [8, 16],
+            "type": "float32"
+          },
+          {
+            "dims": [8, 1, 8],
+            "type": "uint8",
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64
+            ]
+          },
+          {
+            "dims": [8],
+            "type": "float32",
+            "data": [0, 1, 2, 3, 4, 5, 6, 7]
+          }
+        ],
+        "outputs": [
+          {
+            "dims": [8, 8],
+            "type": "float32",
+            "data": [
+              0, -385, -1120, -963, -1984, -1285, -2592, -1351, 0, -1073, -3808, -2643, -6848, -3445, -9120, -3479, 0,
+              -1761, -6496, -4323, -11712, -5605, -15648, -5607, 0, -2449, -9184, -6003, -16576, -7765, -22176, -7735,
+              0, -3137, -11872, -7683, -21440, -9925, -28704, -9863, 0, -3825, -14560, -9363, -26304, -12085, -35232,
+              -11991, 0, -4513, -17248, -11043, -31168, -14245, -41760, -14119, 0, -5201, -19936, -12723, -36032,
+              -16405, -48288, -16247
+            ]
+          }
+        ]
+      }
+    ]
+  },
   {
     "name": "MatMulNBits; K=16, N=16, block_size=16, bits=4",
     "operator": "MatMulNBits",

From d5d9dbd51daf92e2535b190c27409c0e30640e6f Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Thu, 14 Mar 2024 01:41:36 +0800
Subject: [PATCH 160/279] reuse T4 on Linux GPU (#19879)

### Description

### Motivation and Context
Linux GPU test on A10 isn't very stable
---
 .../ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
index 78ee0e1a318e4..b7232e9dc4ba1 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
@@ -150,7 +150,7 @@ stages:
               --enable_cuda_profiling --enable_cuda_nhwc_ops \
               --enable_pybind --build_java \
               --use_cache \
-              --cmake_extra_defines  CMAKE_CUDA_ARCHITECTURES=86; \
+              --cmake_extra_defines  CMAKE_CUDA_ARCHITECTURES=75; \
                 ccache -sv; \
                 ccache -z"
       workingDirectory: $(Build.SourcesDirectory)
@@ -183,7 +183,7 @@ stages:
       skipComponentGovernanceDetection: true
     workspace:
       clean: all
-    pool: onnxruntime-Linux-GPU-A10
+    pool: onnxruntime-Linux-GPU-T4
     steps:
     - checkout: self
       clean: true

From e771a763c3c2d437678409ac20c5bf5ee4d227ea Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Wed, 13 Mar 2024 12:00:36 -0700
Subject: [PATCH 161/279] [js/test] align web test runner flags with ort.env
 (#19790)

### Description
the `npm test` flags are difficult to memorize, because they are
different to the `ort.env` flags. This change makes those flags align
with ort JS API. eg. `--wasm-enable-proxy` became `--wasm.proxy`.

Old flags are marked as deprecated except `-x` (as a shortcut of
`--wasm.numThreads`)
---
 js/web/script/test-runner-cli-args.ts         | 140 ++++++++++--------
 js/web/test/test-main.ts                      |  44 +-----
 .../templates/web-browserstack-ci.yml         |   5 +-
 .../azure-pipelines/templates/win-web-ci.yml  |   4 +-
 .../templates/win-web-multi-browsers.yml      |   6 +-
 5 files changed, 87 insertions(+), 112 deletions(-)

diff --git a/js/web/script/test-runner-cli-args.ts b/js/web/script/test-runner-cli-args.ts
index ed4dd76a6e315..b2b212bdb9bc1 100644
--- a/js/web/script/test-runner-cli-args.ts
+++ b/js/web/script/test-runner-cli-args.ts
@@ -29,8 +29,10 @@ Options:
 *** General Options ***
 
  -h, --help                    Print this message.
- -d, --debug                   Specify to run test runner in debug mode.
-                                 Debug mode outputs verbose log for test runner, sets up environment debug flag, and keeps karma not to exit after tests completed.
+ -d, --debug                   Specify to run test runner in debug mode. Debug mode does the following:
+                                 - outputs verbose log for test runner
+                                 - sets up environment debug flag (env.debug = true)
+                                 - opens Chromium debug port at 9333 and keeps karma not to exit after tests completed.
  -b=<...>, --backend=<...>     Specify one or more backend(s) to run the test upon.
                                  Backends can be one or more of the following, splitted by comma:
                                    webgl
@@ -47,38 +49,55 @@ Options:
                                  bs         (for BrowserStack tests)
  -p, --profile                 Enable profiler.
                                  Profiler will generate extra logs which include the information of events time consumption
+ -t, --trace                   Enable trace.
  -P[=<...>], --perf[=<...>]    Generate performance number. Cannot be used with flag --debug.
                                  This flag can be used with a number as value, specifying the total count of test cases to run. The test cases may be used multiple times. Default value is 10.
  -c, --file-cache              Enable file cache.
+
+*** Session Options ***
+ -u=<...>, --optimized-model-file-path=<...>        Specify whether to dump the optimized model.
+ -o=<...>, --graph-optimization-level=<...>         Specify graph optimization level.
+                                                      Default is 'all'. Valid values are 'disabled', 'basic', 'extended', 'all'.
  -i=<...>, --io-binding=<...>  Specify the IO binding testing type. Should be one of the following:
-                                 none          (default)
+                                 none            (default)
                                  gpu-tensor      use pre-allocated GPU tensors for inputs and outputs
                                  gpu-location    use pre-allocated GPU tensors for inputs and set preferredOutputLocation to 'gpu-buffer'
 
-*** Session Options ***
- -u=<...>, --optimized-model-file-path=<...>        Specify whether to dump the optimized model.
- -o=<...>, --graph-optimization-level=<...>    Specify graph optimization level.
-                                                 Default is 'all'. Valid values are 'disabled', 'basic', 'extended', 'all'.
 *** Logging Options ***
 
- --log-verbose=<...>           Set log level to verbose
- --log-info=<...>              Set log level to info
- --log-warning=<...>           Set log level to warning
- --log-error=<...>             Set log level to error
-                                 The 4 flags above specify the logging configuration. Each flag allows to specify one or more category(s), splitted by comma. If use the flags without value, the log level will be applied to all category.
+ --log-verbose                 Set log level to verbose
+ --log-info                    Set log level to info
+ --log-warning                 Set log level to warning
+ --log-error                   Set log level to error
+                                 The 4 flags above specify the logging configuration.
 
 *** Backend Options ***
 
+ --wasm.<...>=<...>            Set global environment flags for each backend.
+ --webgl.<...>=<...>             These flags can be used multiple times to set multiple flags. For example:
+ --webgpu.<...>=<...>            --webgpu.profiling.mode=default --wasm.numThreads=1 --wasm.simd=false
+ --webnn.<...>=<...>
+
+ --webnn-device-type           Set the WebNN device type (cpu/gpu)
+
  -x, --wasm-number-threads     Set the WebAssembly number of threads
+                                ("--wasm-number-threads" is deprecated. use "--wasm.numThreads" or "-x" instead)
  --wasm-init-timeout           Set the timeout for WebAssembly backend initialization, in milliseconds
+                                (deprecated. use "--wasm.initTimeout" instead)
  --wasm-enable-simd            Set whether to enable SIMD
+                                (deprecated. use "--wasm.simd" instead)
  --wasm-enable-proxy           Set whether to enable proxy worker
+                                (deprecated. use "--wasm.proxy" instead)
  --webgl-context-id            Set the WebGL context ID (webgl/webgl2)
+                                (deprecated. use "--webgl.contextId" instead)
  --webgl-matmul-max-batch-size Set the WebGL matmulMaxBatchSize
+                                (deprecated. use "--webgl.matmulMaxBatchSize" instead)
  --webgl-texture-cache-mode    Set the WebGL texture cache mode (initializerOnly/full)
+                                (deprecated. use "--webgl.textureCacheMode" instead)
  --webgl-texture-pack-mode     Set the WebGL texture pack mode (true/false)
+                                (deprecated. use "--webgl.pack" instead)
  --webgpu-profiling-mode       Set the WebGPU profiling mode (off/default)
- --webnn-device-type           Set the WebNN device type (cpu/gpu)
+                                (deprecated. use "--webgpu.profiling.mode" instead)
 
 *** Browser Options ***
 
@@ -171,7 +190,6 @@ export interface TestRunnerCliArgs {
 
   cpuOptions?: InferenceSession.CpuExecutionProviderOption;
   cudaOptions?: InferenceSession.CudaExecutionProviderOption;
-  cudaFlags?: Record<string, unknown>;
   wasmOptions?: InferenceSession.WebAssemblyExecutionProviderOption;
   webglOptions?: InferenceSession.WebGLExecutionProviderOption;
   webnnOptions?: InferenceSession.WebNNExecutionProviderOption;
@@ -260,40 +278,29 @@ function parseCpuOptions(_args: minimist.ParsedArgs): InferenceSession.CpuExecut
   return {name: 'cpu'};
 }
 
-function parseCpuFlags(_args: minimist.ParsedArgs): Record<string, unknown> {
-  return {};
-}
-
 function parseWasmOptions(_args: minimist.ParsedArgs): InferenceSession.WebAssemblyExecutionProviderOption {
   return {name: 'wasm'};
 }
 
 function parseWasmFlags(args: minimist.ParsedArgs): Env.WebAssemblyFlags {
-  const numThreads = args.x || args['wasm-number-threads'];
+  const wasm = args.wasm || {};
+  const numThreads = wasm.numThreads = wasm.numThreads ?? (args.x ?? args['wasm-number-threads']);
   if (typeof numThreads !== 'undefined' && typeof numThreads !== 'number') {
-    throw new Error('Flag "x"/"wasm-number-threads" must be a number value');
+    throw new Error('Flag "wasm.numThreads"/"x"/"wasm-number-threads" must be a number value');
   }
-  const initTimeout = args['wasm-init-timeout'];
+  const initTimeout = wasm.initTimeout = wasm.initTimeout ?? args['wasm-init-timeout'];
   if (typeof initTimeout !== 'undefined' && typeof initTimeout !== 'number') {
-    throw new Error('Flag "wasm-init-timeout" must be a number value');
-  }
-  let simd = args['wasm-enable-simd'];
-  if (simd === 'true') {
-    simd = true;
-  } else if (simd === 'false') {
-    simd = false;
-  } else if (typeof simd !== 'undefined' && typeof simd !== 'boolean') {
-    throw new Error('Flag "wasm-enable-simd" must be a boolean value');
-  }
-  let proxy = args['wasm-enable-proxy'];
-  if (proxy === 'true') {
-    proxy = true;
-  } else if (proxy === 'false') {
-    proxy = false;
-  } else if (typeof proxy !== 'undefined' && typeof proxy !== 'boolean') {
-    throw new Error('Flag "wasm-enable-proxy" must be a boolean value');
-  }
-  return {numThreads, initTimeout, simd, proxy};
+    throw new Error('Flag "wasm.initTimeout"/"wasm-init-timeout" must be a number value');
+  }
+  const simd = wasm.simd = parseBooleanArg(wasm.simd ?? args['wasm-enable-simd']);
+  if (typeof simd !== 'undefined' && typeof simd !== 'boolean') {
+    throw new Error('Flag "wasm.simd"/"wasm-enable-simd" must be a boolean value');
+  }
+  const proxy = wasm.proxy = parseBooleanArg(wasm.proxy ?? args['wasm-enable-proxy']);
+  if (typeof proxy !== 'undefined' && typeof proxy !== 'boolean') {
+    throw new Error('Flag "wasm.proxy"/"wasm-enable-proxy" must be a boolean value');
+  }
+  return wasm;
 }
 
 function parseWebglOptions(_args: minimist.ParsedArgs): InferenceSession.WebGLExecutionProviderOption {
@@ -301,39 +308,43 @@ function parseWebglOptions(_args: minimist.ParsedArgs): InferenceSession.WebGLEx
 }
 
 function parseWebglFlags(args: minimist.ParsedArgs): Partial<Env.WebGLFlags> {
-  const contextId = args['webgl-context-id'];
+  const webgl = args.webgl || {};
+  const contextId = webgl.contextId = webgl.contextId ?? args['webgl-context-id'];
   if (contextId !== undefined && contextId !== 'webgl' && contextId !== 'webgl2') {
-    throw new Error('Flag "webgl-context-id" is invalid');
+    throw new Error('Flag "webgl.contextId"/"webgl-context-id" is invalid');
   }
-  const matmulMaxBatchSize = args['webgl-matmul-max-batch-size'];
+  const matmulMaxBatchSize = webgl.matmulMaxBatchSize = webgl.matmulMaxBatchSize ?? args['webgl-matmul-max-batch-size'];
   if (matmulMaxBatchSize !== undefined && typeof matmulMaxBatchSize !== 'number') {
-    throw new Error('Flag "webgl-matmul-max-batch-size" must be a number value');
+    throw new Error('Flag "webgl.matmulMaxBatchSize"/"webgl-matmul-max-batch-size" must be a number value');
   }
-  const textureCacheMode = args['webgl-texture-cache-mode'];
+  const textureCacheMode = webgl.textureCacheMode = webgl.textureCacheMode ?? args['webgl-texture-cache-mode'];
   if (textureCacheMode !== undefined && textureCacheMode !== 'initializerOnly' && textureCacheMode !== 'full') {
-    throw new Error('Flag "webgl-texture-cache-mode" is invalid');
+    throw new Error('Flag "webgl.textureCacheMode"/"webgl-texture-cache-mode" is invalid');
   }
-  const pack = args['webgl-texture-pack-mode'];
+  const pack = webgl.pack = parseBooleanArg(webgl.pack ?? args['webgl-texture-pack-mode']);
   if (pack !== undefined && typeof pack !== 'boolean') {
-    throw new Error('Flag "webgl-texture-pack-mode" is invalid');
+    throw new Error('Flag "webgl.pack"/"webgl-texture-pack-mode" is invalid');
   }
-  const async = args['webgl-async'];
+  const async = webgl.async = parseBooleanArg(webgl.async ?? args['webgl-async']);
   if (async !== undefined && typeof async !== 'boolean') {
-    throw new Error('Flag "webgl-async" is invalid');
+    throw new Error('Flag "webgl.async"/"webgl-async" is invalid');
   }
-  return {contextId, matmulMaxBatchSize, textureCacheMode, pack};
+  return webgl;
 }
 
 function parseWebgpuFlags(args: minimist.ParsedArgs): Partial<Env.WebGpuFlags> {
-  const profilingMode = args['webgpu-profiling-mode'];
+  const webgpu = args.webgpu || {};
+  const profilingMode = (webgpu.profiling = webgpu.profiling ?? {}).mode =
+      webgpu?.profiling?.mode ?? webgpu.profilingMode ?? args['webgpu-profiling-mode'];
   if (profilingMode !== undefined && profilingMode !== 'off' && profilingMode !== 'default') {
     throw new Error('Flag "webgpu-profiling-mode" is invalid');
   }
-  const validateInputContent = args['webgpu-validate-input-content'];
+  const validateInputContent = webgpu.validateInputContent =
+      parseBooleanArg(webgpu.validateInputContent ?? args['webgpu-validate-input-content']);
   if (validateInputContent !== undefined && typeof validateInputContent !== 'boolean') {
     throw new Error('Flag "webgpu-validate-input-content" is invalid');
   }
-  return {profilingMode, validateInputContent};
+  return webgpu;
 }
 
 function parseWebNNOptions(args: minimist.ParsedArgs): InferenceSession.WebNNExecutionProviderOption {
@@ -344,12 +355,11 @@ function parseWebNNOptions(args: minimist.ParsedArgs): InferenceSession.WebNNExe
   return {name: 'webnn', deviceType};
 }
 
-function parseGlobalEnvFlags(args: minimist.ParsedArgs): NonNullable<TestRunnerCliArgs['globalEnvFlags']> {
+function parseGlobalEnvFlags(args: minimist.ParsedArgs) {
   const wasm = parseWasmFlags(args);
   const webgl = parseWebglFlags(args);
   const webgpu = parseWebgpuFlags(args);
-  const cpuFlags = parseCpuFlags(args);
-  return {webgl, wasm, webgpu, ...cpuFlags};
+  return {webgl, wasm, webgpu};
 }
 
 export function parseTestRunnerCliArgs(cmdlineArgs: string[]): TestRunnerCliArgs {
@@ -394,15 +404,14 @@ export function parseTestRunnerCliArgs(cmdlineArgs: string[]): TestRunnerCliArgs
     }
   }
 
-  const globalEnvFlags = parseGlobalEnvFlags(args);
-
   // Options:
   // --log-verbose=<...>
   // --log-info=<...>
   // --log-warning=<...>
   // --log-error=<...>
   const logConfig = parseLogConfig(args);
-  globalEnvFlags.logLevel = logConfig[0]?.config.minimalSeverity;
+  let logLevel = logConfig[0]?.config.minimalSeverity;
+
   // Option: -p, --profile
   const profile = (args.profile || args.p) ? true : false;
   if (profile) {
@@ -410,9 +419,18 @@ export function parseTestRunnerCliArgs(cmdlineArgs: string[]): TestRunnerCliArgs
     logConfig.push({category: 'Profiler.node', config: {minimalSeverity: 'verbose'}});
     logConfig.push({category: 'Profiler.op', config: {minimalSeverity: 'verbose'}});
     logConfig.push({category: 'Profiler.backend', config: {minimalSeverity: 'verbose'}});
-    globalEnvFlags.logLevel = 'verbose';
+    logLevel = 'verbose';
   }
 
+  // Option: -t, --trace
+  const trace = parseBooleanArg(args.trace || args.t, false);
+
+  // Options:
+  // --wasm.<...>=<...>
+  // --webgl.<...>=<...>
+  // --webgpu.<...>=<...>
+  const globalEnvFlags = {...parseGlobalEnvFlags(args), debug, trace, logLevel};
+
   // Option: -P[=<...>], --perf[=<...>]
   const perfArg = (args.perf || args.P);
   const perf = perfArg ? true : false;
diff --git a/js/web/test/test-main.ts b/js/web/test/test-main.ts
index 2d83ce1e095ce..96e374f87aed1 100644
--- a/js/web/test/test-main.ts
+++ b/js/web/test/test-main.ts
@@ -19,49 +19,7 @@ if (ORT_WEB_TEST_CONFIG.model.some(testGroup => testGroup.tests.some(test => tes
 }
 
 // set flags
-const options = ORT_WEB_TEST_CONFIG.options;
-if (options.debug !== undefined) {
-  ort.env.debug = options.debug;
-}
-if (options.globalEnvFlags) {
-  const flags = options.globalEnvFlags;
-  if (flags.logLevel !== undefined) {
-    ort.env.logLevel = flags.logLevel;
-  }
-  if (flags.webgl?.contextId !== undefined) {
-    ort.env.webgl.contextId = flags.webgl.contextId;
-  }
-  if (flags.webgl?.matmulMaxBatchSize !== undefined) {
-    ort.env.webgl.matmulMaxBatchSize = flags.webgl.matmulMaxBatchSize;
-  }
-  if (flags.webgl?.textureCacheMode !== undefined) {
-    ort.env.webgl.textureCacheMode = flags.webgl.textureCacheMode;
-  }
-  if (flags.webgl?.pack !== undefined) {
-    ort.env.webgl.pack = flags.webgl.pack;
-  }
-  if (flags.webgl?.async !== undefined) {
-    ort.env.webgl.async = flags.webgl.async;
-  }
-  if (flags.wasm?.numThreads !== undefined) {
-    ort.env.wasm.numThreads = flags.wasm.numThreads;
-  }
-  if (flags.wasm?.simd !== undefined) {
-    ort.env.wasm.simd = flags.wasm.simd;
-  }
-  if (flags.wasm?.proxy !== undefined) {
-    ort.env.wasm.proxy = flags.wasm.proxy;
-  }
-  if (flags.wasm?.initTimeout !== undefined) {
-    ort.env.wasm.initTimeout = flags.wasm.initTimeout;
-  }
-  if (flags.webgpu?.profilingMode !== undefined) {
-    ort.env.webgpu.profiling = {mode: flags.webgpu.profilingMode};
-  }
-  if (flags.webgpu?.validateInputContent !== undefined) {
-    ort.env.webgpu.validateInputContent = flags.webgpu.validateInputContent;
-  }
-}
+Object.assign(ort.env, ORT_WEB_TEST_CONFIG.options.globalEnvFlags);
 
 // Set logging configuration
 for (const logConfig of ORT_WEB_TEST_CONFIG.log) {
diff --git a/tools/ci_build/github/azure-pipelines/templates/web-browserstack-ci.yml b/tools/ci_build/github/azure-pipelines/templates/web-browserstack-ci.yml
index 96e6ff89cd4f1..9ab2d3401de42 100644
--- a/tools/ci_build/github/azure-pipelines/templates/web-browserstack-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/web-browserstack-ci.yml
@@ -71,7 +71,7 @@ jobs:
     timeoutInMinutes: 20
   - script: |
       export ORT_WEB_TEST_BS_BROWSERS=BS_MAC_11_Safari_14,BS_MAC_11_Chrome_91,BS_ANDROID_11_Pixel_5
-      npm test -- suite0 --env=bs --wasm-init-timeout=30000 --file-cache
+      npm test -- suite0 -e=bs --wasm.initTimeout=30000 --file-cache
     workingDirectory: '$(Build.SourcesDirectory)/js/web'
     displayName: 'npm test (Suite0, BS_ANDROID, BS_MAC)'
     env:
@@ -80,7 +80,7 @@ jobs:
     continueOnError: true
   - script: |
       export ORT_WEB_TEST_BS_BROWSERS=BS_IOS_14_iPhoneXS
-      npm test -- suite1 --env=bs --wasm-init-timeout=30000 --file-cache --backend=wasm
+      npm test -- suite1 -e=bs --wasm.initTimeout=30000 --file-cache --backend=wasm
     workingDirectory: '$(Build.SourcesDirectory)/js/web'
     displayName: 'npm test (Suite1, BS_IOS)'
     continueOnError: true
@@ -95,4 +95,3 @@ jobs:
   - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
     displayName: 'Clean Agent Directories'
     condition: always()
-
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
index 9553bc1bc3547..1eb2ee6f6409c 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
@@ -173,11 +173,11 @@ jobs:
     displayName: 'Run ort-web tests (Suite1, webgpu, IO-binding=gpu-location)'
     condition: eq('${{ parameters.RunWebGpuTests }}', 'true')
   - script: |
-     npm test -- --webgl-texture-pack-mode -b=webgl -e=chrome --karma-debug
+     npm test -- --webgl.pack -b=webgl -e=chrome --karma-debug
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'Run ort-web tests - WebGL: packed mode'
   - script: |
-     npm test -- --wasm-enable-proxy -b=wasm -e=chrome --karma-debug
+     npm test -- --wasm.proxy -b=wasm -e=chrome --karma-debug
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'Run ort-web tests - WebAssembly: proxy'
     condition: and(succeeded(), eq('${{ parameters.BuildConfig }}', 'Release'))
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-web-multi-browsers.yml b/tools/ci_build/github/azure-pipelines/templates/win-web-multi-browsers.yml
index 31ee488318a0b..79bf0b5e71363 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-web-multi-browsers.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-web-multi-browsers.yml
@@ -68,15 +68,15 @@ jobs:
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'npm ci /js/web/'
   - script: |
-      npm test -- suite0 -b=wasm,webgl --wasm-init-timeout=30000 --file-cache
+      npm test -- suite0 -b=wasm,webgl --wasm.initTimeout=30000 --file-cache
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'npm test (Suite0, Chrome)'
   - script: |
-      npm test -- suite0 -b=wasm,webgl --env=firefox --wasm-init-timeout=30000 --file-cache
+      npm test -- suite0 -b=wasm,webgl -e=firefox --wasm.initTimeout=30000 --file-cache
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'npm test (Suite0, Firefox)'
   - script: |
-      npm test -- suite0 -b=wasm,webgl --env=edge --wasm-init-timeout=30000 --file-cache
+      npm test -- suite0 -b=wasm,webgl -e=edge --wasm.initTimeout=30000 --file-cache
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'npm test (Suite0, Edge)'
   - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3

From 8eb49c5f0098b6b5cd320e834e639166289e036b Mon Sep 17 00:00:00 2001
From: aciddelgado <139922440+aciddelgado@users.noreply.github.com>
Date: Wed, 13 Mar 2024 14:09:54 -0700
Subject: [PATCH 162/279] fix gqa rotary dim 1 (#19874)

### Description
GQA Rotary Dimension 1 incorrectly assumed to be based on head size.


### Motivation and Context
This change should enable us to run phi-2 with GQA and Rotary Embedding
fused.
---
 onnxruntime/contrib_ops/cpu/bert/attention_common.h   |  1 +
 .../cuda/bert/flash_attention/flash_api.cc            |  3 ++-
 .../contrib_ops/cuda/bert/flash_attention/flash_api.h |  1 +
 .../cuda/bert/group_query_attention_helper.h          | 11 +++++++++--
 .../cuda/bert/group_query_attention_impl.cu           |  2 +-
 5 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_common.h b/onnxruntime/contrib_ops/cpu/bert/attention_common.h
index a34f41d2938c6..5a0c3af05c9da 100644
--- a/onnxruntime/contrib_ops/cpu/bert/attention_common.h
+++ b/onnxruntime/contrib_ops/cpu/bert/attention_common.h
@@ -98,6 +98,7 @@ struct GroupQueryAttentionParameters {
   int kv_hidden_size;
   int kv_num_heads;
   int num_splits;          // number of splits for splitkv
+  int rotary_dim;          // rotary embedding dimension
   bool is_unidirectional;  // causal
   int local_window_size;
   bool kv_share_buffer;
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_api.cc b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_api.cc
index 2c296bf4f8483..0f58a74c4d2fd 100644
--- a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_api.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_api.cc
@@ -371,6 +371,7 @@ Status mha_fwd_kvcache(const cudaDeviceProp& dprops,
                        int seqlen_q,
                        int seqlen_k,
                        int seqlen_k_new,
+                       int rotary_dim,
                        const float softmax_scale,
                        bool is_causal,
                        bool is_bf16,
@@ -448,7 +449,7 @@ Status mha_fwd_kvcache(const cudaDeviceProp& dprops,
     params.rotary_cos_ptr = rotary_cos;
     params.rotary_sin_ptr = rotary_sin;
     params.is_rotary_interleaved = is_rotary_interleaved;
-    params.rotary_dim = (head_size / 16) * 16;
+    params.rotary_dim = rotary_dim;
   }
 
   params.num_splits = num_splits;
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_api.h b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_api.h
index 387d1cf9d84fe..24891bcc4d499 100644
--- a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_api.h
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_api.h
@@ -96,6 +96,7 @@ Status mha_fwd_kvcache(const cudaDeviceProp& dprops,
                        int seqlen_q,
                        int seqlen_k,
                        int seqlen_k_new,
+                       int rotary_dim,
                        const float softmax_scale,
                        bool is_causal,
                        bool is_bf16,
diff --git a/onnxruntime/contrib_ops/cuda/bert/group_query_attention_helper.h b/onnxruntime/contrib_ops/cuda/bert/group_query_attention_helper.h
index 6fa11200fd5be..1a7c3fcea3fa3 100644
--- a/onnxruntime/contrib_ops/cuda/bert/group_query_attention_helper.h
+++ b/onnxruntime/contrib_ops/cuda/bert/group_query_attention_helper.h
@@ -205,6 +205,7 @@ Status CheckInputs(const Tensor* query,
   int total_sequence_length = *((*total_seqlen).template Data<int32_t>());
   int present_sequence_length = std::max(total_sequence_length, past_sequence_length);
 
+  int rotary_dim = 0;
   if (cos_cache != nullptr && sin_cache != nullptr) {
     const auto& cos_dims = cos_cache->Shape().GetDims();
     const auto& sin_dims = sin_cache->Shape().GetDims();
@@ -222,14 +223,19 @@ Status CheckInputs(const Tensor* query,
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                              "sin_cache dimension 0 should be of max_sequence_length.");
     }
-    if (cos_dims[1] != (head_size / 16) * 8) {
+    if (cos_dims[1] > (head_size / 16) * 8 || cos_dims[1] % 8 != 0) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                              "cos_cache dimension 1 must be <= head_size / 2 and a multiple of 8.");
     }
-    if (sin_dims[1] != (head_size / 16) * 8) {
+    if (sin_dims[1] > (head_size / 16) * 8 || sin_dims[1] % 8 != 0) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                              "sin_cache dimension 1 must be <= head_size / 2 and a multiple of 8.");
     }
+    if (cos_dims[1] != sin_dims[1]) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "cos_cache and sin_cache dimension 1 must be the same.");
+    }
+    rotary_dim = static_cast<int>(cos_dims[1] * 2);
   } else if (cos_cache != nullptr || sin_cache != nullptr) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                            "Input 'cos_cache' and 'sin_cache' shall be both present or both absent.");
@@ -248,6 +254,7 @@ Status CheckInputs(const Tensor* query,
     output_parameters->head_size = head_size;
     output_parameters->kv_hidden_size = kv_hidden_size;
     output_parameters->kv_num_heads = kv_num_heads;
+    output_parameters->rotary_dim = rotary_dim;
     output_parameters->is_packed_qkv = is_packed_qkv;
     output_parameters->is_unidirectional = true;
     output_parameters->is_prompt = is_prompt;
diff --git a/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu
index cb5631542c113..afba83be34e2d 100644
--- a/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu
@@ -530,7 +530,7 @@ Status FlashAttention(
       device_prop, stream, query, present_key, present_value, key, value, data.output,
       reinterpret_cast<void*>(data.softmax_lse), seqlens_k, cos_cache, sin_cache,
       batch_size, num_heads, kv_num_heads, head_size, sequence_length,
-      parameters.seqlen_present_kv_cache, kv_sequence_length,
+      parameters.seqlen_present_kv_cache, kv_sequence_length, parameters.rotary_dim,
       scale, is_causal, is_bf16, past_bsnh, parameters.num_splits, reinterpret_cast<void*>(data.softmax_lse_accum),
       reinterpret_cast<void*>(data.out_accum), parameters.local_window_size, parameters.rotary_interleaved,
       parameters.is_packed_qkv));

From 4ac98d6d656d68e40ab2b2c49bbb6d2e889830bb Mon Sep 17 00:00:00 2001
From: kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
Date: Wed, 13 Mar 2024 14:10:52 -0700
Subject: [PATCH 163/279] Update replacing MultiHeadAttention with
 GroupQueryAttention (#19882)

### Description
This PR updates the replacement of MultiHeadAttention (MHA) with
GroupQueryAttention (GQA). It is related to the changes in [this
PR](https://github.com/microsoft/onnxruntime/pull/18906).

### Motivation and Context
The updated replacement of MHA with GQA includes the following fusion
changes.
- Apply sliding window within GQA
- Fuse the rotary embeddings within GQA
- Fuse the 3 MatMuls into 1 packed MatMul if possible
- Fuse the 3 Adds into 1 packed Add if possible
---
 .../cuda/bert/group_query_attention.cc        |   5 +
 .../tools/transformers/convert_generation.py  | 154 ++++++++++++++++--
 .../transformers/models/llama/llama_inputs.py |  10 +-
 3 files changed, 157 insertions(+), 12 deletions(-)

diff --git a/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc b/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc
index fe56f84f0a886..814aa1fb3c8f0 100644
--- a/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc
@@ -110,6 +110,11 @@ Status GroupQueryAttention<T>::ComputeInternal(OpKernelContext* context) const {
   parameters.do_rotary = do_rotary_;
   parameters.rotary_interleaved = rotary_interleaved_;
 
+  if (do_rotary_ && (cos_cache == nullptr || sin_cache == nullptr)) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "cos_cache and sin_cache must be passed to GroupQueryAttention when do_rotary = 1");
+  }
+
   TensorShapeVector output_shape(3);
   output_shape[0] = static_cast<int64_t>(parameters.batch_size);
   output_shape[1] = static_cast<int64_t>(sequence_length);
diff --git a/onnxruntime/python/tools/transformers/convert_generation.py b/onnxruntime/python/tools/transformers/convert_generation.py
index a2cdd17e19fa5..894e11275056e 100644
--- a/onnxruntime/python/tools/transformers/convert_generation.py
+++ b/onnxruntime/python/tools/transformers/convert_generation.py
@@ -1273,7 +1273,7 @@ def find_past_seq_len_usage(subg: GraphProto):
 
 
 def replace_mha_with_gqa(
-    model: OnnxModel, attn_mask: str, kv_num_heads: int = 0, world_size: int = 1, window_size: int = 0
+    model: OnnxModel, attn_mask: str, kv_num_heads: int = 0, world_size: int = 1, window_size: int = -1
 ):
     # Insert attention_mask subgraph to calculate shared inputs for all GroupQueryAttention nodes
     #
@@ -1339,31 +1339,163 @@ def replace_mha_with_gqa(
     )
 
     # Replace MultiHeadAttention with GroupQueryAttention
+    #
+    # When replacing, fuse the following subgraph:
+    #
+    #                 root_input
+    #               /     |      \
+    #         MatMul    MatMul    MatMul
+    #           |         |         |
+    #          Add       Add       Add      (optional Adds)
+    #           |         |         |
+    #         RotEmb    RotEmb      |
+    #            \        |        /
+    #             MultiHeadAttention
+    #
+    # to this new subgraph:
+    #
+    #                 root_input
+    #                     |
+    #                PackedMatMul           (if possible)
+    #                     |
+    #                 PackedAdd             (if possible)
+    #                     |
+    #             GroupQueryAttention
+    #
+
     mha_nodes = list(filter(lambda node: node.op_type == "MultiHeadAttention", model.model.graph.node))
-    for node in mha_nodes:
-        num_heads_mha = 0
+    for idx, node in enumerate(mha_nodes):
+        # Detect Q path to MHA
+        q_path_1 = model.match_parent_path(node, ["RotaryEmbedding", "Add", "MatMul"], [0, 0, 0])
+        q_path_2 = model.match_parent_path(node, ["RotaryEmbedding", "MatMul"], [0, 0])
+
+        q_rotary, q_add, q_matmul = None, None, None
+        if q_path_1 is not None:
+            q_rotary, q_add, q_matmul = q_path_1
+        elif q_path_2 is not None:
+            q_rotary, q_matmul = q_path_2
+
+        # Detect K path to MHA
+        k_path_1 = model.match_parent_path(node, ["RotaryEmbedding", "Add", "MatMul"], [1, 0, 0])
+        k_path_2 = model.match_parent_path(node, ["RotaryEmbedding", "MatMul"], [1, 0])
+
+        k_rotary, k_add, k_matmul = None, None, None
+        if k_path_1 is not None:
+            k_rotary, k_add, k_matmul = k_path_1
+        elif k_path_2 is not None:
+            k_rotary, k_matmul = k_path_2
+
+        # Detect V path to MHA
+        v_path_1 = model.match_parent_path(node, ["Add", "MatMul"], [2, 0])
+        v_path_2 = model.match_parent_path(node, ["MatMul"], [2])
+
+        v_add, v_matmul = None, None
+        if v_path_1 is not None:
+            v_add, v_matmul = v_path_1
+        elif v_path_2 is not None:
+            v_matmul = v_path_2[0]
+
+        # Get `interleaved` attribute from RotaryEmbedding
+        interleaved = 0
+        if q_rotary is not None and k_rotary is not None:
+            for att in q_rotary.attribute:
+                if att.name == "interleaved":
+                    interleaved = att.i
+
+        # Get `num_heads` attribute from MHA
+        num_heads = 0
         for att in node.attribute:
             if att.name == "num_heads":
-                num_heads_mha = att.i
+                num_heads = att.i
+
+        # Check if root_input to Q/K/V paths is the same
+        root_input_is_same = q_matmul.input[0] == k_matmul.input[0] and k_matmul.input[0] == v_matmul.input[0]
+
+        # Check if Q/K/V paths all have bias or all don't have bias
+        all_paths_have_bias = q_add is not None and k_add is not None and v_add is not None
+        all_paths_have_no_bias = q_add is None and k_add is None and v_add is None
+
+        # Make PackedMatMul node if possible
+        q_input_to_attention, k_input_to_attention, v_input_to_attention = "", "", ""
+        if root_input_is_same and (all_paths_have_bias or all_paths_have_no_bias):
+            qw = NumpyHelper.to_array(model.get_initializer(q_matmul.input[1]))
+            kw = NumpyHelper.to_array(model.get_initializer(k_matmul.input[1]))
+            vw = NumpyHelper.to_array(model.get_initializer(v_matmul.input[1]))
+
+            dim = qw.shape[-1]
+            qkv_weight = np.stack((qw, kw, vw), axis=1).reshape(dim, 3 * dim)
+            qkv_weight = onnx.numpy_helper.from_array(qkv_weight, name=f"QKV_Weight_{idx}")
+            model.add_initializer(qkv_weight)
+
+            packed_matmul_node = onnx.helper.make_node(
+                "MatMul",
+                inputs=[q_matmul.input[0], qkv_weight.name],
+                outputs=[f"{qkv_weight.name}_output"],
+                name=model.create_node_name("MatMul"),
+            )
+            model.model.graph.node.extend([packed_matmul_node])
+            model.model.graph.node.remove(q_matmul)
+            model.model.graph.node.remove(k_matmul)
+            model.model.graph.node.remove(v_matmul)
+            q_input_to_attention = packed_matmul_node.output[0]
+
+            # Make PackedAdd node if possible
+            if all_paths_have_bias:
+                qb = NumpyHelper.to_array(model.get_initializer(q_add.input[1]))
+                kb = NumpyHelper.to_array(model.get_initializer(k_add.input[1]))
+                vb = NumpyHelper.to_array(model.get_initializer(v_add.input[1]))
+
+                dim = qb.shape[-1]
+                qkv_bias = np.stack((qb, kb, vb), axis=0).reshape(3 * dim)
+                qkv_bias = onnx.numpy_helper.from_array(qkv_bias, name=f"QKV_Bias_{idx}")
+                model.add_initializer(qkv_bias)
+                packed_add_node = onnx.helper.make_node(
+                    "Add",
+                    inputs=[packed_matmul_node.output[0], qkv_bias.name],
+                    outputs=[f"{qkv_bias.name}_output"],
+                )
+                model.model.graph.node.extend([packed_add_node])
+                model.model.graph.node.remove(q_add)
+                model.model.graph.node.remove(k_add)
+                model.model.graph.node.remove(v_add)
+                q_input_to_attention = packed_add_node.output[0]
+
+        else:
+            q_input_to_attention = q_matmul.output[0]
+            k_input_to_attention = k_matmul.output[0]
+            v_input_to_attention = v_matmul.output[0]
+
+        # Make GQA node
         gqa_node = onnx.helper.make_node(
             "GroupQueryAttention",
             inputs=[
-                node.input[0],  # query
-                node.input[1],  # key
-                node.input[2],  # value
+                q_input_to_attention,  # query
+                k_input_to_attention,  # key
+                v_input_to_attention,  # value
                 node.input[6],  # past_key
                 node.input[7],  # past_value
-                "seqlens_k",  # seqlens_k (for attention_mask)
-                "total_seq_len",  # total_seq_len (for attention_mask)
+                seqlen_k_cast_node.output[0],  # seqlens_k (for attention mask)
+                total_seqlen_cast_node.output[0],  # total_seq_len (for attention mask)
+                q_rotary.input[2] if q_rotary is not None else "",  # cos_cache (for rotary embeddings)
+                q_rotary.input[3] if q_rotary is not None else "",  # sin_cache (for rotary embeddings)
             ],
             outputs=node.output,
             name=node.name.replace("MultiHeadAttention", "GroupQueryAttention"),
             domain="com.microsoft",
-            num_heads=num_heads_mha // world_size,
-            kv_num_heads=num_heads_mha // world_size if kv_num_heads == 0 else kv_num_heads // world_size,
+            num_heads=num_heads // world_size,
+            kv_num_heads=num_heads // world_size if kv_num_heads == 0 else kv_num_heads // world_size,
+            local_window_size=window_size,
+            do_rotary=int(q_rotary is not None and k_rotary is not None),
+            rotary_interleaved=interleaved,
         )
         model.model.graph.node.remove(node)
         model.model.graph.node.extend([gqa_node])
+
+        if q_rotary is not None:
+            model.model.graph.node.remove(q_rotary)
+        if k_rotary is not None:
+            model.model.graph.node.remove(k_rotary)
+
     return model
 
 
diff --git a/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py b/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py
index a329b73259dda..18202f4b81c0f 100644
--- a/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py
+++ b/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py
@@ -222,7 +222,8 @@ def get_msft_sample_inputs(
 # Create past_key_values
 # Each is of shape (batch_size, num_heads, past_sequence_length, head_size)
 def get_past_kv_inputs(config: AutoConfig, batch_size: int, past_seq_len: int, use_fp16: bool, world_size: int = 1):
-    num_heads, head_size = config.num_key_value_heads // world_size, config.hidden_size // config.num_attention_heads
+    num_heads = config.num_key_value_heads // world_size
+    head_size = config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads
     torch_dtype = torch.float16 if use_fp16 else torch.float32
     past_kv = [
         (
@@ -286,7 +287,14 @@ def add_io_bindings(
 ):
     io_binding = model.io_binding()
 
+    model_inputs = set(map(lambda i: i.name, model.get_inputs()))
     for k, v in ort_inputs.items():
+        # Use this check to handle scenarios such as INT4 CUDA and FP16 CUDA models with
+        # GQA + RotaryEmbedding fusion where `position_ids` is removed as an ONNX model input
+        # but `position_ids` is used as a PyTorch model input
+        if k not in model_inputs:
+            continue
+
         # Bind OrtValue inputs to device
         if use_gqa and ("cache" in k or "past_key_values" in k):
             if k not in kv_cache_ortvalues:

From 9f08f8d5b21711e64084cb46d3fc3834157b3f6e Mon Sep 17 00:00:00 2001
From: raoanag <127366241+raoanag@users.noreply.github.com>
Date: Wed, 13 Mar 2024 16:49:55 -0800
Subject: [PATCH 164/279] Set seed for DynamicQuantizeMatMul tests (#19896)

Seed for DynamicQuantizeMatMul tests to avoid pipeline failures with marginal mismatches.
---
 onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc b/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc
index 88bee5fe1b125..0b64ea3de8ded 100644
--- a/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc
+++ b/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc
@@ -89,7 +89,7 @@ void TestDynamicQuantizeMatMul(bool is_matrix_b_constant,
                                bool has_bias = false,
                                bool empty_input = false) {
   // create rand inputs
-  RandomValueGenerator random{};
+  RandomValueGenerator random{1668426375};
 
   int64_t M = empty_input ? 1 : 4;
   int64_t N = 128;

From f42e6ad61e98fb9e7afe41590266c2c39206aa53 Mon Sep 17 00:00:00 2001
From: Markus Tavenrath <mtavenrath@users.noreply.github.com>
Date: Thu, 14 Mar 2024 01:52:07 +0100
Subject: [PATCH 165/279] Add support for LRN NHWC OPs (#19866)

Support LRN NHWC in the CUDA EP.

### Motivation and Context
Add support for all NHWC OPs to avoid NHWC/NCHW Layout transformation
---
 .../core/framework/sequential_executor.cc     | 26 ++++++-----
 .../layout_transformation.cc                  |  3 +-
 .../core/providers/cuda/cuda_nhwc_kernels.cc  | 21 +++++++++
 onnxruntime/core/providers/cuda/nn/lrn.cc     | 44 ++++++++++++-------
 onnxruntime/core/providers/cuda/nn/lrn.h      |  2 +-
 5 files changed, 65 insertions(+), 31 deletions(-)

diff --git a/onnxruntime/core/framework/sequential_executor.cc b/onnxruntime/core/framework/sequential_executor.cc
index ea7f1397c961b..0cc7294a46495 100644
--- a/onnxruntime/core/framework/sequential_executor.cc
+++ b/onnxruntime/core/framework/sequential_executor.cc
@@ -306,18 +306,20 @@ class KernelScope {
 #endif
 
 #ifdef ENABLE_NVTX_PROFILE
-    auto& node = kernel_.Node();
-    profile::NvtxRangeCreator& forward_range = session_scope_.forward_range_;
-    profile::NvtxRangeCreator& backward_range = session_scope_.backward_range_;
-    if (node.Description() != "Backward pass" && !forward_range.IsBeginCalled()) {
-      // Start timing forward pass when encountering the first forward node.
-      forward_range.Begin();
-    } else if (node.Description() == "Backward pass" && !backward_range.IsBeginCalled() &&
-               forward_range.IsBeginCalled()) {
-      // Start timing backward pass when encountering the first backward node.
-      // In the meanwhile, forward range ends.
-      forward_range.End();
-      backward_range.Begin();
+    {
+      auto& node = kernel_.Node();
+      profile::NvtxRangeCreator& forward_range = session_scope_.forward_range_;
+      profile::NvtxRangeCreator& backward_range = session_scope_.backward_range_;
+      if (node.Description() != "Backward pass" && !forward_range.IsBeginCalled()) {
+        // Start timing forward pass when encountering the first forward node.
+        forward_range.Begin();
+      } else if (node.Description() == "Backward pass" && !backward_range.IsBeginCalled() &&
+                 forward_range.IsBeginCalled()) {
+        // Start timing backward pass when encountering the first backward node.
+        // In the meanwhile, forward range ends.
+        forward_range.End();
+        backward_range.Begin();
+      }
     }
 #endif
 
diff --git a/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc b/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc
index 085a02c7c4127..7953cde6686c0 100644
--- a/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc
+++ b/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc
@@ -44,7 +44,8 @@ const std::unordered_set<std::string_view>& GetCUDALayoutSensitiveOps() {
         "AveragePool",
         "GridSample",
         "DepthToSpace",
-        "SpaceToDepth"};
+        "SpaceToDepth",
+        "LRN"};
   }();
   return cuda_nhwc_ops;
 }
diff --git a/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc b/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc
index da7802fe8d5dc..8fdcaacdb0f29 100644
--- a/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc
+++ b/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc
@@ -91,6 +91,15 @@ class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInter
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 13, DepthToSpace);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 12, SpaceToDepth);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 13, SpaceToDepth);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+    kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 12, float, LRN);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+    kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 12, double, LRN);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+    kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 12, MLFloat16, LRN);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 13, float, LRN);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 13, double, LRN);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 13, MLFloat16, LRN);
 
 Status RegisterCudaNhwcKernels(KernelRegistry& kernel_registry) {
   static const BuildKernelCreateInfoFn nhwc_function_table[] = {
@@ -187,6 +196,18 @@ Status RegisterCudaNhwcKernels(KernelRegistry& kernel_registry) {
                                                                       1, 12, SpaceToDepth)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain,
                                                             13, SpaceToDepth)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+          kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 12, float, LRN)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+          kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 12, double, LRN)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+          kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 12, MLFloat16, LRN)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(
+          kCudaExecutionProvider, kMSInternalNHWCDomain, 13, float, LRN)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(
+          kCudaExecutionProvider, kMSInternalNHWCDomain, 13, double, LRN)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(
+          kCudaExecutionProvider, kMSInternalNHWCDomain, 13, MLFloat16, LRN)>,
   };
 
   for (auto& function_table_entry : nhwc_function_table) {
diff --git a/onnxruntime/core/providers/cuda/nn/lrn.cc b/onnxruntime/core/providers/cuda/nn/lrn.cc
index 6fcdec74d84b5..788299b5eb8d6 100644
--- a/onnxruntime/core/providers/cuda/nn/lrn.cc
+++ b/onnxruntime/core/providers/cuda/nn/lrn.cc
@@ -6,37 +6,47 @@
 namespace onnxruntime {
 namespace cuda {
 
-#define REGISTER_KERNEL_VERSIONED_TYPED(START_VER, END_VER, T)                             \
+#define REGISTER_KERNEL_VERSIONED_TYPED(START_VER, END_VER, T, DOMAIN, LAYOUT)             \
   ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                                 \
       LRN,                                                                                 \
-      kOnnxDomain,                                                                         \
+      DOMAIN,                                                                              \
       START_VER,                                                                           \
       END_VER,                                                                             \
       T,                                                                                   \
       kCudaExecutionProvider,                                                              \
       (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
-      LRN<T>);
+      LRN<T, LAYOUT>);
 
-#define REGISTER_KERNEL_TYPED(VER, T)                                                      \
+#define REGISTER_KERNEL_TYPED(VER, T, DOMAIN, LAYOUT)                                      \
   ONNX_OPERATOR_TYPED_KERNEL_EX(                                                           \
       LRN,                                                                                 \
-      kOnnxDomain,                                                                         \
+      DOMAIN,                                                                              \
       VER,                                                                                 \
       T,                                                                                   \
       kCudaExecutionProvider,                                                              \
       (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
-      LRN<T>);
+      LRN<T, LAYOUT>);
 
-REGISTER_KERNEL_VERSIONED_TYPED(1, 12, float)
-REGISTER_KERNEL_VERSIONED_TYPED(1, 12, double)
-REGISTER_KERNEL_VERSIONED_TYPED(1, 12, MLFloat16)
+REGISTER_KERNEL_VERSIONED_TYPED(1, 12, float, kOnnxDomain, false)
+REGISTER_KERNEL_VERSIONED_TYPED(1, 12, double, kOnnxDomain, false)
+REGISTER_KERNEL_VERSIONED_TYPED(1, 12, MLFloat16, kOnnxDomain, false)
 
-REGISTER_KERNEL_TYPED(13, float)
-REGISTER_KERNEL_TYPED(13, double)
-REGISTER_KERNEL_TYPED(13, MLFloat16)
+REGISTER_KERNEL_TYPED(13, float, kOnnxDomain, false)
+REGISTER_KERNEL_TYPED(13, double, kOnnxDomain, false)
+REGISTER_KERNEL_TYPED(13, MLFloat16, kOnnxDomain, false)
 
-template <typename T>
-LRN<T>::LRN(const OpKernelInfo& info) : CudaKernel(info) {
+#ifdef ENABLE_CUDA_NHWC_OPS
+REGISTER_KERNEL_VERSIONED_TYPED(1, 12, float, kMSInternalNHWCDomain, true)
+REGISTER_KERNEL_VERSIONED_TYPED(1, 12, double, kMSInternalNHWCDomain, true)
+REGISTER_KERNEL_VERSIONED_TYPED(1, 12, MLFloat16, kMSInternalNHWCDomain, true)
+
+REGISTER_KERNEL_TYPED(13, float, kMSInternalNHWCDomain, true)
+REGISTER_KERNEL_TYPED(13, double, kMSInternalNHWCDomain, true)
+REGISTER_KERNEL_TYPED(13, MLFloat16, kMSInternalNHWCDomain, true)
+#endif
+
+template <typename T, bool Layout>
+LRN<T, Layout>::LRN(const OpKernelInfo& info) : CudaKernel(info) {
   int64_t size;
   ORT_ENFORCE(info.GetAttr<int64_t>("size", &size).IsOK());
   ORT_ENFORCE(size > 0);
@@ -58,8 +68,8 @@ LRN<T>::LRN(const OpKernelInfo& info) : CudaKernel(info) {
                   .IsOK());
 }
 
-template <typename T>
-Status LRN<T>::ComputeInternal(OpKernelContext* context) const {
+template <typename T, bool Layout>
+Status LRN<T, Layout>::ComputeInternal(OpKernelContext* context) const {
   typedef typename ToCudaType<T>::MappedType CudaT;
 
   const Tensor* X = context->Input<Tensor>(0);
@@ -71,7 +81,7 @@ Status LRN<T>::ComputeInternal(OpKernelContext* context) const {
   Tensor* Y = context->Output(0, X->Shape());
 
   CudnnTensor x_tensor;
-  ORT_RETURN_IF_ERROR(x_tensor.Set(X->Shape().GetDims(), CudnnTensor::GetDataType<CudaT>()));
+  ORT_RETURN_IF_ERROR(x_tensor.Set(X->Shape().GetDims(), CudnnTensor::GetDataType<CudaT>(), Layout == NHWC));
 
   const auto one = Consts<CudaT>::One;
   const auto zero = Consts<CudaT>::Zero;
diff --git a/onnxruntime/core/providers/cuda/nn/lrn.h b/onnxruntime/core/providers/cuda/nn/lrn.h
index 319e323c72a92..31b2819ccc52a 100644
--- a/onnxruntime/core/providers/cuda/nn/lrn.h
+++ b/onnxruntime/core/providers/cuda/nn/lrn.h
@@ -20,7 +20,7 @@ class CudnnLRNDescriptor final {
   cudnnLRNDescriptor_t desc_;
 };
 
-template <typename T>
+template <typename T, bool Layout>
 class LRN : public CudaKernel {
  public:
   LRN(const OpKernelInfo& info);

From 18ad8587a696ac6d7fd1e73d880247d2722755e1 Mon Sep 17 00:00:00 2001
From: Jake Mathern <jamather@microsoft.com>
Date: Wed, 13 Mar 2024 17:54:06 -0700
Subject: [PATCH 166/279] [CP] Fix for xfgcheck and Fix WAI ARM64 build
 (#19634) (#19644)

### Description
Fix WAI build by only conditionally copying linker flags


### Motivation and Context
I broke the WAI build that contains ORT on ARM64
---
 cmake/winml.cmake | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/cmake/winml.cmake b/cmake/winml.cmake
index 57cecd3e66adb..d74250b962628 100644
--- a/cmake/winml.cmake
+++ b/cmake/winml.cmake
@@ -827,7 +827,6 @@ if (winml_is_inbox)
     get_target_property(compile_options ${target} COMPILE_OPTIONS)
     get_target_property(include_directories ${target} INCLUDE_DIRECTORIES)
     get_target_property(link_libraries ${target} LINK_LIBRARIES)
-    get_target_property(link_flags ${target} LINK_FLAGS)
     get_target_property(link_options ${target} LINK_OPTIONS)
 
     add_library(${new_target} SHARED ${sources})
@@ -836,8 +835,14 @@ if (winml_is_inbox)
     target_compile_options(${new_target} PRIVATE ${compile_options})
     target_include_directories(${new_target} PRIVATE ${include_directories})
     target_link_libraries(${new_target} PRIVATE ${link_libraries})
-    set_property(TARGET ${new_target} PROPERTY LINK_FLAGS "${link_flags}")
     target_link_options(${new_target} PRIVATE ${link_options})
+
+    # Attempt to copy linker flags 
+    get_target_property(link_flags ${target} LINK_FLAGS)
+    
+    if (NOT link_flags MATCHES ".*NOTFOUND")
+      set_property(TARGET ${new_target} PROPERTY LINK_FLAGS "${link_flags}")
+    endif()
   endfunction()
 
   if (WAI_ARCH STREQUAL x64 OR WAI_ARCH STREQUAL arm64)

From 2c525a79b1a53339664e2dfd21f604070f150781 Mon Sep 17 00:00:00 2001
From: cao lei <jslhcl@gmail.com>
Date: Wed, 13 Mar 2024 19:41:15 -0700
Subject: [PATCH 167/279] Add new API KernelContext_GetScratchBuffer (#19809)

### Description
<!-- Describe your changes. -->
add new API KernelContext_GetScratchBuffer to get scratch buffer from
kernel context


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
add new API KernelContext_GetScratchBuffer to get scratch buffer from
kernel context which will be used in ORT extension project for
GroupQueryAttention custom op
---
 .../onnxruntime/core/framework/stream_handles.h    |  2 ++
 .../onnxruntime/core/session/onnxruntime_c_api.h   | 10 ++++++++++
 .../core/providers/cann/cann_stream_handle.h       |  4 +++-
 .../core/providers/cuda/cuda_stream_handle.h       |  4 +++-
 .../core/providers/rocm/rocm_stream_handle.h       |  4 +++-
 onnxruntime/core/session/custom_ops.cc             | 14 ++++++++++++++
 onnxruntime/core/session/onnxruntime_c_api.cc      |  1 +
 onnxruntime/core/session/ort_apis.h                |  2 ++
 8 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/include/onnxruntime/core/framework/stream_handles.h b/include/onnxruntime/core/framework/stream_handles.h
index c235ee904762e..26d78133b52fc 100644
--- a/include/onnxruntime/core/framework/stream_handles.h
+++ b/include/onnxruntime/core/framework/stream_handles.h
@@ -100,6 +100,8 @@ class Stream {
     return nullptr;
   }
 
+  virtual WaitNotificationFn GetWaitNotificationFn() const { return nullptr; }
+
  private:
   StreamHandle handle_;
   const OrtDevice& device_;
diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index 144ee1205ee1a..2e2a903da27cb 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -4590,6 +4590,16 @@ struct OrtApi {
                   _In_reads_(num_keys) const char* const* provider_options_keys,
                   _In_reads_(num_keys) const char* const* provider_options_values,
                   _In_ size_t num_keys);
+
+  /** \brief Get scratch buffer from the corresponding allocator under the sepcific OrtMemoryInfo object.
+   *         NOTE: callers are responsible to release this scratch buffer from the corresponding allocator
+   *  \param[in] context OrtKernelContext instance
+   *  \param[in] mem_info OrtMemoryInfo instance
+   *  \param[in] count_or_bytes How many bytes is this scratch buffer
+   *  \param[out] out A pointer to the scrach buffer
+   *  \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(KernelContext_GetScratchBuffer, _In_ const OrtKernelContext* context, _In_ const OrtMemoryInfo* mem_info, _In_ size_t count_or_bytes, _Outptr_ void** out);
 };
 
 /*
diff --git a/onnxruntime/core/providers/cann/cann_stream_handle.h b/onnxruntime/core/providers/cann/cann_stream_handle.h
index 4d03fe5201209..5d822d23f966f 100644
--- a/onnxruntime/core/providers/cann/cann_stream_handle.h
+++ b/onnxruntime/core/providers/cann/cann_stream_handle.h
@@ -12,6 +12,7 @@
 #include "core/providers/cann/cann_call.h"
 
 namespace onnxruntime {
+void WaitCannNotificationOnDevice(Stream& stream, synchronize::Notification& notification);
 
 struct CannStream : Stream {
   CannStream(aclrtStream stream, const OrtDevice& device, bool own_flag);
@@ -23,10 +24,11 @@ struct CannStream : Stream {
   void Flush() override;
 
   bool own_stream_{true};
+
+  WaitNotificationFn GetWaitNotificationFn() const override { return WaitCannNotificationOnDevice; }
 };
 
 void RegisterCannStreamHandles(IStreamCommandHandleRegistry& stream_handle_registry,
                                const OrtDevice::DeviceType device_type);
 
-void WaitCannNotificationOnDevice(Stream& stream, synchronize::Notification& notification);
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/cuda_stream_handle.h b/onnxruntime/core/providers/cuda/cuda_stream_handle.h
index b02c167e9e9ec..15e7a0553c84e 100644
--- a/onnxruntime/core/providers/cuda/cuda_stream_handle.h
+++ b/onnxruntime/core/providers/cuda/cuda_stream_handle.h
@@ -11,6 +11,7 @@
 namespace onnxruntime {
 
 struct CudaStream;
+void WaitCudaNotificationOnDevice(Stream& stream, synchronize::Notification& notification);
 
 struct DeferredCpuAllocator : public OrtAllocator {
   DeferredCpuAllocator(CudaStream&);
@@ -47,6 +48,8 @@ struct CudaStream : Stream {
 
   onnxruntime::IAllocator* GetCpuAllocator() const { return cpu_allocator_.get(); }
 
+  WaitNotificationFn GetWaitNotificationFn() const override { return WaitCudaNotificationOnDevice; }
+
  private:
   std::vector<void*> deferred_cpu_buffers_;
   AllocatorPtr cpu_allocator_;
@@ -64,5 +67,4 @@ void RegisterCudaStreamHandles(IStreamCommandHandleRegistry& stream_handle_regis
                                cudnnHandle_t external_cudnn_handle,
                                cublasHandle_t external_cublass_handle,
                                const CUDAExecutionProviderInfo& ep_info);
-void WaitCudaNotificationOnDevice(Stream& stream, synchronize::Notification& notification);
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/rocm/rocm_stream_handle.h b/onnxruntime/core/providers/rocm/rocm_stream_handle.h
index 1f3e5b75548e7..30983ce03568f 100644
--- a/onnxruntime/core/providers/rocm/rocm_stream_handle.h
+++ b/onnxruntime/core/providers/rocm/rocm_stream_handle.h
@@ -8,6 +8,7 @@
 #include "core/framework/stream_handles.h"
 
 namespace onnxruntime {
+void WaitRocmNotificationOnDevice(Stream& stream, synchronize::Notification& notification);
 
 struct RocmStream : Stream {
   RocmStream(hipStream_t stream,
@@ -36,6 +37,8 @@ struct RocmStream : Stream {
 
   void* GetResource(int version, int id) const override;
 
+  WaitNotificationFn GetWaitNotificationFn() const override { return WaitRocmNotificationOnDevice; }
+
  private:
   std::vector<void*> deferred_cpu_buffers_;
   AllocatorPtr cpu_allocator_;
@@ -50,5 +53,4 @@ void RegisterRocmStreamHandles(IStreamCommandHandleRegistry& stream_handle_regis
                                bool use_existing_stream,
                                miopenHandle_t external_miopen_handle,
                                rocblas_handle external_rocblas_handle);
-void WaitRocmNotificationOnDevice(Stream& stream, synchronize::Notification& notification);
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/session/custom_ops.cc b/onnxruntime/core/session/custom_ops.cc
index 7a233c57cfdf3..9a09c74cd0b3a 100644
--- a/onnxruntime/core/session/custom_ops.cc
+++ b/onnxruntime/core/session/custom_ops.cc
@@ -736,6 +736,20 @@ ORT_API_STATUS_IMPL(OrtApis::KernelInfo_GetLogger, _In_ const OrtKernelInfo* inf
   });
 }
 
+ORT_API_STATUS_IMPL(OrtApis::KernelContext_GetScratchBuffer, _In_ const OrtKernelContext* context, _In_ const OrtMemoryInfo* mem_info, _In_ size_t count_or_bytes, _Outptr_ void** out) {
+  if (count_or_bytes == 0) {
+    *out = nullptr;
+    return nullptr;
+  }
+  onnxruntime::AllocatorPtr allocator = reinterpret_cast<const onnxruntime::OpKernelContext*>(context)->GetAllocator(mem_info->device);
+  if (!allocator) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "No requested allocator available");
+  }
+  onnxruntime::Stream* stream = reinterpret_cast<const onnxruntime::OpKernelContext*>(context)->GetComputeStream();
+  *out = AllocateBufferWithOptions(*allocator, count_or_bytes, false, stream, stream->GetWaitNotificationFn());
+  return nullptr;
+};
+
 #if ENABLE_CUSTOM_OP_API
 #include "core/framework/customregistry.h"
 namespace onnxruntime {
diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc
index dec8754ea244f..273f94ae5decc 100644
--- a/onnxruntime/core/session/onnxruntime_c_api.cc
+++ b/onnxruntime/core/session/onnxruntime_c_api.cc
@@ -2725,6 +2725,7 @@ static constexpr OrtApi ort_api_1_to_18 = {
     &OrtApis::KernelContext_ParallelFor,
     &OrtApis::SessionOptionsAppendExecutionProvider_OpenVINO_V2,
     &OrtApis::SessionOptionsAppendExecutionProvider_VitisAI,
+    &OrtApis::KernelContext_GetScratchBuffer,
 };
 
 // OrtApiBase can never change as there is no way to know what version of OrtApiBase is returned by OrtGetApiBase.
diff --git a/onnxruntime/core/session/ort_apis.h b/onnxruntime/core/session/ort_apis.h
index 9ce94ba89a942..adb89f7f85444 100644
--- a/onnxruntime/core/session/ort_apis.h
+++ b/onnxruntime/core/session/ort_apis.h
@@ -513,4 +513,6 @@ ORT_API_STATUS_IMPL(SessionOptionsAppendExecutionProvider_OpenVINO_V2,
 ORT_API_STATUS_IMPL(SessionOptionsAppendExecutionProvider_VitisAI, _In_ OrtSessionOptions* options,
                     _In_reads_(num_keys) const char* const* provider_options_keys,
                     _In_reads_(num_keys) const char* const* provider_options_values, _In_ size_t num_keys);
+
+ORT_API_STATUS_IMPL(KernelContext_GetScratchBuffer, _In_ const OrtKernelContext* context, _In_ const OrtMemoryInfo* mem_info, _In_ size_t count_or_bytes, _Outptr_ void** out);
 }  // namespace OrtApis

From 9c3242ab70343cefad09bbc32ba1450308747757 Mon Sep 17 00:00:00 2001
From: Adrian Lizarraga <adlizarraga@microsoft.com>
Date: Wed, 13 Mar 2024 20:52:59 -0700
Subject: [PATCH 168/279] [QNN EP] Copy security catalog file for HtpV73Skel.so
 from QNN SDK (#19903)

### Description
Copies the `QNN_HOME/lib/hexagon-v73/unsigned/libqnnhtpv73.cat` file
from QNN SDK to the unittest build directory. This is necessary in order
to be able to load the `libQnnHtpV73Skel.so` file on Windows for modern
versions of QNN SDK.

### Motivation and Context
A [digitally-signed catalog
file](https://learn.microsoft.com/en-us/windows-hardware/drivers/install/catalog-files)
(.cat) can be used as a digital signature for an arbitrary collection of
files.
---
 cmake/onnxruntime_unittests.cmake | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index cce39ae0f5d47..1ffb838328643 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -1000,7 +1000,9 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
     if (MSVC OR ${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
         file(GLOB QNN_LIB_FILES LIST_DIRECTORIES false "${onnxruntime_QNN_HOME}/lib/${QNN_ARCH_ABI}/*.so" "${onnxruntime_QNN_HOME}/lib/${QNN_ARCH_ABI}/*.dll")
         if (${QNN_ARCH_ABI} STREQUAL "aarch64-windows-msvc")
-          file(GLOB EXTRA_HTP_LIB LIST_DIRECTORIES false "${onnxruntime_QNN_HOME}/lib/hexagon-v68/unsigned/libQnnHtpV68Skel.so" "${onnxruntime_QNN_HOME}/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so")
+          file(GLOB EXTRA_HTP_LIB LIST_DIRECTORIES false "${onnxruntime_QNN_HOME}/lib/hexagon-v68/unsigned/libQnnHtpV68Skel.so"
+		  "${onnxruntime_QNN_HOME}/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so"
+		  "${onnxruntime_QNN_HOME}/lib/hexagon-v73/unsigned/libqnnhtpv73.cat")
           list(APPEND QNN_LIB_FILES ${EXTRA_HTP_LIB})
         endif()
         message(STATUS "QNN lib files: " ${QNN_LIB_FILES})

From 944336600914eda170a355954372efdfa608ccca Mon Sep 17 00:00:00 2001
From: Jeff Daily <jeff.daily@amd.com>
Date: Wed, 13 Mar 2024 21:16:54 -0700
Subject: [PATCH 169/279] [ROCm] fix build failure when nccl is enabled
 (#19900)

Building onnxruntime ROCm EP with --enable_nccl --use_mpi fails due to
inclusion of MOE source files but MOE is not supported. The error
observed is

`error: contrib_ops/rocm/moe/ft_moe/moe_kernel.h: No such file or
directory`

The fix is to exclude collective/sharded_moe.* files when nccl is
requested.
---
 cmake/onnxruntime_rocm_hipify.cmake | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cmake/onnxruntime_rocm_hipify.cmake b/cmake/onnxruntime_rocm_hipify.cmake
index 1bb70e9c2ed27..6f54943f09afe 100644
--- a/cmake/onnxruntime_rocm_hipify.cmake
+++ b/cmake/onnxruntime_rocm_hipify.cmake
@@ -114,6 +114,10 @@ if (NOT onnxruntime_USE_NCCL)
   list(APPEND contrib_ops_excluded_files "collective/distributed_reduce.cc")
   list(APPEND contrib_ops_excluded_files "collective/distributed_unsqueeze.cc")
   list(APPEND contrib_ops_excluded_files "collective/distributed_squeeze.cc")
+else()
+  # moe not supported for ROCm EP
+  list(APPEND contrib_ops_excluded_files "collective/sharded_moe.h")
+  list(APPEND contrib_ops_excluded_files "collective/sharded_moe.cc")
 endif()
 
 set(provider_excluded_files

From 0be0791fcc07c576912da0397717183d1cca1f75 Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Thu, 14 Mar 2024 15:19:19 +1000
Subject: [PATCH 170/279] Update MAUI model tester tool to .net8 (#19907)

### Description
<!-- Describe your changes. -->
Update to .net8. Didn't want to build with the latest VS2022 using net6
(which was EOL last year).


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 csharp/tools/MauiModelTester/MauiModelTester.csproj         | 6 +++---
 .../MauiModelTester/Platforms/Android/AndroidManifest.xml   | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/csharp/tools/MauiModelTester/MauiModelTester.csproj b/csharp/tools/MauiModelTester/MauiModelTester.csproj
index a374c2933ce8f..39e688ce6c1b8 100644
--- a/csharp/tools/MauiModelTester/MauiModelTester.csproj
+++ b/csharp/tools/MauiModelTester/MauiModelTester.csproj
@@ -1,8 +1,8 @@
 ﻿<Project Sdk="Microsoft.NET.Sdk">
 
 	<PropertyGroup>
-		<TargetFrameworks>net6.0-android;net6.0-ios</TargetFrameworks>
-		<TargetFrameworks Condition="$([MSBuild]::IsOSPlatform('windows'))">$(TargetFrameworks);net6.0-windows10.0.19041.0</TargetFrameworks>
+		<TargetFrameworks>net8.0-ios;net8.0-android34.0</TargetFrameworks>
+		<TargetFrameworks Condition="$([MSBuild]::IsOSPlatform('windows'))">$(TargetFrameworks);net8.0-windows10.0.19041.0</TargetFrameworks>
 		<OutputType>Exe</OutputType>
 		<RootNamespace>MauiModelTester</RootNamespace>
 		<UseMaui>true</UseMaui>
@@ -21,7 +21,7 @@
 		<ApplicationVersion>1</ApplicationVersion>
 
 		<SupportedOSPlatformVersion Condition="$([MSBuild]::GetTargetPlatformIdentifier('$(TargetFramework)')) == 'ios'">12.0</SupportedOSPlatformVersion>
-		<SupportedOSPlatformVersion Condition="$([MSBuild]::GetTargetPlatformIdentifier('$(TargetFramework)')) == 'android'">21.0</SupportedOSPlatformVersion>
+		<SupportedOSPlatformVersion Condition="$([MSBuild]::GetTargetPlatformIdentifier('$(TargetFramework)')) == 'android'">29.0</SupportedOSPlatformVersion>
 		<SupportedOSPlatformVersion Condition="$([MSBuild]::GetTargetPlatformIdentifier('$(TargetFramework)')) == 'windows'">10.0.17763.0</SupportedOSPlatformVersion>
 		<TargetPlatformMinVersion Condition="$([MSBuild]::GetTargetPlatformIdentifier('$(TargetFramework)')) == 'windows'">10.0.17763.0</TargetPlatformMinVersion>
 		<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
diff --git a/csharp/tools/MauiModelTester/Platforms/Android/AndroidManifest.xml b/csharp/tools/MauiModelTester/Platforms/Android/AndroidManifest.xml
index cc320dab474a0..2ef2296d7441f 100644
--- a/csharp/tools/MauiModelTester/Platforms/Android/AndroidManifest.xml
+++ b/csharp/tools/MauiModelTester/Platforms/Android/AndroidManifest.xml
@@ -4,5 +4,5 @@
 	<uses-permission android:name="android.permission.ACCESS_NETWORK_STATE" />
 	<uses-permission android:name="android.permission.INTERNET" />
 	<uses-permission android:name="android.permission.DIAGNOSTIC" />
-	<uses-sdk android:minSdkVersion="21" android:targetSdkVersion="31" />
+	<uses-sdk android:minSdkVersion="29" android:targetSdkVersion="34" />
 </manifest>
\ No newline at end of file

From 409b81132529fba952337767e717de7935ad6ad5 Mon Sep 17 00:00:00 2001
From: pengwa <pengwa@microsoft.com>
Date: Thu, 14 Mar 2024 16:31:32 +0800
Subject: [PATCH 171/279] Refine logging for execution plan print (#19777)

### Refine logging for execution plan print

Printing NodeIndex only is not enough for us to debug the execution
order.

keep original behaviour for ORT_MINIMAL_BUILD build in case of any CPU
memory concerns.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../core/framework/allocation_planner.cc      | 11 ++++-
 onnxruntime/core/framework/execution_steps.cc | 40 +++++++++++++------
 onnxruntime/core/framework/execution_steps.h  |  9 +++++
 3 files changed, 46 insertions(+), 14 deletions(-)

diff --git a/onnxruntime/core/framework/allocation_planner.cc b/onnxruntime/core/framework/allocation_planner.cc
index 158ab8ed610f4..fa6233476fe62 100644
--- a/onnxruntime/core/framework/allocation_planner.cc
+++ b/onnxruntime/core/framework/allocation_planner.cc
@@ -1775,7 +1775,12 @@ class PlannerImpl {
       execution_plan.emplace_back(std::make_unique<SequentialExecutionPlan::LogicStream>(node_device_mem_location));
       // 2. add steps to the execution plan
       for (auto node_index : stream_nodes_[0]) {
+#if defined(ORT_MINIMAL_BUILD)
         execution_plan[0]->steps_.emplace_back(std::make_unique<LaunchKernelStep>(node_index));
+#else
+        execution_plan[0]->steps_.emplace_back(std::make_unique<LaunchKernelStep>(node_index,
+                                                                                  graph_viewer_.GetNode(node_index)->Name()));
+#endif
       }
     } else {
       // graph with no nodes. e.g. subgraph of If might return the input as-is or a constant value from an initializer
@@ -1980,8 +1985,12 @@ class PlannerImpl {
           // add dependency for model graph
           dependence_graph_[it->Index()].insert(node_index);
         }
-        // push launch kernel command
+// push launch kernel command
+#if defined(ORT_MINIMAL_BUILD)
         execution_plan[i]->steps_.emplace_back(std::make_unique<LaunchKernelStep>(node_index));
+#else
+        execution_plan[i]->steps_.emplace_back(std::make_unique<LaunchKernelStep>(node_index, graph_viewer_.GetNode(node_index)->Name()));
+#endif
         // check if any notification generated by this node, if yes, push a activate
         auto notification_it = node_to_notification.find(node_index);
         if (notification_it != node_to_notification.end()) {
diff --git a/onnxruntime/core/framework/execution_steps.cc b/onnxruntime/core/framework/execution_steps.cc
index df19236d037c0..b647833cfd373 100644
--- a/onnxruntime/core/framework/execution_steps.cc
+++ b/onnxruntime/core/framework/execution_steps.cc
@@ -1,8 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
+
 #include "core/framework/execution_steps.h"
 #include "core/framework/sequential_executor.h"
+
 namespace onnxruntime {
+
 BarrierStep::BarrierStep(size_t id, NodeIndex node_index) : SequentialExecutionPlan::ExecutionStep(node_index),
                                                             barrier_id_{id} {}
 
@@ -16,8 +19,8 @@ Status BarrierStep::Execute(StreamExecutionContext& ctx,
 }
 
 std::string BarrierStep::ToString() const {
-  return ::onnxruntime::MakeString("Set a barrier with id: ",
-                                   barrier_id_, ", count: ", 2, ".");
+  // Set a barrier with id: barrier_id_, count: 2.
+  return MakeString("Barrier - BarrierId: ", barrier_id_, ", Count: ", 2);
 }
 
 WaitOnEPStep::WaitOnEPStep(WaitNotificationFn handle,
@@ -42,11 +45,17 @@ Status WaitOnEPStep::Execute(StreamExecutionContext& ctx,
 }
 
 std::string WaitOnEPStep::ToString() const {
-  return ::onnxruntime::MakeString("WaitOnEPStep: wait on notification with id: ",
-                                   notification_idx_, ". ");
+  // Wait on notification with notification_idx_
+  return MakeString("WaitOnEP - NotificationId: ", notification_idx_);
 }
 
-LaunchKernelStep::LaunchKernelStep(NodeIndex index) : SequentialExecutionPlan::ExecutionStep(index) {}
+#if defined(ORT_MINIMAL_BUILD)
+LaunchKernelStep::LaunchKernelStep(NodeIndex index)
+    : SequentialExecutionPlan::ExecutionStep(index) {}
+#else
+LaunchKernelStep::LaunchKernelStep(NodeIndex index, std::string_view node_name)
+    : SequentialExecutionPlan::ExecutionStep(index), node_name_(node_name) {}
+#endif
 
 Status LaunchKernelStep::Execute(StreamExecutionContext& ctx,
                                  size_t stream_idx,
@@ -61,13 +70,17 @@ Status LaunchKernelStep::Execute(StreamExecutionContext& ctx,
     return Status::OK();
   }
 #endif
-  onnxruntime::Status status = ExecuteKernel(ctx, node_index_, stream_idx, terminate_flag, session_scope);
+  Status status = ExecuteKernel(ctx, node_index_, stream_idx, terminate_flag, session_scope);
   continue_flag = status.IsOK();
   return status;
 }
 
 std::string LaunchKernelStep::ToString() const {
-  return ::onnxruntime::MakeString("Launch kernel with node id: ", node_index_, ". ");
+#if defined(ORT_MINIMAL_BUILD)
+  return MakeString("LaunchKernel - ", "NodeIndex: ", node_index_);
+#else
+  return MakeString("LaunchKernel - ", "NodeIndex: ", node_index_, ", Name: ", node_name_);
+#endif
 }
 
 ActivateNotificationStep::ActivateNotificationStep(
@@ -89,12 +102,12 @@ Status ActivateNotificationStep::Execute(StreamExecutionContext& ctx,
 }
 
 std::string ActivateNotificationStep::ToString() const {
-  return ::onnxruntime::MakeString("ActivateNotificationStep: activate notification with id: ",
-                                   notification_idx_, ". ");
+  // Activate notification with id: notification_idx_
+  return MakeString("ActivateNotification - NotificationId: ", notification_idx_);
 }
 
-TriggerDownstreamStep::TriggerDownstreamStep(size_t trigger_point_index, NodeIndex node_index) : SequentialExecutionPlan::ExecutionStep(node_index),
-                                                                                                 trigger_point_index_(trigger_point_index) {}
+TriggerDownstreamStep::TriggerDownstreamStep(size_t trigger_point_index, NodeIndex node_index)
+    : SequentialExecutionPlan::ExecutionStep(node_index), trigger_point_index_(trigger_point_index) {}
 
 Status TriggerDownstreamStep::Execute(StreamExecutionContext& ctx,
                                       size_t /*stream_idx*/,
@@ -107,7 +120,8 @@ Status TriggerDownstreamStep::Execute(StreamExecutionContext& ctx,
 }
 
 std::string TriggerDownstreamStep::ToString() const {
-  return ::onnxruntime::MakeString("TriggerDownstreamStep: trigger downstream of trigger point: ",
-                                   trigger_point_index_, ".");
+  // Trigger downstream of trigger point: trigger_point_index_.
+  return MakeString("TriggerDownstream - TriggerPointIndex: ", trigger_point_index_);
 }
+
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/execution_steps.h b/onnxruntime/core/framework/execution_steps.h
index b67b583900824..545dabc56b272 100644
--- a/onnxruntime/core/framework/execution_steps.h
+++ b/onnxruntime/core/framework/execution_steps.h
@@ -44,7 +44,11 @@ class WaitOnEPStep : public SequentialExecutionPlan::ExecutionStep {
 
 class LaunchKernelStep : public SequentialExecutionPlan::ExecutionStep {
  public:
+#if defined(ORT_MINIMAL_BUILD)
   LaunchKernelStep(NodeIndex index);
+#else
+  LaunchKernelStep(NodeIndex index, std::string_view node_name);
+#endif
 
   Status Execute(StreamExecutionContext& ctx,
                  size_t stream_idx,
@@ -53,6 +57,11 @@ class LaunchKernelStep : public SequentialExecutionPlan::ExecutionStep {
                  bool& continue_flag) override;
 
   std::string ToString() const override;
+
+#if !defined(ORT_MINIMAL_BUILD)
+ private:
+  std::string node_name_;
+#endif
 };
 
 class ActivateNotificationStep : public SequentialExecutionPlan::ExecutionStep {

From 966fa74597affcaffe35968b5bae0bab5d782f7c Mon Sep 17 00:00:00 2001
From: cao lei <jslhcl@gmail.com>
Date: Thu, 14 Mar 2024 06:00:41 -0700
Subject: [PATCH 172/279] Add 2 C API for ort extension (#19808)

### Description
<!-- Describe your changes. -->
Add 2 C API for ORT extension:
- KernelInfo_GetAllocator
- OrtCustomOp::GetMayInplace


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
Add 2 C API for ORT extension project, which will leverage these 2 APIs
for GroupQueryAttention custom op.
---
 .../core/session/onnxruntime_c_api.h          | 17 ++++++++++
 onnxruntime/core/session/custom_ops.cc        | 12 +++++++
 onnxruntime/core/session/onnxruntime_c_api.cc |  1 +
 onnxruntime/core/session/ort_apis.h           |  2 ++
 onnxruntime/test/shared_lib/test_inference.cc | 31 +++++++++++++++++++
 5 files changed, 63 insertions(+)

diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index 2e2a903da27cb..cef50163f68b0 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -4600,6 +4600,16 @@ struct OrtApi {
    *  \snippet{doc} snippets.dox OrtStatus Return Value
    */
   ORT_API2_STATUS(KernelContext_GetScratchBuffer, _In_ const OrtKernelContext* context, _In_ const OrtMemoryInfo* mem_info, _In_ size_t count_or_bytes, _Outptr_ void** out);
+
+  /** \brief Get allocator from KernelInfo for a specific memory type. Please use C API ReleaseAllocator to release out object
+   *
+   * \param[in] info OrtKernelInfo instance
+   * \param[in] mem_type OrtMemType object
+   * \param[out] out A pointer to OrtAllocator
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(KernelInfoGetAllocator, _In_ const OrtKernelInfo* info, _In_ OrtMemType mem_type, _Outptr_ OrtAllocator** out);
 };
 
 /*
@@ -4697,6 +4707,13 @@ struct OrtCustomOp {
   // Get start range
   int(ORT_API_CALL* GetStartVersion)(_In_ const struct OrtCustomOp* op);
   int(ORT_API_CALL* GetEndVersion)(_In_ const struct OrtCustomOp* op);
+
+  // Get the inplace_map that defines which output can reuse which input
+  // Callers will provide 2 raw int* and pass in their address, this function will fill these 2 arrays
+  // when return, output (*output_index)[i] may reuse the input (*input_index[i]).
+  // The return value is the size of these 2 arrays.
+  // Callers are responsible to delete these 2 arrays after use.
+  size_t(ORT_API_CALL* GetMayInplace)(_Out_ int** input_index, _Out_ int** output_index);
 };
 
 /*
diff --git a/onnxruntime/core/session/custom_ops.cc b/onnxruntime/core/session/custom_ops.cc
index 9a09c74cd0b3a..6e9d68d259a5d 100644
--- a/onnxruntime/core/session/custom_ops.cc
+++ b/onnxruntime/core/session/custom_ops.cc
@@ -736,6 +736,18 @@ ORT_API_STATUS_IMPL(OrtApis::KernelInfo_GetLogger, _In_ const OrtKernelInfo* inf
   });
 }
 
+ORT_API_STATUS_IMPL(OrtApis::KernelInfoGetAllocator, _In_ const OrtKernelInfo* info, _In_ OrtMemType mem_type, _Outptr_ OrtAllocator** out) {
+  return ExecuteIfCustomOpsApiEnabled([&]() -> OrtStatusPtr {
+    onnxruntime::AllocatorPtr allocator = reinterpret_cast<const onnxruntime::OpKernelInfo*>(info)->GetAllocator(mem_type);
+    if (!allocator) {
+      return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "No requested allocator available");
+    }
+    auto p = std::make_unique<onnxruntime::OrtAllocatorImplWrappingIAllocator>(std::move(allocator));
+    *out = p.release();
+    return nullptr;
+  });
+}
+
 ORT_API_STATUS_IMPL(OrtApis::KernelContext_GetScratchBuffer, _In_ const OrtKernelContext* context, _In_ const OrtMemoryInfo* mem_info, _In_ size_t count_or_bytes, _Outptr_ void** out) {
   if (count_or_bytes == 0) {
     *out = nullptr;
diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc
index 273f94ae5decc..270b3490689c4 100644
--- a/onnxruntime/core/session/onnxruntime_c_api.cc
+++ b/onnxruntime/core/session/onnxruntime_c_api.cc
@@ -2726,6 +2726,7 @@ static constexpr OrtApi ort_api_1_to_18 = {
     &OrtApis::SessionOptionsAppendExecutionProvider_OpenVINO_V2,
     &OrtApis::SessionOptionsAppendExecutionProvider_VitisAI,
     &OrtApis::KernelContext_GetScratchBuffer,
+    &OrtApis::KernelInfoGetAllocator,
 };
 
 // OrtApiBase can never change as there is no way to know what version of OrtApiBase is returned by OrtGetApiBase.
diff --git a/onnxruntime/core/session/ort_apis.h b/onnxruntime/core/session/ort_apis.h
index adb89f7f85444..3591c96234aa3 100644
--- a/onnxruntime/core/session/ort_apis.h
+++ b/onnxruntime/core/session/ort_apis.h
@@ -515,4 +515,6 @@ ORT_API_STATUS_IMPL(SessionOptionsAppendExecutionProvider_VitisAI, _In_ OrtSessi
                     _In_reads_(num_keys) const char* const* provider_options_values, _In_ size_t num_keys);
 
 ORT_API_STATUS_IMPL(KernelContext_GetScratchBuffer, _In_ const OrtKernelContext* context, _In_ const OrtMemoryInfo* mem_info, _In_ size_t count_or_bytes, _Outptr_ void** out);
+
+ORT_API_STATUS_IMPL(KernelInfoGetAllocator, _In_ const OrtKernelInfo* info, _In_ OrtMemType mem_type, _Outptr_ OrtAllocator** out);
 }  // namespace OrtApis
diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc
index 453b5fdd360bf..91453102d406f 100644
--- a/onnxruntime/test/shared_lib/test_inference.cc
+++ b/onnxruntime/test/shared_lib/test_inference.cc
@@ -4007,3 +4007,34 @@ TEST(CApiTest, RunAsyncFail) {
   Ort::RunOptions run_options;
   EXPECT_THROW(session.RunAsync(run_options, input_names, input_tensors, 1, output_names, output_values, 1, CallbackFail, nullptr), std::exception);
 }
+
+struct MockGQA : public OrtCustomOp {
+  MockGQA() {
+    OrtCustomOp::GetMayInplace = [](int** input_index, int** output_index) {
+      size_t ret = 2;
+      *input_index = static_cast<int*>(malloc(ret * sizeof(int)));
+      (*input_index)[0] = 3;
+      (*input_index)[1] = 4;
+      *output_index = static_cast<int*>(malloc(ret * sizeof(int)));
+      (*output_index)[0] = 1;
+      (*output_index)[1] = 2;
+      return ret;
+    };
+  }
+};
+
+TEST(CApiTest, OrtCustomOp_GetInPlace) {
+  MockGQA mock_gqa;
+  int* input_index = nullptr;
+  int* output_index = nullptr;
+  size_t len = mock_gqa.GetMayInplace(&input_index, &output_index);
+  ASSERT_NE(input_index, nullptr);
+  ASSERT_NE(output_index, nullptr);
+  ASSERT_EQ(input_index[0], 3);
+  ASSERT_EQ(input_index[1], 4);
+  ASSERT_EQ(output_index[0], 1);
+  ASSERT_EQ(output_index[1], 2);
+  ASSERT_EQ(len, static_cast<size_t>(2));
+  free(input_index);
+  free(output_index);
+}

From ea4a5eea18d6fc35256f4eb92237d1be1c82e3d1 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Thu, 14 Mar 2024 07:55:00 -0700
Subject: [PATCH 173/279] Change nuget pipeline's
 "Final_Jar_Testing_Windows_GPU" job to download TRT binaries in every build
 (#19909)

### Description
Change nuget pipeline's "Final_Jar_Testing_Windows_GPU" job to download
TRT binaries in every build. Now all the other build jobs are already
doing this. This is the only one left.


### Motivation and Context

As a follow up of #19118
---
 .../c-api-noopenmp-packaging-pipelines.yml            | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index 5a50a9964bead..01f316dbbaaef 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -394,12 +394,11 @@ stages:
     steps:
     - template: templates/set-version-number-variables-step.yml
 
-    - task: BatchScript@1
-      displayName: 'setup env'
-      inputs:
-        filename: '$(Build.SourcesDirectory)\tools\ci_build\github\windows\setup_env_cuda.bat'
-        modifyEnvironment: true
-        workingFolder: '$(Build.BinariesDirectory)'
+    - template: templates/jobs/download_win_gpu_library.yml
+      parameters:
+        CudaVersion: ${{ parameters.CudaVersion }}
+        DownloadCUDA: true
+        DownloadTRT: true
 
     - template: templates\flex-downloadPipelineArtifact.yml
       parameters:

From 1fb6cbddee6dc84f3ed720425e42cb789c361696 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Thu, 14 Mar 2024 08:50:42 -0700
Subject: [PATCH 174/279] Add a build patch for Windows ARM64EC (#19898)

### Description
Add a patch for Windows ARM64EC


### Motivation and Context
Will need more changes in onnxruntime/core/common/cpuid_arch_definition.h and onnxruntime/core/common/cpuid_info.cc
---
 .../external/onnxruntime_external_deps.cmake  | 24 +++++++++++++------
 ...2d342fd9479679d505d93a478a6f9cd50a47.patch | 22 +++++++++++++++++
 2 files changed, 39 insertions(+), 7 deletions(-)
 create mode 100644 cmake/patches/cpuinfo/9bb12d342fd9479679d505d93a478a6f9cd50a47.patch

diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index e4fefdbf86369..75ccc2dfd83a0 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -305,13 +305,23 @@ if (CPUINFO_SUPPORTED)
   set(CPUINFO_BUILD_UNIT_TESTS OFF CACHE INTERNAL "")
   set(CPUINFO_BUILD_MOCK_TESTS OFF CACHE INTERNAL "")
   set(CPUINFO_BUILD_BENCHMARKS OFF CACHE INTERNAL "")
-
-  FetchContent_Declare(
-    pytorch_cpuinfo
-    URL ${DEP_URL_pytorch_cpuinfo}
-    URL_HASH SHA1=${DEP_SHA1_pytorch_cpuinfo}
-    FIND_PACKAGE_ARGS NAMES cpuinfo
-  )
+  if(onnxruntime_target_platform STREQUAL "ARM64EC")
+      message("Applying a patch for Windows ARM64EC in cpuinfo")
+      FetchContent_Declare(
+        pytorch_cpuinfo
+        URL ${DEP_URL_pytorch_cpuinfo}
+        URL_HASH SHA1=${DEP_SHA1_pytorch_cpuinfo}
+        PATCH_COMMAND ${Patch_EXECUTABLE} -p1 < ${PROJECT_SOURCE_DIR}/patches/cpuinfo/9bb12d342fd9479679d505d93a478a6f9cd50a47.patch
+        FIND_PACKAGE_ARGS NAMES cpuinfo
+      )
+  else()
+      FetchContent_Declare(
+        pytorch_cpuinfo
+        URL ${DEP_URL_pytorch_cpuinfo}
+        URL_HASH SHA1=${DEP_SHA1_pytorch_cpuinfo}
+        FIND_PACKAGE_ARGS NAMES cpuinfo
+      )
+  endif()
   set(ONNXRUNTIME_CPUINFO_PROJ pytorch_cpuinfo)
 endif()
 
diff --git a/cmake/patches/cpuinfo/9bb12d342fd9479679d505d93a478a6f9cd50a47.patch b/cmake/patches/cpuinfo/9bb12d342fd9479679d505d93a478a6f9cd50a47.patch
new file mode 100644
index 0000000000000..afb19a45ce0f4
--- /dev/null
+++ b/cmake/patches/cpuinfo/9bb12d342fd9479679d505d93a478a6f9cd50a47.patch
@@ -0,0 +1,22 @@
+diff --git a/include/cpuinfo.h b/include/cpuinfo.h
+index c46b65e..8b83a64 100644
+--- a/include/cpuinfo.h
++++ b/include/cpuinfo.h
+@@ -18,7 +18,7 @@
+ 	#define CPUINFO_ARCH_X86 1
+ #endif
+ 
+-#if defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64)
++#if defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)) || (defined(_M_AMD64) && !defined(_M_ARM64EC))
+ 	#define CPUINFO_ARCH_X86_64 1
+ #endif
+ 
+@@ -26,7 +26,7 @@
+ 	#define CPUINFO_ARCH_ARM 1
+ #endif
+ 
+-#if defined(__aarch64__) || defined(_M_ARM64)
++#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+ 	#define CPUINFO_ARCH_ARM64 1
+ #endif
+ 

From 226f60f2f1de34db1df2f3ed5065e456557c78f4 Mon Sep 17 00:00:00 2001
From: Baiju Meswani <bmeswani@microsoft.com>
Date: Thu, 14 Mar 2024 11:31:20 -0700
Subject: [PATCH 175/279] Add support for SGD optimizer in minimal build
 (#19901)

---
 .../orttraining/python/training/artifacts.py  | 27 +++----
 .../orttraining_test_ort_apis_onnxblock.py    | 27 +++++++
 .../orttraining/training_api/optimizer.cc     | 71 +++----------------
 .../orttraining/training_api/optimizer.h      |  5 +-
 4 files changed, 53 insertions(+), 77 deletions(-)

diff --git a/orttraining/orttraining/python/training/artifacts.py b/orttraining/orttraining/python/training/artifacts.py
index 4e76174d8255e..624b30ffdab3b 100644
--- a/orttraining/orttraining/python/training/artifacts.py
+++ b/orttraining/orttraining/python/training/artifacts.py
@@ -41,7 +41,7 @@ def generate_artifacts(
     requires_grad: Optional[List[str]] = None,
     frozen_params: Optional[List[str]] = None,
     loss: Optional[Union[LossType, onnxblock.Block]] = None,
-    optimizer: Optional[OptimType] = None,
+    optimizer: Optional[Union[OptimType, onnxblock.Block]] = None,
     artifact_directory: Optional[Union[str, bytes, os.PathLike]] = None,
     prefix: str = "",
     ort_format: bool = False,
@@ -64,8 +64,8 @@ def generate_artifacts(
         model: The base model to be used for gradient graph generation.
         requires_grad: List of names of model parameters that require gradient computation
         frozen_params: List of names of model parameters that should be frozen.
-        loss: The loss function enum to be used for training. If None, no loss node is added to the graph.
-        optimizer: The optimizer enum to be used for training. If None, no optimizer model is generated.
+        loss: The loss function enum or onnxblock to be used for training. If None, no loss node is added to the graph.
+        optimizer: The optimizer enum or onnxblock to be used for training. If None, no optimizer model is generated.
         artifact_directory: The directory to save the generated artifacts.
             If None, the current working directory is used.
         prefix: The prefix to be used for the generated artifacts. If not specified, no prefix is used.
@@ -219,14 +219,6 @@ def _export_to_ort_format(model_path, output_dir, ort_format, custom_op_library_
         logging.info("No optimizer enum provided. Skipping optimizer model generation.")
         return
 
-    if not isinstance(optimizer, OptimType):
-        raise RuntimeError(
-            f"Unknown optimizer provided {type(optimizer)}. Expected optimizer to be of type "
-            "onnxruntime.training.artifacts.OptimType."
-        )
-
-    logging.info("Optimizer enum provided: %s", optimizer.name)
-
     opset_version = None
     for domain in model.opset_import:
         if domain.domain == "" or domain.domain == "ai.onnx":
@@ -235,8 +227,19 @@ def _export_to_ort_format(model_path, output_dir, ort_format, custom_op_library_
 
     optim_model = None
     optim_blocks = {OptimType.AdamW: onnxblock.optim.AdamW, OptimType.SGD: onnxblock.optim.SGD}
+    optim_block = None
+    if isinstance(optimizer, OptimType):
+        logging.info("Optimizer enum provided: %s", optimizer.name)
+        optim_block = optim_blocks[optimizer]()
+    elif isinstance(optimizer, onnxblock.Block):
+        logging.info("Optimizer block provided: %s", optimizer.__class__.__name__)
+        optim_block = optimizer
+    else:
+        raise TypeError(
+            f"Unknown optimizer provided {type(optimizer)}. Expected optimizer to be either one of"
+            "onnxruntime.training.artifacts.OptimType or onnxruntime.training.onnxblock.Block."
+        )
 
-    optim_block = optim_blocks[optimizer]()
     with onnxblock.empty_base(opset_version=opset_version):
         _ = optim_block(model_params)
         optim_model = optim_block.to_model_proto()
diff --git a/orttraining/orttraining/test/python/orttraining_test_ort_apis_onnxblock.py b/orttraining/orttraining/test/python/orttraining_test_ort_apis_onnxblock.py
index 11df3fa347ff8..ac49c1c2834c7 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ort_apis_onnxblock.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ort_apis_onnxblock.py
@@ -1072,3 +1072,30 @@ def test_save_nominal_checkpoint():
             os.stat(os.path.join(temp_dir, "checkpoint")).st_size
             > os.stat(os.path.join(temp_dir, "nominal_checkpoint")).st_size
         )
+
+
+def test_custom_optimizer_block():
+    device = "cpu"
+    batch_size, input_size, hidden_size, output_size = 64, 784, 500, 10
+    _, base_model = _get_models(device, batch_size, input_size, hidden_size, output_size)
+    weight_decay = 123
+    optimizer = onnxblock.optim.AdamW(weight_decay=weight_decay)
+
+    with tempfile.TemporaryDirectory() as temp_dir:
+        artifacts.generate_artifacts(
+            base_model,
+            requires_grad=["fc1.weight", "fc1.bias", "fc2.weight", "fc2.bias"],
+            loss=artifacts.LossType.CrossEntropyLoss,
+            optimizer=optimizer,
+            artifact_directory=temp_dir,
+        )
+
+        assert os.path.exists(os.path.join(temp_dir, "checkpoint"))
+        assert os.path.exists(os.path.join(temp_dir, "optimizer_model.onnx"))
+
+        optimizer_model = onnx.load(os.path.join(temp_dir, "optimizer_model.onnx"))
+        for node in optimizer_model.graph.node:
+            if node.op_type == "AdamW":
+                for attr in node.attribute:
+                    if attr.name == "weight_decay":
+                        assert attr.f == weight_decay
diff --git a/orttraining/orttraining/training_api/optimizer.cc b/orttraining/orttraining/training_api/optimizer.cc
index 84c35e6100385..4647f890729f4 100644
--- a/orttraining/orttraining/training_api/optimizer.cc
+++ b/orttraining/orttraining/training_api/optimizer.cc
@@ -61,32 +61,19 @@ Status GraphInputsAreExpected(gsl::span<const std::string> actual_graph_inputs,
 }  // namespace
 
 std::unique_ptr<OptimizerAlgorithmBase> OptimizerAlorithmFactory::CreateInstance(
-    std::shared_ptr<Model> model, int32_t& group_count) {
+    const GraphViewer& graph_viewer, int32_t& group_count) {
   std::map<std::pair<std::string, std::string>, int32_t> opt_type_to_freq_map;
-#if !defined(ORT_MINIMAL_BUILD)
-  if (model != nullptr) {
-    Graph& graph = model->MainGraph();
-    for (auto& node : graph.Nodes()) {
-      if (node.Domain() == kMSDomain && (node.OpType() == "AdamWOptimizer" || node.OpType() == "SGDOptimizerV2")) {
-        auto domain_type_pair = std::make_pair(node.Domain(), node.OpType());
-        if (opt_type_to_freq_map.find(domain_type_pair) == opt_type_to_freq_map.end()) {
-          opt_type_to_freq_map[domain_type_pair] = 0;
-        }
 
-        opt_type_to_freq_map[domain_type_pair] += 1;
+  for (const auto& node : graph_viewer.Nodes()) {
+    if (node.Domain() == kMSDomain && (node.OpType() == "AdamWOptimizer" || node.OpType() == "SGDOptimizerV2")) {
+      auto domain_type_pair = std::make_pair(node.Domain(), node.OpType());
+      if (opt_type_to_freq_map.find(domain_type_pair) == opt_type_to_freq_map.end()) {
+        opt_type_to_freq_map[domain_type_pair] = 0;
       }
+
+      opt_type_to_freq_map[domain_type_pair] += 1;
     }
-  } else {
-#else
-  ORT_UNUSED_PARAMETER(model);
-#endif
-    // TODO(baijumeswani): Figure out the best way to extract the optimizer type
-    // from the model (either onnx model or ort format model) or from the checkpoint.
-    // For now, assume that the optimizer type is AdamWOptimizer when using ort format models.
-    opt_type_to_freq_map[std::make_pair(kMSDomain, "AdamWOptimizer")] = 1;
-#if !defined(ORT_MINIMAL_BUILD)
   }
-#endif
 
   ORT_ENFORCE(opt_type_to_freq_map.size() == 1U, "Only support one type of optimizer algorithm, but got: " +
                                                      std::to_string(opt_type_to_freq_map.size()));
@@ -105,42 +92,6 @@ std::unique_ptr<OptimizerAlgorithmBase> OptimizerAlorithmFactory::CreateInstance
   }
 }
 
-std::unique_ptr<OptimizerAlgorithmBase> OptimizerAlorithmFactory::CreateInstance(
-    const PathString& optim_path, int32_t& group_count) {
-  std::shared_ptr<Model> model = nullptr;
-#if !defined(ORT_MINIMAL_BUILD)
-  if (!fbs::utils::IsOrtFormatModel(optim_path)) {
-    ORT_ENFORCE(Model::Load(optim_path, model, nullptr,
-                            logging::LoggingManager::DefaultLogger())
-                    .IsOK());
-  }
-#else
-  ORT_UNUSED_PARAMETER(optim_path);
-#endif
-  return CreateInstance(model, group_count);
-}
-
-std::unique_ptr<OptimizerAlgorithmBase> OptimizerAlorithmFactory::CreateInstance(
-    const uint8_t* optim_model_data, size_t optim_model_data_len, int32_t& group_count) {
-  std::shared_ptr<Model> model = nullptr;
-#if !defined(ORT_MINIMAL_BUILD)
-  if (!fbs::utils::IsOrtFormatModelBytes(optim_model_data, static_cast<int>(optim_model_data_len))) {
-    ONNX_NAMESPACE::ModelProto model_proto;
-    ORT_ENFORCE(model_proto.ParseFromArray(optim_model_data, static_cast<int>(optim_model_data_len)) == true,
-                "Failed to load model because protobuf parsing failed.");
-
-    ORT_ENFORCE(Model::Load(std::move(model_proto), model, nullptr,
-                            logging::LoggingManager::DefaultLogger(), ModelOptions(true, true))
-                    .IsOK());
-  }
-#else
-  ORT_UNUSED_PARAMETER(optim_model_data);
-  ORT_UNUSED_PARAMETER(optim_model_data_len);
-#endif
-
-  return CreateInstance(model, group_count);
-}
-
 Status Optimizer::GenerateMomentumNamedStates(OptimizerCheckpointState& optimizer_checkpoint_states) {
   auto group_optimizer_state_it =
       optimizer_checkpoint_states.group_named_optimizer_states.find(GROUP_ZERO_NAME);
@@ -280,17 +231,15 @@ void Optimizer::Initialize(const ModelIdentifiers& model_identifiers,
     auto optimizer_model = std::get<std::optional<std::string>>(model_identifiers.optim_model);
     // The above call to IsOptimizerModelAvailable() ensures that optimizer_model is not nullopt
     ORT_THROW_IF_ERROR(optim_sess_->Load(optimizer_model.value()));
-    optimizer_algo_ptr_ = OptimizerAlorithmFactory::CreateInstance(ToWideString(optimizer_model.value()), group_count_);
   } else {
     auto optimizer_model = std::get<gsl::span<const uint8_t>>(model_identifiers.optim_model);
     ORT_THROW_IF_ERROR(optim_sess_->Load(optimizer_model.data(),
                                          static_cast<int>(optimizer_model.size())));
-    optimizer_algo_ptr_ = OptimizerAlorithmFactory::CreateInstance(optimizer_model.data(),
-                                                                   optimizer_model.size(),
-                                                                   group_count_);
   }
 
   ORT_THROW_IF_ERROR(optim_sess_->Initialize());
+  optimizer_algo_ptr_ = OptimizerAlorithmFactory::CreateInstance(optim_sess_->GetSessionState().GetGraphViewer(),
+                                                                 group_count_);
 
   // Make sure that the checkpoint state can copy tensors
   state_->optimizer_checkpoint_state.optimizer_session_data_transfer_mgr = &optim_sess_->GetDataTransferManager();
diff --git a/orttraining/orttraining/training_api/optimizer.h b/orttraining/orttraining/training_api/optimizer.h
index 031b11426539b..5b908acf7c9e3 100644
--- a/orttraining/orttraining/training_api/optimizer.h
+++ b/orttraining/orttraining/training_api/optimizer.h
@@ -64,11 +64,8 @@ struct SGDOptimizerV2Algorithm : public OptimizerAlgorithmBase {
 };
 
 struct OptimizerAlorithmFactory {
-  static std::unique_ptr<OptimizerAlgorithmBase> CreateInstance(const PathString& optim_path,
+  static std::unique_ptr<OptimizerAlgorithmBase> CreateInstance(const GraphViewer& graph_viewer,
                                                                 int32_t& group_count);
-  static std::unique_ptr<OptimizerAlgorithmBase> CreateInstance(const uint8_t* optim_model_data,
-                                                                size_t optim_model_data_len, int32_t& group_count);
-  static std::unique_ptr<OptimizerAlgorithmBase> CreateInstance(std::shared_ptr<Model> model, int32_t& group_count);
 };
 
 struct CheckpointState;

From 0b90363acb8d8e9662621c811457393bbf11f309 Mon Sep 17 00:00:00 2001
From: Edward Chen <18449977+edgchen1@users.noreply.github.com>
Date: Thu, 14 Mar 2024 13:05:42 -0700
Subject: [PATCH 176/279] [MLAS][AArch64] SQ4BitGemm CompInt8 multi-block
 implementation (#19826)

Update SQ4BitGemm CompInt8 implementation to process multiple blocks along a single column instead of processing single blocks from multiple columns.
---
 .../core/mlas/lib/sqnbitgemm_kernel_neon.cpp  | 529 ++++++++++++------
 .../test/mlas/bench/bench_sqnbitgemm.cpp      |  91 ++-
 onnxruntime/test/mlas/bench/bench_util.cpp    |  27 -
 onnxruntime/test/mlas/bench/bench_util.h      |   6 -
 4 files changed, 426 insertions(+), 227 deletions(-)

diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon.cpp b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon.cpp
index c4c54a9be34d8..9d7b0ae06e220 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon.cpp
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon.cpp
@@ -687,171 +687,314 @@ QuantizeARow_CompInt8(
     }
 }
 
-template <size_t NCols, size_t SubBlkLen, bool HasZeroPoint>
-MLAS_FORCEINLINE void
-ComputeDotProducts_BlkBitWidth4_CompInt8(
-    size_t BlkLen,
-    const std::byte* QuantARowPtr,
-    const std::byte* QuantBDataColPtr,
-    const float* QuantBScaleColPtr,
-    const std::byte* QuantBZeroPointColPtr,
-    float* SumPtr,
-    size_t CountK,
-    size_t StrideQuantBData,
-    size_t StrideQuantBScale,
-    size_t StrideQuantBZeroPoint,
-    const float* BiasPtr
+template <bool HasZeroPoint>
+void
+SQ4BitGemmM1Kernel_CompInt8_Impl_BlkLen16(
+    const std::byte* QuantA,
+    const std::byte* QuantBData,
+    const float* QuantBScale,
+    const std::byte* QuantBZeroPoint,
+    float* C,
+    size_t CountN,
+    size_t BlockCountK,
+    const float* Bias
 )
 {
     constexpr size_t BlkBitWidth = 4;
+    constexpr size_t BlkLen = 16;
 
-    static_assert(NCols == 1 || NCols == 4, "NCols must be 1 or 4");
-    static_assert(SubBlkLen == 16 || SubBlkLen == 32, "SubBlkLen must be 16 or 32");
+    float* CRowPtr = C;
 
-    assert(BlkLen >= SubBlkLen && BlkLen % SubBlkLen == 0);
+    const size_t StrideQuantBData = BlockCountK * MlasQNBitBlkDataSizeInBytes(BlkBitWidth, BlkLen);
+    const size_t StrideQuantBScale = BlockCountK;
+    const size_t StrideQuantBZeroPoint = MlasQNBitZeroPointsForBlksSizeInBytes<BlkBitWidth>(BlockCountK);
 
-    [[maybe_unused]] const uint8x8_t LowMaskU8x8 = vdup_n_u8(0x0F);     // only used if SubBlkLen == 16
-    [[maybe_unused]] const uint8x16_t LowMaskU8x16 = vdupq_n_u8(0x0F);  // only used if SubBlkLen == 32
+    const float* BiasPtr = Bias;
 
-    const std::byte* QuantA = QuantARowPtr;
+    const std::byte* QuantBDataColPtr = QuantBData;
+    const float* QuantBScaleColPtr = QuantBScale;
+    const std::byte* QuantBZeroPointColPtr = QuantBZeroPoint;
 
-    const std::byte* QuantBData = QuantBDataColPtr;
-    const float* QuantBScale = QuantBScaleColPtr;
-    [[maybe_unused]] size_t QuantBZeroPointIdx = 0;  // track half byte increments with this index instead of a pointer
-                                                     // only used if HasZeroPoint == true
+    float* SumPtr = CRowPtr;
 
-    float32x4_t acc[NCols]{};
+    const uint8x16_t LowMaskU8x16 = vdupq_n_u8(0x0F);
+    const uint8x8_t LowMaskU8x8 = vdup_n_u8(0x0F);
 
-    for (size_t k = 0; k < CountK; k += BlkLen) {
-        const size_t k_blk_len = std::min(CountK - k, BlkLen);
+    for (size_t n = 0; n < CountN; ++n) {
+        const std::byte* QuantAPtr = QuantA;
+        const std::byte* QuantBDataPtr = QuantBDataColPtr;
+        const float* QuantBScalePtr = QuantBScaleColPtr;
+        const std::byte* QuantBZeroPointPtr = QuantBZeroPointColPtr;
 
-        const float a_scale = Q8BlkScale(QuantA);
-        const int8_t* a_data = Q8BlkData(QuantA);
+        float32x4_t acc0{}, acc1{};
 
-        float b_scale[NCols];
-        UnrolledLoop<NCols>([&](size_t i) { b_scale[i] = QuantBScale[i * StrideQuantBScale]; });
+        size_t k_blks_remaining = BlockCountK;
+        for (; k_blks_remaining > 1; k_blks_remaining -= 2) {
+            const std::byte* QuantABlk0 = QuantAPtr;
+            const std::byte* QuantABlk1 = QuantABlk0 + Q8BlkSize(BlkLen);
 
-        [[maybe_unused]] int8_t b_zp[NCols];  // only used if HasZeroPoint == true
-        if constexpr (HasZeroPoint) {
-            UnrolledLoop<NCols>([&](size_t i) {
-                const std::byte zp_packed =
-                    QuantBZeroPointColPtr[i * StrideQuantBZeroPoint + QuantBZeroPointIdx / 2];
-                b_zp[i] = ((QuantBZeroPointIdx & 1) == 1)
-                              ? std::to_integer<int8_t>(zp_packed >> 4)
-                              : std::to_integer<int8_t>(zp_packed & std::byte{0x0F});
-            });
-        }
+            // compute combined scale
+            const float32x4_t scale0 = vdupq_n_f32(Q8BlkScale(QuantABlk0) * QuantBScalePtr[0]);
+            const float32x4_t scale1 = vdupq_n_f32(Q8BlkScale(QuantABlk1) * QuantBScalePtr[1]);
 
-        for (size_t k_idx_in_blk = 0; k_idx_in_blk < k_blk_len; k_idx_in_blk += SubBlkLen) {
-            // load A row vector
-            int8x16_t av[SubBlkLen / 16];
-            UnrolledLoop<SubBlkLen / 16>([&](size_t i) {
-                av[i] = vld1q_s8(a_data + k_idx_in_blk + i * 16);
-            });
+            // load B zero point
+            const int8x16_t bzp0 = vdupq_n_s8(
+                HasZeroPoint ? std::to_integer<int8_t>(QuantBZeroPointPtr[0] & std::byte{0x0F}) : 8
+            );
+            const int8x16_t bzp1 = vdupq_n_s8(
+                HasZeroPoint ? std::to_integer<int8_t>(QuantBZeroPointPtr[0] >> 4) : 8
+            );
 
-            // load B column vectors
-            int8x16_t bv[NCols][SubBlkLen / 16];
+            // load A
+            const int8x16_t av0 = vld1q_s8(Q8BlkData(QuantABlk0));
+            const int8x16_t av1 = vld1q_s8(Q8BlkData(QuantABlk1));
 
-            const size_t b_data_block_offset = k_idx_in_blk * BlkBitWidth / 8;
+            // load B
+            const uint8x16_t bv_packed01 = vld1q_u8(reinterpret_cast<const uint8_t*>(QuantBDataPtr));
 
-            if constexpr (SubBlkLen == 16) {
-                uint8x8_t bv_packed[NCols];
-                UnrolledLoop<NCols>([&](size_t i) {
-                    bv_packed[i] = vld1_u8(
-                        reinterpret_cast<const uint8_t*>(QuantBData) + i * StrideQuantBData + b_data_block_offset
-                    );
-                });
+            const uint8x16_t bv_lo01 = vandq_u8(bv_packed01, LowMaskU8x16);
+            const uint8x16_t bv_hi01 = vshrq_n_u8(bv_packed01, 4);
 
-                UnrolledLoop<NCols>([&](size_t i) {
-                    const int8x8_t lo = vreinterpret_s8_u8(vand_u8(bv_packed[i], LowMaskU8x8));
-                    const int8x8_t hi = vreinterpret_s8_u8(vshr_n_u8(bv_packed[i], 4));
-                    bv[i][0] = vcombine_s8(lo, hi);
-                });
-            } else {
-                static_assert(SubBlkLen == 32);
+            int8x16_t bv0 = vreinterpretq_s8_u8(vcombine_u8(vget_low_u8(bv_lo01), vget_low_u8(bv_hi01)));
+            int8x16_t bv1 = vreinterpretq_s8_u8(vcombine_u8(vget_high_u8(bv_lo01), vget_high_u8(bv_hi01)));
 
-                uint8x16_t bv_packed[NCols];
-                UnrolledLoop<NCols>([&](size_t i) {
-                    bv_packed[i] = vld1q_u8(
-                        reinterpret_cast<const uint8_t*>(QuantBData) + i * StrideQuantBData + b_data_block_offset
-                    );
-                });
+            // subtract B zero point
+            bv0 = vsubq_s8(bv0, bzp0);
+            bv1 = vsubq_s8(bv1, bzp1);
 
-                UnrolledLoop<NCols>([&](size_t i) {
-                    bv[i][0] = vreinterpretq_s8_u8(vandq_u8(bv_packed[i], LowMaskU8x16));
-                    bv[i][1] = vreinterpretq_s8_u8(vshrq_n_u8(bv_packed[i], 4));
-                });
+            // quantized dot product
+            const int32x4_t dot0 = vdotq_s32(vdupq_n_s32(0), av0, bv0);
+            const int32x4_t dot1 = vdotq_s32(vdupq_n_s32(0), av1, bv1);
+
+            // convert to float
+            const float32x4_t dot_f32_0 = vcvtq_f32_s32(dot0);
+            const float32x4_t dot_f32_1 = vcvtq_f32_s32(dot1);
+
+            // multiply by scale and update accumulator
+            acc0 = vfmaq_f32(acc0, dot_f32_0, scale0);
+            acc1 = vfmaq_f32(acc1, dot_f32_1, scale1);
+
+            // increment block pointers
+
+            QuantAPtr += Q8BlkSize(BlkLen) * 2;
+            QuantBDataPtr += 8 * 2;
+            QuantBScalePtr += 2;
+            if constexpr (HasZeroPoint) {
+                QuantBZeroPointPtr += 1;
             }
+        }
+
+        if (k_blks_remaining > 0) {
+            const std::byte* QuantABlk0 = QuantAPtr;
+
+            // compute combined scale
+            const float32x4_t scale0 = vdupq_n_f32(Q8BlkScale(QuantABlk0) * (*QuantBScalePtr));
+
+            // load B zero point
+            const int8x16_t bzp0 = vdupq_n_s8(
+                HasZeroPoint ? std::to_integer<int8_t>(QuantBZeroPointPtr[0] & std::byte{0x0F}) : 8
+            );
+
+            // load A
+            const int8x16_t av0 = vld1q_s8(Q8BlkData(QuantABlk0));
+
+            // load B
+            const uint8x8_t bv_packed0 = vld1_u8(reinterpret_cast<const uint8_t*>(QuantBDataPtr));
+
+            const uint8x8_t bv_lo0 = vand_u8(bv_packed0, LowMaskU8x8);
+            const uint8x8_t bv_hi0 = vshr_n_u8(bv_packed0, 4);
+
+            int8x16_t bv0 = vreinterpretq_s8_u8(vcombine_u8(bv_lo0, bv_hi0));
 
             // subtract B zero point
-            if constexpr (HasZeroPoint) {
-                UnrolledLoop<NCols>([&](size_t i) {
-                    const int8x16_t zp_v = vdupq_n_s8(b_zp[i]);
-                    UnrolledLoop<SubBlkLen / 16>([&](size_t j) {
-                        bv[i][j] = vsubq_s8(bv[i][j], zp_v);
-                    });
-                });
-            } else {
-                const int8x16_t zp_v = vdupq_n_s8(8);
+            bv0 = vsubq_s8(bv0, bzp0);
 
-                UnrolledLoop<NCols>([&](size_t i) {
-                    UnrolledLoop<SubBlkLen / 16>([&](size_t j) {
-                        bv[i][j] = vsubq_s8(bv[i][j], zp_v);
-                    });
-                });
-            }
+            // quantized dot product
+            const int32x4_t dot0 = vdotq_s32(vdupq_n_s32(0), av0, bv0);
 
-            // compute quantized dot product
-            int32x4_t dot[NCols]{};
-            UnrolledLoop<NCols>([&](size_t i) {
-                UnrolledLoop<SubBlkLen / 16>([&](size_t j) {
-                    dot[i] = vdotq_s32(dot[i], av[j], bv[i][j]);
-                });
-            });
+            // convert to float
+            const float32x4_t dot_f32_0 = vcvtq_f32_s32(dot0);
 
-            // convert dot product result to float
-            float32x4_t dot_f32[NCols];
-            UnrolledLoop<NCols>([&](size_t i) {
-                dot_f32[i] = vcvtq_f32_s32(dot[i]);
-            });
+            // multiply by scale and update accumulator
+            acc0 = vfmaq_f32(acc0, dot_f32_0, scale0);
+        }
 
-            // multiply dot product result by scale and update accumulator
-            UnrolledLoop<NCols>([&](size_t i) {
-                const float32x4_t scale_v = vdupq_n_f32(a_scale * b_scale[i]);
-                acc[i] = vfmaq_f32(acc[i], dot_f32[i], scale_v);
-            });
+        *SumPtr = vaddvq_f32(acc0) + vaddvq_f32(acc1);
+        if (BiasPtr) {
+            *SumPtr += *BiasPtr;
         }
 
-        // increment pointers to next block
-        QuantA += Q8BlkSize(BlkLen);
-        QuantBData += MlasQNBitBlkDataSizeInBytes(BlkBitWidth, BlkLen);
-        QuantBScale += 1;
+        // move to next column
+
+        QuantBDataColPtr += StrideQuantBData;
+        QuantBScaleColPtr += StrideQuantBScale;
         if constexpr (HasZeroPoint) {
-            QuantBZeroPointIdx += 1;
+            QuantBZeroPointColPtr += StrideQuantBZeroPoint;
         }
+
+        BiasPtr += BiasPtr != nullptr ? 1 : 0;
+        SumPtr += 1;
     }
+}
 
-    if constexpr (NCols == 4) {
-        float32x4_t sum = FoldAccumulators(acc[0], acc[1], acc[2], acc[3]);
+template <bool HasZeroPoint>
+void
+SQ4BitGemmM1Kernel_CompInt8_Impl_BlkLen32(
+    const std::byte* QuantA,
+    const std::byte* QuantBData,
+    const float* QuantBScale,
+    const std::byte* QuantBZeroPoint,
+    float* C,
+    size_t CountN,
+    size_t BlockCountK,
+    const float* Bias
+)
+{
+    constexpr size_t BlkBitWidth = 4;
+    constexpr size_t BlkLen = 32;
 
-        if (BiasPtr != nullptr) {
-            sum = vaddq_f32(sum, vld1q_f32(BiasPtr));
-        }
+    float* CRowPtr = C;
 
-        vst1q_f32(SumPtr, sum);
-    } else {
-        for (size_t i = 0; i < NCols; ++i) {
-            SumPtr[i] = vaddvq_f32(acc[i]);
-            if (BiasPtr != nullptr) {
-                SumPtr[i] += BiasPtr[i];
+    const size_t StrideQuantBData = BlockCountK * MlasQNBitBlkDataSizeInBytes(BlkBitWidth, BlkLen);
+    const size_t StrideQuantBScale = BlockCountK;
+    const size_t StrideQuantBZeroPoint = MlasQNBitZeroPointsForBlksSizeInBytes<BlkBitWidth>(BlockCountK);
+
+    const float* BiasPtr = Bias;
+
+    const std::byte* QuantBDataColPtr = QuantBData;
+    const float* QuantBScaleColPtr = QuantBScale;
+    const std::byte* QuantBZeroPointColPtr = QuantBZeroPoint;
+
+    float* SumPtr = CRowPtr;
+
+    const uint8x16_t LowMaskU8x16 = vdupq_n_u8(0x0F);
+
+    for (size_t n = 0; n < CountN; ++n) {
+        const std::byte* QuantAPtr = QuantA;
+        const std::byte* QuantBDataPtr = QuantBDataColPtr;
+        const float* QuantBScalePtr = QuantBScaleColPtr;
+        const std::byte* QuantBZeroPointPtr = QuantBZeroPointColPtr;
+
+        float32x4_t acc0{}, acc1{};
+
+        size_t k_blks_remaining = BlockCountK;
+        for (; k_blks_remaining > 1; k_blks_remaining -= 2) {
+            const std::byte* QuantABlk0 = QuantAPtr;
+            const std::byte* QuantABlk1 = QuantABlk0 + Q8BlkSize(BlkLen);
+
+            // compute combined scale
+            const float32x4_t scale0 = vdupq_n_f32(Q8BlkScale(QuantABlk0) * QuantBScalePtr[0]);
+            const float32x4_t scale1 = vdupq_n_f32(Q8BlkScale(QuantABlk1) * QuantBScalePtr[1]);
+
+            // load B zero point
+            const int8x16_t bzp0 = vdupq_n_s8(
+                HasZeroPoint ? std::to_integer<int8_t>((*QuantBZeroPointPtr) & std::byte{0x0F}) : 8
+            );
+            const int8x16_t bzp1 = vdupq_n_s8(
+                HasZeroPoint ? std::to_integer<int8_t>((*QuantBZeroPointPtr) >> 4) : 8
+            );
+
+            // load A
+            const int8x16_t av_lo0 = vld1q_s8(Q8BlkData(QuantABlk0));
+            const int8x16_t av_hi0 = vld1q_s8(Q8BlkData(QuantABlk0) + 16);
+            const int8x16_t av_lo1 = vld1q_s8(Q8BlkData(QuantABlk1));
+            const int8x16_t av_hi1 = vld1q_s8(Q8BlkData(QuantABlk1) + 16);
+
+            // load B
+            const uint8x16_t bv_packed0 = vld1q_u8(reinterpret_cast<const uint8_t*>(QuantBDataPtr));
+            const uint8x16_t bv_packed1 = vld1q_u8(reinterpret_cast<const uint8_t*>(QuantBDataPtr) + 16);
+
+            int8x16_t bv_lo0 = vreinterpretq_s8_u8(vandq_u8(bv_packed0, LowMaskU8x16));
+            int8x16_t bv_hi0 = vreinterpretq_s8_u8(vshrq_n_u8(bv_packed0, 4));
+            int8x16_t bv_lo1 = vreinterpretq_s8_u8(vandq_u8(bv_packed1, LowMaskU8x16));
+            int8x16_t bv_hi1 = vreinterpretq_s8_u8(vshrq_n_u8(bv_packed1, 4));
+
+            // subtract B zero point
+            bv_lo0 = vsubq_s8(bv_lo0, bzp0);
+            bv_hi0 = vsubq_s8(bv_hi0, bzp0);
+            bv_lo1 = vsubq_s8(bv_lo1, bzp1);
+            bv_hi1 = vsubq_s8(bv_hi1, bzp1);
+
+            // quantized dot product
+            int32x4_t dot0{}, dot1{};
+            dot0 = vdotq_s32(vdotq_s32(dot0, av_lo0, bv_lo0), av_hi0, bv_hi0);
+            dot1 = vdotq_s32(vdotq_s32(dot1, av_lo1, bv_lo1), av_hi1, bv_hi1);
+
+            // convert to float
+            const float32x4_t dot_f32_0 = vcvtq_f32_s32(dot0);
+            const float32x4_t dot_f32_1 = vcvtq_f32_s32(dot1);
+
+            // multiply by scale and update accumulator
+            acc0 = vfmaq_f32(acc0, dot_f32_0, scale0);
+            acc1 = vfmaq_f32(acc1, dot_f32_1, scale1);
+
+            // increment block pointers
+
+            QuantAPtr += Q8BlkSize(BlkLen) * 2;
+            QuantBDataPtr += 16 * 2;
+            QuantBScalePtr += 2;
+            if constexpr (HasZeroPoint) {
+                QuantBZeroPointPtr += 1;
             }
         }
+
+        if (k_blks_remaining > 0) {
+            const std::byte* QuantABlk0 = QuantAPtr;
+
+            // compute combined scale
+            const float32x4_t scale0 = vdupq_n_f32(Q8BlkScale(QuantABlk0) * (*QuantBScalePtr));
+
+            // load B zero point
+            const int8x16_t bzp0 = vdupq_n_s8(
+                HasZeroPoint ? std::to_integer<int8_t>((*QuantBZeroPoint) & std::byte{0x0F}) : 8
+            );
+
+            // load A
+            const int8x16_t av_lo0 = vld1q_s8(Q8BlkData(QuantABlk0));
+            const int8x16_t av_hi0 = vld1q_s8(Q8BlkData(QuantABlk0) + 16);
+
+            // load B
+            const uint8x16_t bv_packed0 = vld1q_u8(reinterpret_cast<const uint8_t*>(QuantBDataPtr));
+
+            int8x16_t bv_lo0 = vreinterpretq_s8_u8(vandq_u8(bv_packed0, LowMaskU8x16));
+            int8x16_t bv_hi0 = vreinterpretq_s8_u8(vshrq_n_u8(bv_packed0, 4));
+
+            // subtract B zero point
+            bv_lo0 = vsubq_s8(bv_lo0, bzp0);
+            bv_hi0 = vsubq_s8(bv_hi0, bzp0);
+
+            // quantized dot product
+            int32x4_t dot0{};
+            dot0 = vdotq_s32(vdotq_s32(dot0, av_lo0, bv_lo0), av_hi0, bv_hi0);
+
+            // convert to float
+            const float32x4_t dot_f32_0 = vcvtq_f32_s32(dot0);
+
+            // multiply by scale and update accumulator
+            acc0 = vfmaq_f32(acc0, dot_f32_0, scale0);
+        }
+
+        *SumPtr = vaddvq_f32(acc0) + vaddvq_f32(acc1);
+        if (BiasPtr) {
+            *SumPtr += *BiasPtr;
+        }
+
+        // move to next column
+
+        QuantBDataColPtr += StrideQuantBData;
+        QuantBScaleColPtr += StrideQuantBScale;
+        if constexpr (HasZeroPoint) {
+            QuantBZeroPointColPtr += StrideQuantBZeroPoint;
+        }
+
+        BiasPtr += BiasPtr != nullptr ? 1 : 0;
+        SumPtr += 1;
     }
 }
 
-template <size_t NCols, size_t SubBlkLen, bool HasZeroPoint>
+template <bool HasZeroPoint>
 void
-SQ4BitGemmM1Kernel_CompInt8_Impl(
+SQ4BitGemmM1Kernel_CompInt8_Impl_BlkLenGreaterThan32(
     size_t BlkLen,
     const std::byte* QuantA,
     const std::byte* QuantBData,
@@ -859,17 +1002,16 @@ SQ4BitGemmM1Kernel_CompInt8_Impl(
     const std::byte* QuantBZeroPoint,
     float* C,
     size_t CountN,
-    size_t CountK,
-    size_t BlockStrideQuantB,
+    size_t BlockCountK,
     const float* Bias
 )
 {
     constexpr size_t BlkBitWidth = 4;
 
-    const std::byte* QuantARowPtr = QuantA;
-    float* CRowPtr = C;
+    assert(BlkLen > 32);
+    assert(BlkLen % 32 == 0);
 
-    const size_t BlockCountK = BlockStrideQuantB;
+    float* CRowPtr = C;
 
     const size_t StrideQuantBData = BlockCountK * MlasQNBitBlkDataSizeInBytes(BlkBitWidth, BlkLen);
     const size_t StrideQuantBScale = BlockCountK;
@@ -883,39 +1025,91 @@ SQ4BitGemmM1Kernel_CompInt8_Impl(
 
     float* SumPtr = CRowPtr;
 
-    int64_t nblk = static_cast<int64_t>(CountN) - NCols;
+    const uint8x16_t LowMaskU8x16 = vdupq_n_u8(0x0F);
 
-    while (nblk >= 0) {
-        ComputeDotProducts_BlkBitWidth4_CompInt8<NCols, SubBlkLen, HasZeroPoint>(
-            BlkLen,
-            QuantARowPtr, QuantBDataColPtr, QuantBScaleColPtr, QuantBZeroPointColPtr, SumPtr, CountK,
-            StrideQuantBData, StrideQuantBScale, StrideQuantBZeroPoint,
-            BiasPtr
-        );
+    // process blocks in 32-element sub-blocks
+    const size_t SubBlksPerBlk = BlkLen / 32;
 
-        // move to next `NCols` columns
+    for (size_t n = 0; n < CountN; ++n) {
+        const std::byte* QuantAPtr = QuantA;
+        const std::byte* QuantBDataPtr = QuantBDataColPtr;
+        const float* QuantBScalePtr = QuantBScaleColPtr;
+        const std::byte* QuantBZeroPointPtr = QuantBZeroPointColPtr;
 
-        QuantBDataColPtr += NCols * StrideQuantBData;
-        QuantBScaleColPtr += NCols * StrideQuantBScale;
-        if constexpr (HasZeroPoint) {
-            QuantBZeroPointColPtr += NCols * StrideQuantBZeroPoint;
-        }
+        float32x4_t acc0{}, acc1{};
 
-        BiasPtr += BiasPtr != nullptr ? NCols : 0;
-        SumPtr += NCols;
+        for (size_t k_blk_idx = 0; k_blk_idx < BlockCountK; ++k_blk_idx) {
+            // compute combined scale
+            const float32x4_t scale = vdupq_n_f32(Q8BlkScale(QuantAPtr) * (*QuantBScalePtr));
 
-        nblk -= NCols;
-    }
+            // load B zero point
+            const int8x16_t bzp = [&]() -> int8x16_t {
+                if constexpr (HasZeroPoint) {
+                    return vdupq_n_s8(
+                        ((k_blk_idx & 1) == 0) ? std::to_integer<int8_t>((*QuantBZeroPointPtr) & std::byte{0x0F})
+                                               : std::to_integer<int8_t>((*QuantBZeroPointPtr) >> 4)
+                    );
+                } else {
+                    return vdupq_n_s8(8);
+                }
+            }();
+
+            const int8_t* QuantADataPtr = Q8BlkData(QuantAPtr);
+
+            for (size_t sub_blk_idx = 0; sub_blk_idx < SubBlksPerBlk; sub_blk_idx += 2) {
+                // load A
+                const int8x16_t av0 = vld1q_s8(QuantADataPtr + 0);
+                const int8x16_t av1 = vld1q_s8(QuantADataPtr + 16);
+                const int8x16_t av2 = vld1q_s8(QuantADataPtr + 32);
+                const int8x16_t av3 = vld1q_s8(QuantADataPtr + 48);
+
+                // load B
+                const uint8x16_t bv_packed0 = vld1q_u8(reinterpret_cast<const uint8_t*>(QuantBDataPtr));
+                const uint8x16_t bv_packed1 = vld1q_u8(reinterpret_cast<const uint8_t*>(QuantBDataPtr) + 16);
+
+                int8x16_t bv0 = vreinterpretq_s8_u8(vandq_u8(bv_packed0, LowMaskU8x16));
+                int8x16_t bv1 = vreinterpretq_s8_u8(vshrq_n_u8(bv_packed0, 4));
+                int8x16_t bv2 = vreinterpretq_s8_u8(vandq_u8(bv_packed1, LowMaskU8x16));
+                int8x16_t bv3 = vreinterpretq_s8_u8(vshrq_n_u8(bv_packed1, 4));
+
+                // subtract B zero point
+                bv0 = vsubq_s8(bv0, bzp);
+                bv1 = vsubq_s8(bv1, bzp);
+                bv2 = vsubq_s8(bv2, bzp);
+                bv3 = vsubq_s8(bv3, bzp);
+
+                // quantized dot product
+                int32x4_t dot0{}, dot1{};
+                dot0 = vdotq_s32(vdotq_s32(dot0, av0, bv0), av1, bv1);
+                dot1 = vdotq_s32(vdotq_s32(dot1, av2, bv2), av3, bv3);
+
+                // convert to float
+                const float32x4_t dot_f32_0 = vcvtq_f32_s32(dot0);
+                const float32x4_t dot_f32_1 = vcvtq_f32_s32(dot1);
+
+                // multiply by scale and update accumulator
+                acc0 = vfmaq_f32(acc0, dot_f32_0, scale);
+                acc1 = vfmaq_f32(acc1, dot_f32_1, scale);
+
+                // increment block data pointers to next sub-block
+                QuantADataPtr += 16 * 4;
+                QuantBDataPtr += 16 * 2;
+            }
 
-    // left over columns less than `NCols`?
-    nblk += NCols;
-    for (int64_t n = 0; n < nblk; ++n) {
-        ComputeDotProducts_BlkBitWidth4_CompInt8<1, SubBlkLen, HasZeroPoint>(
-            BlkLen,
-            QuantARowPtr, QuantBDataColPtr, QuantBScaleColPtr, QuantBZeroPointColPtr, SumPtr, CountK,
-            StrideQuantBData, StrideQuantBScale, StrideQuantBZeroPoint,
-            BiasPtr
-        );
+            // increment other block pointers
+
+            QuantAPtr += Q8BlkSize(BlkLen);
+            QuantBScalePtr += 1;
+
+            if constexpr (HasZeroPoint) {
+                QuantBZeroPointPtr += ((k_blk_idx & 1) == 0) ? 0 : 1;
+            }
+        }
+
+        *SumPtr = vaddvq_f32(acc0) + vaddvq_f32(acc1);
+        if (BiasPtr) {
+            *SumPtr += *BiasPtr;
+        }
 
         // move to next column
 
@@ -940,26 +1134,34 @@ SQ4BitGemmM1Kernel_CompInt8_DispatchOnBlkLen(
     const std::byte* QuantBZeroPoint,
     float* C,
     size_t CountN,
-    size_t CountK,
     size_t BlockStrideQuantB,
     const float* Bias
 )
 {
     if (BlkLen == 16) {
-        SQ4BitGemmM1Kernel_CompInt8_Impl<4, 16, HasZeroPoint>(
-            BlkLen,
+        SQ4BitGemmM1Kernel_CompInt8_Impl_BlkLen16<HasZeroPoint>(
+            QuantA,
+            QuantBData,
+            QuantBScale,
+            QuantBZeroPoint,
+            C,
+            CountN,
+            BlockStrideQuantB,
+            Bias
+        );
+    } else if (BlkLen == 32) {
+        SQ4BitGemmM1Kernel_CompInt8_Impl_BlkLen32<HasZeroPoint>(
             QuantA,
             QuantBData,
             QuantBScale,
             QuantBZeroPoint,
             C,
             CountN,
-            CountK,
             BlockStrideQuantB,
             Bias
         );
     } else {
-        SQ4BitGemmM1Kernel_CompInt8_Impl<4, 32, HasZeroPoint>(
+        SQ4BitGemmM1Kernel_CompInt8_Impl_BlkLenGreaterThan32<HasZeroPoint>(
             BlkLen,
             QuantA,
             QuantBData,
@@ -967,7 +1169,6 @@ SQ4BitGemmM1Kernel_CompInt8_DispatchOnBlkLen(
             QuantBZeroPoint,
             C,
             CountN,
-            CountK,
             BlockStrideQuantB,
             Bias
         );
@@ -984,7 +1185,7 @@ SQ4BitGemmM1Kernel_CompInt8(
     const std::byte* QuantBZeroPoint,
     float* C,
     size_t CountN,
-    size_t CountK,
+    size_t /*CountK*/,
     size_t BlockStrideQuantB,
     const float* Bias
 )
@@ -998,7 +1199,6 @@ SQ4BitGemmM1Kernel_CompInt8(
             QuantBZeroPoint,
             C,
             CountN,
-            CountK,
             BlockStrideQuantB,
             Bias
         );
@@ -1011,7 +1211,6 @@ SQ4BitGemmM1Kernel_CompInt8(
             QuantBZeroPoint,
             C,
             CountN,
-            CountK,
             BlockStrideQuantB,
             Bias
         );
diff --git a/onnxruntime/test/mlas/bench/bench_sqnbitgemm.cpp b/onnxruntime/test/mlas/bench/bench_sqnbitgemm.cpp
index b7b453415838a..04f5947e1371c 100644
--- a/onnxruntime/test/mlas/bench/bench_sqnbitgemm.cpp
+++ b/onnxruntime/test/mlas/bench/bench_sqnbitgemm.cpp
@@ -5,26 +5,30 @@
 #include "mlas_qnbit.h"
 
 #include <memory>
+#include <sstream>
 #include <stdexcept>
 #include <vector>
 
 #include "benchmark/benchmark.h"
 
 #include "bench_util.h"
-#include "core/util/thread_utils.h"
 #include "core/common/narrow.h"
+#include "core/util/thread_utils.h"
+#include "core/platform/env_var_utils.h"
 
 using onnxruntime::narrow;
 
 template <size_t BlkBitWidth>
-void SQNBITGEMM(benchmark::State& state) {
-  const auto BlkLen = narrow<size_t>(state.range(0));
-  const auto M = narrow<size_t>(state.range(1));
-  const auto N = narrow<size_t>(state.range(2));
-  const auto K = narrow<size_t>(state.range(3));
-  const auto Threads = narrow<size_t>(state.range(4));
-  const auto Symmetric = narrow<bool>(state.range(5));
-  const auto ComputeType = static_cast<MLAS_SQNBIT_GEMM_COMPUTE_TYPE>(state.range(6));
+void RunSQNBitGemmBenchmark(size_t BlkLen,
+                            size_t M, size_t N, size_t K,
+                            size_t Threads,
+                            bool Symmetric,
+                            MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType,
+                            benchmark::State& state) {
+  if (!MlasIsSQNBitGemmAvailable(BlkBitWidth, BlkLen, ComputeType)) {
+    state.SkipWithMessage("SQNBitGemm is not available with the given configuration on the current machine.");
+    return;
+  }
 
   size_t QuantBDataSizeInBytes, QuantBScaleSize, QuantBZeroPointSizeInBytes;
   MlasBlockwiseQuantizedBufferSizes(
@@ -88,28 +92,57 @@ void SQNBITGEMM(benchmark::State& state) {
   }
 }
 
-static void SQ4BitGemmArgs(benchmark::internal::Benchmark* b) {
-  constexpr size_t BlkBitWidth = 4;
+template <size_t BlkBitWidth>
+void SQNBITGEMM(benchmark::State& state) {
+  const auto BlkLen = narrow<size_t>(state.range(0));
+  const auto M = narrow<size_t>(state.range(1));
+  const auto N = narrow<size_t>(state.range(2));
+  const auto K = narrow<size_t>(state.range(3));
+  const auto Threads = narrow<size_t>(state.range(4));
+  const auto Symmetric = narrow<bool>(state.range(5));
+  const auto ComputeType = static_cast<MLAS_SQNBIT_GEMM_COMPUTE_TYPE>(state.range(6));
+
+  RunSQNBitGemmBenchmark<BlkBitWidth>(BlkLen, M, N, K, Threads, Symmetric, ComputeType, state);
+}
+
+// This test gets benchmark arguments from environment variables.
+template <size_t BlkBitWidth>
+void SQNBITGEMM_ENV(benchmark::State& state) {
+  using onnxruntime::ParseEnvironmentVariableWithDefault;
+
+  const auto BlkLen = ParseEnvironmentVariableWithDefault<size_t>("ORT_SQNBITGEMM_BLKLEN", 32);
+  const auto M = ParseEnvironmentVariableWithDefault<size_t>("ORT_SQNBITGEMM_M", 1);
+  const auto N = ParseEnvironmentVariableWithDefault<size_t>("ORT_SQNBITGEMM_N", 4096);
+  const auto K = ParseEnvironmentVariableWithDefault<size_t>("ORT_SQNBITGEMM_K", 4096);
+  const auto Threads = ParseEnvironmentVariableWithDefault<size_t>("ORT_SQNBITGEMM_THREADS", 1);
+  const auto Symmetric = ParseEnvironmentVariableWithDefault<bool>("ORT_SQNBITGEMM_SYMMETRIC", true);
+  const auto ComputeType = ParseEnvironmentVariableWithDefault<int32_t>("ORT_SQNBITGEMM_COMPUTE_TYPE",
+                                                                        static_cast<int32_t>(CompFp32));
+
+  RunSQNBitGemmBenchmark<BlkBitWidth>(BlkLen, M, N, K, Threads, Symmetric,
+                                      static_cast<MLAS_SQNBIT_GEMM_COMPUTE_TYPE>(ComputeType),
+                                      state);
+
+  std::ostringstream s;
+  s << "BlkBitWidth:" << BlkBitWidth << "/BlkLen:" << BlkLen
+    << "/M:" << M << "/N:" << N << "/K:" << K
+    << "/Threads:" << Threads << "/Symmetric:" << Symmetric << "/ComputeType:" << ComputeType;
+  state.SetLabel(s.str());
+}
 
+static void SQNBitGemmArgs(benchmark::internal::Benchmark* b) {
   b->ArgNames({"BlkLen", "M", "N", "K", "Threads", "Symmetric", "ComputeType"});
 
-  ArgsProductWithFilter(b,
-
-                        {{16, 32, 64, 128, 256},                   // BlkLen
-                         {1, 1024, 2048},                          // M
-                         {4096, 11008},                            // N
-                         {4096, 11008},                            // K
-                         {1, 8},                                   // Threads
-                         {int64_t{false}, int64_t{true}},          // Symmetric
-                         {int64_t{CompFp32}, int64_t{CompInt8}}},  // ComputeType
-
-                        [&](const std::vector<int64_t>& args) {
-                          return MlasIsSQNBitGemmAvailable(
-                              // BlkBitWidth, BlkLen
-                              BlkBitWidth, narrow<size_t>(args[0]),
-                              // ComputeType
-                              static_cast<MLAS_SQNBIT_GEMM_COMPUTE_TYPE>(args[6]));
-                        });
+  b->ArgsProduct({
+      {16, 32, 64, 128, 256},                  // BlkLen
+      {1, 1024, 2048},                         // M
+      {4096, 11008},                           // N
+      {4096, 11008},                           // K
+      {1, 8},                                  // Threads
+      {int64_t{false}, int64_t{true}},         // Symmetric
+      {int64_t{CompFp32}, int64_t{CompInt8}},  // ComputeType
+  });
 }
 
-BENCHMARK(SQNBITGEMM<4>)->Apply(SQ4BitGemmArgs)->UseRealTime();
+BENCHMARK(SQNBITGEMM<4>)->Apply(SQNBitGemmArgs)->UseRealTime();
+BENCHMARK(SQNBITGEMM_ENV<4>)->UseRealTime();
diff --git a/onnxruntime/test/mlas/bench/bench_util.cpp b/onnxruntime/test/mlas/bench/bench_util.cpp
index d57564615b04e..6b59b7e01b46f 100644
--- a/onnxruntime/test/mlas/bench/bench_util.cpp
+++ b/onnxruntime/test/mlas/bench/bench_util.cpp
@@ -22,30 +22,3 @@ std::vector<float> RandomVectorUniform(std::vector<int64_t> shape, float min_val
   }
   return RandomVectorUniform(static_cast<size_t>(sz), min_value, max_value);
 }
-
-void ArgsProductWithFilter(benchmark::internal::Benchmark* bench,
-                           const std::vector<std::vector<int64_t>>& arglists,
-                           std::function<bool(const std::vector<int64_t>& args)> include_filter) {
-  std::vector<std::size_t> indices(arglists.size(), 0);
-  const std::size_t total = std::accumulate(
-      std::begin(arglists), std::end(arglists), std::size_t{1},
-      [](const std::size_t res, const std::vector<int64_t>& arglist) {
-        return res * arglist.size();
-      });
-  std::vector<int64_t> args;
-  args.reserve(arglists.size());
-  for (std::size_t i = 0; i < total; i++) {
-    for (std::size_t arg = 0; arg < arglists.size(); arg++) {
-      args.push_back(arglists[arg][indices[arg]]);
-    }
-    if (include_filter(args)) {
-      bench->Args(args);
-    }
-    args.clear();
-
-    std::size_t arg = 0;
-    do {
-      indices[arg] = (indices[arg] + 1) % arglists[arg].size();
-    } while (indices[arg++] == 0 && arg < arglists.size());
-  }
-}
diff --git a/onnxruntime/test/mlas/bench/bench_util.h b/onnxruntime/test/mlas/bench/bench_util.h
index ee2ec42d0f755..f96dd5c673b3d 100644
--- a/onnxruntime/test/mlas/bench/bench_util.h
+++ b/onnxruntime/test/mlas/bench/bench_util.h
@@ -8,12 +8,6 @@
 #include <functional>
 #include <random>
 
-// Specifies benchmark arguments from the cartesian product of `arglists`, like Benchmark::ArgsProduct().
-// `include_filter` is called to determine whether a given set of arguments should be included.
-void ArgsProductWithFilter(benchmark::internal::Benchmark* bench,
-                           const std::vector<std::vector<int64_t>>& arglists,
-                           std::function<bool(const std::vector<int64_t>& args)> include_filter);
-
 template <typename ElementType>
 std::vector<ElementType> RandomVectorUniform(
     size_t N,

From a2ffc3740b4b3810418770e3275cea487f4c3b7e Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Thu, 14 Mar 2024 13:48:37 -0700
Subject: [PATCH 177/279] [Cuda] Demo multiple cuda graphs and user compute
 stream (#19883)

Update stable diffusion demo to add options `--max-cuda-graphs` and
`--user-compute-stream`.

* Add python class GpuBindingManager to manage IO Binding based on input
shape and max number of cuda graphs setting. The benefit is that one
inference session could enable or disable cuda graph in different runs.
* When `--user-compute-stream`, the demo will use custom compute stream.
---
 .../tools/transformers/io_binding_helper.py   | 124 +++++++++++++++---
 .../models/stable_diffusion/demo_txt2img.py   |  24 +++-
 .../stable_diffusion/demo_txt2img_xl.py       |  32 +++--
 .../models/stable_diffusion/demo_utils.py     |  21 +--
 .../engine_builder_ort_cuda.py                |  41 ++++--
 .../pipeline_stable_diffusion.py              |   8 +-
 6 files changed, 201 insertions(+), 49 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/io_binding_helper.py b/onnxruntime/python/tools/transformers/io_binding_helper.py
index 50703b9c17e03..58a49525b9199 100644
--- a/onnxruntime/python/tools/transformers/io_binding_helper.py
+++ b/onnxruntime/python/tools/transformers/io_binding_helper.py
@@ -1,3 +1,4 @@
+import copy
 import logging
 from collections import OrderedDict
 from typing import Any, Dict, List, Tuple, Union
@@ -5,7 +6,7 @@
 import numpy
 import torch
 
-from onnxruntime import InferenceSession
+from onnxruntime import InferenceSession, RunOptions
 
 logger = logging.getLogger(__name__)
 
@@ -227,7 +228,6 @@ def __del__(self):
         del self.input_tensors
         del self.output_tensors
         del self.io_binding
-        del self.ort_session
 
     def allocate_buffers(self, shape_dict: Dict[str, Union[Tuple[int], List[int]]]):
         """Allocate tensors for I/O Binding"""
@@ -276,7 +276,7 @@ def allocate_buffers(self, shape_dict: Dict[str, Union[Tuple[int], List[int]]]):
                     tensor.data_ptr(),
                 )
 
-    def infer(self, feed_dict: Dict[str, torch.Tensor]):
+    def infer(self, feed_dict: Dict[str, torch.Tensor], run_options: RunOptions = None, synchronize: bool = False):
         """Bind input tensors and run inference"""
         for name, tensor in feed_dict.items():
             assert isinstance(tensor, torch.Tensor) and tensor.is_contiguous()
@@ -285,16 +285,7 @@ def infer(self, feed_dict: Dict[str, torch.Tensor]):
                     assert self.input_tensors[name].nelement() == tensor.nelement()
                     assert self.input_tensors[name].dtype == tensor.dtype
                     assert tensor.device.type == "cuda"
-                    # Please install cuda-python package with a version corresponding to CUDA in your machine.
-                    from cuda import cudart
-
-                    # Update input tensor inplace since cuda graph requires input and output has fixed memory address.
-                    cudart.cudaMemcpy(
-                        self.input_tensors[name].data_ptr(),
-                        tensor.data_ptr(),
-                        tensor.element_size() * tensor.nelement(),
-                        cudart.cudaMemcpyKind.cudaMemcpyDeviceToDevice,
-                    )
+                    self.input_tensors[name].copy_(tensor)
                 else:
                     self.io_binding.bind_input(
                         name,
@@ -305,14 +296,115 @@ def infer(self, feed_dict: Dict[str, torch.Tensor]):
                         tensor.data_ptr(),
                     )
 
-        self.ort_session.run_with_iobinding(self.io_binding)
+        # Synchronization are not needed in most cases unless different streams are used or inputs/outputs are in CPU.
+        if synchronize:
+            self.io_binding.synchronize_inputs()
+            self.ort_session.run_with_iobinding(self.io_binding, run_options)
+            self.io_binding.synchronize_outputs()
+        else:
+            self.ort_session.run_with_iobinding(self.io_binding, run_options)
 
         return self.output_tensors
 
     @staticmethod
-    def get_cuda_provider_options(device_id: int, enable_cuda_graph: bool) -> Dict[str, Any]:
-        return {
+    def get_cuda_provider_options(device_id: int, enable_cuda_graph: bool, stream: int = 0) -> Dict[str, Any]:
+        options = {
             "device_id": device_id,
             "arena_extend_strategy": "kSameAsRequested",
             "enable_cuda_graph": enable_cuda_graph,
         }
+
+        # Stream is address of a CUDA stream. 0 means the default stream.
+        if stream != 0:
+            options["user_compute_stream"] = str(stream)
+
+        return options
+
+
+class GpuBinding(CudaSession):
+    def __init__(
+        self,
+        ort_session: InferenceSession,
+        device: torch.device,
+        shape_dict: Dict[str, Union[Tuple[int], List[int]]],
+        enable_gpu_graph: bool = False,
+        gpu_graph_id: int = -1,
+        stream: int = 0,
+    ):
+        super().__init__(ort_session, device, enable_gpu_graph)
+        self.allocate_buffers(shape_dict)
+        self.gpu_graph_id = gpu_graph_id
+        # For cuda graph, we need to keep a copy of shape_dict to check if the shape is same in inference later.
+        self.shape_dict = copy.deepcopy(shape_dict) if enable_gpu_graph else None
+        self.stream = stream
+        # The gpu graph id of last run. It will be saved to image metadata.
+        self.last_run_gpu_graph_id = None
+
+    def get_run_options(self, disable_cuda_graph_in_run: bool = False) -> RunOptions:
+        options = RunOptions()
+
+        gpu_graph_id = -1 if disable_cuda_graph_in_run else self.gpu_graph_id
+
+        options.add_run_config_entry("gpu_graph_id", str(gpu_graph_id))
+
+        self.last_run_gpu_graph_id = gpu_graph_id
+
+        return options
+
+    def infer(self, feed_dict: Dict[str, torch.Tensor], disable_cuda_graph_in_run: bool = False):
+        run_options = self.get_run_options(disable_cuda_graph_in_run)
+
+        if self.stream:
+            run_options.add_run_config_entry("disable_synchronize_execution_providers", "1")
+
+        return super().infer(feed_dict, run_options)
+
+
+class GpuBindingManager:
+    """A manager for I/O bindings that support multiple CUDA Graphs.
+    One cuda graph is reused for same input shape. Automatically add a new cuda graph for new input shape.
+    """
+
+    def __init__(self, ort_session: InferenceSession, device: torch.device, stream: int = 0, max_cuda_graphs: int = 1):
+        self.ort_session = ort_session
+        self.device = device
+
+        # Binding supports cuda graphs. For a binding, it is able to disable cuda graph for a specific run.
+        self.graph_bindings = []
+
+        # Binding for not using cuda graph.
+        self.no_graph_binding = None
+
+        self.stream = stream
+
+        self.max_cuda_graphs = max_cuda_graphs
+
+    def get_binding(
+        self,
+        shape_dict: Dict[str, Union[Tuple[int], List[int]]],
+        use_cuda_graph: bool = False,
+    ) -> GpuBinding:
+        for gpu_graph_binding in self.graph_bindings:
+            # Found a cuda graph that captured with the same shape
+            if gpu_graph_binding.shape_dict == shape_dict:
+                return gpu_graph_binding
+
+        # Reached the maximum number of cuda graphs. Return a binding without cuda graph.
+        if len(self.graph_bindings) >= self.max_cuda_graphs or (not use_cuda_graph):
+            if self.no_graph_binding is None:
+                self.no_graph_binding = GpuBinding(self.ort_session, self.device, shape_dict, stream=self.stream)
+            else:
+                self.no_graph_binding.allocate_buffers(shape_dict)
+            return self.no_graph_binding
+
+        # This is a new input shape, create a new cuda graph
+        gpu_graph_binding = GpuBinding(
+            self.ort_session,
+            self.device,
+            shape_dict,
+            enable_gpu_graph=True,
+            gpu_graph_id=len(self.graph_bindings),
+            stream=self.stream,
+        )
+        self.graph_bindings.append(gpu_graph_binding)
+        return gpu_graph_binding
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img.py
index 2cd64e8784c6b..a3caba138f44a 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img.py
@@ -32,13 +32,8 @@
     repeat_prompt,
 )
 
-if __name__ == "__main__":
-    coloredlogs.install(fmt="%(funcName)20s: %(message)s")
-
-    parser = arg_parser("Options for Stable Diffusion Demo")
-    add_controlnet_arguments(parser)
-    args = parse_arguments(is_xl=False, parser=parser)
 
+def main(args):
     controlnet_images, controlnet_scale = process_controlnet_arguments(args)
 
     pipeline, refiner = load_pipelines(args)
@@ -88,3 +83,20 @@ def run_inference(warmup=False):
     pipeline.save_images(images, prompt, negative_prompt, metadata)
 
     pipeline.teardown()
+
+
+if __name__ == "__main__":
+    coloredlogs.install(fmt="%(funcName)20s: %(message)s")
+
+    parser = arg_parser("Options for Stable Diffusion Demo")
+    add_controlnet_arguments(parser)
+    args = parse_arguments(is_xl=False, parser=parser)
+
+    if args.user_compute_stream:
+        import torch
+
+        s = torch.cuda.Stream()
+        with torch.cuda.stream(s):
+            main(args)
+    else:
+        main(args)
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img_xl.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img_xl.py
index 19bbb45d77c93..24fa6a2c51343 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img_xl.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img_xl.py
@@ -132,9 +132,11 @@ def run_demo(args):
 
 
 def run_dynamic_shape_demo(args):
-    """Run demo of generating images with different settings with ORT CUDA provider."""
+    """
+    Run demo of generating images with different settings with ORT CUDA provider.
+    Try "python demo_txt2img_xl.py --max-cuda-graphs 3 --user-compute-stream" to see the effect of multiple CUDA graphs.
+    """
     args.engine = "ORT_CUDA"
-    args.disable_cuda_graph = True
     base, refiner = load_pipelines(args, 1)
 
     prompts = [
@@ -216,7 +218,6 @@ def run_dynamic_shape_demo(args):
 def run_turbo_demo(args):
     """Run demo of generating images with test prompts with ORT CUDA provider."""
     args.engine = "ORT_CUDA"
-    args.disable_cuda_graph = True
     base, refiner = load_pipelines(args, 1)
 
     from datasets import load_dataset
@@ -239,13 +240,7 @@ def run_turbo_demo(args):
         refiner.teardown()
 
 
-if __name__ == "__main__":
-    coloredlogs.install(fmt="%(funcName)20s: %(message)s")
-
-    parser = arg_parser("Options for Stable Diffusion XL Demo")
-    add_controlnet_arguments(parser)
-    args = parse_arguments(is_xl=True, parser=parser)
-
+def main(args):
     no_prompt = isinstance(args.prompt, list) and len(args.prompt) == 1 and not args.prompt[0]
     if no_prompt:
         if args.version == "xl-turbo":
@@ -254,3 +249,20 @@ def run_turbo_demo(args):
             run_dynamic_shape_demo(args)
     else:
         run_demo(args)
+
+
+if __name__ == "__main__":
+    coloredlogs.install(fmt="%(funcName)20s: %(message)s")
+
+    parser = arg_parser("Options for Stable Diffusion XL Demo")
+    add_controlnet_arguments(parser)
+    args = parse_arguments(is_xl=True, parser=parser)
+
+    if args.user_compute_stream:
+        import torch
+
+        s = torch.cuda.Stream()
+        with torch.cuda.stream(s):
+            main(args)
+    else:
+        main(args)
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_utils.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_utils.py
index 369f31511faca..a50940933eb82 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_utils.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_utils.py
@@ -23,7 +23,7 @@
 import os
 import sys
 from importlib.metadata import PackageNotFoundError, version
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional
 
 import controlnet_aux
 import cv2
@@ -246,6 +246,8 @@ def parse_arguments(is_xl: bool, parser):
 
     group = parser.add_argument_group("Options for ORT_CUDA engine only")
     group.add_argument("--enable-vae-slicing", action="store_true", help="True will feed only one image to VAE once.")
+    group.add_argument("--max-cuda-graphs", type=int, default=1, help="Max number of cuda graphs to use. Default 1.")
+    group.add_argument("--user-compute-stream", action="store_true", help="Use user compute stream.")
 
     # TensorRT only options
     group = parser.add_argument_group("Options for TensorRT (--engine=TRT) only")
@@ -400,15 +402,16 @@ def initialize_pipeline(
     max_image_size: int = 1024,
     max_batch_size: int = 16,
     opt_batch_size: int = 1,
-    build_all_tactics=False,
-    do_classifier_free_guidance=False,
-    lcm=False,
+    build_all_tactics: bool = False,
+    do_classifier_free_guidance: bool = False,
+    lcm: bool = False,
     controlnet=None,
     lora_weights=None,
-    lora_scale=1.0,
-    use_fp16_vae=True,
-    use_vae=True,
-    framework_model_dir=None,
+    lora_scale: float = 1.0,
+    use_fp16_vae: bool = True,
+    use_vae: bool = True,
+    framework_model_dir: Optional[str] = None,
+    max_cuda_graphs: int = 1,
 ):
     pipeline_info = PipelineInfo(
         version,
@@ -465,6 +468,7 @@ def initialize_pipeline(
             tmp_dir=os.path.join(work_dir or ".", engine_type.name, pipeline_info.short_name(), "tmp"),
             device_id=torch.cuda.current_device(),
             import_engine_dir=import_engine_dir,
+            max_cuda_graphs=max_cuda_graphs,
         )
     elif engine_type == EngineType.ORT_TRT:
         pipeline.backend.build_engines(
@@ -562,6 +566,7 @@ def load_pipelines(args, batch_size=None):
         "use_fp16_vae": "xl" in args.version,
         "use_vae": True,
         "framework_model_dir": args.framework_model_dir,
+        "max_cuda_graphs": args.max_cuda_graphs,
     }
 
     if "xl" in args.version:
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder_ort_cuda.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder_ort_cuda.py
index 6ab4858f11f23..56012e223b18c 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder_ort_cuda.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder_ort_cuda.py
@@ -6,7 +6,7 @@
 import gc
 import logging
 import os
-from typing import List, Optional
+from typing import Dict, List, Optional
 
 import onnx
 import torch
@@ -15,25 +15,25 @@
 from packaging import version
 
 import onnxruntime as ort
-from onnxruntime.transformers.io_binding_helper import CudaSession
+from onnxruntime.transformers.io_binding_helper import CudaSession, GpuBindingManager
 from onnxruntime.transformers.onnx_model import OnnxModel
 
 logger = logging.getLogger(__name__)
 
 
-class OrtCudaEngine(CudaSession):
+class OrtCudaEngine:
     def __init__(
         self,
         onnx_path,
         device_id: int = 0,
         enable_cuda_graph: bool = False,
         disable_optimization: bool = False,
+        max_cuda_graphs: int = 1,
     ):
         self.onnx_path = onnx_path
         self.provider = "CUDAExecutionProvider"
-        self.provider_options = CudaSession.get_cuda_provider_options(device_id, enable_cuda_graph)
-        # self.provider_options["enable_skip_layer_norm_strict_mode"] = True
-
+        self.stream = torch.cuda.current_stream().cuda_stream
+        self.provider_options = CudaSession.get_cuda_provider_options(device_id, enable_cuda_graph, self.stream)
         session_options = ort.SessionOptions()
 
         # When the model has been optimized by onnxruntime, we can disable optimization to save session creation time.
@@ -52,10 +52,33 @@ def __init__(
         logger.info("created CUDA EP session for %s", onnx_path)
 
         device = torch.device("cuda", device_id)
-        super().__init__(ort_session, device, enable_cuda_graph)
+        self.enable_cuda_graph = enable_cuda_graph
+
+        # Support multiple CUDA graphs for different input shapes.
+        # For clip2 model that disabled cuda graph, max_cuda_graphs is updated to 0 here.
+        self.gpu_binding_manager = GpuBindingManager(
+            ort_session=ort_session,
+            device=device,
+            stream=self.stream,
+            max_cuda_graphs=max_cuda_graphs if enable_cuda_graph else 0,
+        )
+
+        self.current_gpu_binding = None
+
+    def metadata(self, name: str):
+        data = {}
+        if self.current_gpu_binding is not None:
+            if self.current_gpu_binding.last_run_gpu_graph_id >= 0:
+                data[f"{name}.gpu_graph_id"] = self.current_gpu_binding.last_run_gpu_graph_id
+        return data
+
+    def infer(self, feed_dict: Dict[str, torch.Tensor]):
+        return self.current_gpu_binding.infer(feed_dict=feed_dict, disable_cuda_graph_in_run=not self.enable_cuda_graph)
 
     def allocate_buffers(self, shape_dict, device):
-        super().allocate_buffers(shape_dict)
+        self.current_gpu_binding = self.gpu_binding_manager.get_binding(
+            shape_dict=shape_dict, use_cuda_graph=self.enable_cuda_graph
+        )
 
 
 class _ModelConfig:
@@ -220,6 +243,7 @@ def build_engines(
         device_id: int = 0,
         save_fp32_intermediate_model: bool = False,
         import_engine_dir: Optional[str] = None,
+        max_cuda_graphs: int = 1,
     ):
         self.torch_device = torch.device("cuda", device_id)
         self.load_models(framework_model_dir)
@@ -352,6 +376,7 @@ def build_engines(
                 device_id=device_id,
                 enable_cuda_graph=use_cuda_graph,
                 disable_optimization=False,
+                max_cuda_graphs=max_cuda_graphs,
             )
 
             logger.info("%s options for %s: %s", engine.provider, model_name, engine.provider_options)
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py
index 0ad8b13b6091c..1629537dc294f 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py
@@ -547,7 +547,7 @@ def pt_to_numpy(images: torch.FloatTensor):
         return ((images + 1) / 2).clamp(0, 1).detach().permute(0, 2, 3, 1).float().cpu().numpy()
 
     def metadata(self) -> Dict[str, Any]:
-        return {
+        data = {
             "actual_steps": self.actual_steps,
             "seed": self.get_current_seed(),
             "name": self.pipeline_info.name(),
@@ -555,6 +555,12 @@ def metadata(self) -> Dict[str, Any]:
             "custom_unet": self.pipeline_info.custom_unet(),
         }
 
+        if self.engine_type == EngineType.ORT_CUDA:
+            for engine_name, engine in self.backend.engines.items():
+                data.update(engine.metadata(engine_name))
+
+        return data
+
     def save_images(self, images: List, prompt: List[str], negative_prompt: List[str], metadata: Dict[str, Any]):
         session_id = str(random.randint(1000, 9999))
         for i, image in enumerate(images):

From 8b766bd24e3637e228c336ddacd3808eaf75857a Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Thu, 14 Mar 2024 15:07:56 -0700
Subject: [PATCH 178/279] Change nuget pipeline's
 "Windows_Packaging_combined_GPU" job to download TRT binaries in every build
 (#19919)

### Description
Change nuget pipeline's "Final_Jar_Testing_Windows_GPU" job to download
TRT binaries in every build. Now all the other build jobs are already
doing this. This is the only one left.

Similar to #19909

### Motivation and Context

As a follow up of #19118
---
 .../c-api-noopenmp-packaging-pipelines.yml            | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index 01f316dbbaaef..a63f1b74b7633 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -506,12 +506,11 @@ stages:
       condition: always()
 
     - script: dir $(Build.SourcesDirectory)
-    - task: BatchScript@1
-      displayName: 'setup env'
-      inputs:
-        filename: '$(Build.SourcesDirectory)\onnxruntime\tools\ci_build\github\windows\setup_env_gpu.bat'
-        modifyEnvironment: true
-        workingFolder: '$(Build.BinariesDirectory)'
+    - template: templates/jobs/download_win_gpu_library.yml
+      parameters:
+        CudaVersion: ${{ parameters.CudaVersion }}
+        DownloadCUDA: true
+        DownloadTRT: true
     - template: templates/set-version-number-variables-step.yml
       parameters:
         versionFileDirectory: '$(Build.SourcesDirectory)\onnxruntime'

From 87a9f77c56412f73da61699888033c2a6523f31b Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Fri, 15 Mar 2024 06:47:41 +0800
Subject: [PATCH 179/279] Refactor Python Packaing Pipeline (Training Cuda
 11.8) (#19910)

### Description
1. Use stage to organize the pipeline and split building and testing
2. Move compilation on CPU machine
3. test stage can leverage existing artifacts
4. check wheel size, it gives warning if the size above 300M
5. docker image name wasn't change even the argument changed, which
caused the docker image was always rebuilt. So update the docker image
name according to the argument can save the docker build time.

Pipeline duration reduced by 60% (2 hours ->  50 minutes)
Compilation time reduced by 75% (1.5hours -> 20 minutes)
GPU time reduced by 87% ( 8 hours to 1 hours)
for debugging, the GPU time could be reduced by above 95%, because we
can choose run only one test stage and skip building.

### Motivation and Context
Make the pipeline efficient.
Optimized

https://dev.azure.com/aiinfra/Lotus/_build/results?buildId=424177&view=results
Curent

https://dev.azure.com/aiinfra/Lotus/_build/results?buildId=422393&view=results

---------
---
 ...orttraining-py-packaging-pipeline-cuda.yml |  13 +
 ...py-packaging-training-cuda-stage-steps.yml | 229 ++++++++++++++++++
 .../py-packaging-training-cuda-stage.yml      | 215 +++-------------
 3 files changed, 279 insertions(+), 178 deletions(-)
 create mode 100644 tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml

diff --git a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda.yml b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda.yml
index 47b1e0933417e..539a61c021cfb 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda.yml
@@ -8,6 +8,17 @@ resources:
     name: pypa/manylinux
     ref: 5eda9aded5462201e6310105728d33016e637ea7
 
+parameters:
+  - name: SpecificArtifact
+    displayName: Use Specific Artifact
+    type: boolean
+    default: false
+
+  - name: BuildId
+    displayName: Specific Artifact's BuildId
+    type: string
+    default: '0'
+
 stages:
 - template: templates/py-packaging-training-cuda-stage.yml
   parameters:
@@ -20,3 +31,5 @@ stages:
     agent_pool: Onnxruntime-Linux-GPU
     upload_wheel: 'yes'
     debug_build: false
+    SpecificArtifact: ${{ parameters.SpecificArtifact }}
+    BuildId: ${{ parameters.BuildId }}
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml
new file mode 100644
index 0000000000000..91d7b9f219f76
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml
@@ -0,0 +1,229 @@
+parameters:
+  build_py_parameters: ''
+  torch_version: ''
+  opset_version: ''
+  cuda_version: ''
+  cmake_cuda_architectures: ''
+  docker_file: ''
+  upload_wheel: ''
+  debug_build:  ''
+  python_version: ''
+  stage_name: ''
+  SpecificArtifact: false
+  BuildId: '0'
+
+stages:
+  - stage: Build_${{ parameters.stage_name }}
+    variables:
+      - name: isMain
+        value: ${{ or(eq(variables['Build.SourceBranch'], 'refs/heads/main'), startsWith(variables['Build.SourceBranch'], 'refs/heads/rel-')) }}
+      - name: finalStorage
+        ${{ if eq(variables['isMain'], 'true') }}:
+          value: '--final_storage'
+        ${{ else }}:
+          value: ''
+      - name: buildConfig
+        ${{ if eq(parameters['debug_build'], 'true') }}:
+          value: 'Debug'
+        ${{ else }}:
+          value: 'Release'
+      - name: PythonVersion
+        value: ${{ parameters.python_version }}
+      - name: Repository
+        value: onnxruntimetraininggpubuild_${{ parameters.python_version }}
+    dependsOn: []
+
+    jobs:
+    - job: Build
+      pool: onnxruntime-Ubuntu2204-AMD-CPU
+      steps:
+        - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+          displayName: 'Clean Agent Directories'
+          condition: always()
+
+        - task: CmdLine@2
+          displayName: 'check variables'
+          inputs:
+            script: |
+              echo "Branch is "${{ variables['Build.SourceBranch'] }} && \
+              echo "isMain is "${{ variables['isMain'] }} && \
+              echo "final_storage is "${{ variables['finalStorage'] }}
+
+        - checkout: self
+          clean: true
+          submodules: recursive
+
+        - template: set-python-manylinux-variables-step.yml
+
+        - template: get-docker-image-steps.yml
+          parameters:
+            Dockerfile: tools/ci_build/github/linux/docker/${{ parameters.docker_file }}
+            Context: tools/ci_build/github/linux/docker
+            DockerBuildArgs: >-
+              --build-arg TORCH_VERSION=${{ parameters.torch_version }}
+              --build-arg OPSET_VERSION=${{ parameters.opset_version }}
+              --build-arg PYTHON_VERSION=${{ parameters.python_version }}
+              --build-arg INSTALL_DEPS_EXTRA_ARGS=-tu
+              --build-arg BUILD_UID=$(id -u)
+              --network=host --build-arg POLICY=manylinux_2_28 --build-arg PLATFORM=x86_64
+              --build-arg DEVTOOLSET_ROOTPATH=/usr
+              --build-arg PREPEND_PATH=/usr/local/cuda/bin:
+              --build-arg LD_LIBRARY_PATH_ARG=/usr/local/lib64
+            Repository: $(Repository)
+
+        - task: CmdLine@2
+          displayName: 'build onnxruntime'
+          inputs:
+            script: |
+              set -e -x
+              mkdir -p $HOME/.onnx
+              docker run --rm -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" \
+                --volume /data/onnx:/data/onnx:ro \
+                --volume $(Build.SourcesDirectory):/onnxruntime_src \
+                --volume $(Build.BinariesDirectory):/build \
+                --volume /data/models:/build/models:ro \
+                --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
+                -e NVIDIA_VISIBLE_DEVICES=all \
+                -e NIGHTLY_BUILD \
+                -e DEFAULT_TRAINING_PACKAGE_DEVICE \
+                -e BUILD_BUILDNUMBER \
+                -e ORT_DISABLE_PYTHON_PACKAGE_LOCAL_VERSION \
+                $(Repository) \
+                  $(PythonManylinuxDir)/bin/python3 /onnxruntime_src/tools/ci_build/build.py \
+                    --build_dir /build \
+                    --config ${{ variables['buildConfig'] }} \
+                    --skip_submodule_sync \
+                    --parallel --use_binskim_compliant_compile_flags \
+                    --build_wheel \
+                    --enable_onnx_tests \
+                    ${{ parameters.build_py_parameters }} \
+                    --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=${{ parameters.cmake_cuda_architectures }}' onnxruntime_BUILD_UNIT_TESTS=OFF \
+                    --use_cuda --cuda_version=${{ parameters.cuda_version }} --cuda_home=/usr/local/cuda-${{ parameters.cuda_version }} --cudnn_home=/usr/local/cuda-${{ parameters.cuda_version }};
+            workingDirectory: $(Build.SourcesDirectory)
+
+        - task: CopyFiles@2
+          displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)'
+          inputs:
+            SourceFolder: '$(Build.BinariesDirectory)'
+            Contents: "${{ variables['buildConfig'] }}/dist/*.whl"
+            TargetFolder: '$(Build.ArtifactStagingDirectory)'
+
+        - task: PublishBuildArtifacts@1
+          displayName: 'Publish Artifact: ONNXRuntime python wheel and documentation'
+          inputs:
+            ArtifactName: "onnxruntime_gpu_${{ variables['buildConfig'] }}_${{ parameters.python_version }}"
+
+        - template: component-governance-component-detection-steps.yml
+          parameters:
+            condition: 'succeeded'
+
+        - template: clean-agent-build-directory-step.yml
+
+  - stage: Test_${{ parameters.stage_name }}
+    variables:
+      - name: isMain
+        value: ${{ or(eq(variables['Build.SourceBranch'], 'refs/heads/main'), startsWith(variables['Build.SourceBranch'], 'refs/heads/rel-')) }}
+      - name: finalStorage
+        ${{ if eq(variables['isMain'], 'true') }}:
+          value: '--final_storage'
+        ${{ else }}:
+          value: ''
+      - name: buildConfig
+        ${{ if eq(parameters['debug_build'], 'true') }}:
+          value: 'Debug'
+        ${{ else }}:
+          value: 'Release'
+      - name: PythonVersion
+        value: ${{ parameters.python_version }}
+      - name: Repository
+        value: onnxruntimetraininggpubuild_${{ parameters.python_version }}
+    dependsOn: Build_${{ parameters.stage_name }}
+    jobs:
+    - job: Test_GPU
+      pool: Onnxruntime-Linux-GPU
+      steps:
+        - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+          displayName: 'Clean Agent Directories'
+          condition: always()
+
+        - checkout: self
+          clean: true
+          submodules: none
+
+        - template: set-python-manylinux-variables-step.yml
+
+        - template: flex-downloadPipelineArtifact.yml
+          parameters:
+            ArtifactName: "onnxruntime_gpu_${{ variables['buildConfig'] }}_${{ parameters.python_version }}"
+            StepName: 'Download Pipeline Artifact - Linux Training Build'
+            TargetPath: '$(Build.ArtifactStagingDirectory)'
+            SpecificArtifact: ${{ parameters.SpecificArtifact }}
+            BuildId: ${{ parameters.BuildId }}
+
+        - script: |
+            set -e -x
+            whlfilename=$(ls $(Build.ArtifactStagingDirectory)/Release/dist/*.whl | head -n 1)  ; \
+            echo $whlfilename ; du -sh $whlfilename ; \
+            (( $(wc -c < "$whlfilename") -  300*1024*1024 < 0 )) ||  ( echo 'Wheel size bigger than 300M'; exit 1)
+          displayName: 'Check wheel size'
+          continueOnError: true
+
+        - template: get-docker-image-steps.yml
+          parameters:
+            Dockerfile: tools/ci_build/github/linux/docker/${{ parameters.docker_file }}
+            Context: tools/ci_build/github/linux/docker
+            DockerBuildArgs: >-
+              --build-arg TORCH_VERSION=${{ parameters.torch_version }}
+              --build-arg OPSET_VERSION=${{ parameters.opset_version }}
+              --build-arg PYTHON_VERSION=${{ parameters.python_version }}
+              --build-arg INSTALL_DEPS_EXTRA_ARGS=-tu
+              --build-arg BUILD_UID=$(id -u)
+              --network=host --build-arg POLICY=manylinux_2_28 --build-arg PLATFORM=x86_64
+              --build-arg DEVTOOLSET_ROOTPATH=/usr
+              --build-arg PREPEND_PATH=/usr/local/cuda/bin:
+              --build-arg LD_LIBRARY_PATH_ARG=/usr/local/lib64
+            Repository: $(Repository)
+
+        - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/mnist" -d "/mnist"
+          displayName: 'Mount MNIST'
+          condition: succeededOrFailed()
+          workingDirectory: $(Build.SourcesDirectory)
+
+        - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/bert-data" -d "/bert_data"
+          displayName: 'Mount bert-data'
+          condition: succeededOrFailed()
+          workingDirectory: $(Build.SourcesDirectory)
+
+        - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/hf-models-cache" -d "/hf_models_cache"
+          displayName: 'Mount hf-models-cache'
+          condition: succeededOrFailed()
+          workingDirectory: $(Build.SourcesDirectory)
+
+        - task: CmdLine@2
+          displayName: 'test ortmodule'
+          inputs:
+            script: |
+              set -ex ; \
+              whlfilename=$(ls $(Build.ArtifactStagingDirectory)/Release/dist/*.whl | head -n 1) ; \
+              echo $whlfilename ; \
+              basefilename=$(basename $whlfilename) ; \
+              docker run --rm \
+                --gpus all \
+                -e NVIDIA_VISIBLE_DEVICES=all \
+                --volume $(Build.ArtifactStagingDirectory):/build \
+                --volume /mnist:/mnist \
+                --volume /bert_data:/bert_data \
+                --volume /hf_models_cache:/hf_models_cache \
+                $(Repository) \
+                  bash -c " $(PythonManylinuxDir)/bin/python3 -m pip install /build/Release/dist/$basefilename && $(PythonManylinuxDir)/bin/python3 -m onnxruntime.training.ortmodule.torch_cpp_extensions.install " ;
+            workingDirectory: $(Build.SourcesDirectory)
+
+        - task: CmdLine@2
+          displayName: 'Upload wheel'
+          condition: and(succeeded(), and(eq(variables['UploadWheel'], 'yes'), ne(variables['ORT_DISABLE_PYTHON_PACKAGE_LOCAL_VERSION'], 'true')))
+          inputs:
+            script: |
+              set -e -x
+              whlfilename=$(ls $(Build.ArtifactStagingDirectory)/Release/dist/*.whl | head -n 1) ; \
+              python3 tools/ci_build/upload_python_package_to_azure_storage.py \
+                  --python_wheel_path $whlfilename ${{ variables['finalStorage'] }}
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml
index c6921e151a029..f7ecc3cf84e48 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml
@@ -47,183 +47,42 @@ parameters:
   type: boolean
   default: false
 
-stages:
-- stage: "Cuda_Python_Packaging_debug_${{ parameters.debug_build }}"
-
-  variables:
-    - name: isMain
-      value: ${{ or(eq(variables['Build.SourceBranch'], 'refs/heads/main'), startsWith(variables['Build.SourceBranch'], 'refs/heads/rel-')) }}
-    - name: finalStorage
-      ${{ if eq(variables['isMain'], 'true') }}:
-        value: '--final_storage'
-      ${{ else }}:
-        value: ''
-    - name: buildConfig
-      ${{ if eq(parameters['debug_build'], 'true') }}:
-        value: 'Debug'
-      ${{ else }}:
-        value: 'Release'
-
-  dependsOn: []
-
-  jobs:
-    - job: Linux_py_Training_Cuda_Wheels
-      timeoutInMinutes: 180
-      workspace:
-        clean: all
-      pool: ${{ parameters.agent_pool }}
-      strategy:
-        matrix:
-          Python38:
-            PythonVersion: '3.8'
-            TorchVersion: ${{ parameters.torch_version }}
-            OpsetVersion: ${{ parameters.opset_version }}
-            CudaVersion: ${{ parameters.cuda_version }}
-            UploadWheel: ${{ parameters.upload_wheel }}
-          Python39:
-            PythonVersion: '3.9'
-            TorchVersion: ${{ parameters.torch_version }}
-            OpsetVersion: ${{ parameters.opset_version }}
-            CudaVersion: ${{ parameters.cuda_version }}
-            UploadWheel: ${{ parameters.upload_wheel }}
-          Python310:
-            PythonVersion: '3.10'
-            TorchVersion: ${{ parameters.torch_version }}
-            OpsetVersion: ${{ parameters.opset_version }}
-            CudaVersion: ${{ parameters.cuda_version }}
-            UploadWheel: ${{ parameters.upload_wheel }}
-          Python311:
-            PythonVersion: '3.11'
-            TorchVersion: ${{ parameters.torch_version }}
-            OpsetVersion: ${{ parameters.opset_version }}
-            CudaVersion: ${{ parameters.cuda_version }}
-            UploadWheel: ${{ parameters.upload_wheel }}
-# TODO: enable this when we have torch support pyton 3.12
-#          Python312:
-#            PythonVersion: '3.12'
-#            TorchVersion: ${{ parameters.torch_version }}
-#            OpsetVersion: ${{ parameters.opset_version }}
-#            CudaVersion: ${{ parameters.cuda_version }}
-#            UploadWheel: ${{ parameters.upload_wheel }}
-
-      steps:
-      - task: CmdLine@2
-        displayName: 'check variables'
-        inputs:
-          script: |
-            echo "Branch is "${{ variables['Build.SourceBranch'] }} && \
-            echo "isMain is "${{ variables['isMain'] }} && \
-            echo "final_storage is "${{ variables['finalStorage'] }}
-
-      - checkout: self
-        clean: true
-        submodules: recursive
-
-      - template: set-python-manylinux-variables-step.yml
-
-      - template: get-docker-image-steps.yml
-        parameters:
-          Dockerfile: tools/ci_build/github/linux/docker/${{ parameters.docker_file }}
-          Context: tools/ci_build/github/linux/docker
-          DockerBuildArgs: >-
-            --build-arg TORCH_VERSION=$(TorchVersion)
-            --build-arg OPSET_VERSION=$(OpsetVersion)
-            --build-arg PYTHON_VERSION=$(PythonVersion)
-            --build-arg INSTALL_DEPS_EXTRA_ARGS=-tu
-            --build-arg BUILD_UID=$(id -u)
-            --network=host --build-arg POLICY=manylinux_2_28 --build-arg PLATFORM=x86_64
-            --build-arg DEVTOOLSET_ROOTPATH=/usr
-            --build-arg PREPEND_PATH=/usr/local/cuda/bin:
-            --build-arg LD_LIBRARY_PATH_ARG=/usr/local/lib64
-          Repository: onnxruntimetraininggpubuild
-
-      - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/mnist" -d "/mnist"
-        displayName: 'Mount MNIST'
-        condition: succeededOrFailed()
-
-      - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/bert-data" -d "/bert_data"
-        displayName: 'Mount bert-data'
-        condition: succeededOrFailed()
-
-      - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/hf-models-cache" -d "/hf_models_cache"
-        displayName: 'Mount hf-models-cache'
-        condition: succeededOrFailed()
-
-      - task: CmdLine@2
-        displayName: 'build onnxruntime'
-        inputs:
-          script: |
-            set -e -x
-            mkdir -p $HOME/.onnx
-            docker run --rm -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" \
-              --volume /data/onnx:/data/onnx:ro \
-              --volume $(Build.SourcesDirectory):/onnxruntime_src \
-              --volume $(Build.BinariesDirectory):/build \
-              --volume /data/models:/build/models:ro \
-              --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
-              -e NVIDIA_VISIBLE_DEVICES=all \
-              -e NIGHTLY_BUILD \
-              -e DEFAULT_TRAINING_PACKAGE_DEVICE \
-              -e BUILD_BUILDNUMBER \
-              -e ORT_DISABLE_PYTHON_PACKAGE_LOCAL_VERSION \
-              onnxruntimetraininggpubuild \
-                $(PythonManylinuxDir)/bin/python3 /onnxruntime_src/tools/ci_build/build.py \
-                  --build_dir /build \
-                  --config ${{ variables['buildConfig'] }} \
-                  --skip_submodule_sync \
-                  --parallel --use_binskim_compliant_compile_flags \
-                  --build_wheel \
-                  --enable_onnx_tests \
-                  ${{ parameters.build_py_parameters }} \
-                  --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=${{ parameters.cmake_cuda_architectures }}' onnxruntime_BUILD_UNIT_TESTS=OFF \
-                  --use_cuda --cuda_version=$(CudaVersion) --cuda_home=/usr/local/cuda-$(CudaVersion) --cudnn_home=/usr/local/cuda-$(CudaVersion) ;
-          workingDirectory: $(Build.SourcesDirectory)
-
-      - task: CmdLine@2
-        displayName: 'test ortmodule'
-        inputs:
-          script: |
-            rm -rf $(Build.BinariesDirectory)/${{ variables['buildConfig'] }}/onnxruntime/ && \
-            files=($(Build.BinariesDirectory)/${{ variables['buildConfig'] }}/dist/*.whl) && \
-            echo ${files[0]} && \
-            whlfilename=$(basename ${files[0]}) && \
-            echo $whlfilename && \
-            docker run --rm \
-              --gpus all \
-              -e NVIDIA_VISIBLE_DEVICES=all \
-              --volume $(Build.BinariesDirectory):/build \
-              --volume /mnist:/mnist \
-              --volume /bert_data:/bert_data \
-              --volume /hf_models_cache:/hf_models_cache \
-              onnxruntimetraininggpubuild \
-                bash -c " $(PythonManylinuxDir)/bin/python3 -m pip install /build/${{ variables['buildConfig'] }}/dist/$whlfilename && $(PythonManylinuxDir)/bin/python3 -m onnxruntime.training.ortmodule.torch_cpp_extensions.install " ;
-          workingDirectory: $(Build.SourcesDirectory)
-
-      - task: CopyFiles@2
-        displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)'
-        inputs:
-          SourceFolder: '$(Build.BinariesDirectory)'
-          Contents: "${{ variables['buildConfig'] }}/dist/*.whl"
-          TargetFolder: '$(Build.ArtifactStagingDirectory)'
-
-      - task: PublishBuildArtifacts@1
-        displayName: 'Publish Artifact: ONNXRuntime python wheel and documentation'
-        inputs:
-          ArtifactName: "onnxruntime_gpu_${{ variables['buildConfig'] }}"
-
-      - task: CmdLine@2
-        displayName: 'Upload wheel'
-        condition: and(succeeded(), and(eq(variables['UploadWheel'], 'yes'), ne(variables['ORT_DISABLE_PYTHON_PACKAGE_LOCAL_VERSION'], 'true')))
-        inputs:
-          script: |
-            set -e -x
-            files=($(Build.ArtifactStagingDirectory)/${{ variables['buildConfig'] }}/dist/*.whl) && \
-            echo ${files[0]} && \
-            python3 tools/ci_build/upload_python_package_to_azure_storage.py \
-                --python_wheel_path ${files[0]} ${{ variables['finalStorage'] }}
+- name: SpecificArtifact
+  displayName: Use Specific Artifact
+  type: boolean
+  default: false
 
-      - template: component-governance-component-detection-steps.yml
-        parameters:
-          condition: 'succeeded'
+- name: BuildId
+  displayName: Specific Artifact's BuildId
+  type: string
+  default: '0'
+
+- name: PythonVersionList
+  displayName: Python Version List
+  type: object
+  default:
+    - name: '38'
+      version: '3.8'
+    - name: '39'
+      version: '3.9'
+    - name: '310'
+      version: '3.10'
+    - name: '311'
+      version: '3.11'
 
-      - template: clean-agent-build-directory-step.yml
+stages:
+- ${{ each python_version in parameters.PythonVersionList }}:
+  - template: py-packaging-training-cuda-stage-steps.yml
+    parameters:
+      build_py_parameters: ${{ parameters.build_py_parameters }}
+      torch_version: ${{ parameters.torch_version }}
+      opset_version: ${{ parameters.opset_version }}
+      cuda_version: ${{ parameters.cuda_version }}
+      cmake_cuda_architectures: ${{ parameters.cmake_cuda_architectures }}
+      docker_file: ${{ parameters.docker_file }}
+      upload_wheel: ${{ parameters.upload_wheel }}
+      debug_build: ${{ parameters.debug_build }}
+      stage_name: 'Linux_py_Training_Cuda_Wheels_${{ python_version.name }}'
+      python_version: ${{ python_version.version }}
+      SpecificArtifact: ${{ parameters.SpecificArtifact }}
+      BuildId: ${{ parameters.BuildId }}

From 32558134a9047f8babcac174cdc83d5366397b54 Mon Sep 17 00:00:00 2001
From: Adam Louly <adamlouly3@gmail.com>
Date: Thu, 14 Mar 2024 16:36:24 -0700
Subject: [PATCH 180/279] [On-Device-Training] Upgrade Flatbuffers to Support
 2GB+ Checkpoints. (#19770)

### Description
Modifications to support 2GB+ checkpoint & Upgrading Flatbuffers


### Motivation and Context
This PR includes changes that will make ort handle 2GB+ checkpoints.
To do that we need to upgrade flatbuffers to 23.5.9 -
https://github.com/google/flatbuffers/pull/7945

- Modified the commitHash and the hash for the new version
- Removed the patch for rust generator's unused variable warning as it
is no longer producing this - [Check it out
here](https://github.com/CasperN/flatbuffers/blob/d121e09d89726256ddbecd6318bcc85ce080d686/src/idl_gen_rust.cpp)
- Updated the VerifyField calls with alignment values that were
introduced in the new version.

---------

Co-authored-by: Sumit Agarwal <sumitagarwal@microsoft.com>
---
 cgmanifests/generated/cgmanifest.json         |   2 +-
 cmake/deps.txt                                |   2 +-
 .../external/onnxruntime_external_deps.cmake  |   2 +-
 cmake/patches/flatbuffers/flatbuffers.patch   |  40 +-
 include/onnxruntime/core/graph/graph.h        |   2 +-
 onnxruntime/core/common/flatbuffers.h         |  18 +
 .../core/flatbuffers/flatbuffers_utils.h      |   2 +-
 onnxruntime/core/flatbuffers/schema/README.md |   2 +-
 onnxruntime/core/flatbuffers/schema/ort.fbs.h |  56 +-
 .../schema/ort_training_checkpoint.fbs.h      |  14 +-
 .../core/framework/kernel_type_str_resolver.h |   2 +-
 .../kernel_type_str_resolver_utils.cc         |   2 +-
 onnxruntime/core/framework/session_state.h    |   2 +-
 .../core/graph/graph_flatbuffers_utils.cc     |   2 +-
 .../core/graph/graph_flatbuffers_utils.h      |   2 +-
 onnxruntime/core/graph/model.h                |   2 +-
 onnxruntime/core/graph/op_identifier_utils.h  |   2 +-
 .../runtime_optimization_record_container.h   |   2 +-
 .../DirectMLHelpers/DmlGraphDesc_generated.h  | 440 +++++++++-------
 .../OperatorFieldTypes_generated.h            | 495 +++++++++---------
 .../dml/DmlExecutionProvider/src/precomp.h    |   2 +-
 .../migraphx/ort_trt_int8_cal_table.fbs.h     |   2 +-
 .../tensorrt/ort_trt_int8_cal_table.fbs.h     |   2 +-
 .../templates/download-deps.yml               |   4 +-
 .../templates/jobs/win-ci-vs-2022-job.yml     |  19 +-
 25 files changed, 585 insertions(+), 535 deletions(-)
 create mode 100644 onnxruntime/core/common/flatbuffers.h

diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json
index cfad59be6b4c0..dc7e9c3fddb2f 100644
--- a/cgmanifests/generated/cgmanifest.json
+++ b/cgmanifests/generated/cgmanifest.json
@@ -86,7 +86,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "6df40a2471737b27271bdd9b900ab5f3aec746c7",
+          "commitHash": "0100f6a5779831fa7a651e4b67ef389a8752bd9b",
           "repositoryUrl": "https://github.com/google/flatbuffers.git"
         },
         "comments": "flatbuffers"
diff --git a/cmake/deps.txt b/cmake/deps.txt
index 9630b6185fcf6..4111689c5def9 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -23,7 +23,7 @@ dlpack;https://github.com/dmlc/dlpack/archive/refs/tags/v0.6.zip;4d565dd2e5b3132
 # Until the 3.4.1 release this is the best option we have.
 # Issue link: https://gitlab.com/libeigen/eigen/-/issues/2744
 eigen;https://gitlab.com/libeigen/eigen/-/archive/e7248b26a1ed53fa030c5c459f7ea095dfd276ac/eigen-e7248b26a1ed53fa030c5c459f7ea095dfd276ac.zip;be8be39fdbc6e60e94fa7870b280707069b5b81a
-flatbuffers;https://github.com/google/flatbuffers/archive/refs/tags/v1.12.0.zip;ba0a75fd12dbef8f6557a74e611b7a3d0c5fe7bf
+flatbuffers;https://github.com/google/flatbuffers/archive/refs/tags/v23.5.26.zip;59422c3b5e573dd192fead2834d25951f1c1670c
 fp16;https://github.com/Maratyszcza/FP16/archive/0a92994d729ff76a58f692d3028ca1b64b145d91.zip;b985f6985a05a1c03ff1bb71190f66d8f98a1494
 fxdiv;https://github.com/Maratyszcza/FXdiv/archive/63058eff77e11aa15bf531df5dd34395ec3017c8.zip;a5658f4036402dbca7cebee32be57fb8149811e1
 google_benchmark;https://github.com/google/benchmark/archive/refs/tags/v1.8.3.zip;bf9870756ee3f8d2d3b346b24ee3600a41c74d3d
diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index 75ccc2dfd83a0..ac1e187f357aa 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -109,7 +109,7 @@ FetchContent_Declare(
     URL ${DEP_URL_flatbuffers}
     URL_HASH SHA1=${DEP_SHA1_flatbuffers}
     PATCH_COMMAND ${ONNXRUNTIME_FLATBUFFERS_PATCH_COMMAND}
-    FIND_PACKAGE_ARGS 1.12.0...<2.0.0 NAMES Flatbuffers
+    FIND_PACKAGE_ARGS 23.5.9 NAMES Flatbuffers
 )
 
 # Download a protoc binary from Internet if needed
diff --git a/cmake/patches/flatbuffers/flatbuffers.patch b/cmake/patches/flatbuffers/flatbuffers.patch
index f141d358c54b6..fbe8db37ecb0e 100644
--- a/cmake/patches/flatbuffers/flatbuffers.patch
+++ b/cmake/patches/flatbuffers/flatbuffers.patch
@@ -2,35 +2,11 @@ diff --git a/CMakeLists.txt b/CMakeLists.txt
 index 3987eac9..5e5462f1 100644
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
-@@ -223,7 +223,7 @@ elseif(CMAKE_COMPILER_IS_GNUCXX)
-       "${CMAKE_CXX_FLAGS} -std=c++0x")
-   endif(CYGWIN)
-   set(CMAKE_CXX_FLAGS
--    "${CMAKE_CXX_FLAGS} -Wall -pedantic -Werror -Wextra -Werror=shadow")
-+    "${CMAKE_CXX_FLAGS} -Wall -pedantic -Wextra -Werror=shadow -Wno-error=stringop-overflow")
-   set(FLATBUFFERS_PRIVATE_CXX_FLAGS "-Wold-style-cast")
-   if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.4)
-     if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
-diff --git a/src/idl_gen_rust.cpp b/src/idl_gen_rust.cpp
-index 55b8439b..dc03e8a8 100644
---- a/src/idl_gen_rust.cpp
-+++ b/src/idl_gen_rust.cpp
-@@ -406,7 +406,8 @@ class RustGenerator : public BaseGenerator {
-     // example: f(A, D::E)          -> super::D::E
-     // does not include leaf object (typically a struct type).
- 
--    size_t i = 0;
-+    // fix unused but set variable warning
-+    //size_t i = 0;
-     std::stringstream stream;
- 
-     auto s = src->components.begin();
-@@ -417,7 +418,7 @@ class RustGenerator : public BaseGenerator {
-       if (*s != *d) { break; }
-       ++s;
-       ++d;
--      ++i;
-+      //++i;
-     }
- 
-     for (; s != src->components.end(); ++s) { stream << "super::"; }
+@@ -279,5 +279,5 @@
+ # Append FLATBUFFERS_CXX_FLAGS to CMAKE_CXX_FLAGS.
+ if(DEFINED FLATBUFFERS_CXX_FLAGS)
+   message(STATUS "extend CXX_FLAGS with ${FLATBUFFERS_CXX_FLAGS}")
+-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLATBUFFERS_CXX_FLAGS}")
++  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLATBUFFERS_CXX_FLAGS} -Wno-error=stringop-overflow")
+ endif()
+ message(STATUS "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h
index b9b8a25286b7b..b16d52dbdab68 100644
--- a/include/onnxruntime/core/graph/graph.h
+++ b/include/onnxruntime/core/graph/graph.h
@@ -21,7 +21,7 @@
 #pragma warning(pop)
 #endif
 
-#include "flatbuffers/flatbuffers.h"
+#include "core/common/flatbuffers.h"
 
 #include "core/common/gsl.h"
 
diff --git a/onnxruntime/core/common/flatbuffers.h b/onnxruntime/core/common/flatbuffers.h
new file mode 100644
index 0000000000000..0d61e1038a82c
--- /dev/null
+++ b/onnxruntime/core/common/flatbuffers.h
@@ -0,0 +1,18 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#if defined(__GNUC__)
+#include "onnxruntime_config.h"
+#pragma GCC diagnostic push
+
+#ifdef HAS_SHORTEN_64_TO_32
+#pragma GCC diagnostic ignored "-Wshorten-64-to-32"
+#endif
+#endif
+
+#include "flatbuffers/flatbuffers.h"
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
\ No newline at end of file
diff --git a/onnxruntime/core/flatbuffers/flatbuffers_utils.h b/onnxruntime/core/flatbuffers/flatbuffers_utils.h
index 55bde0b2df806..76860d6ab1db8 100644
--- a/onnxruntime/core/flatbuffers/flatbuffers_utils.h
+++ b/onnxruntime/core/flatbuffers/flatbuffers_utils.h
@@ -5,7 +5,7 @@
 
 #include <unordered_map>
 
-#include "flatbuffers/flatbuffers.h"
+#include "core/common/flatbuffers.h"
 
 #include "core/common/common.h"
 #include "core/common/path_string.h"
diff --git a/onnxruntime/core/flatbuffers/schema/README.md b/onnxruntime/core/flatbuffers/schema/README.md
index 932478111ee68..96a2936c196ae 100644
--- a/onnxruntime/core/flatbuffers/schema/README.md
+++ b/onnxruntime/core/flatbuffers/schema/README.md
@@ -21,7 +21,7 @@ e.g.
     - /build/Linux/Debug/_deps/flatbuffers-build/flatc
 
 It is possible to use another flatc as well, e.g., from a separate installation. Note that ONNX Runtime uses
-FlatBuffers 1.12.
+FlatBuffers 23.5.26.
 
 To update the flatbuffers schemas and generated files:
 1. Modify [the ORT file format schema](ort.fbs) or [training checkpoint schema](ort_training_checkpoint.fbs).
diff --git a/onnxruntime/core/flatbuffers/schema/ort.fbs.h b/onnxruntime/core/flatbuffers/schema/ort.fbs.h
index e0f5342c29621..dc8a471f2d81f 100644
--- a/onnxruntime/core/flatbuffers/schema/ort.fbs.h
+++ b/onnxruntime/core/flatbuffers/schema/ort.fbs.h
@@ -4,7 +4,7 @@
 #ifndef FLATBUFFERS_GENERATED_ORT_ONNXRUNTIME_FBS_H_
 #define FLATBUFFERS_GENERATED_ORT_ONNXRUNTIME_FBS_H_
 
-#include "flatbuffers/flatbuffers.h"
+#include "core/common/flatbuffers.h"
 
 namespace onnxruntime {
 namespace fbs {
@@ -562,8 +562,8 @@ struct DimensionValue FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyField<int8_t>(verifier, VT_DIM_TYPE) &&
-           VerifyField<int64_t>(verifier, VT_DIM_VALUE) &&
+           VerifyField<int8_t>(verifier, VT_DIM_TYPE, 1) &&
+           VerifyField<int64_t>(verifier, VT_DIM_VALUE, 8) &&
            VerifyOffset(verifier, VT_DIM_PARAM) &&
            verifier.VerifyString(dim_param()) &&
            verifier.EndTable();
@@ -634,7 +634,7 @@ struct TensorTypeAndShape FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyField<int32_t>(verifier, VT_ELEM_TYPE) &&
+           VerifyField<int32_t>(verifier, VT_ELEM_TYPE, 4) &&
            VerifyOffset(verifier, VT_SHAPE) &&
            verifier.VerifyTable(shape()) &&
            verifier.EndTable();
@@ -687,7 +687,7 @@ struct MapType FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyField<int32_t>(verifier, VT_KEY_TYPE) &&
+           VerifyField<int32_t>(verifier, VT_KEY_TYPE, 4) &&
            VerifyOffset(verifier, VT_VALUE_TYPE) &&
            verifier.VerifyTable(value_type()) &&
            verifier.EndTable();
@@ -787,7 +787,7 @@ struct NodeEdge FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyField<uint32_t>(verifier, VT_NODE_INDEX) &&
+           VerifyField<uint32_t>(verifier, VT_NODE_INDEX, 4) &&
            VerifyOffset(verifier, VT_INPUT_EDGES) &&
            verifier.VerifyVector(input_edges()) &&
            VerifyOffset(verifier, VT_OUTPUT_EDGES) &&
@@ -911,11 +911,11 @@ struct Node FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            verifier.VerifyString(doc_string()) &&
            VerifyOffset(verifier, VT_DOMAIN) &&
            verifier.VerifyString(domain()) &&
-           VerifyField<int32_t>(verifier, VT_SINCE_VERSION) &&
-           VerifyField<uint32_t>(verifier, VT_INDEX) &&
+           VerifyField<int32_t>(verifier, VT_SINCE_VERSION, 4) &&
+           VerifyField<uint32_t>(verifier, VT_INDEX, 4) &&
            VerifyOffset(verifier, VT_OP_TYPE) &&
            verifier.VerifyString(op_type()) &&
-           VerifyField<int32_t>(verifier, VT_TYPE) &&
+           VerifyField<int32_t>(verifier, VT_TYPE, 4) &&
            VerifyOffset(verifier, VT_EXECUTION_PROVIDER_TYPE) &&
            verifier.VerifyString(execution_provider_type()) &&
            VerifyOffset(verifier, VT_INPUTS) &&
@@ -1174,7 +1174,7 @@ struct TypeInfo FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_DENOTATION) &&
            verifier.VerifyString(denotation()) &&
-           VerifyField<uint8_t>(verifier, VT_VALUE_TYPE) &&
+           VerifyField<uint8_t>(verifier, VT_VALUE_TYPE, 1) &&
            VerifyOffset(verifier, VT_VALUE) &&
            VerifyTypeInfoValue(verifier, value(), value_type()) &&
            verifier.EndTable();
@@ -1259,7 +1259,7 @@ struct OperatorSetId FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_DOMAIN) &&
            verifier.VerifyString(domain()) &&
-           VerifyField<int64_t>(verifier, VT_VERSION) &&
+           VerifyField<int64_t>(verifier, VT_VERSION, 8) &&
            verifier.EndTable();
   }
 };
@@ -1343,7 +1343,7 @@ struct Tensor FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            verifier.VerifyString(doc_string()) &&
            VerifyOffset(verifier, VT_DIMS) &&
            verifier.VerifyVector(dims()) &&
-           VerifyField<int32_t>(verifier, VT_DATA_TYPE) &&
+           VerifyField<int32_t>(verifier, VT_DATA_TYPE, 4) &&
            VerifyOffset(verifier, VT_RAW_DATA) &&
            verifier.VerifyVector(raw_data()) &&
            VerifyOffset(verifier, VT_STRING_DATA) &&
@@ -1568,9 +1568,9 @@ struct Attribute FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            verifier.VerifyString(name()) &&
            VerifyOffset(verifier, VT_DOC_STRING) &&
            verifier.VerifyString(doc_string()) &&
-           VerifyField<int32_t>(verifier, VT_TYPE) &&
-           VerifyField<float>(verifier, VT_F) &&
-           VerifyField<int64_t>(verifier, VT_I) &&
+           VerifyField<int32_t>(verifier, VT_TYPE, 4) &&
+           VerifyField<float>(verifier, VT_F, 4) &&
+           VerifyField<int64_t>(verifier, VT_I, 8) &&
            VerifyOffset(verifier, VT_S) &&
            verifier.VerifyString(s()) &&
            VerifyOffset(verifier, VT_T) &&
@@ -1759,12 +1759,12 @@ struct NodesToOptimizeIndices FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tab
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_NODE_INDICES) &&
            verifier.VerifyVector(node_indices()) &&
-           VerifyField<uint32_t>(verifier, VT_NUM_INPUTS) &&
-           VerifyField<uint32_t>(verifier, VT_NUM_OUTPUTS) &&
-           VerifyField<uint8_t>(verifier, VT_HAS_VARIADIC_INPUT) &&
-           VerifyField<uint8_t>(verifier, VT_HAS_VARIADIC_OUTPUT) &&
-           VerifyField<uint32_t>(verifier, VT_NUM_VARIADIC_INPUTS) &&
-           VerifyField<uint32_t>(verifier, VT_NUM_VARIADIC_OUTPUTS) &&
+           VerifyField<uint32_t>(verifier, VT_NUM_INPUTS, 4) &&
+           VerifyField<uint32_t>(verifier, VT_NUM_OUTPUTS, 4) &&
+           VerifyField<uint8_t>(verifier, VT_HAS_VARIADIC_INPUT, 1) &&
+           VerifyField<uint8_t>(verifier, VT_HAS_VARIADIC_OUTPUT, 1) &&
+           VerifyField<uint32_t>(verifier, VT_NUM_VARIADIC_INPUTS, 4) &&
+           VerifyField<uint32_t>(verifier, VT_NUM_VARIADIC_OUTPUTS, 4) &&
            verifier.EndTable();
   }
 };
@@ -1862,8 +1862,8 @@ struct DeprecatedNodeIndexAndKernelDefHash FLATBUFFERS_FINAL_CLASS : private fla
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyField<uint32_t>(verifier, VT_NODE_INDEX) &&
-           VerifyField<uint64_t>(verifier, VT_KERNEL_DEF_HASH) &&
+           VerifyField<uint32_t>(verifier, VT_NODE_INDEX, 4) &&
+           VerifyField<uint64_t>(verifier, VT_KERNEL_DEF_HASH, 8) &&
            verifier.EndTable();
   }
 };
@@ -2161,7 +2161,7 @@ struct Graph FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyOffset(verifier, VT_NODES) &&
            verifier.VerifyVector(nodes()) &&
            verifier.VerifyVectorOfTables(nodes()) &&
-           VerifyField<uint32_t>(verifier, VT_MAX_NODE_INDEX) &&
+           VerifyField<uint32_t>(verifier, VT_MAX_NODE_INDEX, 4) &&
            VerifyOffset(verifier, VT_NODE_EDGES) &&
            verifier.VerifyVector(node_edges()) &&
            verifier.VerifyVectorOfTables(node_edges()) &&
@@ -2390,7 +2390,7 @@ struct Model FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyField<int64_t>(verifier, VT_IR_VERSION) &&
+           VerifyField<int64_t>(verifier, VT_IR_VERSION, 8) &&
            VerifyOffset(verifier, VT_OPSET_IMPORT) &&
            verifier.VerifyVector(opset_import()) &&
            verifier.VerifyVectorOfTables(opset_import()) &&
@@ -2400,7 +2400,7 @@ struct Model FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            verifier.VerifyString(producer_version()) &&
            VerifyOffset(verifier, VT_DOMAIN) &&
            verifier.VerifyString(domain()) &&
-           VerifyField<int64_t>(verifier, VT_MODEL_VERSION) &&
+           VerifyField<int64_t>(verifier, VT_MODEL_VERSION, 8) &&
            VerifyOffset(verifier, VT_DOC_STRING) &&
            verifier.VerifyString(doc_string()) &&
            VerifyOffset(verifier, VT_GRAPH) &&
@@ -2740,8 +2740,8 @@ struct ArgTypeAndIndex FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyField<int8_t>(verifier, VT_ARG_TYPE) &&
-           VerifyField<uint32_t>(verifier, VT_INDEX) &&
+           VerifyField<int8_t>(verifier, VT_ARG_TYPE, 1) &&
+           VerifyField<uint32_t>(verifier, VT_INDEX, 4) &&
            verifier.EndTable();
   }
 };
diff --git a/onnxruntime/core/flatbuffers/schema/ort_training_checkpoint.fbs.h b/onnxruntime/core/flatbuffers/schema/ort_training_checkpoint.fbs.h
index d205c5eb8f409..62e6cf74394e5 100644
--- a/onnxruntime/core/flatbuffers/schema/ort_training_checkpoint.fbs.h
+++ b/onnxruntime/core/flatbuffers/schema/ort_training_checkpoint.fbs.h
@@ -4,7 +4,7 @@
 #ifndef FLATBUFFERS_GENERATED_ORTTRAININGCHECKPOINT_ONNXRUNTIME_FBS_H_
 #define FLATBUFFERS_GENERATED_ORTTRAININGCHECKPOINT_ONNXRUNTIME_FBS_H_
 
-#include "flatbuffers/flatbuffers.h"
+#include "core/common/flatbuffers.h"
 
 #include "ort.fbs.h"
 
@@ -59,7 +59,7 @@ struct ModuleState FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyOffset(verifier, VT_FROZEN_PARAMS) &&
            verifier.VerifyVector(frozen_params()) &&
            verifier.VerifyVectorOfTables(frozen_params()) &&
-           VerifyField<uint8_t>(verifier, VT_IS_NOMINAL_STATE) &&
+           VerifyField<uint8_t>(verifier, VT_IS_NOMINAL_STATE, 1) &&
            verifier.EndTable();
   }
 };
@@ -206,8 +206,8 @@ struct OptimizerGroup FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_GROUP_NAME) &&
            verifier.VerifyString(group_name()) &&
-           VerifyField<int64_t>(verifier, VT_STEP) &&
-           VerifyField<float>(verifier, VT_INITIAL_LEARNING_RATE) &&
+           VerifyField<int64_t>(verifier, VT_STEP, 8) &&
+           VerifyField<float>(verifier, VT_INITIAL_LEARNING_RATE, 4) &&
            VerifyOffset(verifier, VT_OPTIMIZER_STATES) &&
            verifier.VerifyVector(optimizer_states()) &&
            verifier.VerifyVectorOfTables(optimizer_states()) &&
@@ -289,7 +289,7 @@ struct IntProperty FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_NAME) &&
            verifier.VerifyString(name()) &&
-           VerifyField<int64_t>(verifier, VT_VALUE) &&
+           VerifyField<int64_t>(verifier, VT_VALUE, 8) &&
            verifier.EndTable();
   }
 };
@@ -353,7 +353,7 @@ struct FloatProperty FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_NAME) &&
            verifier.VerifyString(name()) &&
-           VerifyField<float>(verifier, VT_VALUE) &&
+           VerifyField<float>(verifier, VT_VALUE, 4) &&
            verifier.EndTable();
   }
 };
@@ -572,7 +572,7 @@ struct Checkpoint FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyField<int32_t>(verifier, VT_VERSION) &&
+           VerifyField<int32_t>(verifier, VT_VERSION, 4) &&
            VerifyOffset(verifier, VT_MODULE_STATE) &&
            verifier.VerifyTable(module_state()) &&
            VerifyOffset(verifier, VT_OPTIMIZER_GROUPS) &&
diff --git a/onnxruntime/core/framework/kernel_type_str_resolver.h b/onnxruntime/core/framework/kernel_type_str_resolver.h
index 31a806dd52291..fea2a6ef3a439 100644
--- a/onnxruntime/core/framework/kernel_type_str_resolver.h
+++ b/onnxruntime/core/framework/kernel_type_str_resolver.h
@@ -7,7 +7,7 @@
 #include <string_view>
 #include <utility>
 
-#include "flatbuffers/flatbuffers.h"
+#include "core/common/flatbuffers.h"
 
 #if !defined(ORT_MINIMAL_BUILD)
 #include "core/graph/onnx_protobuf.h"
diff --git a/onnxruntime/core/framework/kernel_type_str_resolver_utils.cc b/onnxruntime/core/framework/kernel_type_str_resolver_utils.cc
index 4f5fa9910b5df..473e78c3f5e25 100644
--- a/onnxruntime/core/framework/kernel_type_str_resolver_utils.cc
+++ b/onnxruntime/core/framework/kernel_type_str_resolver_utils.cc
@@ -5,7 +5,7 @@
 
 #include "core/framework/kernel_type_str_resolver_utils.h"
 
-#include "flatbuffers/flatbuffers.h"
+#include "core/common/flatbuffers.h"
 
 #include "core/common/common.h"
 #include "core/flatbuffers/schema/ort.fbs.h"
diff --git a/onnxruntime/core/framework/session_state.h b/onnxruntime/core/framework/session_state.h
index 51bb02918d82f..a2ee1601d386b 100644
--- a/onnxruntime/core/framework/session_state.h
+++ b/onnxruntime/core/framework/session_state.h
@@ -8,7 +8,7 @@
 #include <unordered_map>
 #include <vector>
 
-#include "flatbuffers/flatbuffers.h"
+#include "core/common/flatbuffers.h"
 
 #include "core/common/gsl.h"
 
diff --git a/onnxruntime/core/graph/graph_flatbuffers_utils.cc b/onnxruntime/core/graph/graph_flatbuffers_utils.cc
index 6d7ed94b2956d..2314a5228f83c 100644
--- a/onnxruntime/core/graph/graph_flatbuffers_utils.cc
+++ b/onnxruntime/core/graph/graph_flatbuffers_utils.cc
@@ -3,7 +3,7 @@
 
 #include "graph_flatbuffers_utils.h"
 
-#include "flatbuffers/flatbuffers.h"
+#include "core/common/flatbuffers.h"
 
 #include "core/common/narrow.h"
 #include "core/flatbuffers/flatbuffers_utils.h"
diff --git a/onnxruntime/core/graph/graph_flatbuffers_utils.h b/onnxruntime/core/graph/graph_flatbuffers_utils.h
index b625cbf3ca492..9c55dad3c41ef 100644
--- a/onnxruntime/core/graph/graph_flatbuffers_utils.h
+++ b/onnxruntime/core/graph/graph_flatbuffers_utils.h
@@ -5,7 +5,7 @@
 
 #include <memory>
 
-#include "flatbuffers/flatbuffers.h"
+#include "core/common/flatbuffers.h"
 
 #include "core/common/status.h"
 #include "core/graph/ort_format_load_options.h"
diff --git a/onnxruntime/core/graph/model.h b/onnxruntime/core/graph/model.h
index 4ce6660b794bc..a774d5fe34461 100644
--- a/onnxruntime/core/graph/model.h
+++ b/onnxruntime/core/graph/model.h
@@ -8,7 +8,7 @@
 #include <climits>
 #include <string>
 
-#include "flatbuffers/flatbuffers.h"
+#include "core/common/flatbuffers.h"
 
 #include "core/common/path.h"
 #include "core/graph/graph_viewer.h"
diff --git a/onnxruntime/core/graph/op_identifier_utils.h b/onnxruntime/core/graph/op_identifier_utils.h
index 8a9351a2d0ddc..f7b1198c31972 100644
--- a/onnxruntime/core/graph/op_identifier_utils.h
+++ b/onnxruntime/core/graph/op_identifier_utils.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "flatbuffers/flatbuffers.h"
+#include "core/common/flatbuffers.h"
 
 #include "core/graph/op_identifier.h"
 
diff --git a/onnxruntime/core/graph/runtime_optimization_record_container.h b/onnxruntime/core/graph/runtime_optimization_record_container.h
index a28b19e786de0..75750c2b96987 100644
--- a/onnxruntime/core/graph/runtime_optimization_record_container.h
+++ b/onnxruntime/core/graph/runtime_optimization_record_container.h
@@ -9,7 +9,7 @@
 #include <unordered_map>
 #include <vector>
 
-#include "flatbuffers/flatbuffers.h"
+#include "core/common/flatbuffers.h"
 
 #include "core/common/common.h"
 #include "core/graph/runtime_optimization_record.h"
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphDesc_generated.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphDesc_generated.h
index 72059b9a3f911..df485396f1e47 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphDesc_generated.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphDesc_generated.h
@@ -4,7 +4,14 @@
 #ifndef FLATBUFFERS_GENERATED_DMLGRAPHDESC_DML_IR_H_
 #define FLATBUFFERS_GENERATED_DMLGRAPHDESC_DML_IR_H_
 
-#include "flatbuffers/flatbuffers.h"
+#include "core/common/flatbuffers.h"
+
+// Ensure the included flatbuffers.h is the same version as when this file was
+// generated, otherwise it may not be compatible.
+static_assert(FLATBUFFERS_VERSION_MAJOR == 23 &&
+              FLATBUFFERS_VERSION_MINOR == 5 &&
+              FLATBUFFERS_VERSION_REVISION == 26,
+             "Non-compatible flatbuffers version included");
 
 #include "OperatorFieldTypes_generated.h"
 
@@ -32,7 +39,7 @@ struct DmlGraphNodeBuilder;
 struct DmlGraphDesc;
 struct DmlGraphDescBuilder;
 
-enum ConstantNodeDescDetail {
+enum ConstantNodeDescDetail : uint8_t {
   ConstantNodeDescDetail_NONE = 0,
   ConstantNodeDescDetail_ConstantName = 1,
   ConstantNodeDescDetail_ConstantRawData = 2,
@@ -60,7 +67,7 @@ inline const char * const *EnumNamesConstantNodeDescDetail() {
 }
 
 inline const char *EnumNameConstantNodeDescDetail(ConstantNodeDescDetail e) {
-  if (flatbuffers::IsOutRange(e, ConstantNodeDescDetail_NONE, ConstantNodeDescDetail_ConstantRawData)) return "";
+  if (::flatbuffers::IsOutRange(e, ConstantNodeDescDetail_NONE, ConstantNodeDescDetail_ConstantRawData)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesConstantNodeDescDetail()[index];
 }
@@ -77,10 +84,10 @@ template<> struct ConstantNodeDescDetailTraits<dml::ir::ConstantRawData> {
   static const ConstantNodeDescDetail enum_value = ConstantNodeDescDetail_ConstantRawData;
 };
 
-bool VerifyConstantNodeDescDetail(flatbuffers::Verifier &verifier, const void *obj, ConstantNodeDescDetail type);
-bool VerifyConstantNodeDescDetailVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types);
+bool VerifyConstantNodeDescDetail(::flatbuffers::Verifier &verifier, const void *obj, ConstantNodeDescDetail type);
+bool VerifyConstantNodeDescDetailVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values, const ::flatbuffers::Vector<uint8_t> *types);
 
-enum NodeDesc {
+enum NodeDesc : uint8_t {
   NodeDesc_NONE = 0,
   NodeDesc_OperatorNodeDesc = 1,
   NodeDesc_ConstantNodeDesc = 2,
@@ -108,7 +115,7 @@ inline const char * const *EnumNamesNodeDesc() {
 }
 
 inline const char *EnumNameNodeDesc(NodeDesc e) {
-  if (flatbuffers::IsOutRange(e, NodeDesc_NONE, NodeDesc_ConstantNodeDesc)) return "";
+  if (::flatbuffers::IsOutRange(e, NodeDesc_NONE, NodeDesc_ConstantNodeDesc)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesNodeDesc()[index];
 }
@@ -125,18 +132,21 @@ template<> struct NodeDescTraits<dml::ir::ConstantNodeDesc> {
   static const NodeDesc enum_value = NodeDesc_ConstantNodeDesc;
 };
 
-bool VerifyNodeDesc(flatbuffers::Verifier &verifier, const void *obj, NodeDesc type);
-bool VerifyNodeDescVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types);
+bool VerifyNodeDesc(::flatbuffers::Verifier &verifier, const void *obj, NodeDesc type);
+bool VerifyNodeDescVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values, const ::flatbuffers::Vector<uint8_t> *types);
 
-struct ConstantRawData FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct ConstantRawData FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef ConstantRawDataBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_DATA = 4
   };
-  const flatbuffers::Vector<uint8_t> *data() const {
-    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_DATA);
+  const ::flatbuffers::Vector<uint8_t> *data() const {
+    return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_DATA);
+  }
+  ::flatbuffers::Vector<uint8_t> *mutable_data() {
+    return GetPointer<::flatbuffers::Vector<uint8_t> *>(VT_DATA);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_DATA) &&
            verifier.VerifyVector(data()) &&
@@ -146,33 +156,32 @@ struct ConstantRawData FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 
 struct ConstantRawDataBuilder {
   typedef ConstantRawData Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_data(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> data) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_data(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> data) {
     fbb_.AddOffset(ConstantRawData::VT_DATA, data);
   }
-  explicit ConstantRawDataBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit ConstantRawDataBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  ConstantRawDataBuilder &operator=(const ConstantRawDataBuilder &);
-  flatbuffers::Offset<ConstantRawData> Finish() {
+  ::flatbuffers::Offset<ConstantRawData> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<ConstantRawData>(end);
+    auto o = ::flatbuffers::Offset<ConstantRawData>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<ConstantRawData> CreateConstantRawData(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> data = 0) {
+inline ::flatbuffers::Offset<ConstantRawData> CreateConstantRawData(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> data = 0) {
   ConstantRawDataBuilder builder_(_fbb);
   builder_.add_data(data);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<ConstantRawData> CreateConstantRawDataDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<ConstantRawData> CreateConstantRawDataDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     const std::vector<uint8_t> *data = nullptr) {
   auto data__ = data ? _fbb.CreateVector<uint8_t>(*data) : 0;
   return dml::ir::CreateConstantRawData(
@@ -180,15 +189,18 @@ inline flatbuffers::Offset<ConstantRawData> CreateConstantRawDataDirect(
       data__);
 }
 
-struct ConstantName FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct ConstantName FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef ConstantNameBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_NAME = 4
   };
-  const flatbuffers::String *name() const {
-    return GetPointer<const flatbuffers::String *>(VT_NAME);
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
+  }
+  ::flatbuffers::String *mutable_name() {
+    return GetPointer<::flatbuffers::String *>(VT_NAME);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_NAME) &&
            verifier.VerifyString(name()) &&
@@ -198,33 +210,32 @@ struct ConstantName FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 
 struct ConstantNameBuilder {
   typedef ConstantName Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_name(flatbuffers::Offset<flatbuffers::String> name) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
     fbb_.AddOffset(ConstantName::VT_NAME, name);
   }
-  explicit ConstantNameBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit ConstantNameBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  ConstantNameBuilder &operator=(const ConstantNameBuilder &);
-  flatbuffers::Offset<ConstantName> Finish() {
+  ::flatbuffers::Offset<ConstantName> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<ConstantName>(end);
+    auto o = ::flatbuffers::Offset<ConstantName>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<ConstantName> CreateConstantName(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::String> name = 0) {
+inline ::flatbuffers::Offset<ConstantName> CreateConstantName(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0) {
   ConstantNameBuilder builder_(_fbb);
   builder_.add_name(name);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<ConstantName> CreateConstantNameDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<ConstantName> CreateConstantNameDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     const char *name = nullptr) {
   auto name__ = name ? _fbb.CreateString(name) : 0;
   return dml::ir::CreateConstantName(
@@ -232,7 +243,7 @@ inline flatbuffers::Offset<ConstantName> CreateConstantNameDirect(
       name__);
 }
 
-struct ConstantNodeDesc FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct ConstantNodeDesc FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef ConstantNodeDescBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_DATA_TYPE = 4,
@@ -251,9 +262,12 @@ struct ConstantNodeDesc FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const dml::ir::ConstantRawData *data_as_ConstantRawData() const {
     return data_type() == dml::ir::ConstantNodeDescDetail_ConstantRawData ? static_cast<const dml::ir::ConstantRawData *>(data()) : nullptr;
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  void *mutable_data() {
+    return GetPointer<void *>(VT_DATA);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyField<uint8_t>(verifier, VT_DATA_TYPE) &&
+           VerifyField<uint8_t>(verifier, VT_DATA_TYPE, 1) &&
            VerifyOffset(verifier, VT_DATA) &&
            VerifyConstantNodeDescDetail(verifier, data(), data_type()) &&
            verifier.EndTable();
@@ -270,37 +284,36 @@ template<> inline const dml::ir::ConstantRawData *ConstantNodeDesc::data_as<dml:
 
 struct ConstantNodeDescBuilder {
   typedef ConstantNodeDesc Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_data_type(dml::ir::ConstantNodeDescDetail data_type) {
     fbb_.AddElement<uint8_t>(ConstantNodeDesc::VT_DATA_TYPE, static_cast<uint8_t>(data_type), 0);
   }
-  void add_data(flatbuffers::Offset<void> data) {
+  void add_data(::flatbuffers::Offset<void> data) {
     fbb_.AddOffset(ConstantNodeDesc::VT_DATA, data);
   }
-  explicit ConstantNodeDescBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit ConstantNodeDescBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  ConstantNodeDescBuilder &operator=(const ConstantNodeDescBuilder &);
-  flatbuffers::Offset<ConstantNodeDesc> Finish() {
+  ::flatbuffers::Offset<ConstantNodeDesc> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<ConstantNodeDesc>(end);
+    auto o = ::flatbuffers::Offset<ConstantNodeDesc>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<ConstantNodeDesc> CreateConstantNodeDesc(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<ConstantNodeDesc> CreateConstantNodeDesc(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     dml::ir::ConstantNodeDescDetail data_type = dml::ir::ConstantNodeDescDetail_NONE,
-    flatbuffers::Offset<void> data = 0) {
+    ::flatbuffers::Offset<void> data = 0) {
   ConstantNodeDescBuilder builder_(_fbb);
   builder_.add_data(data);
   builder_.add_data_type(data_type);
   return builder_.Finish();
 }
 
-struct DmlBufferTensorDesc FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct DmlBufferTensorDesc FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef DmlBufferTensorDescBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_DATATYPE = 4,
@@ -308,19 +321,31 @@ struct DmlBufferTensorDesc FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
     VT_STRIDES = 8,
     VT_TOTALTENSORSIZEINBYTES = 10
   };
-  const flatbuffers::String *dataType() const {
-    return GetPointer<const flatbuffers::String *>(VT_DATATYPE);
+  const ::flatbuffers::String *dataType() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_DATATYPE);
+  }
+  ::flatbuffers::String *mutable_dataType() {
+    return GetPointer<::flatbuffers::String *>(VT_DATATYPE);
+  }
+  const ::flatbuffers::Vector<uint32_t> *sizes() const {
+    return GetPointer<const ::flatbuffers::Vector<uint32_t> *>(VT_SIZES);
   }
-  const flatbuffers::Vector<uint32_t> *sizes() const {
-    return GetPointer<const flatbuffers::Vector<uint32_t> *>(VT_SIZES);
+  ::flatbuffers::Vector<uint32_t> *mutable_sizes() {
+    return GetPointer<::flatbuffers::Vector<uint32_t> *>(VT_SIZES);
   }
-  const flatbuffers::Vector<uint32_t> *strides() const {
-    return GetPointer<const flatbuffers::Vector<uint32_t> *>(VT_STRIDES);
+  const ::flatbuffers::Vector<uint32_t> *strides() const {
+    return GetPointer<const ::flatbuffers::Vector<uint32_t> *>(VT_STRIDES);
+  }
+  ::flatbuffers::Vector<uint32_t> *mutable_strides() {
+    return GetPointer<::flatbuffers::Vector<uint32_t> *>(VT_STRIDES);
   }
   uint64_t totalTensorSizeInBytes() const {
     return GetField<uint64_t>(VT_TOTALTENSORSIZEINBYTES, 0);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool mutate_totalTensorSizeInBytes(uint64_t _totalTensorSizeInBytes = 0) {
+    return SetField<uint64_t>(VT_TOTALTENSORSIZEINBYTES, _totalTensorSizeInBytes, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_DATATYPE) &&
            verifier.VerifyString(dataType()) &&
@@ -328,44 +353,43 @@ struct DmlBufferTensorDesc FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
            verifier.VerifyVector(sizes()) &&
            VerifyOffset(verifier, VT_STRIDES) &&
            verifier.VerifyVector(strides()) &&
-           VerifyField<uint64_t>(verifier, VT_TOTALTENSORSIZEINBYTES) &&
+           VerifyField<uint64_t>(verifier, VT_TOTALTENSORSIZEINBYTES, 8) &&
            verifier.EndTable();
   }
 };
 
 struct DmlBufferTensorDescBuilder {
   typedef DmlBufferTensorDesc Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_dataType(flatbuffers::Offset<flatbuffers::String> dataType) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_dataType(::flatbuffers::Offset<::flatbuffers::String> dataType) {
     fbb_.AddOffset(DmlBufferTensorDesc::VT_DATATYPE, dataType);
   }
-  void add_sizes(flatbuffers::Offset<flatbuffers::Vector<uint32_t>> sizes) {
+  void add_sizes(::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> sizes) {
     fbb_.AddOffset(DmlBufferTensorDesc::VT_SIZES, sizes);
   }
-  void add_strides(flatbuffers::Offset<flatbuffers::Vector<uint32_t>> strides) {
+  void add_strides(::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> strides) {
     fbb_.AddOffset(DmlBufferTensorDesc::VT_STRIDES, strides);
   }
   void add_totalTensorSizeInBytes(uint64_t totalTensorSizeInBytes) {
     fbb_.AddElement<uint64_t>(DmlBufferTensorDesc::VT_TOTALTENSORSIZEINBYTES, totalTensorSizeInBytes, 0);
   }
-  explicit DmlBufferTensorDescBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit DmlBufferTensorDescBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  DmlBufferTensorDescBuilder &operator=(const DmlBufferTensorDescBuilder &);
-  flatbuffers::Offset<DmlBufferTensorDesc> Finish() {
+  ::flatbuffers::Offset<DmlBufferTensorDesc> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<DmlBufferTensorDesc>(end);
+    auto o = ::flatbuffers::Offset<DmlBufferTensorDesc>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<DmlBufferTensorDesc> CreateDmlBufferTensorDesc(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::String> dataType = 0,
-    flatbuffers::Offset<flatbuffers::Vector<uint32_t>> sizes = 0,
-    flatbuffers::Offset<flatbuffers::Vector<uint32_t>> strides = 0,
+inline ::flatbuffers::Offset<DmlBufferTensorDesc> CreateDmlBufferTensorDesc(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> dataType = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> sizes = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> strides = 0,
     uint64_t totalTensorSizeInBytes = 0) {
   DmlBufferTensorDescBuilder builder_(_fbb);
   builder_.add_totalTensorSizeInBytes(totalTensorSizeInBytes);
@@ -375,8 +399,8 @@ inline flatbuffers::Offset<DmlBufferTensorDesc> CreateDmlBufferTensorDesc(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<DmlBufferTensorDesc> CreateDmlBufferTensorDescDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<DmlBufferTensorDesc> CreateDmlBufferTensorDescDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     const char *dataType = nullptr,
     const std::vector<uint32_t> *sizes = nullptr,
     const std::vector<uint32_t> *strides = nullptr,
@@ -392,7 +416,7 @@ inline flatbuffers::Offset<DmlBufferTensorDesc> CreateDmlBufferTensorDescDirect(
       totalTensorSizeInBytes);
 }
 
-struct OperatorNodeDesc FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct OperatorNodeDesc FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef OperatorNodeDescBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_TYPE = 4,
@@ -400,19 +424,31 @@ struct OperatorNodeDesc FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     VT_OUTPUTS = 8,
     VT_ATTRIBUTES = 10
   };
-  const flatbuffers::String *type() const {
-    return GetPointer<const flatbuffers::String *>(VT_TYPE);
+  const ::flatbuffers::String *type() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_TYPE);
+  }
+  ::flatbuffers::String *mutable_type() {
+    return GetPointer<::flatbuffers::String *>(VT_TYPE);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>> *inputs() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>> *>(VT_INPUTS);
+  }
+  ::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>> *mutable_inputs() {
+    return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>> *>(VT_INPUTS);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>> *outputs() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>> *>(VT_OUTPUTS);
   }
-  const flatbuffers::Vector<flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>> *inputs() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>> *>(VT_INPUTS);
+  ::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>> *mutable_outputs() {
+    return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>> *>(VT_OUTPUTS);
   }
-  const flatbuffers::Vector<flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>> *outputs() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>> *>(VT_OUTPUTS);
+  const ::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>> *attributes() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>> *>(VT_ATTRIBUTES);
   }
-  const flatbuffers::Vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>> *attributes() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>> *>(VT_ATTRIBUTES);
+  ::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>> *mutable_attributes() {
+    return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>> *>(VT_ATTRIBUTES);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_TYPE) &&
            verifier.VerifyString(type()) &&
@@ -431,38 +467,37 @@ struct OperatorNodeDesc FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 
 struct OperatorNodeDescBuilder {
   typedef OperatorNodeDesc Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_type(flatbuffers::Offset<flatbuffers::String> type) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_type(::flatbuffers::Offset<::flatbuffers::String> type) {
     fbb_.AddOffset(OperatorNodeDesc::VT_TYPE, type);
   }
-  void add_inputs(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>>> inputs) {
+  void add_inputs(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>>> inputs) {
     fbb_.AddOffset(OperatorNodeDesc::VT_INPUTS, inputs);
   }
-  void add_outputs(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>>> outputs) {
+  void add_outputs(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>>> outputs) {
     fbb_.AddOffset(OperatorNodeDesc::VT_OUTPUTS, outputs);
   }
-  void add_attributes(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>>> attributes) {
+  void add_attributes(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>>> attributes) {
     fbb_.AddOffset(OperatorNodeDesc::VT_ATTRIBUTES, attributes);
   }
-  explicit OperatorNodeDescBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit OperatorNodeDescBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  OperatorNodeDescBuilder &operator=(const OperatorNodeDescBuilder &);
-  flatbuffers::Offset<OperatorNodeDesc> Finish() {
+  ::flatbuffers::Offset<OperatorNodeDesc> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<OperatorNodeDesc>(end);
+    auto o = ::flatbuffers::Offset<OperatorNodeDesc>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<OperatorNodeDesc> CreateOperatorNodeDesc(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::String> type = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>>> inputs = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>>> outputs = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>>> attributes = 0) {
+inline ::flatbuffers::Offset<OperatorNodeDesc> CreateOperatorNodeDesc(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> type = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>>> inputs = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>>> outputs = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>>> attributes = 0) {
   OperatorNodeDescBuilder builder_(_fbb);
   builder_.add_attributes(attributes);
   builder_.add_outputs(outputs);
@@ -471,16 +506,16 @@ inline flatbuffers::Offset<OperatorNodeDesc> CreateOperatorNodeDesc(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<OperatorNodeDesc> CreateOperatorNodeDescDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<OperatorNodeDesc> CreateOperatorNodeDescDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     const char *type = nullptr,
-    const std::vector<flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>> *inputs = nullptr,
-    const std::vector<flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>> *outputs = nullptr,
-    const std::vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>> *attributes = nullptr) {
+    const std::vector<::flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>> *inputs = nullptr,
+    const std::vector<::flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>> *outputs = nullptr,
+    const std::vector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>> *attributes = nullptr) {
   auto type__ = type ? _fbb.CreateString(type) : 0;
-  auto inputs__ = inputs ? _fbb.CreateVector<flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>>(*inputs) : 0;
-  auto outputs__ = outputs ? _fbb.CreateVector<flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>>(*outputs) : 0;
-  auto attributes__ = attributes ? _fbb.CreateVector<flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>>(*attributes) : 0;
+  auto inputs__ = inputs ? _fbb.CreateVector<::flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>>(*inputs) : 0;
+  auto outputs__ = outputs ? _fbb.CreateVector<::flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>>(*outputs) : 0;
+  auto attributes__ = attributes ? _fbb.CreateVector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>>(*attributes) : 0;
   return dml::ir::CreateOperatorNodeDesc(
       _fbb,
       type__,
@@ -489,7 +524,7 @@ inline flatbuffers::Offset<OperatorNodeDesc> CreateOperatorNodeDescDirect(
       attributes__);
 }
 
-struct DmlGraphNode FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct DmlGraphNode FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef DmlGraphNodeBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_DESC_TYPE = 4,
@@ -511,18 +546,30 @@ struct DmlGraphNode FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const dml::ir::ConstantNodeDesc *desc_as_ConstantNodeDesc() const {
     return desc_type() == dml::ir::NodeDesc_ConstantNodeDesc ? static_cast<const dml::ir::ConstantNodeDesc *>(desc()) : nullptr;
   }
-  const flatbuffers::String *name() const {
-    return GetPointer<const flatbuffers::String *>(VT_NAME);
+  void *mutable_desc() {
+    return GetPointer<void *>(VT_DESC);
   }
-  const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *inputNames() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *>(VT_INPUTNAMES);
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
   }
-  const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *outputNames() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *>(VT_OUTPUTNAMES);
+  ::flatbuffers::String *mutable_name() {
+    return GetPointer<::flatbuffers::String *>(VT_NAME);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *inputNames() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_INPUTNAMES);
+  }
+  ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *mutable_inputNames() {
+    return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_INPUTNAMES);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *outputNames() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_OUTPUTNAMES);
+  }
+  ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *mutable_outputNames() {
+    return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_OUTPUTNAMES);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyField<uint8_t>(verifier, VT_DESC_TYPE) &&
+           VerifyField<uint8_t>(verifier, VT_DESC_TYPE, 1) &&
            VerifyOffset(verifier, VT_DESC) &&
            VerifyNodeDesc(verifier, desc(), desc_type()) &&
            VerifyOffset(verifier, VT_NAME) &&
@@ -547,42 +594,41 @@ template<> inline const dml::ir::ConstantNodeDesc *DmlGraphNode::desc_as<dml::ir
 
 struct DmlGraphNodeBuilder {
   typedef DmlGraphNode Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_desc_type(dml::ir::NodeDesc desc_type) {
     fbb_.AddElement<uint8_t>(DmlGraphNode::VT_DESC_TYPE, static_cast<uint8_t>(desc_type), 0);
   }
-  void add_desc(flatbuffers::Offset<void> desc) {
+  void add_desc(::flatbuffers::Offset<void> desc) {
     fbb_.AddOffset(DmlGraphNode::VT_DESC, desc);
   }
-  void add_name(flatbuffers::Offset<flatbuffers::String> name) {
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
     fbb_.AddOffset(DmlGraphNode::VT_NAME, name);
   }
-  void add_inputNames(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> inputNames) {
+  void add_inputNames(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> inputNames) {
     fbb_.AddOffset(DmlGraphNode::VT_INPUTNAMES, inputNames);
   }
-  void add_outputNames(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> outputNames) {
+  void add_outputNames(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> outputNames) {
     fbb_.AddOffset(DmlGraphNode::VT_OUTPUTNAMES, outputNames);
   }
-  explicit DmlGraphNodeBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit DmlGraphNodeBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  DmlGraphNodeBuilder &operator=(const DmlGraphNodeBuilder &);
-  flatbuffers::Offset<DmlGraphNode> Finish() {
+  ::flatbuffers::Offset<DmlGraphNode> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<DmlGraphNode>(end);
+    auto o = ::flatbuffers::Offset<DmlGraphNode>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<DmlGraphNode> CreateDmlGraphNode(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<DmlGraphNode> CreateDmlGraphNode(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     dml::ir::NodeDesc desc_type = dml::ir::NodeDesc_NONE,
-    flatbuffers::Offset<void> desc = 0,
-    flatbuffers::Offset<flatbuffers::String> name = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> inputNames = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> outputNames = 0) {
+    ::flatbuffers::Offset<void> desc = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> inputNames = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> outputNames = 0) {
   DmlGraphNodeBuilder builder_(_fbb);
   builder_.add_outputNames(outputNames);
   builder_.add_inputNames(inputNames);
@@ -592,16 +638,16 @@ inline flatbuffers::Offset<DmlGraphNode> CreateDmlGraphNode(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<DmlGraphNode> CreateDmlGraphNodeDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<DmlGraphNode> CreateDmlGraphNodeDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     dml::ir::NodeDesc desc_type = dml::ir::NodeDesc_NONE,
-    flatbuffers::Offset<void> desc = 0,
+    ::flatbuffers::Offset<void> desc = 0,
     const char *name = nullptr,
-    const std::vector<flatbuffers::Offset<flatbuffers::String>> *inputNames = nullptr,
-    const std::vector<flatbuffers::Offset<flatbuffers::String>> *outputNames = nullptr) {
+    const std::vector<::flatbuffers::Offset<::flatbuffers::String>> *inputNames = nullptr,
+    const std::vector<::flatbuffers::Offset<::flatbuffers::String>> *outputNames = nullptr) {
   auto name__ = name ? _fbb.CreateString(name) : 0;
-  auto inputNames__ = inputNames ? _fbb.CreateVector<flatbuffers::Offset<flatbuffers::String>>(*inputNames) : 0;
-  auto outputNames__ = outputNames ? _fbb.CreateVector<flatbuffers::Offset<flatbuffers::String>>(*outputNames) : 0;
+  auto inputNames__ = inputNames ? _fbb.CreateVector<::flatbuffers::Offset<::flatbuffers::String>>(*inputNames) : 0;
+  auto outputNames__ = outputNames ? _fbb.CreateVector<::flatbuffers::Offset<::flatbuffers::String>>(*outputNames) : 0;
   return dml::ir::CreateDmlGraphNode(
       _fbb,
       desc_type,
@@ -611,23 +657,32 @@ inline flatbuffers::Offset<DmlGraphNode> CreateDmlGraphNodeDirect(
       outputNames__);
 }
 
-struct DmlGraphDesc FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct DmlGraphDesc FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef DmlGraphDescBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_NODES = 4,
     VT_GRAPHINPUTNAMES = 6,
     VT_GRAPHOUTPUTNAMES = 8
   };
-  const flatbuffers::Vector<flatbuffers::Offset<dml::ir::DmlGraphNode>> *nodes() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<dml::ir::DmlGraphNode>> *>(VT_NODES);
+  const ::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::DmlGraphNode>> *nodes() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::DmlGraphNode>> *>(VT_NODES);
+  }
+  ::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::DmlGraphNode>> *mutable_nodes() {
+    return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::DmlGraphNode>> *>(VT_NODES);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *graphInputNames() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_GRAPHINPUTNAMES);
   }
-  const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *graphInputNames() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *>(VT_GRAPHINPUTNAMES);
+  ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *mutable_graphInputNames() {
+    return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_GRAPHINPUTNAMES);
   }
-  const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *graphOutputNames() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *>(VT_GRAPHOUTPUTNAMES);
+  const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *graphOutputNames() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_GRAPHOUTPUTNAMES);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *mutable_graphOutputNames() {
+    return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_GRAPHOUTPUTNAMES);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_NODES) &&
            verifier.VerifyVector(nodes()) &&
@@ -644,34 +699,33 @@ struct DmlGraphDesc FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 
 struct DmlGraphDescBuilder {
   typedef DmlGraphDesc Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_nodes(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<dml::ir::DmlGraphNode>>> nodes) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_nodes(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::DmlGraphNode>>> nodes) {
     fbb_.AddOffset(DmlGraphDesc::VT_NODES, nodes);
   }
-  void add_graphInputNames(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> graphInputNames) {
+  void add_graphInputNames(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> graphInputNames) {
     fbb_.AddOffset(DmlGraphDesc::VT_GRAPHINPUTNAMES, graphInputNames);
   }
-  void add_graphOutputNames(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> graphOutputNames) {
+  void add_graphOutputNames(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> graphOutputNames) {
     fbb_.AddOffset(DmlGraphDesc::VT_GRAPHOUTPUTNAMES, graphOutputNames);
   }
-  explicit DmlGraphDescBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit DmlGraphDescBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  DmlGraphDescBuilder &operator=(const DmlGraphDescBuilder &);
-  flatbuffers::Offset<DmlGraphDesc> Finish() {
+  ::flatbuffers::Offset<DmlGraphDesc> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<DmlGraphDesc>(end);
+    auto o = ::flatbuffers::Offset<DmlGraphDesc>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<DmlGraphDesc> CreateDmlGraphDesc(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<dml::ir::DmlGraphNode>>> nodes = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> graphInputNames = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> graphOutputNames = 0) {
+inline ::flatbuffers::Offset<DmlGraphDesc> CreateDmlGraphDesc(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::DmlGraphNode>>> nodes = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> graphInputNames = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> graphOutputNames = 0) {
   DmlGraphDescBuilder builder_(_fbb);
   builder_.add_graphOutputNames(graphOutputNames);
   builder_.add_graphInputNames(graphInputNames);
@@ -679,14 +733,14 @@ inline flatbuffers::Offset<DmlGraphDesc> CreateDmlGraphDesc(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<DmlGraphDesc> CreateDmlGraphDescDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    const std::vector<flatbuffers::Offset<dml::ir::DmlGraphNode>> *nodes = nullptr,
-    const std::vector<flatbuffers::Offset<flatbuffers::String>> *graphInputNames = nullptr,
-    const std::vector<flatbuffers::Offset<flatbuffers::String>> *graphOutputNames = nullptr) {
-  auto nodes__ = nodes ? _fbb.CreateVector<flatbuffers::Offset<dml::ir::DmlGraphNode>>(*nodes) : 0;
-  auto graphInputNames__ = graphInputNames ? _fbb.CreateVector<flatbuffers::Offset<flatbuffers::String>>(*graphInputNames) : 0;
-  auto graphOutputNames__ = graphOutputNames ? _fbb.CreateVector<flatbuffers::Offset<flatbuffers::String>>(*graphOutputNames) : 0;
+inline ::flatbuffers::Offset<DmlGraphDesc> CreateDmlGraphDescDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<::flatbuffers::Offset<dml::ir::DmlGraphNode>> *nodes = nullptr,
+    const std::vector<::flatbuffers::Offset<::flatbuffers::String>> *graphInputNames = nullptr,
+    const std::vector<::flatbuffers::Offset<::flatbuffers::String>> *graphOutputNames = nullptr) {
+  auto nodes__ = nodes ? _fbb.CreateVector<::flatbuffers::Offset<dml::ir::DmlGraphNode>>(*nodes) : 0;
+  auto graphInputNames__ = graphInputNames ? _fbb.CreateVector<::flatbuffers::Offset<::flatbuffers::String>>(*graphInputNames) : 0;
+  auto graphOutputNames__ = graphOutputNames ? _fbb.CreateVector<::flatbuffers::Offset<::flatbuffers::String>>(*graphOutputNames) : 0;
   return dml::ir::CreateDmlGraphDesc(
       _fbb,
       nodes__,
@@ -694,7 +748,7 @@ inline flatbuffers::Offset<DmlGraphDesc> CreateDmlGraphDescDirect(
       graphOutputNames__);
 }
 
-inline bool VerifyConstantNodeDescDetail(flatbuffers::Verifier &verifier, const void *obj, ConstantNodeDescDetail type) {
+inline bool VerifyConstantNodeDescDetail(::flatbuffers::Verifier &verifier, const void *obj, ConstantNodeDescDetail type) {
   switch (type) {
     case ConstantNodeDescDetail_NONE: {
       return true;
@@ -711,10 +765,10 @@ inline bool VerifyConstantNodeDescDetail(flatbuffers::Verifier &verifier, const
   }
 }
 
-inline bool VerifyConstantNodeDescDetailVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types) {
+inline bool VerifyConstantNodeDescDetailVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values, const ::flatbuffers::Vector<uint8_t> *types) {
   if (!values || !types) return !values && !types;
   if (values->size() != types->size()) return false;
-  for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+  for (::flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
     if (!VerifyConstantNodeDescDetail(
         verifier,  values->Get(i), types->GetEnum<ConstantNodeDescDetail>(i))) {
       return false;
@@ -723,7 +777,7 @@ inline bool VerifyConstantNodeDescDetailVector(flatbuffers::Verifier &verifier,
   return true;
 }
 
-inline bool VerifyNodeDesc(flatbuffers::Verifier &verifier, const void *obj, NodeDesc type) {
+inline bool VerifyNodeDesc(::flatbuffers::Verifier &verifier, const void *obj, NodeDesc type) {
   switch (type) {
     case NodeDesc_NONE: {
       return true;
@@ -740,10 +794,10 @@ inline bool VerifyNodeDesc(flatbuffers::Verifier &verifier, const void *obj, Nod
   }
 }
 
-inline bool VerifyNodeDescVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types) {
+inline bool VerifyNodeDescVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values, const ::flatbuffers::Vector<uint8_t> *types) {
   if (!values || !types) return !values && !types;
   if (values->size() != types->size()) return false;
-  for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+  for (::flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
     if (!VerifyNodeDesc(
         verifier,  values->Get(i), types->GetEnum<NodeDesc>(i))) {
       return false;
@@ -753,32 +807,40 @@ inline bool VerifyNodeDescVector(flatbuffers::Verifier &verifier, const flatbuff
 }
 
 inline const dml::ir::DmlGraphDesc *GetDmlGraphDesc(const void *buf) {
-  return flatbuffers::GetRoot<dml::ir::DmlGraphDesc>(buf);
+  return ::flatbuffers::GetRoot<dml::ir::DmlGraphDesc>(buf);
 }
 
 inline const dml::ir::DmlGraphDesc *GetSizePrefixedDmlGraphDesc(const void *buf) {
-  return flatbuffers::GetSizePrefixedRoot<dml::ir::DmlGraphDesc>(buf);
+  return ::flatbuffers::GetSizePrefixedRoot<dml::ir::DmlGraphDesc>(buf);
+}
+
+inline DmlGraphDesc *GetMutableDmlGraphDesc(void *buf) {
+  return ::flatbuffers::GetMutableRoot<DmlGraphDesc>(buf);
+}
+
+inline dml::ir::DmlGraphDesc *GetMutableSizePrefixedDmlGraphDesc(void *buf) {
+  return ::flatbuffers::GetMutableSizePrefixedRoot<dml::ir::DmlGraphDesc>(buf);
 }
 
 inline bool VerifyDmlGraphDescBuffer(
-    flatbuffers::Verifier &verifier) {
+    ::flatbuffers::Verifier &verifier) {
   return verifier.VerifyBuffer<dml::ir::DmlGraphDesc>(nullptr);
 }
 
 inline bool VerifySizePrefixedDmlGraphDescBuffer(
-    flatbuffers::Verifier &verifier) {
+    ::flatbuffers::Verifier &verifier) {
   return verifier.VerifySizePrefixedBuffer<dml::ir::DmlGraphDesc>(nullptr);
 }
 
 inline void FinishDmlGraphDescBuffer(
-    flatbuffers::FlatBufferBuilder &fbb,
-    flatbuffers::Offset<dml::ir::DmlGraphDesc> root) {
+    ::flatbuffers::FlatBufferBuilder &fbb,
+    ::flatbuffers::Offset<dml::ir::DmlGraphDesc> root) {
   fbb.Finish(root);
 }
 
 inline void FinishSizePrefixedDmlGraphDescBuffer(
-    flatbuffers::FlatBufferBuilder &fbb,
-    flatbuffers::Offset<dml::ir::DmlGraphDesc> root) {
+    ::flatbuffers::FlatBufferBuilder &fbb,
+    ::flatbuffers::Offset<dml::ir::DmlGraphDesc> root) {
   fbb.FinishSizePrefixed(root);
 }
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/OperatorFieldTypes_generated.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/OperatorFieldTypes_generated.h
index 167a913bb0132..639c31f0dc5c8 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/OperatorFieldTypes_generated.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/OperatorFieldTypes_generated.h
@@ -4,7 +4,14 @@
 #ifndef FLATBUFFERS_GENERATED_OPERATORFIELDTYPES_DML_IR_OPERATORFIELDTYPES_H_
 #define FLATBUFFERS_GENERATED_OPERATORFIELDTYPES_DML_IR_OPERATORFIELDTYPES_H_
 
-#include "flatbuffers/flatbuffers.h"
+#include "core/common/flatbuffers.h"
+
+// Ensure the included flatbuffers.h is the same version as when this file was
+// generated, otherwise it may not be compatible.
+static_assert(FLATBUFFERS_VERSION_MAJOR == 23 &&
+              FLATBUFFERS_VERSION_MINOR == 5 &&
+              FLATBUFFERS_VERSION_REVISION == 26,
+             "Non-compatible flatbuffers version included");
 
 namespace dml {
 namespace ir {
@@ -59,7 +66,7 @@ struct ScalarUnionDataBuilder;
 
 struct Bool;
 
-enum AttributeFieldVariant {
+enum AttributeFieldVariant : uint8_t {
   AttributeFieldVariant_NONE = 0,
   AttributeFieldVariant_Activation = 1,
   AttributeFieldVariant_ActivationArray = 2,
@@ -120,7 +127,7 @@ inline const char * const *EnumNamesAttributeFieldVariant() {
 }
 
 inline const char *EnumNameAttributeFieldVariant(AttributeFieldVariant e) {
-  if (flatbuffers::IsOutRange(e, AttributeFieldVariant_NONE, AttributeFieldVariant_Bool)) return "";
+  if (::flatbuffers::IsOutRange(e, AttributeFieldVariant_NONE, AttributeFieldVariant_Bool)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesAttributeFieldVariant()[index];
 }
@@ -181,10 +188,10 @@ template<> struct AttributeFieldVariantTraits<dml::ir::operatorFieldTypes::Bool>
   static const AttributeFieldVariant enum_value = AttributeFieldVariant_Bool;
 };
 
-bool VerifyAttributeFieldVariant(flatbuffers::Verifier &verifier, const void *obj, AttributeFieldVariant type);
-bool VerifyAttributeFieldVariantVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types);
+bool VerifyAttributeFieldVariant(::flatbuffers::Verifier &verifier, const void *obj, AttributeFieldVariant type);
+bool VerifyAttributeFieldVariantVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values, const ::flatbuffers::Vector<uint8_t> *types);
 
-enum ScalarVariant {
+enum ScalarVariant : uint8_t {
   ScalarVariant_NONE = 0,
   ScalarVariant_ByteArray = 1,
   ScalarVariant_Int8 = 2,
@@ -239,7 +246,7 @@ inline const char * const *EnumNamesScalarVariant() {
 }
 
 inline const char *EnumNameScalarVariant(ScalarVariant e) {
-  if (flatbuffers::IsOutRange(e, ScalarVariant_NONE, ScalarVariant_Float64)) return "";
+  if (::flatbuffers::IsOutRange(e, ScalarVariant_NONE, ScalarVariant_Float64)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesScalarVariant()[index];
 }
@@ -292,25 +299,25 @@ template<> struct ScalarVariantTraits<dml::ir::operatorFieldTypes::Float64> {
   static const ScalarVariant enum_value = ScalarVariant_Float64;
 };
 
-bool VerifyScalarVariant(flatbuffers::Verifier &verifier, const void *obj, ScalarVariant type);
-bool VerifyScalarVariantVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types);
+bool VerifyScalarVariant(::flatbuffers::Verifier &verifier, const void *obj, ScalarVariant type);
+bool VerifyScalarVariantVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values, const ::flatbuffers::Vector<uint8_t> *types);
 
 FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(1) UInt8 FLATBUFFERS_FINAL_CLASS {
  private:
   uint8_t data_;
 
  public:
-  UInt8() {
-    memset(static_cast<void *>(this), 0, sizeof(UInt8));
+  UInt8()
+      : data_(0) {
   }
   UInt8(uint8_t _data)
-      : data_(flatbuffers::EndianScalar(_data)) {
+      : data_(::flatbuffers::EndianScalar(_data)) {
   }
   uint8_t data() const {
-    return flatbuffers::EndianScalar(data_);
+    return ::flatbuffers::EndianScalar(data_);
   }
   void mutate_data(uint8_t _data) {
-    flatbuffers::WriteScalar(&data_, _data);
+    ::flatbuffers::WriteScalar(&data_, _data);
   }
 };
 FLATBUFFERS_STRUCT_END(UInt8, 1);
@@ -320,17 +327,17 @@ FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(2) UInt16 FLATBUFFERS_FINAL_CLASS {
   uint16_t data_;
 
  public:
-  UInt16() {
-    memset(static_cast<void *>(this), 0, sizeof(UInt16));
+  UInt16()
+      : data_(0) {
   }
   UInt16(uint16_t _data)
-      : data_(flatbuffers::EndianScalar(_data)) {
+      : data_(::flatbuffers::EndianScalar(_data)) {
   }
   uint16_t data() const {
-    return flatbuffers::EndianScalar(data_);
+    return ::flatbuffers::EndianScalar(data_);
   }
   void mutate_data(uint16_t _data) {
-    flatbuffers::WriteScalar(&data_, _data);
+    ::flatbuffers::WriteScalar(&data_, _data);
   }
 };
 FLATBUFFERS_STRUCT_END(UInt16, 2);
@@ -340,17 +347,17 @@ FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(4) UInt32 FLATBUFFERS_FINAL_CLASS {
   uint32_t data_;
 
  public:
-  UInt32() {
-    memset(static_cast<void *>(this), 0, sizeof(UInt32));
+  UInt32()
+      : data_(0) {
   }
   UInt32(uint32_t _data)
-      : data_(flatbuffers::EndianScalar(_data)) {
+      : data_(::flatbuffers::EndianScalar(_data)) {
   }
   uint32_t data() const {
-    return flatbuffers::EndianScalar(data_);
+    return ::flatbuffers::EndianScalar(data_);
   }
   void mutate_data(uint32_t _data) {
-    flatbuffers::WriteScalar(&data_, _data);
+    ::flatbuffers::WriteScalar(&data_, _data);
   }
 };
 FLATBUFFERS_STRUCT_END(UInt32, 4);
@@ -360,17 +367,17 @@ FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) UInt64 FLATBUFFERS_FINAL_CLASS {
   uint64_t data_;
 
  public:
-  UInt64() {
-    memset(static_cast<void *>(this), 0, sizeof(UInt64));
+  UInt64()
+      : data_(0) {
   }
   UInt64(uint64_t _data)
-      : data_(flatbuffers::EndianScalar(_data)) {
+      : data_(::flatbuffers::EndianScalar(_data)) {
   }
   uint64_t data() const {
-    return flatbuffers::EndianScalar(data_);
+    return ::flatbuffers::EndianScalar(data_);
   }
   void mutate_data(uint64_t _data) {
-    flatbuffers::WriteScalar(&data_, _data);
+    ::flatbuffers::WriteScalar(&data_, _data);
   }
 };
 FLATBUFFERS_STRUCT_END(UInt64, 8);
@@ -380,17 +387,17 @@ FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(1) Int8 FLATBUFFERS_FINAL_CLASS {
   int8_t data_;
 
  public:
-  Int8() {
-    memset(static_cast<void *>(this), 0, sizeof(Int8));
+  Int8()
+      : data_(0) {
   }
   Int8(int8_t _data)
-      : data_(flatbuffers::EndianScalar(_data)) {
+      : data_(::flatbuffers::EndianScalar(_data)) {
   }
   int8_t data() const {
-    return flatbuffers::EndianScalar(data_);
+    return ::flatbuffers::EndianScalar(data_);
   }
   void mutate_data(int8_t _data) {
-    flatbuffers::WriteScalar(&data_, _data);
+    ::flatbuffers::WriteScalar(&data_, _data);
   }
 };
 FLATBUFFERS_STRUCT_END(Int8, 1);
@@ -400,17 +407,17 @@ FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(2) Int16 FLATBUFFERS_FINAL_CLASS {
   int16_t data_;
 
  public:
-  Int16() {
-    memset(static_cast<void *>(this), 0, sizeof(Int16));
+  Int16()
+      : data_(0) {
   }
   Int16(int16_t _data)
-      : data_(flatbuffers::EndianScalar(_data)) {
+      : data_(::flatbuffers::EndianScalar(_data)) {
   }
   int16_t data() const {
-    return flatbuffers::EndianScalar(data_);
+    return ::flatbuffers::EndianScalar(data_);
   }
   void mutate_data(int16_t _data) {
-    flatbuffers::WriteScalar(&data_, _data);
+    ::flatbuffers::WriteScalar(&data_, _data);
   }
 };
 FLATBUFFERS_STRUCT_END(Int16, 2);
@@ -420,17 +427,17 @@ FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(4) Int32 FLATBUFFERS_FINAL_CLASS {
   int32_t data_;
 
  public:
-  Int32() {
-    memset(static_cast<void *>(this), 0, sizeof(Int32));
+  Int32()
+      : data_(0) {
   }
   Int32(int32_t _data)
-      : data_(flatbuffers::EndianScalar(_data)) {
+      : data_(::flatbuffers::EndianScalar(_data)) {
   }
   int32_t data() const {
-    return flatbuffers::EndianScalar(data_);
+    return ::flatbuffers::EndianScalar(data_);
   }
   void mutate_data(int32_t _data) {
-    flatbuffers::WriteScalar(&data_, _data);
+    ::flatbuffers::WriteScalar(&data_, _data);
   }
 };
 FLATBUFFERS_STRUCT_END(Int32, 4);
@@ -440,17 +447,17 @@ FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) Int64 FLATBUFFERS_FINAL_CLASS {
   int64_t data_;
 
  public:
-  Int64() {
-    memset(static_cast<void *>(this), 0, sizeof(Int64));
+  Int64()
+      : data_(0) {
   }
   Int64(int64_t _data)
-      : data_(flatbuffers::EndianScalar(_data)) {
+      : data_(::flatbuffers::EndianScalar(_data)) {
   }
   int64_t data() const {
-    return flatbuffers::EndianScalar(data_);
+    return ::flatbuffers::EndianScalar(data_);
   }
   void mutate_data(int64_t _data) {
-    flatbuffers::WriteScalar(&data_, _data);
+    ::flatbuffers::WriteScalar(&data_, _data);
   }
 };
 FLATBUFFERS_STRUCT_END(Int64, 8);
@@ -460,17 +467,17 @@ FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(4) Float32 FLATBUFFERS_FINAL_CLASS {
   float data_;
 
  public:
-  Float32() {
-    memset(static_cast<void *>(this), 0, sizeof(Float32));
+  Float32()
+      : data_(0) {
   }
   Float32(float _data)
-      : data_(flatbuffers::EndianScalar(_data)) {
+      : data_(::flatbuffers::EndianScalar(_data)) {
   }
   float data() const {
-    return flatbuffers::EndianScalar(data_);
+    return ::flatbuffers::EndianScalar(data_);
   }
   void mutate_data(float _data) {
-    flatbuffers::WriteScalar(&data_, _data);
+    ::flatbuffers::WriteScalar(&data_, _data);
   }
 };
 FLATBUFFERS_STRUCT_END(Float32, 4);
@@ -480,17 +487,17 @@ FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) Float64 FLATBUFFERS_FINAL_CLASS {
   double data_;
 
  public:
-  Float64() {
-    memset(static_cast<void *>(this), 0, sizeof(Float64));
+  Float64()
+      : data_(0) {
   }
   Float64(double _data)
-      : data_(flatbuffers::EndianScalar(_data)) {
+      : data_(::flatbuffers::EndianScalar(_data)) {
   }
   double data() const {
-    return flatbuffers::EndianScalar(data_);
+    return ::flatbuffers::EndianScalar(data_);
   }
   void mutate_data(double _data) {
-    flatbuffers::WriteScalar(&data_, _data);
+    ::flatbuffers::WriteScalar(&data_, _data);
   }
 };
 FLATBUFFERS_STRUCT_END(Float64, 8);
@@ -501,24 +508,25 @@ FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(4) ScaleBias FLATBUFFERS_FINAL_CLASS {
   float bias_;
 
  public:
-  ScaleBias() {
-    memset(static_cast<void *>(this), 0, sizeof(ScaleBias));
+  ScaleBias()
+      : scale_(0),
+        bias_(0) {
   }
   ScaleBias(float _scale, float _bias)
-      : scale_(flatbuffers::EndianScalar(_scale)),
-        bias_(flatbuffers::EndianScalar(_bias)) {
+      : scale_(::flatbuffers::EndianScalar(_scale)),
+        bias_(::flatbuffers::EndianScalar(_bias)) {
   }
   float scale() const {
-    return flatbuffers::EndianScalar(scale_);
+    return ::flatbuffers::EndianScalar(scale_);
   }
   void mutate_scale(float _scale) {
-    flatbuffers::WriteScalar(&scale_, _scale);
+    ::flatbuffers::WriteScalar(&scale_, _scale);
   }
   float bias() const {
-    return flatbuffers::EndianScalar(bias_);
+    return ::flatbuffers::EndianScalar(bias_);
   }
   void mutate_bias(float _bias) {
-    flatbuffers::WriteScalar(&bias_, _bias);
+    ::flatbuffers::WriteScalar(&bias_, _bias);
   }
 };
 FLATBUFFERS_STRUCT_END(ScaleBias, 8);
@@ -529,24 +537,25 @@ FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(4) Size2D FLATBUFFERS_FINAL_CLASS {
   uint32_t height_;
 
  public:
-  Size2D() {
-    memset(static_cast<void *>(this), 0, sizeof(Size2D));
+  Size2D()
+      : width_(0),
+        height_(0) {
   }
   Size2D(uint32_t _width, uint32_t _height)
-      : width_(flatbuffers::EndianScalar(_width)),
-        height_(flatbuffers::EndianScalar(_height)) {
+      : width_(::flatbuffers::EndianScalar(_width)),
+        height_(::flatbuffers::EndianScalar(_height)) {
   }
   uint32_t width() const {
-    return flatbuffers::EndianScalar(width_);
+    return ::flatbuffers::EndianScalar(width_);
   }
   void mutate_width(uint32_t _width) {
-    flatbuffers::WriteScalar(&width_, _width);
+    ::flatbuffers::WriteScalar(&width_, _width);
   }
   uint32_t height() const {
-    return flatbuffers::EndianScalar(height_);
+    return ::flatbuffers::EndianScalar(height_);
   }
   void mutate_height(uint32_t _height) {
-    flatbuffers::WriteScalar(&height_, _height);
+    ::flatbuffers::WriteScalar(&height_, _height);
   }
 };
 FLATBUFFERS_STRUCT_END(Size2D, 8);
@@ -556,14 +565,17 @@ FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(1) ByteArray FLATBUFFERS_FINAL_CLASS {
   uint8_t data_[8];
 
  public:
-  ByteArray() {
-    memset(static_cast<void *>(this), 0, sizeof(ByteArray));
+  ByteArray()
+      : data_() {
+  }
+  ByteArray(::flatbuffers::span<const uint8_t, 8> _data) {
+    ::flatbuffers::CastToArray(data_).CopyFromSpan(_data);
   }
-  const flatbuffers::Array<uint8_t, 8> *data() const {
-    return reinterpret_cast<const flatbuffers::Array<uint8_t, 8> *>(data_);
+  const ::flatbuffers::Array<uint8_t, 8> *data() const {
+    return &::flatbuffers::CastToArray(data_);
   }
-  flatbuffers::Array<uint8_t, 8> *mutable_data() {
-    return reinterpret_cast<flatbuffers::Array<uint8_t, 8> *>(data_);
+  ::flatbuffers::Array<uint8_t, 8> *mutable_data() {
+    return &::flatbuffers::CastToArray(data_);
   }
 };
 FLATBUFFERS_STRUCT_END(ByteArray, 8);
@@ -573,33 +585,33 @@ FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(1) Bool FLATBUFFERS_FINAL_CLASS {
   uint8_t data_;
 
  public:
-  Bool() {
-    memset(static_cast<void *>(this), 0, sizeof(Bool));
+  Bool()
+      : data_(0) {
   }
   Bool(bool _data)
-      : data_(flatbuffers::EndianScalar(static_cast<uint8_t>(_data))) {
+      : data_(::flatbuffers::EndianScalar(static_cast<uint8_t>(_data))) {
   }
   bool data() const {
-    return flatbuffers::EndianScalar(data_) != 0;
+    return ::flatbuffers::EndianScalar(data_) != 0;
   }
   void mutate_data(bool _data) {
-    flatbuffers::WriteScalar(&data_, static_cast<uint8_t>(_data));
+    ::flatbuffers::WriteScalar(&data_, static_cast<uint8_t>(_data));
   }
 };
 FLATBUFFERS_STRUCT_END(Bool, 1);
 
-struct AttributeDesc FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct AttributeDesc FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef AttributeDescBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_NAME = 4,
     VT_VAL_TYPE = 6,
     VT_VAL = 8
   };
-  const flatbuffers::String *name() const {
-    return GetPointer<const flatbuffers::String *>(VT_NAME);
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
   }
-  flatbuffers::String *mutable_name() {
-    return GetPointer<flatbuffers::String *>(VT_NAME);
+  ::flatbuffers::String *mutable_name() {
+    return GetPointer<::flatbuffers::String *>(VT_NAME);
   }
   dml::ir::operatorFieldTypes::AttributeFieldVariant val_type() const {
     return static_cast<dml::ir::operatorFieldTypes::AttributeFieldVariant>(GetField<uint8_t>(VT_VAL_TYPE, 0));
@@ -650,11 +662,11 @@ struct AttributeDesc FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   void *mutable_val() {
     return GetPointer<void *>(VT_VAL);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_NAME) &&
            verifier.VerifyString(name()) &&
-           VerifyField<uint8_t>(verifier, VT_VAL_TYPE) &&
+           VerifyField<uint8_t>(verifier, VT_VAL_TYPE, 1) &&
            VerifyOffset(verifier, VT_VAL) &&
            VerifyAttributeFieldVariant(verifier, val(), val_type()) &&
            verifier.EndTable();
@@ -715,34 +727,33 @@ template<> inline const dml::ir::operatorFieldTypes::Bool *AttributeDesc::val_as
 
 struct AttributeDescBuilder {
   typedef AttributeDesc Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_name(flatbuffers::Offset<flatbuffers::String> name) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
     fbb_.AddOffset(AttributeDesc::VT_NAME, name);
   }
   void add_val_type(dml::ir::operatorFieldTypes::AttributeFieldVariant val_type) {
     fbb_.AddElement<uint8_t>(AttributeDesc::VT_VAL_TYPE, static_cast<uint8_t>(val_type), 0);
   }
-  void add_val(flatbuffers::Offset<void> val) {
+  void add_val(::flatbuffers::Offset<void> val) {
     fbb_.AddOffset(AttributeDesc::VT_VAL, val);
   }
-  explicit AttributeDescBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit AttributeDescBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  AttributeDescBuilder &operator=(const AttributeDescBuilder &);
-  flatbuffers::Offset<AttributeDesc> Finish() {
+  ::flatbuffers::Offset<AttributeDesc> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<AttributeDesc>(end);
+    auto o = ::flatbuffers::Offset<AttributeDesc>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<AttributeDesc> CreateAttributeDesc(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::String> name = 0,
+inline ::flatbuffers::Offset<AttributeDesc> CreateAttributeDesc(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0,
     dml::ir::operatorFieldTypes::AttributeFieldVariant val_type = dml::ir::operatorFieldTypes::AttributeFieldVariant_NONE,
-    flatbuffers::Offset<void> val = 0) {
+    ::flatbuffers::Offset<void> val = 0) {
   AttributeDescBuilder builder_(_fbb);
   builder_.add_val(val);
   builder_.add_name(name);
@@ -750,11 +761,11 @@ inline flatbuffers::Offset<AttributeDesc> CreateAttributeDesc(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<AttributeDesc> CreateAttributeDescDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<AttributeDesc> CreateAttributeDescDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     const char *name = nullptr,
     dml::ir::operatorFieldTypes::AttributeFieldVariant val_type = dml::ir::operatorFieldTypes::AttributeFieldVariant_NONE,
-    flatbuffers::Offset<void> val = 0) {
+    ::flatbuffers::Offset<void> val = 0) {
   auto name__ = name ? _fbb.CreateString(name) : 0;
   return dml::ir::operatorFieldTypes::CreateAttributeDesc(
       _fbb,
@@ -763,25 +774,25 @@ inline flatbuffers::Offset<AttributeDesc> CreateAttributeDescDirect(
       val);
 }
 
-struct Activation FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct Activation FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef ActivationBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_TYPE = 4,
     VT_ATTRIBUTES = 6
   };
-  const flatbuffers::String *type() const {
-    return GetPointer<const flatbuffers::String *>(VT_TYPE);
+  const ::flatbuffers::String *type() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_TYPE);
   }
-  flatbuffers::String *mutable_type() {
-    return GetPointer<flatbuffers::String *>(VT_TYPE);
+  ::flatbuffers::String *mutable_type() {
+    return GetPointer<::flatbuffers::String *>(VT_TYPE);
   }
-  const flatbuffers::Vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>> *attributes() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>> *>(VT_ATTRIBUTES);
+  const ::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>> *attributes() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>> *>(VT_ATTRIBUTES);
   }
-  flatbuffers::Vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>> *mutable_attributes() {
-    return GetPointer<flatbuffers::Vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>> *>(VT_ATTRIBUTES);
+  ::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>> *mutable_attributes() {
+    return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>> *>(VT_ATTRIBUTES);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_TYPE) &&
            verifier.VerifyString(type()) &&
@@ -794,60 +805,59 @@ struct Activation FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 
 struct ActivationBuilder {
   typedef Activation Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_type(flatbuffers::Offset<flatbuffers::String> type) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_type(::flatbuffers::Offset<::flatbuffers::String> type) {
     fbb_.AddOffset(Activation::VT_TYPE, type);
   }
-  void add_attributes(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>>> attributes) {
+  void add_attributes(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>>> attributes) {
     fbb_.AddOffset(Activation::VT_ATTRIBUTES, attributes);
   }
-  explicit ActivationBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit ActivationBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  ActivationBuilder &operator=(const ActivationBuilder &);
-  flatbuffers::Offset<Activation> Finish() {
+  ::flatbuffers::Offset<Activation> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<Activation>(end);
+    auto o = ::flatbuffers::Offset<Activation>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<Activation> CreateActivation(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::String> type = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>>> attributes = 0) {
+inline ::flatbuffers::Offset<Activation> CreateActivation(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> type = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>>> attributes = 0) {
   ActivationBuilder builder_(_fbb);
   builder_.add_attributes(attributes);
   builder_.add_type(type);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<Activation> CreateActivationDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<Activation> CreateActivationDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     const char *type = nullptr,
-    const std::vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>> *attributes = nullptr) {
+    const std::vector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>> *attributes = nullptr) {
   auto type__ = type ? _fbb.CreateString(type) : 0;
-  auto attributes__ = attributes ? _fbb.CreateVector<flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>>(*attributes) : 0;
+  auto attributes__ = attributes ? _fbb.CreateVector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>>(*attributes) : 0;
   return dml::ir::operatorFieldTypes::CreateActivation(
       _fbb,
       type__,
       attributes__);
 }
 
-struct ActivationArray FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct ActivationArray FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef ActivationArrayBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_DATA = 4
   };
-  const flatbuffers::Vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::Activation>> *data() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::Activation>> *>(VT_DATA);
+  const ::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::Activation>> *data() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::Activation>> *>(VT_DATA);
   }
-  flatbuffers::Vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::Activation>> *mutable_data() {
-    return GetPointer<flatbuffers::Vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::Activation>> *>(VT_DATA);
+  ::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::Activation>> *mutable_data() {
+    return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::Activation>> *>(VT_DATA);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_DATA) &&
            verifier.VerifyVector(data()) &&
@@ -858,52 +868,51 @@ struct ActivationArray FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 
 struct ActivationArrayBuilder {
   typedef ActivationArray Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_data(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::Activation>>> data) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_data(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::Activation>>> data) {
     fbb_.AddOffset(ActivationArray::VT_DATA, data);
   }
-  explicit ActivationArrayBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit ActivationArrayBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  ActivationArrayBuilder &operator=(const ActivationArrayBuilder &);
-  flatbuffers::Offset<ActivationArray> Finish() {
+  ::flatbuffers::Offset<ActivationArray> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<ActivationArray>(end);
+    auto o = ::flatbuffers::Offset<ActivationArray>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<ActivationArray> CreateActivationArray(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::Activation>>> data = 0) {
+inline ::flatbuffers::Offset<ActivationArray> CreateActivationArray(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::Activation>>> data = 0) {
   ActivationArrayBuilder builder_(_fbb);
   builder_.add_data(data);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<ActivationArray> CreateActivationArrayDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    const std::vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::Activation>> *data = nullptr) {
-  auto data__ = data ? _fbb.CreateVector<flatbuffers::Offset<dml::ir::operatorFieldTypes::Activation>>(*data) : 0;
+inline ::flatbuffers::Offset<ActivationArray> CreateActivationArrayDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::Activation>> *data = nullptr) {
+  auto data__ = data ? _fbb.CreateVector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::Activation>>(*data) : 0;
   return dml::ir::operatorFieldTypes::CreateActivationArray(
       _fbb,
       data__);
 }
 
-struct UIntArray FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct UIntArray FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef UIntArrayBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_DATA = 4
   };
-  const flatbuffers::Vector<uint32_t> *data() const {
-    return GetPointer<const flatbuffers::Vector<uint32_t> *>(VT_DATA);
+  const ::flatbuffers::Vector<uint32_t> *data() const {
+    return GetPointer<const ::flatbuffers::Vector<uint32_t> *>(VT_DATA);
   }
-  flatbuffers::Vector<uint32_t> *mutable_data() {
-    return GetPointer<flatbuffers::Vector<uint32_t> *>(VT_DATA);
+  ::flatbuffers::Vector<uint32_t> *mutable_data() {
+    return GetPointer<::flatbuffers::Vector<uint32_t> *>(VT_DATA);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_DATA) &&
            verifier.VerifyVector(data()) &&
@@ -913,33 +922,32 @@ struct UIntArray FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 
 struct UIntArrayBuilder {
   typedef UIntArray Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_data(flatbuffers::Offset<flatbuffers::Vector<uint32_t>> data) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_data(::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> data) {
     fbb_.AddOffset(UIntArray::VT_DATA, data);
   }
-  explicit UIntArrayBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit UIntArrayBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  UIntArrayBuilder &operator=(const UIntArrayBuilder &);
-  flatbuffers::Offset<UIntArray> Finish() {
+  ::flatbuffers::Offset<UIntArray> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<UIntArray>(end);
+    auto o = ::flatbuffers::Offset<UIntArray>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<UIntArray> CreateUIntArray(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<uint32_t>> data = 0) {
+inline ::flatbuffers::Offset<UIntArray> CreateUIntArray(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> data = 0) {
   UIntArrayBuilder builder_(_fbb);
   builder_.add_data(data);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<UIntArray> CreateUIntArrayDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<UIntArray> CreateUIntArrayDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     const std::vector<uint32_t> *data = nullptr) {
   auto data__ = data ? _fbb.CreateVector<uint32_t>(*data) : 0;
   return dml::ir::operatorFieldTypes::CreateUIntArray(
@@ -947,18 +955,18 @@ inline flatbuffers::Offset<UIntArray> CreateUIntArrayDirect(
       data__);
 }
 
-struct IntArray FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct IntArray FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef IntArrayBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_DATA = 4
   };
-  const flatbuffers::Vector<int32_t> *data() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_DATA);
+  const ::flatbuffers::Vector<int32_t> *data() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_DATA);
   }
-  flatbuffers::Vector<int32_t> *mutable_data() {
-    return GetPointer<flatbuffers::Vector<int32_t> *>(VT_DATA);
+  ::flatbuffers::Vector<int32_t> *mutable_data() {
+    return GetPointer<::flatbuffers::Vector<int32_t> *>(VT_DATA);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_DATA) &&
            verifier.VerifyVector(data()) &&
@@ -968,33 +976,32 @@ struct IntArray FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 
 struct IntArrayBuilder {
   typedef IntArray Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_data(flatbuffers::Offset<flatbuffers::Vector<int32_t>> data) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_data(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> data) {
     fbb_.AddOffset(IntArray::VT_DATA, data);
   }
-  explicit IntArrayBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit IntArrayBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  IntArrayBuilder &operator=(const IntArrayBuilder &);
-  flatbuffers::Offset<IntArray> Finish() {
+  ::flatbuffers::Offset<IntArray> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<IntArray>(end);
+    auto o = ::flatbuffers::Offset<IntArray>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<IntArray> CreateIntArray(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> data = 0) {
+inline ::flatbuffers::Offset<IntArray> CreateIntArray(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> data = 0) {
   IntArrayBuilder builder_(_fbb);
   builder_.add_data(data);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<IntArray> CreateIntArrayDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<IntArray> CreateIntArrayDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     const std::vector<int32_t> *data = nullptr) {
   auto data__ = data ? _fbb.CreateVector<int32_t>(*data) : 0;
   return dml::ir::operatorFieldTypes::CreateIntArray(
@@ -1002,18 +1009,18 @@ inline flatbuffers::Offset<IntArray> CreateIntArrayDirect(
       data__);
 }
 
-struct FloatArray FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct FloatArray FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef FloatArrayBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_DATA = 4
   };
-  const flatbuffers::Vector<float> *data() const {
-    return GetPointer<const flatbuffers::Vector<float> *>(VT_DATA);
+  const ::flatbuffers::Vector<float> *data() const {
+    return GetPointer<const ::flatbuffers::Vector<float> *>(VT_DATA);
   }
-  flatbuffers::Vector<float> *mutable_data() {
-    return GetPointer<flatbuffers::Vector<float> *>(VT_DATA);
+  ::flatbuffers::Vector<float> *mutable_data() {
+    return GetPointer<::flatbuffers::Vector<float> *>(VT_DATA);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_DATA) &&
            verifier.VerifyVector(data()) &&
@@ -1023,33 +1030,32 @@ struct FloatArray FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 
 struct FloatArrayBuilder {
   typedef FloatArray Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_data(flatbuffers::Offset<flatbuffers::Vector<float>> data) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_data(::flatbuffers::Offset<::flatbuffers::Vector<float>> data) {
     fbb_.AddOffset(FloatArray::VT_DATA, data);
   }
-  explicit FloatArrayBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit FloatArrayBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  FloatArrayBuilder &operator=(const FloatArrayBuilder &);
-  flatbuffers::Offset<FloatArray> Finish() {
+  ::flatbuffers::Offset<FloatArray> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<FloatArray>(end);
+    auto o = ::flatbuffers::Offset<FloatArray>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<FloatArray> CreateFloatArray(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<float>> data = 0) {
+inline ::flatbuffers::Offset<FloatArray> CreateFloatArray(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<float>> data = 0) {
   FloatArrayBuilder builder_(_fbb);
   builder_.add_data(data);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<FloatArray> CreateFloatArrayDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<FloatArray> CreateFloatArrayDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     const std::vector<float> *data = nullptr) {
   auto data__ = data ? _fbb.CreateVector<float>(*data) : 0;
   return dml::ir::operatorFieldTypes::CreateFloatArray(
@@ -1057,7 +1063,7 @@ inline flatbuffers::Offset<FloatArray> CreateFloatArrayDirect(
       data__);
 }
 
-struct ScalarUnionData FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct ScalarUnionData FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef ScalarUnionDataBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_DATA_TYPE = 4,
@@ -1106,9 +1112,9 @@ struct ScalarUnionData FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   void *mutable_data() {
     return GetPointer<void *>(VT_DATA);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyField<uint8_t>(verifier, VT_DATA_TYPE) &&
+           VerifyField<uint8_t>(verifier, VT_DATA_TYPE, 1) &&
            VerifyOffset(verifier, VT_DATA) &&
            VerifyScalarVariant(verifier, data(), data_type()) &&
            verifier.EndTable();
@@ -1161,37 +1167,36 @@ template<> inline const dml::ir::operatorFieldTypes::Float64 *ScalarUnionData::d
 
 struct ScalarUnionDataBuilder {
   typedef ScalarUnionData Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_data_type(dml::ir::operatorFieldTypes::ScalarVariant data_type) {
     fbb_.AddElement<uint8_t>(ScalarUnionData::VT_DATA_TYPE, static_cast<uint8_t>(data_type), 0);
   }
-  void add_data(flatbuffers::Offset<void> data) {
+  void add_data(::flatbuffers::Offset<void> data) {
     fbb_.AddOffset(ScalarUnionData::VT_DATA, data);
   }
-  explicit ScalarUnionDataBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit ScalarUnionDataBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  ScalarUnionDataBuilder &operator=(const ScalarUnionDataBuilder &);
-  flatbuffers::Offset<ScalarUnionData> Finish() {
+  ::flatbuffers::Offset<ScalarUnionData> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<ScalarUnionData>(end);
+    auto o = ::flatbuffers::Offset<ScalarUnionData>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<ScalarUnionData> CreateScalarUnionData(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<ScalarUnionData> CreateScalarUnionData(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     dml::ir::operatorFieldTypes::ScalarVariant data_type = dml::ir::operatorFieldTypes::ScalarVariant_NONE,
-    flatbuffers::Offset<void> data = 0) {
+    ::flatbuffers::Offset<void> data = 0) {
   ScalarUnionDataBuilder builder_(_fbb);
   builder_.add_data(data);
   builder_.add_data_type(data_type);
   return builder_.Finish();
 }
 
-inline bool VerifyAttributeFieldVariant(flatbuffers::Verifier &verifier, const void *obj, AttributeFieldVariant type) {
+inline bool VerifyAttributeFieldVariant(::flatbuffers::Verifier &verifier, const void *obj, AttributeFieldVariant type) {
   switch (type) {
     case AttributeFieldVariant_NONE: {
       return true;
@@ -1205,16 +1210,16 @@ inline bool VerifyAttributeFieldVariant(flatbuffers::Verifier &verifier, const v
       return verifier.VerifyTable(ptr);
     }
     case AttributeFieldVariant_UInt32: {
-      return verifier.Verify<dml::ir::operatorFieldTypes::UInt32>(static_cast<const uint8_t *>(obj), 0);
+      return verifier.VerifyField<dml::ir::operatorFieldTypes::UInt32>(static_cast<const uint8_t *>(obj), 0, 4);
     }
     case AttributeFieldVariant_UInt64: {
-      return verifier.Verify<dml::ir::operatorFieldTypes::UInt64>(static_cast<const uint8_t *>(obj), 0);
+      return verifier.VerifyField<dml::ir::operatorFieldTypes::UInt64>(static_cast<const uint8_t *>(obj), 0, 8);
     }
     case AttributeFieldVariant_Int32: {
-      return verifier.Verify<dml::ir::operatorFieldTypes::Int32>(static_cast<const uint8_t *>(obj), 0);
+      return verifier.VerifyField<dml::ir::operatorFieldTypes::Int32>(static_cast<const uint8_t *>(obj), 0, 4);
     }
     case AttributeFieldVariant_Float32: {
-      return verifier.Verify<dml::ir::operatorFieldTypes::Float32>(static_cast<const uint8_t *>(obj), 0);
+      return verifier.VerifyField<dml::ir::operatorFieldTypes::Float32>(static_cast<const uint8_t *>(obj), 0, 4);
     }
     case AttributeFieldVariant_UIntArray: {
       auto ptr = reinterpret_cast<const dml::ir::operatorFieldTypes::UIntArray *>(obj);
@@ -1229,26 +1234,26 @@ inline bool VerifyAttributeFieldVariant(flatbuffers::Verifier &verifier, const v
       return verifier.VerifyTable(ptr);
     }
     case AttributeFieldVariant_ScaleBias: {
-      return verifier.Verify<dml::ir::operatorFieldTypes::ScaleBias>(static_cast<const uint8_t *>(obj), 0);
+      return verifier.VerifyField<dml::ir::operatorFieldTypes::ScaleBias>(static_cast<const uint8_t *>(obj), 0, 4);
     }
     case AttributeFieldVariant_Size2D: {
-      return verifier.Verify<dml::ir::operatorFieldTypes::Size2D>(static_cast<const uint8_t *>(obj), 0);
+      return verifier.VerifyField<dml::ir::operatorFieldTypes::Size2D>(static_cast<const uint8_t *>(obj), 0, 4);
     }
     case AttributeFieldVariant_ScalarUnionData: {
       auto ptr = reinterpret_cast<const dml::ir::operatorFieldTypes::ScalarUnionData *>(obj);
       return verifier.VerifyTable(ptr);
     }
     case AttributeFieldVariant_Bool: {
-      return verifier.Verify<dml::ir::operatorFieldTypes::Bool>(static_cast<const uint8_t *>(obj), 0);
+      return verifier.VerifyField<dml::ir::operatorFieldTypes::Bool>(static_cast<const uint8_t *>(obj), 0, 1);
     }
     default: return true;
   }
 }
 
-inline bool VerifyAttributeFieldVariantVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types) {
+inline bool VerifyAttributeFieldVariantVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values, const ::flatbuffers::Vector<uint8_t> *types) {
   if (!values || !types) return !values && !types;
   if (values->size() != types->size()) return false;
-  for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+  for (::flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
     if (!VerifyAttributeFieldVariant(
         verifier,  values->Get(i), types->GetEnum<AttributeFieldVariant>(i))) {
       return false;
@@ -1257,52 +1262,52 @@ inline bool VerifyAttributeFieldVariantVector(flatbuffers::Verifier &verifier, c
   return true;
 }
 
-inline bool VerifyScalarVariant(flatbuffers::Verifier &verifier, const void *obj, ScalarVariant type) {
+inline bool VerifyScalarVariant(::flatbuffers::Verifier &verifier, const void *obj, ScalarVariant type) {
   switch (type) {
     case ScalarVariant_NONE: {
       return true;
     }
     case ScalarVariant_ByteArray: {
-      return verifier.Verify<dml::ir::operatorFieldTypes::ByteArray>(static_cast<const uint8_t *>(obj), 0);
+      return verifier.VerifyField<dml::ir::operatorFieldTypes::ByteArray>(static_cast<const uint8_t *>(obj), 0, 1);
     }
     case ScalarVariant_Int8: {
-      return verifier.Verify<dml::ir::operatorFieldTypes::Int8>(static_cast<const uint8_t *>(obj), 0);
+      return verifier.VerifyField<dml::ir::operatorFieldTypes::Int8>(static_cast<const uint8_t *>(obj), 0, 1);
     }
     case ScalarVariant_UInt8: {
-      return verifier.Verify<dml::ir::operatorFieldTypes::UInt8>(static_cast<const uint8_t *>(obj), 0);
+      return verifier.VerifyField<dml::ir::operatorFieldTypes::UInt8>(static_cast<const uint8_t *>(obj), 0, 1);
     }
     case ScalarVariant_Int16: {
-      return verifier.Verify<dml::ir::operatorFieldTypes::Int16>(static_cast<const uint8_t *>(obj), 0);
+      return verifier.VerifyField<dml::ir::operatorFieldTypes::Int16>(static_cast<const uint8_t *>(obj), 0, 2);
     }
     case ScalarVariant_UInt16: {
-      return verifier.Verify<dml::ir::operatorFieldTypes::UInt16>(static_cast<const uint8_t *>(obj), 0);
+      return verifier.VerifyField<dml::ir::operatorFieldTypes::UInt16>(static_cast<const uint8_t *>(obj), 0, 2);
     }
     case ScalarVariant_Int32: {
-      return verifier.Verify<dml::ir::operatorFieldTypes::Int32>(static_cast<const uint8_t *>(obj), 0);
+      return verifier.VerifyField<dml::ir::operatorFieldTypes::Int32>(static_cast<const uint8_t *>(obj), 0, 4);
     }
     case ScalarVariant_UInt32: {
-      return verifier.Verify<dml::ir::operatorFieldTypes::UInt32>(static_cast<const uint8_t *>(obj), 0);
+      return verifier.VerifyField<dml::ir::operatorFieldTypes::UInt32>(static_cast<const uint8_t *>(obj), 0, 4);
     }
     case ScalarVariant_Int64: {
-      return verifier.Verify<dml::ir::operatorFieldTypes::Int64>(static_cast<const uint8_t *>(obj), 0);
+      return verifier.VerifyField<dml::ir::operatorFieldTypes::Int64>(static_cast<const uint8_t *>(obj), 0, 8);
     }
     case ScalarVariant_UInt64: {
-      return verifier.Verify<dml::ir::operatorFieldTypes::UInt64>(static_cast<const uint8_t *>(obj), 0);
+      return verifier.VerifyField<dml::ir::operatorFieldTypes::UInt64>(static_cast<const uint8_t *>(obj), 0, 8);
     }
     case ScalarVariant_Float32: {
-      return verifier.Verify<dml::ir::operatorFieldTypes::Float32>(static_cast<const uint8_t *>(obj), 0);
+      return verifier.VerifyField<dml::ir::operatorFieldTypes::Float32>(static_cast<const uint8_t *>(obj), 0, 4);
     }
     case ScalarVariant_Float64: {
-      return verifier.Verify<dml::ir::operatorFieldTypes::Float64>(static_cast<const uint8_t *>(obj), 0);
+      return verifier.VerifyField<dml::ir::operatorFieldTypes::Float64>(static_cast<const uint8_t *>(obj), 0, 8);
     }
     default: return true;
   }
 }
 
-inline bool VerifyScalarVariantVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types) {
+inline bool VerifyScalarVariantVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values, const ::flatbuffers::Vector<uint8_t> *types) {
   if (!values || !types) return !values && !types;
   if (values->size() != types->size()) return false;
-  for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+  for (::flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
     if (!VerifyScalarVariant(
         verifier,  values->Get(i), types->GetEnum<ScalarVariant>(i))) {
       return false;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/precomp.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/precomp.h
index 332bf86685e8a..1a796b25c5d1f 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/precomp.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/precomp.h
@@ -39,7 +39,7 @@
 #include <d3d12sdklayers.h>
 #include "External/D3DX12/d3dx12.h"
 #endif
-#include "flatbuffers/flatbuffers.h"
+#include "core/common/flatbuffers.h"
 
 #include "GraphicsUnknownHelper.h"
 
diff --git a/onnxruntime/core/providers/migraphx/ort_trt_int8_cal_table.fbs.h b/onnxruntime/core/providers/migraphx/ort_trt_int8_cal_table.fbs.h
index 9639040f772da..a2721f6a5b44f 100644
--- a/onnxruntime/core/providers/migraphx/ort_trt_int8_cal_table.fbs.h
+++ b/onnxruntime/core/providers/migraphx/ort_trt_int8_cal_table.fbs.h
@@ -4,7 +4,7 @@
 #define ONNXRUNTIME_CORE_PROVIDERS_MIGRAPHX_ORT_TRT_INT8_CAL_TABLE_FBS_H_
 
 #include <vector>
-#include "flatbuffers/flatbuffers.h"
+#include "core/common/flatbuffers.h"
 
 namespace CalTableFlatBuffers {
 
diff --git a/onnxruntime/core/providers/tensorrt/ort_trt_int8_cal_table.fbs.h b/onnxruntime/core/providers/tensorrt/ort_trt_int8_cal_table.fbs.h
index 9e4324fb9f516..a2e027f56fbd9 100644
--- a/onnxruntime/core/providers/tensorrt/ort_trt_int8_cal_table.fbs.h
+++ b/onnxruntime/core/providers/tensorrt/ort_trt_int8_cal_table.fbs.h
@@ -3,7 +3,7 @@
 #ifndef FLATBUFFERS_GENERATED_ORTTRTINT8CALTABLE_CALTABLEFLATBUFFERS_H_
 #define FLATBUFFERS_GENERATED_ORTTRTINT8CALTABLE_CALTABLEFLATBUFFERS_H_
 
-#include "flatbuffers/flatbuffers.h"
+#include "core/common/flatbuffers.h"
 
 namespace CalTableFlatBuffers {
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
index 01be343795a56..c60b3e467d4f1 100644
--- a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
@@ -11,7 +11,7 @@ steps:
       packageType: upack
       feed: '/7424c8e4-5c62-490e-95c4-79446f31017c'
       definition: '517c4f6f-5437-4392-a70d-4f15ec5be2f0'
-      version: 1.0.134
+      version: 1.0.143
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # The private ADO project
@@ -22,7 +22,7 @@ steps:
       packageType: upack
       feed: '/4c7631f5-24c0-4307-8822-1aa8f180c325'
       definition: 'fd9dd5ad-b73e-4678-890e-edcf680dbc1a'
-      version: 1.0.134
+      version: 1.0.143
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # You can add more ADO accounts at here.
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml
index dd703f3199d9b..30e427a18509d 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml
@@ -148,12 +148,9 @@ jobs:
       Get-Volume $("$(Build.BinariesDirectory)")[0]
     displayName: check disk size
 
-  - task: DeleteFiles@1
-    displayName: 'Delete intermedia files from $(Build.BinariesDirectory)\${{ parameters.BuildConfig }}'
-    inputs:
-      SourceFolder: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}'
-      Contents: |
-        **/*.obj
+  - powershell: |
+      Remove-Item "$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}" -Include "*.obj" -Recurse
+    displayName: 'Delete intermediate files from $(Build.BinariesDirectory)\${{ parameters.BuildConfig }}'
 
   - powershell: |
       Get-Volume $("$(Build.BinariesDirectory)")[0]
@@ -221,14 +218,6 @@ jobs:
         workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
         displayName: 'Run tests'
 
-      - task: PublishTestResults@2
-        displayName: 'Publish unit test results'
-        inputs:
-          testResultsFiles: '**/*.results.xml'
-          searchFolder: '$(Build.BinariesDirectory)/${{ parameters.BuildConfig }}'
-          testRunTitle: 'Unit Test Run'
-        condition: succeededOrFailed()
-
   - ${{ if eq(parameters.GenerateDocumentation, true) }}:
     - task: PythonScript@0
       displayName: 'Generate documentation'
@@ -251,4 +240,4 @@ jobs:
     condition: and(failed(), eq(variables['DocUpdateNeeded'], 'true'))
     inputs:
       pathtoPublish: '$(Build.SourcesDirectory)/docs/ContribOperators.md'
-      artifactName: 'ContribOperators.md'
+      artifactName: 'ContribOperators.md'
\ No newline at end of file

From bcf47d354646f67decdc04f7ebb427f8343faabc Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchuby@users.noreply.github.com>
Date: Thu, 14 Mar 2024 17:05:50 -0700
Subject: [PATCH 181/279] Update install_deps_lort.sh to fix onnxscript
 installation (#19922)

Install onnxscript correctly with `pip install`. Dev dependencies are
not required.

### Motivation and Context

Fix build breaks.
---
 .../github/linux/docker/scripts/manylinux/install_deps_lort.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps_lort.sh b/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps_lort.sh
index da8a45e00cc90..39c15338aeddb 100755
--- a/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps_lort.sh
+++ b/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps_lort.sh
@@ -31,8 +31,7 @@ cd /usr/local/
 echo "Cloning ONNX Script"
 git clone --recursive https://github.com/microsoft/onnxscript.git
 cd onnxscript
-/opt/python/cp39-cp39/bin/python3.9 -m pip install -r requirements-dev.txt
-/opt/python/cp39-cp39/bin/python3.9 setup.py install
+/opt/python/cp39-cp39/bin/python3.9 -m pip install .
 cd ~ && /opt/python/cp39-cp39/bin/python3.9 -c "import onnxscript; print(f'Installed ONNX Script: {onnxscript.__version__}')"
 
 cd /usr/local

From 42399dfd2b248876bc184d653585f06ed088b229 Mon Sep 17 00:00:00 2001
From: Hariharan Seshadri <shariharan91@gmail.com>
Date: Thu, 14 Mar 2024 18:13:47 -0700
Subject: [PATCH 182/279] Fix a potential race in the CUDA TopK kernel (#19917)

### Description
If the `K` value is flowing through as a tensor, we are updating a
mutable member of the `TopK` class and basing the compute off that -
which is likely to cause data race issues with concurrent Run() calls
and `K` value changes.


### Motivation and Context
Fix potential race in CUDA TopK kernel
---
 onnxruntime/core/providers/cuda/math/topk.cc | 24 ++++++++++++++------
 onnxruntime/core/providers/cuda/math/topk.h  |  2 +-
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/onnxruntime/core/providers/cuda/math/topk.cc b/onnxruntime/core/providers/cuda/math/topk.cc
index d516537e25949..cf26e0acfa557 100644
--- a/onnxruntime/core/providers/cuda/math/topk.cc
+++ b/onnxruntime/core/providers/cuda/math/topk.cc
@@ -56,7 +56,7 @@ TopK<inputk>::TopK(const OpKernelInfo& info) : CudaKernel(info) {
   info.GetAttrOrDefault<int64_t>("largest", &largest_, 1);
   info.GetAttrOrDefault<int64_t>("sorted", &sorted_, 1);
   if (!inputk) {
-    info.GetAttrOrDefault<int64_t>("k", &K_, 0);
+    info.GetAttrOrDefault<int64_t>("k", &attr_k_, 0);
   }
 }
 
@@ -67,7 +67,7 @@ TopK<inputk>::TopK(const OpKernelInfo& info) : CudaKernel(info) {
                                 static_cast<int64_t*>(tensor_I->MutableDataRaw()), \
                                 elem_nums_cuda,                                    \
                                 elem_nums.size(),                                  \
-                                axis, K_, largest_, sorted_, N, dimension)
+                                axis, k_value, largest_, sorted_, N, dimension)
 
 template <bool inputk>
 Status TopK<inputk>::ComputeInternal(OpKernelContext* ctx) const {
@@ -77,19 +77,29 @@ Status TopK<inputk>::ComputeInternal(OpKernelContext* ctx) const {
   int32_t axis = static_cast<int32_t>(axis_ < 0 ? rank + axis_ : axis_);
   ORT_ENFORCE(axis > -1 && axis < rank);
 
+  int64_t k_value = 0;
   if (inputk) {
     auto tensor_K = ctx->Input<Tensor>(1);
     ORT_ENFORCE(nullptr != tensor_K);
-    K_ = *tensor_K->Data<int64_t>();
-    ORT_ENFORCE(K_ >= 0 && K_ <= tensor_X->Shape().GetDims()[axis]);
+    k_value = *tensor_K->Data<int64_t>();
+  } else {  // from attribute
+    k_value = attr_k_;
   }
 
-  auto output_shape = tensor_X->Shape();
-  output_shape[axis] = K_;
+  // Now that we know the value of 'K' and the input shape,
+  // make a final validation before going to the implementation
+  const auto& input_shape = tensor_X->Shape();
+  if ((k_value < 0) || (k_value > input_shape.GetDims()[axis])) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Value of K outside range. K value: ", k_value,
+                           ". Input shape: ", input_shape, " . Axis: ", axis);
+  }
+
+  auto output_shape = input_shape;
+  output_shape[axis] = k_value;
   auto tensor_V = ctx->Output(0, output_shape);
   auto tensor_I = ctx->Output(1, output_shape);
 
-  if (0 == K_) {
+  if (output_shape.Size() == 0) {  // Bail out early if the output is going to be empty
     return Status::OK();
   }
 
diff --git a/onnxruntime/core/providers/cuda/math/topk.h b/onnxruntime/core/providers/cuda/math/topk.h
index 9dec13ad2a930..5731df3130c5a 100644
--- a/onnxruntime/core/providers/cuda/math/topk.h
+++ b/onnxruntime/core/providers/cuda/math/topk.h
@@ -17,7 +17,7 @@ class TopK final : public CudaKernel {
   int64_t axis_;
   int64_t largest_;
   int64_t sorted_;
-  mutable int64_t K_;
+  int64_t attr_k_;
 };
 }  // namespace cuda
 }  // namespace onnxruntime

From 0b2a75b274e45c7a510bfdae9071a97a69e75618 Mon Sep 17 00:00:00 2001
From: Yifan Li <109183385+yf711@users.noreply.github.com>
Date: Fri, 15 Mar 2024 23:41:21 +0900
Subject: [PATCH 183/279] [EP Perf] Add concurrency test (#19804)

### Description
<!-- Describe your changes. -->
* Add concurrency test to EP Perf CI panel (impl. by onnx_test_runner)
  * Model: FasterRCNN-10 model within CI image
  * `-c` param configurable via CI panel when kicking off CI tasks
  * Auto-replicate test input/outputs according to `-c` param
* By default, the model test will be executed in 100 iterations (~2min
added to T4 CI task load overall)

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
To monitor potential concurrency issues of ORT-TRT
---
 .../tools/tensorrt/perf/mem_test/run.sh       | 23 ++++++-
 .../perf/mem_test/run_mem_test_docker.sh      |  5 +-
 .../python/tools/tensorrt/perf/post.py        | 61 +++++++++++++++----
 onnxruntime/test/onnx/main.cc                 |  5 --
 ...linux-gpu-tensorrt-daily-perf-pipeline.yml | 17 ++++--
 5 files changed, 86 insertions(+), 25 deletions(-)

diff --git a/onnxruntime/python/tools/tensorrt/perf/mem_test/run.sh b/onnxruntime/python/tools/tensorrt/perf/mem_test/run.sh
index dd53fe6127462..2cfdd39bc96aa 100755
--- a/onnxruntime/python/tools/tensorrt/perf/mem_test/run.sh
+++ b/onnxruntime/python/tools/tensorrt/perf/mem_test/run.sh
@@ -4,13 +4,14 @@
 
 set -x
 
-while getopts p:o:l:s: parameter
+while getopts p:o:l:s:c: parameter
 do case "${parameter}"
 in
 p) WORKSPACE=${OPTARG};;
 o) ORT_BINARY_PATH=${OPTARG};;
 l) BUILD_ORT_LATEST=${OPTARG};;
 s) ORT_SOURCE=${OPTARG};;
+c) CONCURRENCY=${OPTARG};;
 esac
 done
 
@@ -104,6 +105,26 @@ fi
 
 mv valgrind.log result
 
+# Concurrency Test
+FRCNN_FOLDER="/data/ep-perf-models/onnx-zoo-models/FasterRCNN-10/"
+
+mkdir FasterRCNN-10/
+cp -r ${FRCNN_FOLDER}/test_data_set_0 ${FRCNN_FOLDER}/faster_rcnn_R_50_FPN_1x.onnx ./FasterRCNN-10/
+
+# replicate test inputs
+for (( i=1; i<CONCURRENCY; i++ )); do
+    cp -r "./FasterRCNN-10/test_data_set_0/" "./FasterRCNN-10/test_data_set_$i/"
+done
+
+pip install onnx requests packaging
+python ${ORT_SOURCE}/onnxruntime/python/tools/symbolic_shape_infer.py \
+    --input="./FasterRCNN-10/faster_rcnn_R_50_FPN_1x.onnx" \
+    --output="./FasterRCNN-10/faster_rcnn_R_50_FPN_1x.onnx" \
+    --auto_merge
+
+${ORT_SOURCE}/build/Linux/Release/onnx_test_runner -e tensorrt -c ${CONCURRENCY} -r 100 ./FasterRCNN-10/ > concurrency_test.log 2>&1
+mv concurrency_test.log result
+
 # Run AddressSanitizer 
 ASAN_OPTIONS=${ASAN_OPTIONS} ./onnx_memtest
 
diff --git a/onnxruntime/python/tools/tensorrt/perf/mem_test/run_mem_test_docker.sh b/onnxruntime/python/tools/tensorrt/perf/mem_test/run_mem_test_docker.sh
index 4e94c63ee6c25..a355e4cf5d365 100755
--- a/onnxruntime/python/tools/tensorrt/perf/mem_test/run_mem_test_docker.sh
+++ b/onnxruntime/python/tools/tensorrt/perf/mem_test/run_mem_test_docker.sh
@@ -3,13 +3,14 @@
 set -x
 
 # Parse Arguments
-while getopts w:d:p:l: parameter
+while getopts w:d:p:l:c: parameter
 do case "${parameter}"
 in 
 w) WORKSPACE=${OPTARG};; # workspace folder of onnxruntime
 d) DOCKER_IMAGE=${OPTARG};; # docker image:"trt-ep-mem-test" docker image is already pre-built on perf machine
 p) MEM_TEST_DIR=${OPTARG};; # mem test dir
 l) BUILD_ORT_LATEST=${OPTARG};; # whether to build latest ORT
+c) CONCURRENCY=${OPTARG};;
 esac
 done 
 
@@ -24,4 +25,4 @@ then
     BUILD_ORT_LATEST="true"
 fi
 
-docker run --rm --gpus all -v $MEM_TEST_DIR:$DOCKER_MEM_TEST_DIR -v /data/ep-perf-models:/data/ep-perf-models $DOCKER_IMAGE /bin/bash $DOCKER_MEM_TEST_DIR'run.sh' -p $DOCKER_MEM_TEST_DIR -o $DOCKER_ORT_LIBS -s $DOCKER_ORT_SOURCE -l $BUILD_ORT_LATEST
+docker run --rm --gpus all -v $MEM_TEST_DIR:$DOCKER_MEM_TEST_DIR -v /data/ep-perf-models:/data/ep-perf-models $DOCKER_IMAGE /bin/bash $DOCKER_MEM_TEST_DIR'run.sh' -p $DOCKER_MEM_TEST_DIR -o $DOCKER_ORT_LIBS -s $DOCKER_ORT_SOURCE -l $BUILD_ORT_LATEST -c $CONCURRENCY
diff --git a/onnxruntime/python/tools/tensorrt/perf/post.py b/onnxruntime/python/tools/tensorrt/perf/post.py
index 363fa3a96d283..df389ad572596 100644
--- a/onnxruntime/python/tools/tensorrt/perf/post.py
+++ b/onnxruntime/python/tools/tensorrt/perf/post.py
@@ -3,6 +3,7 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 import argparse
+import csv
 import datetime
 import os
 import sys
@@ -419,10 +420,11 @@ def main():
     upload_time = datetime.datetime.now(tz=datetime.timezone.utc).replace(microsecond=0)
 
     try:
+        # Load EP Perf test results from /result
         result_file = args.report_folder
-
-        folders = os.listdir(result_file)
-        os.chdir(result_file)
+        result_perf_test_path = os.path.join(result_file, "result")
+        folders = os.listdir(result_perf_test_path)
+        os.chdir(result_perf_test_path)
 
         tables = [
             fail_name,
@@ -445,13 +447,13 @@ def main():
         for model_group in folders:
             os.chdir(model_group)
             csv_filenames = os.listdir()
-            for csv in csv_filenames:
-                table = pd.read_csv(csv)
-                if session_name in csv:
+            for csv_file in csv_filenames:
+                table = pd.read_csv(csv_file)
+                if session_name in csv_file:
                     table_results[session_name] = pd.concat(
                         [table_results[session_name], get_session(table, model_group)], ignore_index=True
                     )
-                elif specs_name in csv:
+                elif specs_name in csv_file:
                     table_results[specs_name] = pd.concat(
                         [
                             table_results[specs_name],
@@ -459,12 +461,12 @@ def main():
                         ],
                         ignore_index=True,
                     )
-                elif fail_name in csv:
+                elif fail_name in csv_file:
                     table_results[fail_name] = pd.concat(
                         [table_results[fail_name], get_failures(table, model_group)],
                         ignore_index=True,
                     )
-                elif latency_name in csv:
+                elif latency_name in csv_file:
                     table_results[memory_name] = pd.concat(
                         [table_results[memory_name], get_memory(table, model_group)],
                         ignore_index=True,
@@ -474,11 +476,11 @@ def main():
                         [table_results[latency_name], get_latency(table, model_group)],
                         ignore_index=True,
                     )
-                elif status_name in csv:
+                elif status_name in csv_file:
                     table_results[status_name] = pd.concat(
                         [table_results[status_name], get_status(table, model_group)], ignore_index=True
                     )
-                elif op_metrics_name in csv:
+                elif op_metrics_name in csv_file:
                     table = table.assign(Group=model_group)
                     table_results[op_metrics_name] = pd.concat(
                         [table_results[op_metrics_name], table], ignore_index=True
@@ -512,6 +514,43 @@ def main():
                 args.commit_datetime,
             )
 
+        # Load concurrency test results
+        result_mem_test_path = os.path.join(result_file, "result_mem_test")
+        os.chdir(result_mem_test_path)
+        log_path = "concurrency_test.log"
+        if os.path.exists(log_path):
+            print("Generating concurrency test report")
+            with open(log_path) as log_file:
+                log_content = log_file.read()
+
+            failed_cases_section = log_content.split("Failed Test Cases:")[1]
+
+            # passed = 1 if no failed test cases
+            if failed_cases_section.strip() == "":
+                passed = 1
+            else:
+                passed = 0
+
+            csv_path = "concurrency_test.csv"
+            with open(csv_path, "w", newline="") as csv_file:
+                csv_writer = csv.writer(csv_file)
+                csv_writer.writerow(["Passed", "Log"])
+                csv_writer.writerow([passed, log_content])
+
+            db_table_name = "ep_concurrencytest_record"
+            table = pd.read_csv(csv_path)
+            write_table(
+                ingest_client,
+                args.database,
+                table,
+                db_table_name,
+                upload_time,
+                identifier,
+                args.branch,
+                args.commit_hash,
+                args.commit_datetime,
+            )
+
     except BaseException as e:
         print(str(e))
         sys.exit(1)
diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc
index 5a2104ffeb0da..9c2c24e3c337d 100644
--- a/onnxruntime/test/onnx/main.cc
+++ b/onnxruntime/test/onnx/main.cc
@@ -341,11 +341,6 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
     logging_level = ORT_LOGGING_LEVEL_VERBOSE;
   }
 
-  if (concurrent_session_runs > 1 && repeat_count > 1) {
-    fprintf(stderr, "when you use '-r [repeat]', please set '-c' to 1\n");
-    usage();
-    return -1;
-  }
   argc -= optind;
   argv += optind;
   if (argc < 1) {
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml
index 15f558e6f9ef0..af2d722a6b90c 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml
@@ -28,10 +28,15 @@ parameters:
     - "partner-models"
 
 - name: MemTest
-  displayName: Run Memory Test
+  displayName: Run Memory Test and Concurrency Test
   type: boolean
   default: true
 
+- name: ConcurrencyTest
+  displayName: Specifies the number of concurrency model test to invoke simultaneously
+  type: string
+  default: 2
+
 - name: TrtEPOptions
   displayName: TensorRT EP options
   type: object
@@ -107,8 +112,8 @@ jobs:
         workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/build'
     
     - ${{ if eq(parameters.MemTest, true) }}:
-      - script: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/run_mem_test_docker.sh -d $(image) -p $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/ -w /code/ -l false'
-        displayName: 'Run Memory Test'
+      - script: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/run_mem_test_docker.sh -d $(image) -p $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/ -w /code/ -l false -c ${{ parameters.ConcurrencyTest }}'
+        displayName: 'Run Memory Test and Concurrency Test'
         workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/'
 
     - ${{ each option in parameters.ModelGroups }}:
@@ -152,7 +157,7 @@ jobs:
         displayName: 'Check and Install Azure CLI'
 
       - task: AzureCLI@2
-        displayName: 'Azure CLI Post to Dashboard'
+        displayName: 'Post EP Perf Results to Dashboard'
         inputs:
           azureSubscription: AIInfraBuildOnnxRuntimeOSS
           scriptLocation: inlineScript
@@ -160,8 +165,8 @@ jobs:
           inlineScript: |
             short_hash=$(git rev-parse --short HEAD) &&
             commit_date=$(git log -1 --date=iso-strict --pretty=format:%cd) &&
-            python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/post.py -r $(Build.SourcesDirectory)/Artifact/result -c $short_hash -d $commit_date -u "$(reportUrl)?buildId=$(Build.BuildId)" -t $(trtVersion) -b $(branchName) --kusto_conn $(kustoConn) --database $(database) $(parser)
-    
+            python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/post.py -r $(Build.SourcesDirectory)/Artifact -c $short_hash -d $commit_date -u "$(reportUrl)?buildId=$(Build.BuildId)" -t $(trtVersion) -b $(branchName) --kusto_conn $(kustoConn) --database $(database) $(parser)
+
     - template: templates/component-governance-component-detection-steps.yml
       parameters :
         condition : 'succeeded'

From 79e50aeef3d99177867c07e38a574cf641fe6c22 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Fri, 15 Mar 2024 11:47:45 -0700
Subject: [PATCH 184/279] [js/web] rewrite backend resolve to allow multiple
 EPs (#19735)

### Description

This PR rewrite the backend resolve logic to support specifying multiple
EPs.

#### Backend

The first version of ONNX Runtime Web actually carried some existing
code from [ONNX.js](https://github.com/microsoft/onnxjs), which includes
the "backend" concept. The original "backend" in ONNX.js is designed in
a way assuming there is only one backend from user's backend hint list
will be used. For example, in ONNX.js, if user specify a backend hint as
`['webgl', 'wasm']`, ONNX.js will first try to use WebGL backend - if it
loads successfully (the browser supports webgl), then "webgl" backend
will be used and "wasm" will be ignored; otherwise, "webgl" will be
ignored and try to load "wasm" backend.

In short: only one backend will be used when initializing a session.

#### Execution Provider

Execution Provider, or EP, in ONNX Runtime is a different concept. One
of the differences is that users are allow to specify multiple EPs, and
if one does not support a particular kernel, it can fallback to other
EP. This is a very common case when using a GPU EP in ONNX Runtime.

#### Current Status: Backend v.s. EP

Because of the history reasons mentioned above, the current status is
quite confusing. There are **real backend**s, which means it's different
implementation in code; and there are **backend hint**s, which are used
as string names for backend hint; and there are **EP**s of the ONNX
Runtime concepts.

currently there are only 2 **backend**s in our code base: The "onnxjs
backend", and the "wasm backend". The "onnxjs backend" currently only
powers backend hint "webgl", which go into the old onnx.js code path.
All other backend hints including "wasm", "cpu"(alias to wasm), "webgpu"
and "webnn" are all powered by "wasm backend".

And because ORT Web treat "backend" as an internal concept and want to
align with ONNX Runtime, so those names of backend hints are becoming EP
names.

The following table shows today's status:

| Execution Provider Name (public) / Backend Hint (internal) | Backend |
EP in ORT
| -------- | ------- | ------- |
| "wasm"/"cpu" | WasmBackend | CPU EP
| "webgl" | OnnxjsBackend | \* technically not an EP
| "webgpu" | WasmBackend | JSEP
| "webnn" | WasmBackend | WebNN EP

#### Problem

While the API allows to specify multiple EPs, the backend resolving only
allows one backend. This causes issues when user specify multiple EP
names in session options, the backend resolve behavior and EP
registration behavior is inconsistent. Specifically, in this issue:
https://github.com/microsoft/onnxruntime/issues/15796#issuecomment-1925363908:

EP list `['webgpu', 'wasm']` on a browser without WebGPU support
resolves to 'wasm' backend, but the full EP list is passed in session
options, so JSEP is still enabled, causing the runtime error.


#### Solution

Since we still need WebGL backend, we cannot totally remove the backend
register/resolve system. In this PR I made the following changes:
- initialize every backend from the EP list, instead of only do that for
the first successful one.
- for the first resolved backend, filter all EP using the exact same
backend. Remove all EPs not using this backend from session options
- for every explicitly specified EP, if it's removed, show a warning
message in console
---
 js/common/lib/backend-impl.ts           | 121 +++++++++---
 js/common/lib/inference-session-impl.ts |  10 +-
 js/common/lib/training-session-impl.ts  |  11 +-
 js/web/lib/wasm/binding/ort-wasm.d.ts   | 240 +++++++++++++-----------
 js/web/lib/wasm/jsep/init.ts            |  38 ++--
 js/web/lib/wasm/proxy-wrapper.ts        |   2 +-
 js/web/lib/wasm/wasm-core-impl.ts       |  76 +++++---
 onnxruntime/wasm/js_internal_api.js     |  82 ++++----
 8 files changed, 348 insertions(+), 232 deletions(-)

diff --git a/js/common/lib/backend-impl.ts b/js/common/lib/backend-impl.ts
index 3e1e833addb91..e90efd7b97c29 100644
--- a/js/common/lib/backend-impl.ts
+++ b/js/common/lib/backend-impl.ts
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 import {Backend} from './backend.js';
+import {InferenceSession} from './inference-session.js';
 
 interface BackendInfo {
   backend: Backend;
@@ -10,6 +11,7 @@ interface BackendInfo {
   initPromise?: Promise<void>;
   initialized?: boolean;
   aborted?: boolean;
+  error?: string;
 }
 
 const backends: Map<string, BackendInfo> = new Map();
@@ -60,43 +62,100 @@ export const registerBackend = (name: string, backend: Backend, priority: number
 };
 
 /**
- * Resolve backend by specified hints.
+ * Try to resolve and initialize a backend.
  *
- * @param backendHints - a list of execution provider names to lookup. If omitted use registered backends as list.
- * @returns a promise that resolves to the backend.
+ * @param backendName - the name of the backend.
+ * @returns the backend instance if resolved and initialized successfully, or an error message if failed.
+ */
+const tryResolveAndInitializeBackend = async(backendName: string): Promise<Backend|string> => {
+  const backendInfo = backends.get(backendName);
+  if (!backendInfo) {
+    return 'backend not found.';
+  }
+
+  if (backendInfo.initialized) {
+    return backendInfo.backend;
+  } else if (backendInfo.aborted) {
+    return backendInfo.error!;
+  } else {
+    const isInitializing = !!backendInfo.initPromise;
+    try {
+      if (!isInitializing) {
+        backendInfo.initPromise = backendInfo.backend.init(backendName);
+      }
+      await backendInfo.initPromise;
+      backendInfo.initialized = true;
+      return backendInfo.backend;
+    } catch (e) {
+      if (!isInitializing) {
+        backendInfo.error = `${e}`;
+        backendInfo.aborted = true;
+      }
+      return backendInfo.error!;
+    } finally {
+      delete backendInfo.initPromise;
+    }
+  }
+};
+
+/**
+ * Resolve execution providers from the specific session options.
+ *
+ * @param options - the session options object.
+ * @returns a promise that resolves to a tuple of an initialized backend instance and a session options object with
+ * filtered EP list.
  *
  * @ignore
  */
-export const resolveBackend = async(backendHints: readonly string[]): Promise<Backend> => {
-  const backendNames = backendHints.length === 0 ? backendsSortedByPriority : backendHints;
-  const errors = [];
-  for (const backendName of backendNames) {
-    const backendInfo = backends.get(backendName);
-    if (backendInfo) {
-      if (backendInfo.initialized) {
-        return backendInfo.backend;
-      } else if (backendInfo.aborted) {
-        continue;  // current backend is unavailable; try next
-      }
+export const resolveBackendAndExecutionProviders = async(options: InferenceSession.SessionOptions):
+    Promise<[backend: Backend, options: InferenceSession.SessionOptions]> => {
+      // extract backend hints from session options
+      const eps = options.executionProviders || [];
+      const backendHints = eps.map(i => typeof i === 'string' ? i : i.name);
+      const backendNames = backendHints.length === 0 ? backendsSortedByPriority : backendHints;
 
-      const isInitializing = !!backendInfo.initPromise;
-      try {
-        if (!isInitializing) {
-          backendInfo.initPromise = backendInfo.backend.init(backendName);
+      // try to resolve and initialize all requested backends
+      let backend: Backend|undefined;
+      const errors = [];
+      const availableBackendNames = new Set<string>();
+      for (const backendName of backendNames) {
+        const resolveResult = await tryResolveAndInitializeBackend(backendName);
+        if (typeof resolveResult === 'string') {
+          errors.push({name: backendName, err: resolveResult});
+        } else {
+          if (!backend) {
+            backend = resolveResult;
+          }
+          if (backend === resolveResult) {
+            availableBackendNames.add(backendName);
+          }
         }
-        await backendInfo.initPromise;
-        backendInfo.initialized = true;
-        return backendInfo.backend;
-      } catch (e) {
-        if (!isInitializing) {
-          errors.push({name: backendName, err: e});
+      }
+
+      // if no backend is available, throw error.
+      if (!backend) {
+        throw new Error(`no available backend found. ERR: ${errors.map(e => `[${e.name}] ${e.err}`).join(', ')}`);
+      }
+
+      // for each explicitly requested backend, if it's not available, output warning message.
+      for (const {name, err} of errors) {
+        if (backendHints.includes(name)) {
+          // eslint-disable-next-line no-console
+          console.warn(`removing requested execution provider "${
+              name}" from session options because it is not available: ${err}`);
         }
-        backendInfo.aborted = true;
-      } finally {
-        delete backendInfo.initPromise;
       }
-    }
-  }
 
-  throw new Error(`no available backend found. ERR: ${errors.map(e => `[${e.name}] ${e.err}`).join(', ')}`);
-};
+      const filteredEps = eps.filter(i => availableBackendNames.has(typeof i === 'string' ? i : i.name));
+
+      return [
+        backend, new Proxy(options, {
+          get: (target, prop) => {
+            if (prop === 'executionProviders') {
+              return filteredEps;
+            }
+            return Reflect.get(target, prop);
+          }
+        })
+      ];
+    };
diff --git a/js/common/lib/inference-session-impl.ts b/js/common/lib/inference-session-impl.ts
index 55f40c8907a89..ab4c6a3e0c46b 100644
--- a/js/common/lib/inference-session-impl.ts
+++ b/js/common/lib/inference-session-impl.ts
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {resolveBackend} from './backend-impl.js';
+import {resolveBackendAndExecutionProviders} from './backend-impl.js';
 import {InferenceSessionHandler} from './backend.js';
 import {InferenceSession as InferenceSessionInterface} from './inference-session.js';
 import {OnnxValue} from './onnx-value.js';
@@ -195,11 +195,9 @@ export class InferenceSession implements InferenceSessionInterface {
       throw new TypeError('Unexpected argument[0]: must be \'path\' or \'buffer\'.');
     }
 
-    // get backend hints
-    const eps = options.executionProviders || [];
-    const backendHints = eps.map(i => typeof i === 'string' ? i : i.name);
-    const backend = await resolveBackend(backendHints);
-    const handler = await backend.createInferenceSessionHandler(filePathOrUint8Array, options);
+    // resolve backend, update session options with validated EPs, and create session handler
+    const [backend, optionsWithValidatedEPs] = await resolveBackendAndExecutionProviders(options);
+    const handler = await backend.createInferenceSessionHandler(filePathOrUint8Array, optionsWithValidatedEPs);
     TRACE_FUNC_END();
     return new InferenceSession(handler);
   }
diff --git a/js/common/lib/training-session-impl.ts b/js/common/lib/training-session-impl.ts
index 23bd4421ae672..bae38b0dfda5a 100644
--- a/js/common/lib/training-session-impl.ts
+++ b/js/common/lib/training-session-impl.ts
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {resolveBackend} from './backend-impl.js';
+import {resolveBackendAndExecutionProviders} from './backend-impl.js';
 import {SessionHandler, TrainingSessionHandler} from './backend.js';
 import {InferenceSession as InferenceSession} from './inference-session.js';
 import {OnnxValue} from './onnx-value.js';
@@ -55,13 +55,12 @@ export class TrainingSession implements TrainingSessionInterface {
     const optimizerModel: string|Uint8Array = trainingOptions.optimizerModel || '';
     const options: SessionOptions = sessionOptions || {};
 
-    // get backend hints
-    const eps = options.executionProviders || [];
-    const backendHints = eps.map(i => typeof i === 'string' ? i : i.name);
-    const backend = await resolveBackend(backendHints);
+    // resolve backend, update session options with validated EPs, and create session handler
+    const [backend, optionsWithValidatedEPs] = await resolveBackendAndExecutionProviders(options);
     if (backend.createTrainingSessionHandler) {
       const handler = await backend.createTrainingSessionHandler(
-          trainingOptions.checkpointState, trainingOptions.trainModel, evalModel, optimizerModel, options);
+          trainingOptions.checkpointState, trainingOptions.trainModel, evalModel, optimizerModel,
+          optionsWithValidatedEPs);
       return new TrainingSession(handler, !!trainingOptions.optimizerModel, !!trainingOptions.evalModel);
     } else {
       throw new Error(noBackendErrMsg);
diff --git a/js/web/lib/wasm/binding/ort-wasm.d.ts b/js/web/lib/wasm/binding/ort-wasm.d.ts
index 5dd715191c830..56925b728e9a3 100644
--- a/js/web/lib/wasm/binding/ort-wasm.d.ts
+++ b/js/web/lib/wasm/binding/ort-wasm.d.ts
@@ -16,20 +16,97 @@ export declare namespace JSEP {
   type CaptureBeginFunction = () => void;
   type CaptureEndFunction = () => void;
   type ReplayFunction = () => void;
-}
 
-export interface OrtWasmModule extends EmscriptenModule {
-  // #region emscripten functions
-  stackSave(): number;
-  stackRestore(stack: number): void;
-  stackAlloc(size: number): number;
-
-  UTF8ToString(offset: number, maxBytesToRead?: number): string;
-  lengthBytesUTF8(str: string): number;
-  stringToUTF8(str: string, offset: number, maxBytes: number): void;
-  // #endregion
+  export interface Module extends WebGpuModule {
+    /**
+     * Mount the external data file to an internal map, which will be used during session initialization.
+     *
+     * @param externalDataFilePath - specify the relative path of the external data file.
+     * @param externalDataFileData - specify the content data.
+     */
+    mountExternalData(externalDataFilePath: string, externalDataFileData: Uint8Array): void;
+    /**
+     * Unmount all external data files from the internal map.
+     */
+    unmountExternalData(): void;
+
+    /**
+     * This is the entry of JSEP initialization. This function is called once when initializing ONNX Runtime per
+     * backend. This function initializes Asyncify support. If name is 'webgpu', also initializes WebGPU backend and
+     * registers a few callbacks that will be called in C++ code.
+     */
+    jsepInit(name: 'webgpu', initParams: [
+      backend: BackendType, alloc: AllocFunction, free: FreeFunction, upload: UploadFunction,
+      download: DownloadFunction, createKernel: CreateKernelFunction, releaseKernel: ReleaseKernelFunction,
+      run: RunFunction, captureBegin: CaptureBeginFunction, captureEnd: CaptureEndFunction, replay: ReplayFunction
+    ]): void;
+    jsepInit(name: 'webnn', initParams?: never): void;
+  }
+
+  export interface WebGpuModule {
+    /**
+     * [exported from wasm] Specify a kernel's output when running OpKernel::Compute().
+     *
+     * @param context - specify the kernel context pointer.
+     * @param index - specify the index of the output.
+     * @param data - specify the pointer to encoded data of type and dims.
+     */
+    _JsepOutput(context: number, index: number, data: number): number;
+    /**
+     * [exported from wasm] Get name of an operator node.
+     *
+     * @param kernel - specify the kernel pointer.
+     * @returns the pointer to a C-style UTF8 encoded string representing the node name.
+     */
+    _JsepGetNodeName(kernel: number): number;
+
+    /**
+     * [exported from js_internal_api.js] Register a user GPU buffer for usage of a session's input or output.
+     *
+     * @param sessionId - specify the session ID.
+     * @param index - specify an integer to represent which input/output it is registering for. For input, it is the
+     *     input_index corresponding to the session's inputNames. For output, it is the inputCount + output_index
+     *     corresponding to the session's ouputNames.
+     * @param buffer - specify the GPU buffer to register.
+     * @param size - specify the original data size in byte.
+     * @returns the GPU data ID for the registered GPU buffer.
+     */
+    jsepRegisterBuffer: (sessionId: number, index: number, buffer: GPUBuffer, size: number) => number;
+    /**
+     * [exported from js_internal_api.js] Get the GPU buffer by GPU data ID.
+     *
+     * @param dataId - specify the GPU data ID
+     * @returns the GPU buffer.
+     */
+    jsepGetBuffer: (dataId: number) => GPUBuffer;
+    /**
+     * [exported from js_internal_api.js] Create a function to be used to create a GPU Tensor.
+     *
+     * @param gpuBuffer - specify the GPU buffer
+     * @param size - specify the original data size in byte.
+     * @param type - specify the tensor type.
+     * @returns the generated downloader function.
+     */
+    jsepCreateDownloader:
+        (gpuBuffer: GPUBuffer, size: number,
+         type: Tensor.GpuBufferDataTypes) => () => Promise<Tensor.DataTypeMap[Tensor.GpuBufferDataTypes]>;
+    /**
+     *  [exported from js_internal_api.js] Called when InferenceSession.run started. This function will be called before
+     * _OrtRun[WithBinding]() is called.
+     * @param sessionId - specify the session ID.
+     */
+    jsepOnRunStart: (sessionId: number) => void;
+    /**
+     * [exported from js_internal_api.js] Release a session. This function will be called before _OrtReleaseSession() is
+     * called.
+     * @param sessionId - specify the session ID.
+     * @returns
+     */
+    jsepOnReleaseSession: (sessionId: number) => void;
+  }
+}
 
-  // #region ORT APIs
+export interface OrtInferenceAPIs {
   _OrtInit(numThreads: number, loggingLevel: number): number;
 
   _OrtGetLastError(errorCodeOffset: number, errorMessageOffset: number): void;
@@ -74,126 +151,61 @@ export interface OrtWasmModule extends EmscriptenModule {
   _OrtReleaseRunOptions(runOptionsHandle: number): void;
 
   _OrtEndProfiling(sessionHandle: number): number;
-  // #endregion
+}
+
+export interface OrtTrainingAPIs {
+  _OrtTrainingLoadCheckpoint(dataOffset: number, dataLength: number): number;
 
-  // #region ORT Training APIs
-  _OrtTrainingLoadCheckpoint?(dataOffset: number, dataLength: number): number;
+  _OrtTrainingReleaseCheckpoint(checkpointHandle: number): void;
 
-  _OrtTrainingReleaseCheckpoint?(checkpointHandle: number): void;
+  _OrtTrainingCreateSession(
+      sessionOptionsHandle: number, checkpointHandle: number, trainOffset: number, trainLength: number,
+      evalOffset: number, evalLength: number, optimizerOffset: number, optimizerLength: number): number;
 
-  _OrtTrainingCreateSession?
-      (sessionOptionsHandle: number, checkpointHandle: number, trainOffset: number, trainLength: number,
-       evalOffset: number, evalLength: number, optimizerOffset: number, optimizerLength: number): number;
+  _OrtTrainingLazyResetGrad(trainingHandle: number): number;
 
-  _OrtTrainingLazyResetGrad?(trainingHandle: number): number;
+  _OrtTrainingRunTrainStep(
+      trainingHandle: number, inputsOffset: number, inputCount: number, outputsOffset: number, outputCount: number,
+      runOptionsHandle: number): number;
 
-  _OrtTrainingRunTrainStep?
-      (trainingHandle: number, inputsOffset: number, inputCount: number, outputsOffset: number, outputCount: number,
-       runOptionsHandle: number): number;
+  _OrtTrainingOptimizerStep(trainingHandle: number, runOptionsHandle: number): number;
 
-  _OrtTrainingOptimizerStep?(trainingHandle: number, runOptionsHandle: number): number;
+  _OrtTrainingEvalStep(
+      trainingHandle: number, inputsOffset: number, inputCount: number, outputsOffset: number, outputCount: number,
+      runOptionsHandle: number): number;
 
-  _OrtTrainingEvalStep?
-      (trainingHandle: number, inputsOffset: number, inputCount: number, outputsOffset: number, outputCount: number,
-       runOptionsHandle: number): number;
+  _OrtTrainingGetParametersSize(trainingHandle: number, paramSizeT: number, trainableOnly: boolean): number;
 
-  _OrtTrainingGetParametersSize?(trainingHandle: number, paramSizeT: number, trainableOnly: boolean): number;
+  _OrtTrainingCopyParametersToBuffer(
+      trainingHandle: number, parametersBuffer: number, parameterCount: number, trainableOnly: boolean): number;
 
-  _OrtTrainingCopyParametersToBuffer?
-      (trainingHandle: number, parametersBuffer: number, parameterCount: number, trainableOnly: boolean): number;
+  _OrtTrainingCopyParametersFromBuffer(
+      trainingHandle: number, parametersBuffer: number, parameterCount: number, trainableOnly: boolean): number;
 
-  _OrtTrainingCopyParametersFromBuffer?
-      (trainingHandle: number, parametersBuffer: number, parameterCount: number, trainableOnly: boolean): number;
+  _OrtTrainingGetModelInputOutputCount(
+      trainingHandle: number, inputCount: number, outputCount: number, isEvalModel: boolean): number;
+  _OrtTrainingGetModelInputOutputName(trainingHandle: number, index: number, isInput: boolean, isEvalModel: boolean):
+      number;
+
+  _OrtTrainingReleaseSession(trainingHandle: number): void;
+}
 
-  _OrtTrainingGetModelInputOutputCount?
-      (trainingHandle: number, inputCount: number, outputCount: number, isEvalModel: boolean): number;
-  _OrtTrainingGetModelInputOutputName?
-      (trainingHandle: number, index: number, isInput: boolean, isEvalModel: boolean): number;
+export interface OrtWasmModule extends EmscriptenModule, OrtInferenceAPIs, Partial<OrtTrainingAPIs>,
+                                       Partial<JSEP.Module> {
+  // #region emscripten functions
+  stackSave(): number;
+  stackRestore(stack: number): void;
+  stackAlloc(size: number): number;
 
-  _OrtTrainingReleaseSession?(trainingHandle: number): void;
+  UTF8ToString(offset: number, maxBytesToRead?: number): string;
+  lengthBytesUTF8(str: string): number;
+  stringToUTF8(str: string, offset: number, maxBytes: number): void;
   // #endregion
 
   // #region config
   numThreads?: number;
   mainScriptUrlOrBlob?: string|Blob;
   // #endregion
-
-  // #region external data API
-  mountExternalData?(externalDataFilePath: string, externalDataFileData: Uint8Array): void;
-  unmountExternalData?(): void;
-  // #endregion
-
-  // #region JSEP
-  /**
-   * This is the entry of JSEP initialization. This function is called once when initializing ONNX Runtime.
-   * This function initializes WebGPU backend and registers a few callbacks that will be called in C++ code.
-   */
-  jsepInit?
-      (backend: JSEP.BackendType, alloc: JSEP.AllocFunction, free: JSEP.FreeFunction, upload: JSEP.UploadFunction,
-       download: JSEP.DownloadFunction, createKernel: JSEP.CreateKernelFunction,
-       releaseKernel: JSEP.ReleaseKernelFunction, run: JSEP.RunFunction, captureBegin: JSEP.CaptureBeginFunction,
-       captureEnd: JSEP.CaptureEndFunction, replay: JSEP.ReplayFunction): void;
-
-  /**
-   * [exported from wasm] Specify a kernel's output when running OpKernel::Compute().
-   *
-   * @param context - specify the kernel context pointer.
-   * @param index - specify the index of the output.
-   * @param data - specify the pointer to encoded data of type and dims.
-   */
-  _JsepOutput(context: number, index: number, data: number): number;
-  /**
-   * [exported from wasm] Get name of an operator node.
-   *
-   * @param kernel - specify the kernel pointer.
-   * @returns the pointer to a C-style UTF8 encoded string representing the node name.
-   */
-  _JsepGetNodeName(kernel: number): number;
-
-  /**
-   * [exported from js_internal_api.js] Register a user GPU buffer for usage of a session's input or output.
-   *
-   * @param sessionId - specify the session ID.
-   * @param index - specify an integer to represent which input/output it is registering for. For input, it is the
-   *     input_index corresponding to the session's inputNames. For output, it is the inputCount + output_index
-   *     corresponding to the session's ouputNames.
-   * @param buffer - specify the GPU buffer to register.
-   * @param size - specify the original data size in byte.
-   * @returns the GPU data ID for the registered GPU buffer.
-   */
-  jsepRegisterBuffer: (sessionId: number, index: number, buffer: GPUBuffer, size: number) => number;
-  /**
-   * [exported from js_internal_api.js] Get the GPU buffer by GPU data ID.
-   *
-   * @param dataId - specify the GPU data ID
-   * @returns the GPU buffer.
-   */
-  jsepGetBuffer: (dataId: number) => GPUBuffer;
-  /**
-   * [exported from js_internal_api.js] Create a function to be used to create a GPU Tensor.
-   *
-   * @param gpuBuffer - specify the GPU buffer
-   * @param size - specify the original data size in byte.
-   * @param type - specify the tensor type.
-   * @returns the generated downloader function.
-   */
-  jsepCreateDownloader:
-      (gpuBuffer: GPUBuffer, size: number,
-       type: Tensor.GpuBufferDataTypes) => () => Promise<Tensor.DataTypeMap[Tensor.GpuBufferDataTypes]>;
-  /**
-   *  [exported from js_internal_api.js] Called when InferenceSession.run started. This function will be called before
-   * _OrtRun[WithBinding]() is called.
-   * @param sessionId - specify the session ID.
-   */
-  jsepOnRunStart: (sessionId: number) => void;
-  /**
-   * [exported from js_internal_api.js] Release a session. This function will be called before _OrtReleaseSession() is
-   * called.
-   * @param sessionId - specify the session ID.
-   * @returns
-   */
-  jsepOnReleaseSession: (sessionId: number) => void;
-  // #endregion
 }
 
 declare const moduleFactory: EmscriptenModuleFactory<OrtWasmModule>;
diff --git a/js/web/lib/wasm/jsep/init.ts b/js/web/lib/wasm/jsep/init.ts
index 4936b94ef7a86..adcaa145cdca8 100644
--- a/js/web/lib/wasm/jsep/init.ts
+++ b/js/web/lib/wasm/jsep/init.ts
@@ -121,7 +121,7 @@ class ComputeContextImpl implements ComputeContext {
       for (let i = 0; i < dims.length; i++) {
         this.module.HEAPU32[offset++] = dims[i];
       }
-      return this.module._JsepOutput(this.opKernelContext, index, data);
+      return this.module._JsepOutput!(this.opKernelContext, index, data);
     } catch (e) {
       throw new Error(
           `Failed to generate kernel's output[${index}] with dims [${dims}]. ` +
@@ -136,27 +136,39 @@ class ComputeContextImpl implements ComputeContext {
 /**
  * Initialize JSEP with WebGPU backend.
  *
- * This function will be called only once after the WebAssembly module is loaded and initialized ("_OrtInit" is called).
- * This function expects:
+ * This function will be called after the WebAssembly module is loaded and initialized ("_OrtInit" is called), once for
+ * each of the following EPs if they are specified:
+ * - "webgpu"
+ * - "webnn"
+ *
+ * For WebGPU, this function expects:
  *  - WebGPU is enabled in build (BUILD_DEFS.DISABLE_WEBGPU === false).
  *  - WebGPU is available in current environment. (a valid GPUAdapter is passed in)
+ *
+ * For WebNN, this function expects:
+ * - WebNN is enabled in build (BUILD_DEFS.DISABLE_WEBGPU === false).
+ * - WebNN is available in current environment. (navigator.ml is not undefined)
+ *
  * If the WebAssembly module is not built with JSEP support, this function will throw an error. This will invalidate
- * 'webgpu' backend.
+ * 'webgpu'/'webnn' backend.
  *
+ * @param name - the name of the EP, either "webgpu" or "webnn"
  * @param module - the ORT WebAssembly module
  * @param env - the ORT environment variable (ort.env)
  * @param gpuAdapter - the pre-created GPU adapter
  */
-export const init = async(module: OrtWasmModule, env: Env, gpuAdapter: GPUAdapter): Promise<void> => {
+export const init =
+    async(name: 'webgpu'|'webnn', module: OrtWasmModule, env: Env, gpuAdapter?: GPUAdapter): Promise<void> => {
   const jsepInit = module.jsepInit;
   if (!jsepInit) {
     throw new Error('Failed to initialize JSEP. The WebAssembly module is not built with JSEP support.');
   }
 
-  const backend = new WebGpuBackend();
-  await backend.initialize(env, gpuAdapter);
+  if (name === 'webgpu') {
+    const backend = new WebGpuBackend();
+    await backend.initialize(env, gpuAdapter!);
 
-  jsepInit(
+    jsepInit('webgpu', [
       // backend
       backend,
 
@@ -190,8 +202,8 @@ export const init = async(module: OrtWasmModule, env: Env, gpuAdapter: GPUAdapte
           },
 
       // jsepCreateKernel
-      (kernelType: string, kernelId: number, attribute: unknown) =>
-          backend.createKernel(kernelType, kernelId, attribute, module.UTF8ToString(module._JsepGetNodeName(kernelId))),
+      (kernelType: string, kernelId: number, attribute: unknown) => backend.createKernel(
+          kernelType, kernelId, attribute, module.UTF8ToString(module._JsepGetNodeName!(kernelId))),
 
       // jsepReleaseKernel
       (kernel: number) => backend.releaseKernel(kernel),
@@ -210,5 +222,9 @@ export const init = async(module: OrtWasmModule, env: Env, gpuAdapter: GPUAdapte
       // jsepCaptureEnd
       () => backend.captureEnd(),
       // jsepReplay
-      () => backend.replay());
+      () => backend.replay()
+    ]);
+  } else {
+    jsepInit('webnn');
+  }
 };
diff --git a/js/web/lib/wasm/proxy-wrapper.ts b/js/web/lib/wasm/proxy-wrapper.ts
index 86017a4ec6904..6ff4e86b1235e 100644
--- a/js/web/lib/wasm/proxy-wrapper.ts
+++ b/js/web/lib/wasm/proxy-wrapper.ts
@@ -155,7 +155,7 @@ export const createSession =
             ensureWorker();
             return new Promise<SerializableSessionMetadata>((resolve, reject) => {
               enqueueCallbacks('create', [resolve, reject]);
-              const message: OrtWasmMessage = {type: 'create', in : {model, options}};
+              const message: OrtWasmMessage = {type: 'create', in : {model, options: {...options}}};
               const transferable: Transferable[] = [];
               if (model instanceof Uint8Array) {
                 transferable.push(model.buffer);
diff --git a/js/web/lib/wasm/wasm-core-impl.ts b/js/web/lib/wasm/wasm-core-impl.ts
index afab9ba00b0c4..7019758be0efd 100644
--- a/js/web/lib/wasm/wasm-core-impl.ts
+++ b/js/web/lib/wasm/wasm-core-impl.ts
@@ -84,35 +84,44 @@ export const initRuntime = async(env: Env): Promise<void> => {
  * @param epName
  */
 export const initEp = async(env: Env, epName: string): Promise<void> => {
-  if (!BUILD_DEFS.DISABLE_WEBGPU && (epName === 'webgpu' || epName === 'webnn')) {
-    // perform WebGPU availability check
-    if (typeof navigator === 'undefined' || !navigator.gpu) {
-      throw new Error('WebGPU is not supported in current environment');
-    }
-    const powerPreference = env.webgpu?.powerPreference;
-    if (powerPreference !== undefined && powerPreference !== 'low-power' && powerPreference !== 'high-performance') {
-      throw new Error(`Invalid powerPreference setting: "${powerPreference}"`);
-    }
-    const forceFallbackAdapter = env.webgpu?.forceFallbackAdapter;
-    if (forceFallbackAdapter !== undefined && typeof forceFallbackAdapter !== 'boolean') {
-      throw new Error(`Invalid forceFallbackAdapter setting: "${forceFallbackAdapter}"`);
-    }
-    const adapter = await navigator.gpu.requestAdapter({powerPreference, forceFallbackAdapter});
-    if (!adapter) {
-      throw new Error(
-          'Failed to get GPU adapter. You may need to enable flag "--enable-unsafe-webgpu" if you are using Chrome.');
-    }
+  if (!BUILD_DEFS.DISABLE_WEBGPU) {
+    // eslint-disable-next-line @typescript-eslint/no-require-imports, @typescript-eslint/no-var-requires
+    const initJsep = require('./jsep/init').init;
 
-    if (!env.wasm.simd) {
-      throw new Error(
-          'Not supported for WebGPU=ON and SIMD=OFF. Please set `env.wasm.simd` to true when using `webgpu` EP');
-    }
+    if (epName === 'webgpu') {
+      // perform WebGPU availability check
+      if (typeof navigator === 'undefined' || !navigator.gpu) {
+        throw new Error('WebGPU is not supported in current environment');
+      }
+      const powerPreference = env.webgpu?.powerPreference;
+      if (powerPreference !== undefined && powerPreference !== 'low-power' && powerPreference !== 'high-performance') {
+        throw new Error(`Invalid powerPreference setting: "${powerPreference}"`);
+      }
+      const forceFallbackAdapter = env.webgpu?.forceFallbackAdapter;
+      if (forceFallbackAdapter !== undefined && typeof forceFallbackAdapter !== 'boolean') {
+        throw new Error(`Invalid forceFallbackAdapter setting: "${forceFallbackAdapter}"`);
+      }
+      const adapter = await navigator.gpu.requestAdapter({powerPreference, forceFallbackAdapter});
+      if (!adapter) {
+        throw new Error(
+            'Failed to get GPU adapter. You may need to enable flag "--enable-unsafe-webgpu" if you are using Chrome.');
+      }
 
-    // init JSEP if available
+      if (!env.wasm.simd) {
+        throw new Error(
+            'Not supported for WebGPU=ON and SIMD=OFF. Please set `env.wasm.simd` to true when using `webgpu` EP');
+      }
 
-    // eslint-disable-next-line @typescript-eslint/no-require-imports, @typescript-eslint/no-var-requires
-    const initJsep = require('./jsep/init').init;
-    await initJsep(getInstance(), env, adapter);
+      await initJsep('webgpu', getInstance(), env, adapter);
+    }
+    if (epName === 'webnn') {
+      // perform WebNN availability check
+      if (typeof navigator === 'undefined' || !(navigator as unknown as {ml: unknown}).ml) {
+        throw new Error('WebNN is not supported in current environment');
+      }
+
+      await initJsep('webnn', getInstance(), env);
+    }
   }
 };
 
@@ -380,7 +389,12 @@ export const prepareInputOutputTensor =
         const gpuBuffer = tensor[2].gpuBuffer as GPUBuffer;
         const elementSizeInBytes = getTensorElementSize(tensorDataTypeStringToEnum(dataType))!;
         dataByteLength = dims.reduce((a, b) => a * b, 1) * elementSizeInBytes;
-        rawData = wasm.jsepRegisterBuffer(sessionId, index, gpuBuffer, dataByteLength);
+
+        const registerBuffer = wasm.jsepRegisterBuffer;
+        if (!registerBuffer) {
+          throw new Error('Tensor location "gpu-buffer" is not supported without using WebGPU.');
+        }
+        rawData = registerBuffer(sessionId, index, gpuBuffer, dataByteLength);
       } else {
         const data = tensor[2];
 
@@ -595,7 +609,11 @@ export const run = async(
           // If a certain output's preferred location is GPU but the tensor is empty, we still need to create a CPU
           // tensor for it. There is no mapping GPU buffer for an empty tensor.
           if (preferredLocation === 'gpu-buffer' && size > 0) {
-            const gpuBuffer = wasm.jsepGetBuffer(dataOffset);
+            const getBuffer = wasm.jsepGetBuffer;
+            if (!getBuffer) {
+              throw new Error('preferredLocation "gpu-buffer" is not supported without using WebGPU.');
+            }
+            const gpuBuffer = getBuffer(dataOffset);
             const elementSize = getTensorElementSize(dataType);
             if (elementSize === undefined || !isGpuBufferSupportedType(type)) {
               throw new Error(`Unsupported data type: ${type}`);
@@ -607,7 +625,7 @@ export const run = async(
             output.push([
               type, dims, {
                 gpuBuffer,
-                download: wasm.jsepCreateDownloader(gpuBuffer, size * elementSize, type),
+                download: wasm.jsepCreateDownloader!(gpuBuffer, size * elementSize, type),
                 dispose: () => {
                   wasm._OrtReleaseTensor(tensor);
                 }
diff --git a/onnxruntime/wasm/js_internal_api.js b/onnxruntime/wasm/js_internal_api.js
index cbc60c70b57aa..90d8b737252e5 100644
--- a/onnxruntime/wasm/js_internal_api.js
+++ b/onnxruntime/wasm/js_internal_api.js
@@ -4,39 +4,27 @@
 'use strict';
 
 /**
- * Mount external data files of a model to the virtual file system (MEMFS).
+ * Mount external data files of a model to an internal map, which will be used during session initialization.
  *
  * @param {string} externalDataFilesPath
  * @param {Uint8Array} externalDataFilesData
  */
 Module['mountExternalData'] = (externalDataFilePath, externalDataFileData) => {
   const files = Module.MountedFiles || (Module.MountedFiles = new Map());
-    files.set(externalDataFilePath, externalDataFileData);
+  files.set(externalDataFilePath, externalDataFileData);
 };
 
 /**
- * Unmount external data files of a model from the virtual file system (MEMFS).
+ * Unmount external data files of a model.
  */
 Module['unmountExternalData'] = () => {
   delete Module.MountedFiles;
 };
 
 /**
- * init JSEP
+ * initialize JSEP for asyncify support.
  */
-Module['jsepInit'] = (backend, alloc, free, copy, copyAsync, createKernel, releaseKernel, runKernel, captureBegin, captureEnd, replay) => {
-  Module.jsepBackend = backend;
-  Module.jsepAlloc = alloc;
-  Module.jsepFree = free;
-  Module.jsepCopy = copy;
-  Module.jsepCopyAsync = copyAsync;
-  Module.jsepCreateKernel = createKernel;
-  Module.jsepReleaseKernel = releaseKernel;
-  Module.jsepRunKernel = runKernel;
-  Module.jsepCaptureBegin = captureBegin;
-  Module.jsepCaptureEnd = captureEnd;
-  Module.jsepReplay = replay;
-
+let jsepInitAsync = () => {
   // This is a simplified version of cwrap() with options.async === true (-sASYNCIFY=1)
   // It removes some overhead in cwarp() and ccall() that we don't need.
   //
@@ -143,7 +131,7 @@ Module['jsepInit'] = (backend, alloc, free, copy, copyAsync, createKernel, relea
         }
 
         // Flush the backend. This will submit all pending commands to the GPU.
-        backend['flush']();
+        Module.jsepBackend?.['flush']();
 
         // Await all pending promises. This includes GPU validation promises for diagnostic purposes.
         const errorPromises = state.errors;
@@ -180,20 +168,46 @@ Module['jsepInit'] = (backend, alloc, free, copy, copyAsync, createKernel, relea
       () => Module['_OrtBindInput'],
       v => Module['_OrtBindInput'] = v);
 
-  // expose webgpu backend functions
-  Module['jsepRegisterBuffer'] = (sessionId, index, buffer, size) => {
-    return backend['registerBuffer'](sessionId, index, buffer, size);
-  };
-  Module['jsepGetBuffer'] = (dataId) => {
-    return backend['getBuffer'](dataId);
-  };
-  Module['jsepCreateDownloader'] = (gpuBuffer, size, type) => {
-    return backend['createDownloader'](gpuBuffer, size, type);
-  };
-  Module['jsepOnReleaseSession'] = sessionId => {
-    backend['onReleaseSession'](sessionId);
-  };
-  Module['jsepOnRunStart'] = sessionId => {
-    return backend['onRunStart'](sessionId);
-  };
+  // remove this function to make sure it is called only once.
+  jsepInitAsync = undefined;
+};
+
+
+/**
+ * initialize JSEP for WebGPU.
+ */
+Module['jsepInit'] = (name, params) => {
+  jsepInitAsync?.();
+
+  if (name === 'webgpu') {
+    [Module.jsepBackend,
+     Module.jsepAlloc,
+     Module.jsepFree,
+     Module.jsepCopy,
+     Module.jsepCopyAsync,
+     Module.jsepCreateKernel,
+     Module.jsepReleaseKernel,
+     Module.jsepRunKernel,
+     Module.jsepCaptureBegin,
+     Module.jsepCaptureEnd,
+     Module.jsepReplay] = params;
+
+    // expose webgpu backend functions
+    const backend = Module.jsepBackend;
+    Module['jsepRegisterBuffer'] = (sessionId, index, buffer, size) => {
+      return backend['registerBuffer'](sessionId, index, buffer, size);
+    };
+    Module['jsepGetBuffer'] = (dataId) => {
+      return backend['getBuffer'](dataId);
+    };
+    Module['jsepCreateDownloader'] = (gpuBuffer, size, type) => {
+      return backend['createDownloader'](gpuBuffer, size, type);
+    };
+    Module['jsepOnReleaseSession'] = sessionId => {
+      backend['onReleaseSession'](sessionId);
+    };
+    Module['jsepOnRunStart'] = sessionId => {
+      return backend['onRunStart'](sessionId);
+    };
+  }
 };

From 7b46b3155891cbf6c783e2535e413b92ae81b050 Mon Sep 17 00:00:00 2001
From: enximi <70036307+enximi@users.noreply.github.com>
Date: Sat, 16 Mar 2024 03:41:44 +0800
Subject: [PATCH 185/279] =?UTF-8?q?fix:=20"UserWarning:=20Unsupported=20Wi?=
 =?UTF-8?q?ndows=20version=20(11).=20ONNX=20Runtime=20sup=E2=80=A6=20(#198?=
 =?UTF-8?q?45)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fix: "UserWarning: Unsupported Windows version (11). ONNX Runtime
supports Windows 10 and above, only."

### Description

Include Windows 11 in the version check. Now, you will not see the
warning “Unsupported Windows version (11). ONNX Runtime supports Windows
10 and above, only.”

### Motivation and Context

Warning on Windows 11: Only supports systems above Windows 10, which is
somewhat strange.
---
 onnxruntime/python/onnxruntime_validation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/python/onnxruntime_validation.py b/onnxruntime/python/onnxruntime_validation.py
index 16cbc8e8099e1..10d9f469863c4 100644
--- a/onnxruntime/python/onnxruntime_validation.py
+++ b/onnxruntime/python/onnxruntime_validation.py
@@ -22,7 +22,7 @@ def check_distro_info():
         __my_distro__ = __my_system__
         __my_distro_ver__ = platform.release().lower()
 
-        if __my_distro_ver__ != "10":
+        if __my_distro_ver__ not in ["10", "11"]:
             warnings.warn(
                 "Unsupported Windows version (%s). ONNX Runtime supports Windows 10 and above, only."
                 % __my_distro_ver__

From d5c6a2cecf5f5fb9e41c8cf4176cbae0eafeeb22 Mon Sep 17 00:00:00 2001
From: Hector Li <hecli@microsoft.com>
Date: Fri, 15 Mar 2024 17:02:01 -0700
Subject: [PATCH 186/279] Enable code in QNN UT to verify the fix for partition
 issue (#19939)

### Description
Enable code in QNN UT to verify the fix for partition issue relate to
QDQ model.
https://github.com/microsoft/onnxruntime/pull/19723
---
 onnxruntime/test/providers/qnn/qnn_ep_context_test.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
index eaef6f6315157..9eb75d297ef78 100644
--- a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
+++ b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
@@ -123,6 +123,8 @@ void QnnContextBinaryMultiPartitionTestBody(bool single_ep_node = true) {
   for (auto& node : ctx_graph.Nodes()) {
     if (node.OpType() == "EPContext") {
       ++ep_context_node_count;
+      // validate the fix for the partition issue relate to QDQ model
+      ASSERT_EQ(node.InputDefs().size(), 1);
     } else {
       ++non_ep_context_node_count;
     }

From acb0df228024c3bc824f6abbef7b14ac21258755 Mon Sep 17 00:00:00 2001
From: Belem Zhang <belem.zhang@intel.com>
Date: Sat, 16 Mar 2024 10:00:30 +0800
Subject: [PATCH 187/279] Fix #19931 broken Get Started link of "ONNX Runtime
 JavaScript API" page (#19932)

### Description
Fix #19931 broken Get Started link

HTTP 404 for "Get Started" link in "ONNX Runtime JavaScript API" page

Co-authored-by: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
---
 js/common/lib/index.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/js/common/lib/index.ts b/js/common/lib/index.ts
index d7c98380f3fa4..18cc2aba03f63 100644
--- a/js/common/lib/index.ts
+++ b/js/common/lib/index.ts
@@ -11,7 +11,7 @@
  * - [onnxruntime-react-native](https://www.npmjs.com/package/onnxruntime-react-native)
  *
  * See also:
- * - [Get Started](https://onnxruntime.ai/docs/get-started/with-javascript.html)
+ * - [Get Started](https://onnxruntime.ai/docs/get-started/with-javascript/)
  * - [Inference examples](https://github.com/microsoft/onnxruntime-inference-examples/tree/main/js)
  *
  * @packageDocumentation

From b29849a2877527683c0361834e6335698218d07f Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Fri, 15 Mar 2024 19:01:50 -0700
Subject: [PATCH 188/279] [js/common] fix typedoc warnings (#19933)

### Description
Fix a few warnings in typedoc (for generating JS API):
```
[warning] The signature TrainingSession.loadParametersBuffer has an @param with name "buffer", which was not used.
[warning] NonTensorType, defined in ./lib/onnx-value.ts, is referenced by OnnxValue but not included in the documentation.
[warning] TensorFactory, defined in ./lib/tensor-factory.ts, is referenced by Tensor but not included in the documentation.
[warning] ExternalDataFileType, defined in ./lib/onnx-model.ts, is referenced by InferenceSession.SessionOptions.externalData but not included in the documentation.
[warning] TensorToDataUrlOptions, defined in ./lib/tensor-conversion.ts, is referenced by Tensor.toDataURL.toDataURL.options but not included in the documentation.
[warning] TensorToImageDataOptions, defined in ./lib/tensor-conversion.ts, is referenced by Tensor.toImageData.toImageData.options but not included in the documentation.
[warning] Failed to resolve link to "GpuBufferType" in comment for Env.WebGpuFlags.adapter.
[warning] Failed to resolve link to "GpuBufferType" in comment for Env.WebGpuFlags.device.
```

Changes highlighted:
- Merge `CoreMlExecutionProviderOption` and
`CoreMLExecutionProviderOption`. They expose 2 set of different options
for React-native and ORT nodejs binding. This should be fixed in future.
- Fix a few inconsistency of names between JSDoc and parameters
- Fix broken type links
- Exclude trace functions
---
 js/common/lib/backend.ts                      |  6 +--
 js/common/lib/env.ts                          |  4 +-
 js/common/lib/index.ts                        |  3 ++
 js/common/lib/inference-session.ts            | 43 +++++++++++++++----
 js/common/lib/onnx-value.ts                   |  2 +-
 js/common/lib/tensor-factory.ts               |  2 +-
 js/common/lib/tensor.ts                       |  4 +-
 js/common/lib/trace.ts                        |  9 ++++
 js/common/lib/training-session.ts             | 16 +++----
 .../templates/linux-web-init-and-check.yml    |  4 ++
 10 files changed, 68 insertions(+), 25 deletions(-)

diff --git a/js/common/lib/backend.ts b/js/common/lib/backend.ts
index 9bfcb12206057..8c07bdd5c5c4a 100644
--- a/js/common/lib/backend.ts
+++ b/js/common/lib/backend.ts
@@ -58,7 +58,7 @@ export interface TrainingSessionHandler extends SessionHandler {
       options: InferenceSession.RunOptions): Promise<SessionHandler.ReturnType>;
 
   getParametersSize(trainableOnly: boolean): Promise<number>;
-  loadParametersBuffer(array: Uint8Array, trainableOnly: boolean): Promise<void>;
+  loadParametersBuffer(buffer: Uint8Array, trainableOnly: boolean): Promise<void>;
   getContiguousParameters(trainableOnly: boolean): Promise<OnnxValue>;
 }
 
@@ -77,8 +77,8 @@ export interface Backend {
       Promise<InferenceSessionHandler>;
 
   createTrainingSessionHandler?
-      (checkpointStateUriOrBuffer: TrainingSession.URIorBuffer, trainModelUriOrBuffer: TrainingSession.URIorBuffer,
-       evalModelUriOrBuffer: TrainingSession.URIorBuffer, optimizerModelUriOrBuffer: TrainingSession.URIorBuffer,
+      (checkpointStateUriOrBuffer: TrainingSession.UriOrBuffer, trainModelUriOrBuffer: TrainingSession.UriOrBuffer,
+       evalModelUriOrBuffer: TrainingSession.UriOrBuffer, optimizerModelUriOrBuffer: TrainingSession.UriOrBuffer,
        options: InferenceSession.SessionOptions): Promise<TrainingSessionHandler>;
 }
 
diff --git a/js/common/lib/env.ts b/js/common/lib/env.ts
index dd8bde2b596f4..b139c719e863f 100644
--- a/js/common/lib/env.ts
+++ b/js/common/lib/env.ts
@@ -173,7 +173,7 @@ export declare namespace Env {
      * When use with TypeScript, the type of this property is `GPUAdapter` defined in "@webgpu/types".
      * Use `const adapter = env.webgpu.adapter as GPUAdapter;` in TypeScript to access this property with correct type.
      *
-     * see comments on {@link GpuBufferType}
+     * see comments on {@link Tensor.GpuBufferType}
      */
     readonly adapter: unknown;
     /**
@@ -184,7 +184,7 @@ export declare namespace Env {
      * When use with TypeScript, the type of this property is `GPUDevice` defined in "@webgpu/types".
      * Use `const device = env.webgpu.device as GPUDevice;` in TypeScript to access this property with correct type.
      *
-     * see comments on {@link GpuBufferType} for more details about why not use types defined in "@webgpu/types".
+     * see comments on {@link Tensor.GpuBufferType} for more details about why not use types defined in "@webgpu/types".
      */
     readonly device: unknown;
     /**
diff --git a/js/common/lib/index.ts b/js/common/lib/index.ts
index 18cc2aba03f63..3ed56b3c2e812 100644
--- a/js/common/lib/index.ts
+++ b/js/common/lib/index.ts
@@ -21,6 +21,9 @@ export * from './backend.js';
 export * from './env.js';
 export * from './inference-session.js';
 export * from './tensor.js';
+export * from './tensor-conversion.js';
+export * from './tensor-factory.js';
 export * from './trace.js';
+export * from './onnx-model.js';
 export * from './onnx-value.js';
 export * from './training-session.js';
diff --git a/js/common/lib/inference-session.ts b/js/common/lib/inference-session.ts
index 4f85c3b46e253..4f7fbdcdcf0ca 100644
--- a/js/common/lib/inference-session.ts
+++ b/js/common/lib/inference-session.ts
@@ -186,22 +186,22 @@ export declare namespace InferenceSession {
   // #region execution providers
 
   // Currently, we have the following backends to support execution providers:
-  // Backend Node.js binding: supports 'cpu' and 'cuda'.
+  // Backend Node.js binding: supports 'cpu', 'dml' (win32), 'coreml' (macOS) and 'cuda' (linux).
   // Backend WebAssembly: supports 'cpu', 'wasm', 'webgpu' and 'webnn'.
   // Backend ONNX.js: supports 'webgl'.
   // Backend React Native: supports 'cpu', 'xnnpack', 'coreml' (iOS), 'nnapi' (Android).
   interface ExecutionProviderOptionMap {
+    coreml: CoreMLExecutionProviderOption;
     cpu: CpuExecutionProviderOption;
-    coreml: CoreMlExecutionProviderOption;
     cuda: CudaExecutionProviderOption;
     dml: DmlExecutionProviderOption;
+    nnapi: NnapiExecutionProviderOption;
     tensorrt: TensorRtExecutionProviderOption;
     wasm: WebAssemblyExecutionProviderOption;
     webgl: WebGLExecutionProviderOption;
-    xnnpack: XnnpackExecutionProviderOption;
     webgpu: WebGpuExecutionProviderOption;
     webnn: WebNNExecutionProviderOption;
-    nnapi: NnapiExecutionProviderOption;
+    xnnpack: XnnpackExecutionProviderOption;
   }
 
   type ExecutionProviderName = keyof ExecutionProviderOptionMap;
@@ -219,10 +219,6 @@ export declare namespace InferenceSession {
     readonly name: 'cuda';
     deviceId?: number;
   }
-  export interface CoreMlExecutionProviderOption extends ExecutionProviderOption {
-    readonly name: 'coreml';
-    coreMlFlags?: number;
-  }
   export interface DmlExecutionProviderOption extends ExecutionProviderOption {
     readonly name: 'dml';
     deviceId?: number;
@@ -253,8 +249,39 @@ export declare namespace InferenceSession {
   }
   export interface CoreMLExecutionProviderOption extends ExecutionProviderOption {
     readonly name: 'coreml';
+    /**
+     * The bit flags for CoreML execution provider.
+     *
+     * ```
+     * COREML_FLAG_USE_CPU_ONLY = 0x001
+     * COREML_FLAG_ENABLE_ON_SUBGRAPH = 0x002
+     * COREML_FLAG_ONLY_ENABLE_DEVICE_WITH_ANE = 0x004
+     * COREML_FLAG_ONLY_ALLOW_STATIC_INPUT_SHAPES = 0x008
+     * COREML_FLAG_CREATE_MLPROGRAM = 0x010
+     * ```
+     *
+     * See include/onnxruntime/core/providers/coreml/coreml_provider_factory.h for more details.
+     *
+     * This flag is available only in ONNXRuntime (Node.js binding).
+     */
+    coreMlFlags?: number;
+    /**
+     * Specify whether to use CPU only in CoreML EP.
+     *
+     * This setting is available only in ONNXRuntime (react-native).
+     */
     useCPUOnly?: boolean;
+    /**
+     * Specify whether to enable CoreML EP on subgraph.
+     *
+     * This setting is available only in ONNXRuntime (react-native).
+     */
     enableOnSubgraph?: boolean;
+    /**
+     * Specify whether to only enable CoreML EP for Apple devices with ANE (Apple Neural Engine).
+     *
+     * This setting is available only in ONNXRuntime (react-native).
+     */
     onlyEnableDeviceWithANE?: boolean;
   }
   export interface NnapiExecutionProviderOption extends ExecutionProviderOption {
diff --git a/js/common/lib/onnx-value.ts b/js/common/lib/onnx-value.ts
index a16a30d25d839..72369ce8b4209 100644
--- a/js/common/lib/onnx-value.ts
+++ b/js/common/lib/onnx-value.ts
@@ -3,7 +3,7 @@
 
 import {Tensor} from './tensor.js';
 
-type NonTensorType = never;
+export type NonTensorType = never;
 
 /**
  * Type OnnxValue Represents both tensors and non-tensors value for model's inputs/outputs.
diff --git a/js/common/lib/tensor-factory.ts b/js/common/lib/tensor-factory.ts
index 6e19d7fb898a3..431de4c3635c2 100644
--- a/js/common/lib/tensor-factory.ts
+++ b/js/common/lib/tensor-factory.ts
@@ -253,7 +253,7 @@ export interface TensorFactory {
   /**
    * create a tensor from an ImageBitmap object
    *
-   * @param bitMap - the ImageBitmap object to create tensor from
+   * @param bitmap - the ImageBitmap object to create tensor from
    * @param options - An optional object representing options for creating tensor from URL.
    *
    * The following default settings will be applied:
diff --git a/js/common/lib/tensor.ts b/js/common/lib/tensor.ts
index d5da33640dc7d..20319ebb800c2 100644
--- a/js/common/lib/tensor.ts
+++ b/js/common/lib/tensor.ts
@@ -160,7 +160,7 @@ export interface Tensor extends TypedTensorBase<Tensor.Type>, TypedTensorUtils<T
 /**
  * type TensorConstructor defines the constructors of 'Tensor' to create CPU tensor instances.
  */
-export interface TensorConstructor {
+export interface TensorConstructor extends TensorFactory {
   // #region CPU tensor - specify element type
   /**
    * Construct a new string tensor object from the given type, data and dims.
@@ -326,4 +326,4 @@ export interface TensorConstructor {
 }
 
 // eslint-disable-next-line @typescript-eslint/naming-convention
-export const Tensor = TensorImpl as (TensorConstructor & TensorFactory);
+export const Tensor = TensorImpl as TensorConstructor;
diff --git a/js/common/lib/trace.ts b/js/common/lib/trace.ts
index 7e0487b350198..44ad6cacb4bb4 100644
--- a/js/common/lib/trace.ts
+++ b/js/common/lib/trace.ts
@@ -3,6 +3,9 @@
 
 import {env} from './env-impl.js';
 
+/**
+ * @ignore
+ */
 export const TRACE = (deviceType: string, label: string) => {
   if (typeof env.trace === 'undefined' ? !env.wasm.trace : !env.trace) {
     return;
@@ -29,6 +32,9 @@ const TRACE_FUNC = (msg: string, extraMsg?: string) => {
   }
 };
 
+/**
+ * @ignore
+ */
 export const TRACE_FUNC_BEGIN = (extraMsg?: string) => {
   if (typeof env.trace === 'undefined' ? !env.wasm.trace : !env.trace) {
     return;
@@ -36,6 +42,9 @@ export const TRACE_FUNC_BEGIN = (extraMsg?: string) => {
   TRACE_FUNC('BEGIN', extraMsg);
 };
 
+/**
+ * @ignore
+ */
 export const TRACE_FUNC_END = (extraMsg?: string) => {
   if (typeof env.trace === 'undefined' ? !env.wasm.trace : !env.trace) {
     return;
diff --git a/js/common/lib/training-session.ts b/js/common/lib/training-session.ts
index e54aed90e702c..f9de77e3ac7d0 100644
--- a/js/common/lib/training-session.ts
+++ b/js/common/lib/training-session.ts
@@ -11,7 +11,7 @@ export declare namespace TrainingSession {
   /**
    * Either URI file path (string) or Uint8Array containing model or checkpoint information.
    */
-  type URIorBuffer = string|Uint8Array;
+  type UriOrBuffer = string|Uint8Array;
 }
 
 /**
@@ -98,13 +98,13 @@ export interface TrainingSession {
   getParametersSize(trainableOnly: boolean): Promise<number>;
 
   /**
-   * Copies parameter values from the given array to the training state. Currently, only supporting models with
+   * Copies parameter values from the given buffer to the training state. Currently, only supporting models with
    * parameters of type Float32.
    *
-   * @param buffer - Float32 buffer containing parameters converted to a Uint8Array.
+   * @param buffer - A Uint8Array representation of Float32 parameters.
    * @param trainableOnly - True if trainable parameters only to be modified, false otherwise. Default value is true.
    */
-  loadParametersBuffer(array: Uint8Array, trainableOnly: boolean): Promise<void>;
+  loadParametersBuffer(buffer: Uint8Array, trainableOnly: boolean): Promise<void>;
 
   /**
    * Copies the model parameters to a contiguous buffer. Usually used in the context of Federated Learning.
@@ -157,19 +157,19 @@ export interface TrainingSessionCreateOptions {
   /**
    * URI or buffer for a .ckpt file that contains the checkpoint for the training model.
    */
-  checkpointState: TrainingSession.URIorBuffer;
+  checkpointState: TrainingSession.UriOrBuffer;
   /**
    * URI or buffer for the .onnx training file.
    */
-  trainModel: TrainingSession.URIorBuffer;
+  trainModel: TrainingSession.UriOrBuffer;
   /**
    * Optional. URI or buffer for the .onnx optimizer model file.
    */
-  optimizerModel?: TrainingSession.URIorBuffer;
+  optimizerModel?: TrainingSession.UriOrBuffer;
   /**
    * Optional. URI or buffer for the .onnx eval model file.
    */
-  evalModel?: TrainingSession.URIorBuffer;
+  evalModel?: TrainingSession.UriOrBuffer;
 }
 
 /**
diff --git a/tools/ci_build/github/azure-pipelines/templates/linux-web-init-and-check.yml b/tools/ci_build/github/azure-pipelines/templates/linux-web-init-and-check.yml
index e788e4b3dddaa..a4d5a73118ea2 100644
--- a/tools/ci_build/github/azure-pipelines/templates/linux-web-init-and-check.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/linux-web-init-and-check.yml
@@ -31,6 +31,10 @@ steps:
     node -e "a=require('child_process').execSync('git diff --name-only').toString();if(a)throw new Error('Following source files are not formatted: (did you run \"npm run format\"?)\n'+a)"
   workingDirectory: '$(Build.SourcesDirectory)/js'
   displayName: 'Check unformatted files'
+- script: |
+    npx typedoc --emit none --treatWarningsAsErrors
+  workingDirectory: '$(Build.SourcesDirectory)/js/common'
+  displayName: 'TypeDoc Validation'
 - script: |
     npm run build:doc
   workingDirectory: '$(Build.SourcesDirectory)/js/web'

From 1eb67a07caca2fa9561af03ac47f23f5cc0cdd41 Mon Sep 17 00:00:00 2001
From: wangshuai09 <391746016@qq.com>
Date: Sat, 16 Mar 2024 11:28:43 +0800
Subject: [PATCH 189/279] Add cann_dependencies (#19929)

### Description
<!-- Describe your changes. -->

Add `cann_dependencies`


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

The previous [PR](https://github.com/microsoft/onnxruntime/pull/17365)
avioded using patchelf but lost `cann_dependencies`, This PR adds
`cann_dependencies` to avoid require cann libraries when repairing
wheel.
---
 setup.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index ac7a70b991fbf..ffe2958b357b8 100644
--- a/setup.py
+++ b/setup.py
@@ -232,6 +232,8 @@ def run(self):
 
                 tensorrt_dependencies = ["libnvinfer.so.8", "libnvinfer_plugin.so.8", "libnvonnxparser.so.8"]
 
+                cann_dependencies = ["libascendcl.so", "libacl_op_compiler.so", "libfmk_onnx_parser.so"]
+
                 dest = "onnxruntime/capi/libonnxruntime_providers_openvino.so"
                 if path.isfile(dest):
                     subprocess.run(
@@ -255,7 +257,7 @@ def run(self):
                 file = glob(path.join(self.dist_dir, "*linux*.whl"))[0]
                 logger.info("repairing %s for manylinux1", file)
                 auditwheel_cmd = ["auditwheel", "-v", "repair", "-w", self.dist_dir, file]
-                for i in cuda_dependencies + rocm_dependencies + tensorrt_dependencies:
+                for i in cuda_dependencies + rocm_dependencies + tensorrt_dependencies + cann_dependencies:
                     auditwheel_cmd += ["--exclude", i]
                 logger.info("Running %s", " ".join([shlex.quote(arg) for arg in auditwheel_cmd]))
                 try:

From afdab62f53db83a8c248b4088d426cbedfe8eab1 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sat, 16 Mar 2024 18:53:17 -0700
Subject: [PATCH 190/279] Bump follow-redirects from 1.15.4 to 1.15.6 in
 /js/web (#19949)

Bumps
[follow-redirects](https://github.com/follow-redirects/follow-redirects)
from 1.15.4 to 1.15.6.
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/35a517c5861d79dc8bff7db8626013d20b711b06"><code>35a517c</code></a>
Release version 1.15.6 of the npm package.</li>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/c4f847f85176991f95ab9c88af63b1294de8649b"><code>c4f847f</code></a>
Drop Proxy-Authorization across hosts.</li>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/8526b4a1b2ab3a2e4044299377df623a661caa76"><code>8526b4a</code></a>
Use GitHub for disclosure.</li>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/b1677ce00110ee50dc5da576751d39b281fc4944"><code>b1677ce</code></a>
Release version 1.15.5 of the npm package.</li>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/d8914f7982403ea096b39bd594a00ee9d3b7e224"><code>d8914f7</code></a>
Preserve fragment in responseUrl.</li>
<li>See full diff in <a
href="https://github.com/follow-redirects/follow-redirects/compare/v1.15.4...v1.15.6">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=follow-redirects&package-manager=npm_and_yarn&previous-version=1.15.4&new-version=1.15.6)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)
You can disable automated security fix PRs for this repo from the
[Security Alerts
page](https://github.com/microsoft/onnxruntime/network/alerts).

</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 js/web/package-lock.json | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/js/web/package-lock.json b/js/web/package-lock.json
index 41c44aaa2679b..5c9113459ff06 100644
--- a/js/web/package-lock.json
+++ b/js/web/package-lock.json
@@ -52,7 +52,7 @@
       "version": "1.18.0",
       "license": "MIT",
       "devDependencies": {
-        "typedoc": "^0.23.22"
+        "typedoc": "^0.25.7"
       }
     },
     "node_modules/@chiragrupani/karma-chromium-edge-launcher": {
@@ -1351,9 +1351,9 @@
       "dev": true
     },
     "node_modules/follow-redirects": {
-      "version": "1.15.4",
-      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.4.tgz",
-      "integrity": "sha512-Cr4D/5wlrb0z9dgERpUL3LrmPKVDsETIJhaCMeDfuFYcqa5bldGV6wBsAN6X/vxlXQtFBMrXdXxdL8CbDTGniw==",
+      "version": "1.15.6",
+      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz",
+      "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==",
       "dev": true,
       "funding": [
         {
@@ -4595,9 +4595,9 @@
       "dev": true
     },
     "follow-redirects": {
-      "version": "1.15.4",
-      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.4.tgz",
-      "integrity": "sha512-Cr4D/5wlrb0z9dgERpUL3LrmPKVDsETIJhaCMeDfuFYcqa5bldGV6wBsAN6X/vxlXQtFBMrXdXxdL8CbDTGniw==",
+      "version": "1.15.6",
+      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz",
+      "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==",
       "dev": true
     },
     "from": {
@@ -5503,7 +5503,7 @@
     "onnxruntime-common": {
       "version": "file:../common",
       "requires": {
-        "typedoc": "^0.23.22"
+        "typedoc": "^0.25.7"
       }
     },
     "p-cancelable": {

From 4e55242a3031fb86d3366d77e1b753a2c71b1880 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sat, 16 Mar 2024 18:54:06 -0700
Subject: [PATCH 191/279] Bump follow-redirects from 1.15.4 to 1.15.6 in
 /onnxruntime/test/wasm (#19950)

Bumps
[follow-redirects](https://github.com/follow-redirects/follow-redirects)
from 1.15.4 to 1.15.6.
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/35a517c5861d79dc8bff7db8626013d20b711b06"><code>35a517c</code></a>
Release version 1.15.6 of the npm package.</li>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/c4f847f85176991f95ab9c88af63b1294de8649b"><code>c4f847f</code></a>
Drop Proxy-Authorization across hosts.</li>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/8526b4a1b2ab3a2e4044299377df623a661caa76"><code>8526b4a</code></a>
Use GitHub for disclosure.</li>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/b1677ce00110ee50dc5da576751d39b281fc4944"><code>b1677ce</code></a>
Release version 1.15.5 of the npm package.</li>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/d8914f7982403ea096b39bd594a00ee9d3b7e224"><code>d8914f7</code></a>
Preserve fragment in responseUrl.</li>
<li>See full diff in <a
href="https://github.com/follow-redirects/follow-redirects/compare/v1.15.4...v1.15.6">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=follow-redirects&package-manager=npm_and_yarn&previous-version=1.15.4&new-version=1.15.6)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)
You can disable automated security fix PRs for this repo from the
[Security Alerts
page](https://github.com/microsoft/onnxruntime/network/alerts).

</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 onnxruntime/test/wasm/package-lock.json | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/onnxruntime/test/wasm/package-lock.json b/onnxruntime/test/wasm/package-lock.json
index bfa000fda440a..1beaf3b83ca28 100644
--- a/onnxruntime/test/wasm/package-lock.json
+++ b/onnxruntime/test/wasm/package-lock.json
@@ -520,9 +520,9 @@
       "dev": true
     },
     "node_modules/follow-redirects": {
-      "version": "1.15.4",
-      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.4.tgz",
-      "integrity": "sha512-Cr4D/5wlrb0z9dgERpUL3LrmPKVDsETIJhaCMeDfuFYcqa5bldGV6wBsAN6X/vxlXQtFBMrXdXxdL8CbDTGniw==",
+      "version": "1.15.6",
+      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz",
+      "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==",
       "dev": true,
       "funding": [
         {
@@ -1972,9 +1972,9 @@
       "dev": true
     },
     "follow-redirects": {
-      "version": "1.15.4",
-      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.4.tgz",
-      "integrity": "sha512-Cr4D/5wlrb0z9dgERpUL3LrmPKVDsETIJhaCMeDfuFYcqa5bldGV6wBsAN6X/vxlXQtFBMrXdXxdL8CbDTGniw==",
+      "version": "1.15.6",
+      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz",
+      "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==",
       "dev": true
     },
     "fs-extra": {

From 28ad6c3955ca8bbcc7ce6ec07d47865e848b8f20 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sat, 16 Mar 2024 18:54:53 -0700
Subject: [PATCH 192/279] Bump follow-redirects from 1.15.4 to 1.15.6 in
 /js/node (#19951)

Bumps
[follow-redirects](https://github.com/follow-redirects/follow-redirects)
from 1.15.4 to 1.15.6.
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/35a517c5861d79dc8bff7db8626013d20b711b06"><code>35a517c</code></a>
Release version 1.15.6 of the npm package.</li>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/c4f847f85176991f95ab9c88af63b1294de8649b"><code>c4f847f</code></a>
Drop Proxy-Authorization across hosts.</li>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/8526b4a1b2ab3a2e4044299377df623a661caa76"><code>8526b4a</code></a>
Use GitHub for disclosure.</li>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/b1677ce00110ee50dc5da576751d39b281fc4944"><code>b1677ce</code></a>
Release version 1.15.5 of the npm package.</li>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/d8914f7982403ea096b39bd594a00ee9d3b7e224"><code>d8914f7</code></a>
Preserve fragment in responseUrl.</li>
<li>See full diff in <a
href="https://github.com/follow-redirects/follow-redirects/compare/v1.15.4...v1.15.6">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=follow-redirects&package-manager=npm_and_yarn&previous-version=1.15.4&new-version=1.15.6)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)
You can disable automated security fix PRs for this repo from the
[Security Alerts
page](https://github.com/microsoft/onnxruntime/network/alerts).

</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 js/node/package-lock.json | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/js/node/package-lock.json b/js/node/package-lock.json
index 2d7c39c86097f..62b47698a1438 100644
--- a/js/node/package-lock.json
+++ b/js/node/package-lock.json
@@ -30,7 +30,7 @@
       "version": "1.18.0",
       "license": "MIT",
       "devDependencies": {
-        "typedoc": "^0.23.22"
+        "typedoc": "^0.25.7"
       }
     },
     "node_modules/@protobufjs/aspromise": {
@@ -336,9 +336,9 @@
       "dev": true
     },
     "node_modules/follow-redirects": {
-      "version": "1.15.4",
-      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.4.tgz",
-      "integrity": "sha512-Cr4D/5wlrb0z9dgERpUL3LrmPKVDsETIJhaCMeDfuFYcqa5bldGV6wBsAN6X/vxlXQtFBMrXdXxdL8CbDTGniw==",
+      "version": "1.15.6",
+      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz",
+      "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==",
       "dev": true,
       "funding": [
         {
@@ -1242,9 +1242,9 @@
       "dev": true
     },
     "follow-redirects": {
-      "version": "1.15.4",
-      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.4.tgz",
-      "integrity": "sha512-Cr4D/5wlrb0z9dgERpUL3LrmPKVDsETIJhaCMeDfuFYcqa5bldGV6wBsAN6X/vxlXQtFBMrXdXxdL8CbDTGniw==",
+      "version": "1.15.6",
+      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz",
+      "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==",
       "dev": true
     },
     "form-data": {
@@ -1503,7 +1503,7 @@
     "onnxruntime-common": {
       "version": "file:../common",
       "requires": {
-        "typedoc": "^0.23.22"
+        "typedoc": "^0.25.7"
       }
     },
     "parse-json": {

From 7e0d4249343054c59410cd7cca76adb3456de0c1 Mon Sep 17 00:00:00 2001
From: Guenther Schmuelling <guschmue@microsoft.com>
Date: Mon, 18 Mar 2024 08:28:43 -0700
Subject: [PATCH 193/279] accumulate in fp32 for Reduce* (#19868)

---
 js/web/lib/wasm/jsep/webgpu/ops/reduce-shared.ts | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/reduce-shared.ts b/js/web/lib/wasm/jsep/webgpu/ops/reduce-shared.ts
index a9b28d7c034f3..210b3ee7e2fca 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/reduce-shared.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/reduce-shared.ts
@@ -131,7 +131,7 @@ export const createReduceSharedProgramInfo =
       const workgroupSize = 32;
 
       const sharedMemorySnippet = `
-          var<workgroup> aBestValues : array<${output.type.storage}, ${workgroupSize}>;
+          var<workgroup> aBestValues : array<f32, ${workgroupSize}>;
        `;
 
       const getShaderSource = (shaderHelper: ShaderHelper) => `
@@ -145,10 +145,10 @@ export const createReduceSharedProgramInfo =
           let outputIndex = global_idx / ${workgroupSize};
           let offset = outputIndex * uniforms.reduceSize;
 
-          var bestValue = ${output.type.storage}(${reduceInitValues[reduceType]});
+          var bestValue = f32(${reduceInitValues[reduceType]});
           let Length = uniforms.reduceSize;
           for (var k = local_idx; k < Length; k = k + ${workgroupSize}) {
-           let candidate = ${output.type.storage}(${input.getByOffset('offset + k')});
+           let candidate = f32(${input.getByOffset('offset + k')});
            bestValue = ${reduceOps[reduceType]};
           }
           aBestValues[local_idx] = bestValue;
@@ -172,8 +172,8 @@ export const createReduceSharedProgramInfo =
           output.setByOffset(
               'outputIndex',
               `${
-                  reduceType === 'mean' ? `bestValue / ${output.type.storage}(uniforms.reduceSize)` :
-                                          `${reduceOutputValues[reduceType]}`}`)};
+                  reduceType === 'mean' ? `${output.type.storage}(bestValue / f32(uniforms.reduceSize))` :
+                                          `${output.type.storage}(${reduceOutputValues[reduceType]})`}`)};
          }
         }`;
 

From 4d31076d687560f7cfdada19f6ba9ad5a86612f2 Mon Sep 17 00:00:00 2001
From: Edward Chen <18449977+edgchen1@users.noreply.github.com>
Date: Mon, 18 Mar 2024 08:54:24 -0700
Subject: [PATCH 194/279] [objc] Add check for ORTValue being a tensor in
 ORTValue methods that should only be used with tensors. (#19946)

Add check to report error instead of crashing.
---
 objectivec/ort_value.mm | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/objectivec/ort_value.mm b/objectivec/ort_value.mm
index b9dc1a9885c61..c61a7ea809237 100644
--- a/objectivec/ort_value.mm
+++ b/objectivec/ort_value.mm
@@ -148,6 +148,9 @@ - (nullable ORTValueTypeInfo*)typeInfoWithError:(NSError**)error {
 - (nullable ORTTensorTypeAndShapeInfo*)tensorTypeAndShapeInfoWithError:(NSError**)error {
   try {
     const auto tensorTypeAndShapeInfo = _typeInfo->GetTensorTypeAndShapeInfo();
+    if (!tensorTypeAndShapeInfo) {
+      ORT_CXX_API_THROW("ORTValue is not a tensor.", ORT_RUNTIME_EXCEPTION);
+    }
     return CXXAPIToPublicTensorTypeAndShapeInfo(tensorTypeAndShapeInfo);
   }
   ORT_OBJC_API_IMPL_CATCH_RETURNING_NULLABLE(error)
@@ -156,6 +159,9 @@ - (nullable ORTTensorTypeAndShapeInfo*)tensorTypeAndShapeInfoWithError:(NSError*
 - (nullable NSMutableData*)tensorDataWithError:(NSError**)error {
   try {
     const auto tensorTypeAndShapeInfo = _typeInfo->GetTensorTypeAndShapeInfo();
+    if (!tensorTypeAndShapeInfo) {
+      ORT_CXX_API_THROW("ORTValue is not a tensor.", ORT_RUNTIME_EXCEPTION);
+    }
     if (tensorTypeAndShapeInfo.GetElementType() == ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING) {
       ORT_CXX_API_THROW(
           "This ORTValue holds string data. Please call tensorStringDataWithError: "
@@ -182,6 +188,9 @@ - (nullable NSMutableData*)tensorDataWithError:(NSError**)error {
 - (nullable NSArray<NSString*>*)tensorStringDataWithError:(NSError**)error {
   try {
     const auto tensorTypeAndShapeInfo = _typeInfo->GetTensorTypeAndShapeInfo();
+    if (!tensorTypeAndShapeInfo) {
+      ORT_CXX_API_THROW("ORTValue is not a tensor.", ORT_RUNTIME_EXCEPTION);
+    }
     const size_t elementCount = tensorTypeAndShapeInfo.GetElementCount();
     const size_t tensorStringDataLength = _value->GetStringTensorDataLength();
     std::vector<char> tensorStringData(tensorStringDataLength, '\0');

From a033df8c31311b6710570a3b7103dd8c2f9f9a64 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <yuslepukhin@users.noreply.github.com>
Date: Mon, 18 Mar 2024 10:28:39 -0700
Subject: [PATCH 195/279] Implement CustomOp Output Type Inference function
 (#19906)

### Description
<!-- Describe your changes. -->
This change addresses the following issues with the current CustomOP
Output Type inference
- The function does not take into account optional inputs. When input is
absent the inference is silently aborted, and no output type is inferred
(P1 customer issue)
- Inferring output type based on the input type for multi-kernel custom
ops is done based on the latest in sequence kernel definition. There is
not an attempt made to match the kernel based on the input type.
- Inference is aborted when variadic inputs/outputs are detected when
the generated input/output names fail to obtain type constraints. This
is not immediately clear from the code, because custom op schema is not
available within the inference function.
- No error reporting.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

Most of CustomOPs lack their own type and shape inference function as it
was recently introduced. For that reason, it is important to fix this.
This change is inspired by a customer issue.

This is a follow up on:
- https://github.com/microsoft/onnxruntime/pull/15184
- https://github.com/cbourjau/ort-custom-op/pull/11
- https://github.com/microsoft/onnxruntime-extensions/issues/451
---
 .../core/session/onnxruntime_c_api.h          |  18 +-
 .../core/session/onnxruntime_cxx_api.h        |   4 +
 onnxruntime/core/session/custom_ops.cc        | 157 ++++++++++++------
 .../test/framework/shape_inference_test.cc    |  93 ++++++++++-
 onnxruntime/test/shared_lib/test_inference.cc |   7 +-
 5 files changed, 224 insertions(+), 55 deletions(-)

diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index cef50163f68b0..41b034e9c1dcc 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -1837,14 +1837,28 @@ struct OrtApi {
 
   /** \brief Used for custom operators, get an input of a kernel
    *
-   * \see ::OrtCustomOp
+   * The function attempts fetches the input of the kernel. If the input is optional
+   * and not present, the function returns success and out is set to nullptr.
+   *
+   * \param[in] context ::OrtKernelContext instance
+   * \param[in] input index. See KernelContext_GetInputCount for boundaries check.
+   * \param[in, out] returns a ptr to OrtValue if the input is present
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
    */
   ORT_API2_STATUS(KernelContext_GetInput, _In_ const OrtKernelContext* context, _In_ size_t index,
                   _Out_ const OrtValue** out);
 
   /** \brief Used for custom operators, get an output of a kernel
    *
-   * \see ::OrtCustomOp
+   * The function attempts fetches the output of the kernel. If the output is optional
+   * and not present, the function returns success and out is set to nullptr.
+   *
+   * \param[in] context ::OrtKernelContext instance
+   * \param[in] output index. See KernelContext_GetOutputCount for boundaries check.
+   * \param[in, out] returns a ptr to OrtValue if the output is present
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
    */
   ORT_API2_STATUS(KernelContext_GetOutput, _Inout_ OrtKernelContext* context, _In_ size_t index,
                   _In_ const int64_t* dim_values, size_t dim_count, _Outptr_ OrtValue** out);
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
index ae4c4bef90c64..60540514fbfa6 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
@@ -2055,7 +2055,11 @@ struct KernelContext {
   explicit KernelContext(OrtKernelContext* context);
   size_t GetInputCount() const;
   size_t GetOutputCount() const;
+  // If input is optional and is not present, the method returns en empty ConstValue
+  // which can be compared to nullptr.
   ConstValue GetInput(size_t index) const;
+  // If outout is optional and is not present, the method returns en empty UnownedValue
+  // which can be compared to nullptr.
   UnownedValue GetOutput(size_t index, const int64_t* dim_values, size_t dim_count) const;
   UnownedValue GetOutput(size_t index, const std::vector<int64_t>& dims) const;
   void* GetGPUComputeStream() const;
diff --git a/onnxruntime/core/session/custom_ops.cc b/onnxruntime/core/session/custom_ops.cc
index 6e9d68d259a5d..513aafcdadb7d 100644
--- a/onnxruntime/core/session/custom_ops.cc
+++ b/onnxruntime/core/session/custom_ops.cc
@@ -1066,59 +1066,120 @@ Status IsCompatible(const ONNX_NAMESPACE::OpSchema& schema, const OrtCustomOp* o
   return Status::OK();
 }
 
-void InferOutputTypes(const InlinedVector<const KernelDef*>& kernel_defs,
-                      ONNX_NAMESPACE::InferenceContext& infer_ctx) {
-  for (const auto& kernel_def : kernel_defs) {
+// This function attempts to do its best for older custom ops (most of them) who do not have
+// they own type and shape inference function. However, it falls short in some cases, and we leave
+// those for the user to handle in their own inference function.
+static void InferOutputTypes(const ONNX_NAMESPACE::OpSchema& schema, gsl::span<const KernelDef* const> kernel_defs,
+                             ONNX_NAMESPACE::InferenceContext& infer_ctx) {
+  const auto& inputs = schema.inputs();
+  const auto node_input_num = infer_ctx.getNumInputs();
+
+  const KernelDef* def_selected = nullptr;
+  bool is_variadic_input = false;
+  bool is_homogeneous_input = false;
+  int32_t output_propagate{0};
+
+  for (size_t kernel_index = 0;
+       kernel_index < kernel_defs.size() && def_selected == nullptr;
+       ++kernel_index) {
+    const auto* kernel_def = kernel_defs[kernel_index];
     const auto& type_constraints = kernel_def->TypeConstraints();
-    auto num_inputs = infer_ctx.getNumInputs();
-    bool matched = true;
-    ONNXTensorElementDataType undef = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED;
-    // first, make sure there is a constraint for every input
-    for (size_t i = 0; i < num_inputs && matched; ++i) {
-      auto input_name = "Input" + std::to_string(i);
-      auto input_type = infer_ctx.getInputType(i);
-      if (input_type) {
-        auto elem_type = static_cast<ONNXTensorElementDataType>(input_type->tensor_type().elem_type());
-        auto tc_iter = type_constraints.find(input_name);
-        if (tc_iter != type_constraints.end()) {
-          if (tc_iter->second.size() > 1) {
-            undef = elem_type;
-          } else if (tc_iter->second.size() != 1 ||
-                     tc_iter->second[0] != DataTypeImpl::TensorTypeFromONNXEnum(elem_type)) {
-            matched = false;
+    def_selected = kernel_def;
+
+    for (size_t i = 0; i < node_input_num; ++i) {
+      const auto input_type = infer_ctx.getInputType(i);
+
+      // Guard against variadic parameter index
+      const size_t schema_input_index = (i < inputs.size()) ? i : inputs.size() - 1;
+      const auto& param = inputs[schema_input_index];
+      const auto& input_name = param.GetName();
+      if (input_type == nullptr) {
+        if (param.GetOption() == ONNX_NAMESPACE::OpSchema::FormalParameterOption::Optional)
+          continue;
+
+        ORT_THROW("[CustomOP type inferencing error]: kernel Input: ", input_name,
+                  " is absent, but not optional. Op : ", schema.Name());
+      }
+
+      is_variadic_input = (param.GetOption() == ONNX_NAMESPACE::OpSchema::FormalParameterOption::Variadic);
+      is_homogeneous_input = param.GetIsHomogeneous();
+
+      if (!is_variadic_input || is_homogeneous_input) {
+        auto hit = type_constraints.find(input_name);
+        if (hit != type_constraints.end()) {
+          const auto& types = hit->second;
+          // For custom ops kernel constraints are never empty
+          assert(!types.empty());
+          if (!std::any_of(types.cbegin(), types.cend(),
+                           [input_type](const DataTypeImpl* type) {
+                             return type->IsCompatible(*input_type);
+                           })) {
+            def_selected = nullptr;
+            output_propagate = 0;
+            break;
+          }
+
+          // If we have multiple types possible from the constraints,
+          // record the last type and use it to guess the output type if
+          // output may have different types. Works well for symmetric single input/outputs
+          // otherwise give up and let the user supply their own function
+          if (types.size() > 1) {
+            output_propagate = input_type->tensor_type().elem_type();
           }
         } else {
-          matched = false;
+          ORT_THROW("[CustomOP type inferencing error]: no type constraint found for input: ",
+                    input_name, " Op: ", schema.Name());
         }
-      } else {
-        matched = false;
-      }
-    }  // for
-    // next, ensure that there is a constraint for every output
-    auto num_outputs = infer_ctx.getNumOutputs();
-    for (size_t i = 0; i < num_outputs && matched; i++) {
-      auto output_name = "Output" + std::to_string(i);
-      auto tc_iter = type_constraints.find(output_name);
-      if (tc_iter == type_constraints.end() || tc_iter->second.size() < 1) {
-        matched = false;
       }
     }
-    if (matched) {
-      for (size_t i = 0; i < num_outputs; i++) {
-        auto output_name = "Output" + std::to_string(i);
-        auto output_type = infer_ctx.getOutputType(i);
-        auto tc_iter = type_constraints.find(output_name);
-        if (tc_iter->second.size() > 1) {
-          output_type->mutable_tensor_type()->set_elem_type(undef);
-        } else {
-          output_type->mutable_tensor_type()->set_elem_type(
-              tc_iter->second[0]->GetTypeProto()->tensor_type().elem_type());
-        }
-      }
+  }
+
+  if (def_selected == nullptr) {
+    ORT_THROW("[CustomOP type inferencing error]: no kernel def matches node inputs for Op: ", schema.Name());
+  }
+
+  const auto& outputs = schema.outputs();
+  const auto node_output_num = infer_ctx.getNumOutputs();
+  const auto& selected_type_constraints = def_selected->TypeConstraints();
+
+  for (size_t i = 0; i < node_output_num; ++i) {
+    auto output_type = infer_ctx.getOutputType(i);
+    // Account for variadic outputs
+    const size_t schema_output_index = (i < outputs.size()) ? i : outputs.size() - 1;
+    const auto& param = outputs[schema_output_index];
+    const auto& output_name = param.GetName();
+
+    const bool is_variadic_output = (param.GetOption() == ONNX_NAMESPACE::OpSchema::FormalParameterOption::Variadic);
+    const bool is_homogeneous = param.GetIsHomogeneous();
+
+    // We give up on variadic non-homogeneous outputs
+    // Let the user handle it in their inference function
+    if (is_variadic_output && !is_homogeneous) {
       break;
     }
+
+    auto hit = selected_type_constraints.find(output_name);
+    if (hit != selected_type_constraints.end()) {
+      const auto& types = hit->second;
+      assert(!types.empty());
+
+      if (types.size() == 1) {
+        // Use the constraint type
+        output_type->mutable_tensor_type()->set_elem_type(
+            types[0]->GetTypeProto()->tensor_type().elem_type());
+      } else if (!is_variadic_input || is_homogeneous_input) {
+        // If not variadic or homogeneous, and there are multiple types possible, guess from the last input type
+        // as this works for symmetric varied single input/outputs
+        // otherwise give up and let the user supply their own function
+        output_type->mutable_tensor_type()->set_elem_type(output_propagate);
+      }
+    } else {
+      ORT_THROW("[CustomOP type inferencing error]: no type constraint found for output: ",
+                output_name, " Op: ", schema.Name());
+    }
   }
 }
+
 #endif
 
 common::Status CreateCustomRegistry(gsl::span<OrtCustomOpDomain* const> op_domains,
@@ -1178,13 +1239,13 @@ common::Status CreateCustomRegistry(gsl::span<OrtCustomOpDomain* const> op_domai
     }
 
     std::vector<ONNX_NAMESPACE::OpSchema> schemas;
-    for (auto schema_iter : schema_map) {
-      schemas.push_back(schema_iter.second);
-      InlinedVector<const KernelDef*> kernel_defs = std::move(kernel_def_map[schema_iter.first]);
+    for (auto& [name, schema] : schema_map) {
+      schemas.push_back(schema);
       auto infer_fn = schemas.back().GetTypeAndShapeInferenceFunction();
       ONNX_NAMESPACE::InferenceFunction extended_infer_fn =
-          [infer_fn, kernel_defs](ONNX_NAMESPACE::InferenceContext& infer_ctx) {
-            InferOutputTypes(kernel_defs, infer_ctx);
+          [sch = schema, infer_fn = std::move(infer_fn),
+           kernel_defs = std::move(kernel_def_map[name])](ONNX_NAMESPACE::InferenceContext& infer_ctx) {
+            InferOutputTypes(sch, kernel_defs, infer_ctx);
             if (infer_fn) {
               infer_fn(infer_ctx);
             }
diff --git a/onnxruntime/test/framework/shape_inference_test.cc b/onnxruntime/test/framework/shape_inference_test.cc
index bfabcd567803b..f5258760eb20d 100644
--- a/onnxruntime/test/framework/shape_inference_test.cc
+++ b/onnxruntime/test/framework/shape_inference_test.cc
@@ -5,13 +5,16 @@
 #include <unordered_map>
 
 #include "gtest/gtest.h"
+#include "core/common/span_utils.h"
 #include "core/graph/model.h"
+#include "core/session/onnxruntime_cxx_api.h"
 #include "test/framework/model_builder_utils.h"
+#include "test/util/include/asserts.h"
 #include "test/util/include/test_utils.h"
+#include "test/util/include/inference_session_wrapper.h"
 #include "test/test_environment.h"
 
 using namespace ONNX_NAMESPACE;
-using namespace std;
 
 namespace onnxruntime {
 namespace test {
@@ -22,7 +25,7 @@ class ShapeInferenceTest : public ::testing::Test {
  protected:
   onnxruntime::Model model_;
   int node_count_;
-  std::unordered_map<string, std::unique_ptr<onnxruntime::NodeArg>> name_to_arg_;
+  std::unordered_map<std::string, std::unique_ptr<onnxruntime::NodeArg>> name_to_arg_;
 
  public:
   ShapeInferenceTest() : model_("Test", false, DefaultLoggingManager().DefaultLogger()), node_count_(0) {}
@@ -73,5 +76,91 @@ TEST_F(ShapeInferenceTest, BasicTest) {
   CheckShapeEquality(InputShape(node), OutputShape(node));
 }
 
+namespace {
+struct MyCustomKernelWithOptionalInput {
+  MyCustomKernelWithOptionalInput(const OrtKernelInfo* /*info*/) {
+  }
+
+  OrtStatusPtr ComputeV2(OrtKernelContext* /* context */) const {
+    return nullptr;
+  }
+};
+
+struct MyCustomOpWithOptionalInput : Ort::CustomOpBase<MyCustomOpWithOptionalInput,
+                                                       MyCustomKernelWithOptionalInput,
+                                                       true> {
+  explicit MyCustomOpWithOptionalInput(const char* provider) : provider_(provider) {}
+
+  OrtStatusPtr CreateKernelV2(const OrtApi& /* api */, const OrtKernelInfo* info, void** kernel) const {
+    *kernel = new MyCustomKernelWithOptionalInput(info);
+    return nullptr;
+  };
+
+  const char* GetName() const { return "FooBar"; };
+  const char* GetExecutionProviderType() const { return provider_; };
+
+  size_t GetInputTypeCount() const { return 3; };
+  ONNXTensorElementDataType GetInputType(size_t /*index*/) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; };
+  OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(size_t index) const {
+    // The second input (index == 1) is optional
+    if (index == 1)
+      return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_OPTIONAL;
+
+    return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_REQUIRED;
+  }
+
+  size_t GetOutputTypeCount() const { return 1; };
+  ONNXTensorElementDataType GetOutputType(size_t /*index*/) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; };
+  OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(size_t /*index*/) const {
+    return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_REQUIRED;
+  }
+
+ private:
+  const char* provider_;
+};
+
+const ORTCHAR_T* const OPTIONAL_INPUT_CUSTOM_OP_MODEL_URI_2 = ORT_TSTR("testdata/foo_bar_2.onnx");
+
+}  // namespace
+
+// CustomOps Output type inference function quits if it
+// encounters the an output that is optional and absent.
+// It quits without any errors or logging. We want to make sure
+// that inference proceeds for all of the outputs when absent optional inputs are present
+TEST(ShapeInferenceCustomOpTest, custom_op_optional_input_inference_test) {
+  MyCustomOpWithOptionalInput custom_op{onnxruntime::kCpuExecutionProvider};
+
+  const auto& env = GetEnvironment();
+
+  Ort::CustomOpDomain op_domain("test");
+  op_domain.Add(&custom_op);
+
+  std::initializer_list<OrtCustomOpDomain*> op_domains = {static_cast<OrtCustomOpDomain*>(op_domain)};
+
+  SessionOptions sess_opts;
+  sess_opts.inter_op_param.thread_pool_size = 1;
+  sess_opts.intra_op_param.thread_pool_size = 1;
+
+  InferenceSessionWrapper session{sess_opts, env, OPTIONAL_INPUT_CUSTOM_OP_MODEL_URI_2};
+  ASSERT_STATUS_OK(session.AddCustomOpDomains(AsSpan(op_domains)));
+
+  ASSERT_STATUS_OK(session.Load());
+  ASSERT_STATUS_OK(session.Initialize());
+
+  const onnxruntime::Model& model = session.GetModel();
+  const auto& graph = model.MainGraph();
+  const auto& nodes = graph.Nodes();
+  for (const auto& node : nodes) {
+    if (node.OpType() == "FooBar") {
+      // check inferred shapes
+      const auto* node_arg = node.OutputDefs()[0];
+      const auto* type_proto = node_arg->TypeAsProto();
+      ASSERT_NE(nullptr, type_proto);
+      ASSERT_EQ(ONNX_NAMESPACE::TypeProto::ValueCase::kTensorType, type_proto->value_case());
+      ASSERT_EQ(ONNX_NAMESPACE::TensorProto_DataType_FLOAT, type_proto->tensor_type().elem_type());
+    }
+  }
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc
index 91453102d406f..52dd2a84e383b 100644
--- a/onnxruntime/test/shared_lib/test_inference.cc
+++ b/onnxruntime/test/shared_lib/test_inference.cc
@@ -208,7 +208,7 @@ static constexpr PATH_TYPE MODEL_WITH_CUSTOM_MODEL_METADATA = TSTR("testdata/mod
 static constexpr PATH_TYPE VARIED_INPUT_CUSTOM_OP_MODEL_URI = TSTR("testdata/VariedInputCustomOp.onnx");
 static constexpr PATH_TYPE VARIED_INPUT_CUSTOM_OP_MODEL_URI_2 = TSTR("testdata/foo_3.onnx");
 static constexpr PATH_TYPE OPTIONAL_INPUT_OUTPUT_CUSTOM_OP_MODEL_URI = TSTR("testdata/foo_bar_1.onnx");
-static constexpr PATH_TYPE OPTIONAL_INPUT_OUTPUT_CUSTOM_OP_MODEL_URI_2 = TSTR("testdata/foo_bar_2.onnx");
+static constexpr PATH_TYPE OPTIONAL_INPUT_CUSTOM_OP_MODEL_URI_2 = TSTR("testdata/foo_bar_2.onnx");
 static constexpr PATH_TYPE VARIADIC_INPUT_OUTPUT_CUSTOM_OP_MODEL_URI = TSTR("testdata/custom_op_variadic_io.onnx");
 static constexpr PATH_TYPE VARIADIC_UNDEF_INPUT_OUTPUT_CUSTOM_OP_MODEL_URI = TSTR(
     "testdata/custom_op_variadic_undef_io.onnx");
@@ -1082,7 +1082,7 @@ TEST(CApiTest, invalid_variadic_input_homogeneity_custom_op) {
   }
 }
 
-TEST(CApiTest, optional_input_output_custom_op_handler) {
+TEST(CApiTest, optional_input_custom_op_handler) {
   MyCustomOpWithOptionalInput custom_op{onnxruntime::kCpuExecutionProvider};
 
   // `MyCustomOpFooBar` defines a custom op with atmost 3 inputs and the second input is optional.
@@ -1147,7 +1147,7 @@ TEST(CApiTest, optional_input_output_custom_op_handler) {
   {
     std::vector<const char*> input_names = {"X1", "X2"};
     ort_inputs.erase(ort_inputs.begin() + 2);  // remove the last input in the container
-    Ort::Session session(*ort_env, OPTIONAL_INPUT_OUTPUT_CUSTOM_OP_MODEL_URI_2, session_options);
+    Ort::Session session(*ort_env, OPTIONAL_INPUT_CUSTOM_OP_MODEL_URI_2, session_options);
     auto ort_outputs = session.Run(Ort::RunOptions{}, input_names.data(), ort_inputs.data(), ort_inputs.size(),
                                    &output_name, 1);
     ASSERT_EQ(ort_outputs.size(), 1u);
@@ -1166,6 +1166,7 @@ TEST(CApiTest, optional_input_output_custom_op_handler) {
     }
   }
 }
+
 TEST(CApiTest, custom_op_with_attributes_handler) {
   MyCustomOpWithAttributes custom_op{onnxruntime::kCpuExecutionProvider};
 

From 141966bb69468ce87d717df59fa01fd64bc35112 Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Mon, 18 Mar 2024 11:17:34 -0700
Subject: [PATCH 196/279] Disable TF32 in tests of CUDA ep (#19963)

Operator or model test result shall not depend on whether
NVIDIA_TF32_OVERRIDE environment variable is set or not. This make test results more deterministic.
---
 .../test/contrib_ops/attention_op_test.cc     | 14 ---------
 .../test/contrib_ops/beam_search_test.cc      | 20 ++++++++++---
 .../test/contrib_ops/greedy_search_test.cc    | 16 ++++++++--
 .../contrib_ops/packed_attention_op_test.cc   |  3 +-
 onnxruntime/test/contrib_ops/sampling_test.cc |  9 +++++-
 onnxruntime/test/onnx/main.cc                 | 18 ++++++++----
 onnxruntime/test/providers/cpu/model_tests.cc | 29 ++++++++-----------
 onnxruntime/test/util/default_providers.cc    |  6 ++--
 8 files changed, 68 insertions(+), 47 deletions(-)

diff --git a/onnxruntime/test/contrib_ops/attention_op_test.cc b/onnxruntime/test/contrib_ops/attention_op_test.cc
index b652e0723f5aa..7fe70fd2d6f09 100644
--- a/onnxruntime/test/contrib_ops/attention_op_test.cc
+++ b/onnxruntime/test/contrib_ops/attention_op_test.cc
@@ -2013,13 +2013,6 @@ TEST(AttentionTest, AttentionMaskIndexOutOfRange) {
 #if !defined(__wasm__)
 // TODO: fix in web assembly
 TEST(AttentionTest, AttentionPastState_dynamic) {
-  // ORT enables TF32 in GEMM for A100. TF32 will cause precsion loss and fail this test.
-  // Do not run this test unless TF32 is disabled explicitly.
-  if (HasCudaEnvironment(800) && ParseEnvironmentVariableWithDefault<int>("NVIDIA_TF32_OVERRIDE", 1) != 0) {
-    GTEST_SKIP() << "Skipping AttentionPastState_dynamic in A100 since TF32 is enabled";
-    return;
-  }
-
   // create rand inputs
   RandomValueGenerator random{};
 
@@ -2101,13 +2094,6 @@ static void RunModelWithRandomInput(
     std::vector<int32_t>& mask_index_data,
     std::string& onnx_model,
     bool is_float16) {
-  // ORT enables TF32 in GEMM for A100. TF32 will cause precsion loss and fail this test.
-  // Do not run this test unless TF32 is disabled explicitly.
-  if (HasCudaEnvironment(800) && ParseEnvironmentVariableWithDefault<int>("NVIDIA_TF32_OVERRIDE", 1) != 0) {
-    GTEST_SKIP() << "Skipping RunModelWithRandomInput in A100 since TF32 is enabled";
-    return;
-  }
-
   RandomValueGenerator random{234};
 
   constexpr int hidden_size = 768;
diff --git a/onnxruntime/test/contrib_ops/beam_search_test.cc b/onnxruntime/test/contrib_ops/beam_search_test.cc
index 156ed3799fc22..6ce9f5de68f11 100644
--- a/onnxruntime/test/contrib_ops/beam_search_test.cc
+++ b/onnxruntime/test/contrib_ops/beam_search_test.cc
@@ -8,6 +8,10 @@
 #include "core/session/onnxruntime_cxx_api.h"
 #include "test/common/cuda_op_test_utils.h"
 
+#ifdef USE_CUDA
+#include "core/providers/cuda/cuda_provider_options.h"
+#endif
+
 extern std::unique_ptr<Ort::Env> ort_env;
 
 namespace onnxruntime {
@@ -70,7 +74,9 @@ TEST(BeamSearchTest, GptBeamSearchFp32) {
 
   Ort::SessionOptions session_options;
 #ifdef USE_CUDA
-  Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
+  OrtCUDAProviderOptionsV2 cuda_options;
+  cuda_options.use_tf32 = false;
+  session_options.AppendExecutionProvider_CUDA_V2(cuda_options);
 #endif
 
 #ifdef USE_ROCM
@@ -161,7 +167,9 @@ TEST(BeamSearchTest, GptBeamSearchFp16) {
   if (enable_cuda || enable_rocm) {
     Ort::SessionOptions session_options;
 #ifdef USE_CUDA
-    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
+    OrtCUDAProviderOptionsV2 cuda_options;
+    cuda_options.use_tf32 = false;
+    session_options.AppendExecutionProvider_CUDA_V2(cuda_options);
 #endif
 
 #ifdef USE_ROCM
@@ -254,7 +262,9 @@ TEST(BeamSearchTest, GptBeamSearchWithInitDecoderFp16) {
   if (enable_cuda || enable_rocm) {
     Ort::SessionOptions session_options;
 #ifdef USE_CUDA
-    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
+    OrtCUDAProviderOptionsV2 cuda_options;
+    cuda_options.use_tf32 = false;
+    session_options.AppendExecutionProvider_CUDA_V2(cuda_options);
 #endif
 
 #ifdef USE_ROCM
@@ -346,7 +356,9 @@ TEST(BeamSearchTest, GptBeamSearchFp16_VocabPadded) {
   if (enable_cuda || enable_rocm) {
     Ort::SessionOptions session_options;
 #ifdef USE_CUDA
-    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
+    OrtCUDAProviderOptionsV2 cuda_options;
+    cuda_options.use_tf32 = false;
+    session_options.AppendExecutionProvider_CUDA_V2(cuda_options);
 #endif
 
 #ifdef USE_ROCM
diff --git a/onnxruntime/test/contrib_ops/greedy_search_test.cc b/onnxruntime/test/contrib_ops/greedy_search_test.cc
index 1baf50c1ba616..73da82d4bb039 100644
--- a/onnxruntime/test/contrib_ops/greedy_search_test.cc
+++ b/onnxruntime/test/contrib_ops/greedy_search_test.cc
@@ -8,6 +8,10 @@
 #include "core/session/onnxruntime_cxx_api.h"
 #include "test/common/cuda_op_test_utils.h"
 
+#ifdef USE_CUDA
+#include "core/providers/cuda/cuda_provider_options.h"
+#endif
+
 extern std::unique_ptr<Ort::Env> ort_env;
 
 namespace onnxruntime {
@@ -64,9 +68,13 @@ TEST(GreedySearchTest, GptGreedySearchFp16_VocabPadded) {
 
   if (is_cuda || is_rocm) {
     Ort::SessionOptions session_options;
+#ifdef USE_CUDA
     if (is_cuda) {
-      Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
+      OrtCUDAProviderOptionsV2 cuda_options;
+      cuda_options.use_tf32 = false;
+      session_options.AppendExecutionProvider_CUDA_V2(cuda_options);
     }
+#endif
     if (is_rocm) {
       Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(session_options, 0));
     }
@@ -145,9 +153,13 @@ TEST(GreedySearchTest, GptGreedySearchFp32) {
 
   if (is_cuda || is_rocm) {
     Ort::SessionOptions session_options;
+#ifdef USE_CUDA
     if (is_cuda) {
-      Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
+      OrtCUDAProviderOptionsV2 cuda_options;
+      cuda_options.use_tf32 = false;
+      session_options.AppendExecutionProvider_CUDA_V2(cuda_options);
     }
+#endif
     if (is_rocm) {
       Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(session_options, 0));
     }
diff --git a/onnxruntime/test/contrib_ops/packed_attention_op_test.cc b/onnxruntime/test/contrib_ops/packed_attention_op_test.cc
index 31ef62e69bb88..09baf8def05f6 100644
--- a/onnxruntime/test/contrib_ops/packed_attention_op_test.cc
+++ b/onnxruntime/test/contrib_ops/packed_attention_op_test.cc
@@ -433,8 +433,7 @@ static void RunModelWithRandomInput(
   std::vector<int64_t> token_offset_dims{batch_size, sequence_length};
   std::vector<int64_t> cum_seq_len_dims{batch_size + 1};
 
-  // TF32 in SM >= 80 is enabled by default, need larger threshold for float when TF32 is enabled.
-  float gpu_threshold = is_float16 ? 0.15f : (HasCudaEnvironment(800) ? 0.05f : 0.005f);
+  float gpu_threshold = is_float16 ? 0.15f : 0.005f;
   gpu_threshold *= sequence_length > 1024 ? 4.0f : 1.0f;  // threshold should increase with sequence length
   bool enable_cuda = HasCudaEnvironment(is_float16 ? 530 : 0);
   if (enable_cuda) {
diff --git a/onnxruntime/test/contrib_ops/sampling_test.cc b/onnxruntime/test/contrib_ops/sampling_test.cc
index 733bc9f01fd11..d987a1cae427d 100644
--- a/onnxruntime/test/contrib_ops/sampling_test.cc
+++ b/onnxruntime/test/contrib_ops/sampling_test.cc
@@ -8,6 +8,10 @@
 #include "core/session/onnxruntime_cxx_api.h"
 #include "test/common/cuda_op_test_utils.h"
 
+#ifdef USE_CUDA
+#include "core/providers/cuda/cuda_provider_options.h"
+#endif
+
 extern std::unique_ptr<Ort::Env> ort_env;
 
 namespace onnxruntime {
@@ -65,7 +69,10 @@ TEST(SamplingTest, Gpt2Sampling_GPU) {
     LOGS_DEFAULT(WARNING) << "Hardware NOT support current architecture";
     return;
   }
-  Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
+
+  OrtCUDAProviderOptionsV2 cuda_options;
+  cuda_options.use_tf32 = false;
+  session_options.AppendExecutionProvider_CUDA_V2(cuda_options);
 #else  // USE_ROCM
   OrtROCMProviderOptions rocm_options;
   // TODO - verify the default settings
diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc
index 9c2c24e3c337d..0d55fd19b918a 100644
--- a/onnxruntime/test/onnx/main.cc
+++ b/onnxruntime/test/onnx/main.cc
@@ -25,6 +25,10 @@
 #include "core/session/onnxruntime_session_options_config_keys.h"
 #include "nlohmann/json.hpp"
 
+#ifdef USE_CUDA
+#include "core/providers/cuda/cuda_provider_options.h"
+#endif
+
 using namespace onnxruntime;
 
 namespace {
@@ -401,12 +405,15 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
 
     if (enable_tensorrt) {
 #ifdef USE_TENSORRT
-      OrtCUDAProviderOptions cuda_options;
+      Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_Tensorrt(sf, device_id));
+#ifdef USE_CUDA
+      OrtCUDAProviderOptionsV2 cuda_options;
       cuda_options.device_id = device_id;
       cuda_options.do_copy_in_default_stream = true;
+      cuda_options.use_tf32 = false;
       // TODO: Support arena configuration for users of test runner
-      Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_Tensorrt(sf, device_id));
-      sf.AppendExecutionProvider_CUDA(cuda_options);
+      sf.AppendExecutionProvider_CUDA_V2(cuda_options);
+#endif
 #else
       fprintf(stderr, "TensorRT is not supported in this build");
       return -1;
@@ -424,10 +431,11 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
     }
     if (enable_cuda) {
 #ifdef USE_CUDA
-      OrtCUDAProviderOptions cuda_options;
+      OrtCUDAProviderOptionsV2 cuda_options;
       cuda_options.do_copy_in_default_stream = true;
+      cuda_options.use_tf32 = false;
       // TODO: Support arena configuration for users of test runner
-      sf.AppendExecutionProvider_CUDA(cuda_options);
+      sf.AppendExecutionProvider_CUDA_V2(cuda_options);
 #else
       fprintf(stderr, "CUDA is not supported in this build");
       return -1;
diff --git a/onnxruntime/test/providers/cpu/model_tests.cc b/onnxruntime/test/providers/cpu/model_tests.cc
index af71fe5cf79ae..00d96a0664fa0 100644
--- a/onnxruntime/test/providers/cpu/model_tests.cc
+++ b/onnxruntime/test/providers/cpu/model_tests.cc
@@ -98,21 +98,6 @@ TEST_P(ModelTest, Run) {
 
   std::unique_ptr<OnnxModelInfo> model_info = std::make_unique<OnnxModelInfo>(model_path.c_str());
 
-#if defined(__linux__)
-  // ORT enables TF32 in GEMM for A100. TF32 will cause precsion loss and fail this test.
-  if (HasCudaEnvironment(800) && provider_name == "cuda") {
-    per_sample_tolerance = 1e-1;
-    if (model_path.find(ORT_TSTR("SSD")) > 0 ||
-        model_path.find(ORT_TSTR("ssd")) > 0 ||
-        model_path.find(ORT_TSTR("yolov3")) > 0 ||
-        model_path.find(ORT_TSTR("mask_rcnn")) > 0 ||
-        model_path.find(ORT_TSTR("FNS")) > 0) {
-      SkipTest("Skipping SSD test for big tolearance failure or other errors");
-      return;
-    }
-  }
-#endif
-
   if (model_info->HasDomain(ONNX_NAMESPACE::AI_ONNX_TRAINING_DOMAIN) ||
       model_info->HasDomain(ONNX_NAMESPACE::AI_ONNX_PREVIEW_TRAINING_DOMAIN)) {
     SkipTest("it has the training domain. No pipeline should need to run these tests.");
@@ -192,12 +177,14 @@ TEST_P(ModelTest, Run) {
         ASSERT_ORT_STATUS_OK(OrtApis::CreateCUDAProviderOptions(&cuda_options));
         std::unique_ptr<OrtCUDAProviderOptionsV2, decltype(&OrtApis::ReleaseCUDAProviderOptions)> rel_cuda_options(
             cuda_options, &OrtApis::ReleaseCUDAProviderOptions);
-        std::vector<const char*> keys{"device_id"};
 
+        std::vector<const char*> keys{"device_id", "use_tf32"};
         std::vector<const char*> values;
         std::string device_id = Env::Default().GetEnvironmentVar("ONNXRUNTIME_TEST_GPU_DEVICE_ID");
         values.push_back(device_id.empty() ? "0" : device_id.c_str());
-        ASSERT_ORT_STATUS_OK(OrtApis::UpdateCUDAProviderOptions(cuda_options, keys.data(), values.data(), 1));
+        values.push_back("0");
+        ASSERT_ORT_STATUS_OK(OrtApis::UpdateCUDAProviderOptions(cuda_options, keys.data(), values.data(), 2));
+
         ortso.AppendExecutionProvider_CUDA_V2(*cuda_options);
       } else if (provider_name == "rocm") {
         OrtROCMProviderOptions ep_options;
@@ -229,6 +216,14 @@ TEST_P(ModelTest, Run) {
         ASSERT_ORT_STATUS_OK(OrtApis::CreateCUDAProviderOptions(&cuda_options));
         std::unique_ptr<OrtCUDAProviderOptionsV2, decltype(&OrtApis::ReleaseCUDAProviderOptions)> rel_cuda_options(
             cuda_options, &OrtApis::ReleaseCUDAProviderOptions);
+
+        std::vector<const char*> keys{"device_id", "use_tf32"};
+        std::vector<const char*> values;
+        std::string device_id = Env::Default().GetEnvironmentVar("ONNXRUNTIME_TEST_GPU_DEVICE_ID");
+        values.push_back(device_id.empty() ? "0" : device_id.c_str());
+        values.push_back("0");
+        ASSERT_ORT_STATUS_OK(OrtApis::UpdateCUDAProviderOptions(cuda_options, keys.data(), values.data(), 2));
+
         ortso.AppendExecutionProvider_CUDA_V2(*cuda_options);
       } else if (provider_name == "migraphx") {
         OrtMIGraphXProviderOptions ep_options;
diff --git a/onnxruntime/test/util/default_providers.cc b/onnxruntime/test/util/default_providers.cc
index c12a52c4356aa..6ad2d41edb562 100644
--- a/onnxruntime/test/util/default_providers.cc
+++ b/onnxruntime/test/util/default_providers.cc
@@ -8,7 +8,7 @@
 #ifdef USE_COREML
 #include "core/providers/coreml/coreml_provider_factory.h"
 #endif
-#if defined(ENABLE_CUDA_NHWC_OPS)
+#ifdef USE_CUDA
 #include <core/providers/cuda/cuda_provider_options.h>
 #endif
 #include "core/session/onnxruntime_cxx_api.h"
@@ -113,8 +113,9 @@ std::unique_ptr<IExecutionProvider> DefaultOpenVINOExecutionProvider() {
 
 std::unique_ptr<IExecutionProvider> DefaultCudaExecutionProvider() {
 #ifdef USE_CUDA
-  OrtCUDAProviderOptions provider_options{};
+  OrtCUDAProviderOptionsV2 provider_options{};
   provider_options.do_copy_in_default_stream = true;
+  provider_options.use_tf32 = false;
   if (auto factory = CudaProviderFactoryCreator::Create(&provider_options))
     return factory->CreateProvider();
 #endif
@@ -126,6 +127,7 @@ std::unique_ptr<IExecutionProvider> DefaultCudaNHWCExecutionProvider() {
 #if defined(USE_CUDA)
   OrtCUDAProviderOptionsV2 provider_options{};
   provider_options.do_copy_in_default_stream = true;
+  provider_options.use_tf32 = false;
   provider_options.prefer_nhwc = true;
   if (auto factory = CudaProviderFactoryCreator::Create(&provider_options))
     return factory->CreateProvider();

From a4ac727cbbf1c1d5fa1483972591b6693afbb2d6 Mon Sep 17 00:00:00 2001
From: Guenther Schmuelling <guschmue@microsoft.com>
Date: Mon, 18 Mar 2024 13:42:51 -0700
Subject: [PATCH 197/279] handle fp16 for where op (#19969)

this prevents falling back from webgpu to cpu, aka helps performance
---
 .../core/providers/js/operators/where.cc      | 26 ++++++++++---------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/onnxruntime/core/providers/js/operators/where.cc b/onnxruntime/core/providers/js/operators/where.cc
index 2f8f5e275aa98..dcdf9bee2f783 100644
--- a/onnxruntime/core/providers/js/operators/where.cc
+++ b/onnxruntime/core/providers/js/operators/where.cc
@@ -6,18 +6,19 @@
 namespace onnxruntime {
 namespace js {
 
-#define REG_ELEMENTWISE_KERNEL(OP_TYPE, VERSION, KERNEL_CLASS)      \
-  ONNX_OPERATOR_KERNEL_EX(                                          \
-      OP_TYPE,                                                      \
-      kOnnxDomain,                                                  \
-      VERSION,                                                      \
-      kJsExecutionProvider,                                         \
-      KernelDefBuilder()                                            \
-          .TypeConstraint("T",                                      \
-                          {DataTypeImpl::GetTensorType<float>(),    \
-                           DataTypeImpl::GetTensorType<int32_t>(),  \
-                           DataTypeImpl::GetTensorType<uint32_t>(), \
-                           DataTypeImpl::GetTensorType<bool>()}),   \
+#define REG_ELEMENTWISE_KERNEL(OP_TYPE, VERSION, KERNEL_CLASS)       \
+  ONNX_OPERATOR_KERNEL_EX(                                           \
+      OP_TYPE,                                                       \
+      kOnnxDomain,                                                   \
+      VERSION,                                                       \
+      kJsExecutionProvider,                                          \
+      KernelDefBuilder()                                             \
+          .TypeConstraint("T",                                       \
+                          {DataTypeImpl::GetTensorType<float>(),     \
+                           DataTypeImpl::GetTensorType<MLFloat16>(), \
+                           DataTypeImpl::GetTensorType<int32_t>(),   \
+                           DataTypeImpl::GetTensorType<uint32_t>(),  \
+                           DataTypeImpl::GetTensorType<bool>()}),    \
       KERNEL_CLASS);
 
 #define REG_ELEMENTWISE_VERSIONED_KERNEL(OP_TYPE, VERSION_FROM, VERSION_TO, KERNEL_CLASS) \
@@ -29,6 +30,7 @@ namespace js {
       KernelDefBuilder()                                                                  \
           .TypeConstraint("T",                                                            \
                           {DataTypeImpl::GetTensorType<float>(),                          \
+                           DataTypeImpl::GetTensorType<MLFloat16>(),                      \
                            DataTypeImpl::GetTensorType<int32_t>(),                        \
                            DataTypeImpl::GetTensorType<uint32_t>(),                       \
                            DataTypeImpl::GetTensorType<bool>()}),                         \

From 6bb64683f8f937da7af86bc61df7a4fb28dee5aa Mon Sep 17 00:00:00 2001
From: Ted Themistokleous
 <107195283+TedThemistokleous@users.noreply.github.com>
Date: Mon, 18 Mar 2024 22:40:40 -0400
Subject: [PATCH 198/279] Use version instead of version-dev for ROCm (#19967)

---
 cmake/CMakeLists.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 02b568abdf8da..655ca1c42ef93 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -325,8 +325,8 @@ if (onnxruntime_USE_ROCM)
   # replicate strategy used by pytorch to get ROCM_VERSION
   # https://github.com/pytorch/pytorch/blob/5c5b71b6eebae76d744261715231093e62f0d090/cmake/public/LoadHIP.cmake
   # with modification
-  if (EXISTS "${onnxruntime_ROCM_HOME}/.info/version-dev")
-    file(READ "${onnxruntime_ROCM_HOME}/.info/version-dev" ROCM_VERSION_DEV_RAW)
+  if (EXISTS "${onnxruntime_ROCM_HOME}/.info/version")
+    file(READ "${onnxruntime_ROCM_HOME}/.info/version" ROCM_VERSION_DEV_RAW)
     string(REGEX MATCH "^([0-9]+)\.([0-9]+)\.([0-9]+)-.*$" ROCM_VERSION_MATCH ${ROCM_VERSION_DEV_RAW})
   elseif (EXISTS "${onnxruntime_ROCM_HOME}/include/rocm_version.h")
     file(READ "${onnxruntime_ROCM_HOME}/include/rocm_version.h" ROCM_VERSION_H_RAW)
@@ -345,7 +345,7 @@ if (onnxruntime_USE_ROCM)
   else()
     message(FATAL_ERROR "Cannot determine ROCm version string")
   endif()
-  message("\n***** ROCm version from ${onnxruntime_ROCM_HOME}/.info/version-dev ****\n")
+  message("\n***** ROCm version from ${onnxruntime_ROCM_HOME}/.info/version ****\n")
   message("ROCM_VERSION_DEV: ${ROCM_VERSION_DEV}")
   message("ROCM_VERSION_DEV_MAJOR: ${ROCM_VERSION_DEV_MAJOR}")
   message("ROCM_VERSION_DEV_MINOR: ${ROCM_VERSION_DEV_MINOR}")

From 4c6a6a37f77dae7b54a826527a0d688c7ca46834 Mon Sep 17 00:00:00 2001
From: Xu Xing <xing.xu@intel.com>
Date: Tue, 19 Mar 2024 13:59:32 +0800
Subject: [PATCH 199/279] [js/webgpu] Fix NAN caused by un-initialized buffer
 in instance-norm (#19387)

The added case will be NAN because of the un-initialized buffer.
---
 .../lib/wasm/jsep/webgpu/ops/instance-norm.ts |  2 +-
 js/web/test/data/ops/instance-norm.jsonc      | 80 +++++++++++++++++++
 2 files changed, 81 insertions(+), 1 deletion(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts b/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts
index 2f652dbd310ab..2c72def089144 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts
@@ -207,7 +207,7 @@ const computeMean =
     let offset = currentImageNumber * uniforms.image_size;
     var sum = ${fillVector('f32', components)};
     var squaredSum = ${fillVector('f32', components)};
-    for (var i: u32 = 0; i < ${WG}; i++) {
+    for (var i: u32 = 0; i < min(${WG}, uniforms.H); i++) {
         let value = input[offset + i + currentChannelNumber * ${WG}];
         sum += value[0];
         squaredSum += value[1];
diff --git a/js/web/test/data/ops/instance-norm.jsonc b/js/web/test/data/ops/instance-norm.jsonc
index e89ac2da3795f..f28b016d47ab9 100644
--- a/js/web/test/data/ops/instance-norm.jsonc
+++ b/js/web/test/data/ops/instance-norm.jsonc
@@ -224,5 +224,85 @@
         ]
       }
     ]
+  },
+  {
+    "name": "Simple test with NHWC, components 1, buffer reuse",
+    "operator": "InstanceNormalization",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": {
+      "domain": "",
+      "version": 17
+    },
+    "cases": [
+      {
+        "name": "Simple test",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6],
+            "dims": [2, 3, 1, 1],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3],
+            "dims": [3],
+            "type": "float32"
+          },
+          {
+            "data": [4, 5, 6],
+            "dims": [3],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [4, 5, 6, 4, 5, 6],
+            "dims": [2, 3, 1, 1],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "Simple test with NHWC, components 2, buffer reuse",
+    "operator": "InstanceNormalization",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": {
+      "domain": "",
+      "version": 17
+    },
+    "cases": [
+      {
+        "name": "Simple test",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 9, 8, 7, 6, 5, 4, 3, 2],
+            "dims": [1, 6, 1, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6],
+            "dims": [6],
+            "type": "float32"
+          },
+          {
+            "data": [4, 5, 6, 7, 8, 9],
+            "dims": [6],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              2.775264263153076, 4, 5.224735260009766, 2.5505285263061523, 5, 7.449470520019531, 2.325794219970703, 6,
+              9.674205780029297, 11.898944854736328, 7, 2.1010589599609375, 14.123676300048828, 8, 1.876321792602539,
+              16.348413467407227, 9, 1.6515865325927734
+            ],
+            "dims": [1, 6, 1, 3],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
   }
 ]

From 26cd3c1fb0245d05e3beb8a9f33ce5f5d274d111 Mon Sep 17 00:00:00 2001
From: Prathik Rao <prathik.rao@gmail.com>
Date: Tue, 19 Mar 2024 09:33:06 -0700
Subject: [PATCH 200/279] add kernel tests for ops that changed in opset18
 (#19767)

### Description
<!-- Describe your changes. -->

- [x] Pad operator has introduced a new input called "axes" which
specifies which axis to pad. But it defaults to input_rank if axes is
not provided which was the behavior before the opset upgrade.
- [x] ReduceMean
- [x] ReduceL2
- [x] ReduceLogSumExp
- [x] ReduceSum
- Reduction ops all had the axes attribute switched to an input and a
new attribute called "noop_with_empty_axes" was added to define what to
do when axes is not specified.
- [x] Resize has had two new attributes introduced: antialias and
keep_aspect_ratio_policy. From Operators.md I've gathered:
"Antialiasing is achieved by stretching the resampling filter by a
factor max(1, 1 / scale), which means that when downsampling, more input
pixels contribute to an output pixel."
keep_aspect_ratio_policy "describes how to interpret the `sizes` input
with regard to keeping the original aspect ratio of the input." there
are a couple enum-type options that specify different policies and what
to do in each case.
- NOTE: Baiju already included opset18 tests in
https://github.com/microsoft/onnxruntime/pull/17772
- [x] ScatterElements/ScatterND has had a new attribute introduced
called "reduction." This specifies the type of reduction to apply: none
(default), add, mul, max, min.
- [x] Split introduced a new attribute called "num_outputs" which
specifies how many outputs to split the input tensor into. This is in
contrast to the previous, default behavior of specifying a "split" input
which defines the size of each resultant tensor of the output.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../core/graph/gradient_builder.cc            | 37 ++++++++++++++-----
 .../test/gradient/gradient_ops_test.cc        | 30 +++++++++++++--
 2 files changed, 55 insertions(+), 12 deletions(-)

diff --git a/orttraining/orttraining/core/graph/gradient_builder.cc b/orttraining/orttraining/core/graph/gradient_builder.cc
index e675b55c8af8f..22dcf4eb92411 100755
--- a/orttraining/orttraining/core/graph/gradient_builder.cc
+++ b/orttraining/orttraining/core/graph/gradient_builder.cc
@@ -1112,6 +1112,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetReduceMeanGradient) {
 
   ArgDef grad = GO(0);
   if (!keepdims) {
+    size_t numInputs = GetSrcNodeInputSize();
     if (attributes.find("axes") != attributes.end()) {
       std::vector<int64_t> axes_values = RetrieveValues<int64_t>(attributes.at("axes"));
       grad = IA("Unqueezed_Grad");
@@ -1122,6 +1123,9 @@ IMPLEMENT_GRADIENT_BUILDER(GetReduceMeanGradient) {
         result.push_back(axes_values_node);
         result.push_back(NodeDef(OpDef{"Unsqueeze", kOnnxDomain, 13}, {GO(0), axes_values_node.output_args[0]}, {grad}));
       }
+    } else if (numInputs == 2) {  // optional input 'axes' is available as input I(1)
+      grad = IA("Unqueezed_Grad");
+      result.push_back(NodeDef("Unsqueeze", {GO(0), I(1)}, {grad}));
     }
   }
 
@@ -1152,12 +1156,21 @@ IMPLEMENT_GRADIENT_BUILDER(GetReduceLogSumExpGradient) {
   }
 
   ArgDef grad = GO(0);
-  if (!keepdims && attributes.find("axes") != attributes.end()) {
-    std::vector<int64_t> axes_values = RetrieveValues<int64_t>(attributes.at("axes"));
-    grad = IA("Unsqueezed_Grad");
-    result.push_back(NodeDef("Unsqueeze", {GO(0)}, {grad}, {MakeAttribute("axes", axes_values)}));
+  if (!keepdims) {
+    size_t numInputs = GetSrcNodeInputSize();
+    if (attributes.find("axes") != attributes.end()) {
+      std::vector<int64_t> axes_values = RetrieveValues<int64_t>(attributes.at("axes"));
+      grad = IA("Unsqueezed_Grad");
 
-    result.push_back(NodeDef("Unsqueeze", {O(0)}, {IA("Unsqueezed_Output")}, {MakeAttribute("axes", axes_values)}));
+      result.push_back(NodeDef("Unsqueeze", {GO(0)}, {grad}, {MakeAttribute("axes", axes_values)}));
+
+      result.push_back(NodeDef("Unsqueeze", {O(0)}, {IA("Unsqueezed_Output")}, {MakeAttribute("axes", axes_values)}));
+    } else if (numInputs == 2) {  // optional input 'axes' is available as input I(1)
+      grad = IA("Unsqueezed_Grad");
+      result.push_back(NodeDef("Unsqueeze", {GO(0), I(1)}, {grad}));
+
+      result.push_back(NodeDef("Unsqueeze", {O(0), I(1)}, {IA("Unsqueezed_Output")}));
+    }
     result.push_back(NodeDef("Sub", {I(0), IA("Unsqueezed_Output")}, {IA("Self_Sub_Result")}));
   } else {
     result.push_back(NodeDef("Sub", {I(0), O(0)}, {IA("Self_Sub_Result")}));
@@ -1188,11 +1201,17 @@ IMPLEMENT_GRADIENT_BUILDER(GetReduceL2Gradient) {
   ArgDef scaled_dy_arg_def = IA("Masked_Scaled_dY");
   result.emplace_back(NodeDef("Where", {IA("Masked_Y"), ZERO, IA("Scaled_dY")}, {scaled_dy_arg_def}));
 
-  if (!keepdims && attributes.find("axes") != attributes.end()) {
-    std::vector<int64_t> axes_values = RetrieveValues<int64_t>(attributes.at("axes"));
+  if (!keepdims) {
+    size_t numInputs = GetSrcNodeInputSize();
     scaled_dy_arg_def = IA("Unsqueezed_Masked_Scaled_dY");
-    result.emplace_back(
-        NodeDef("Unsqueeze", {IA("Masked_Scaled_dY")}, {scaled_dy_arg_def}, {MakeAttribute("axes", axes_values)}));
+    if (attributes.find("axes") != attributes.end()) {
+      std::vector<int64_t> axes_values = RetrieveValues<int64_t>(attributes.at("axes"));
+      result.emplace_back(
+          NodeDef("Unsqueeze", {IA("Masked_Scaled_dY")}, {scaled_dy_arg_def}, {MakeAttribute("axes", axes_values)}));
+    } else if (numInputs == 2) {  // optional input 'axes' is available as input I(1)
+      result.emplace_back(
+          NodeDef("Unsqueeze", {IA("Masked_Scaled_dY"), I(1)}, {scaled_dy_arg_def}));
+    }
   }
 
   result.emplace_back(NodeDef("Mul", {I(0), scaled_dy_arg_def}, {GI(0)}));
diff --git a/orttraining/orttraining/test/gradient/gradient_ops_test.cc b/orttraining/orttraining/test/gradient/gradient_ops_test.cc
index feca94ae27c13..94ca96c68f2ce 100644
--- a/orttraining/orttraining/test/gradient/gradient_ops_test.cc
+++ b/orttraining/orttraining/test/gradient/gradient_ops_test.cc
@@ -607,6 +607,10 @@ TEST(GradientCheckerTest, ReduceMeanGrad) {
 
   OpDef op_def_opset13{"ReduceMean", kOnnxDomain, 13};
   RunReductionTests(op_def_opset13);
+
+  // axes is input from opset 18.
+  OpDef op_def_opset18{"ReduceMean", kOnnxDomain, 18};
+  RunReductionTests(op_def_opset18, true, true);
 }
 
 TEST(GradientCheckerTest, ReduceSumGrad) {
@@ -619,6 +623,10 @@ TEST(GradientCheckerTest, ReduceSumGrad) {
   OpDef op_def_13{"ReduceSum", kOnnxDomain, 13};
 
   RunReductionTests(op_def_13, true, true);
+
+  OpDef op_def_18{"ReduceSum", kOnnxDomain, 18};
+
+  RunReductionTests(op_def_18, true, true);
 }
 
 TEST(GradientCheckerTest, ReduceL2Grad) {
@@ -641,6 +649,11 @@ TEST(GradientCheckerTest, ReduceL2Grad) {
                                                            {MakeAttribute("axes", axes)}));
     EXPECT_IS_TINY(max_error);
   }
+
+  // axes is input from opset 18
+  OpDef op_def_18{"ReduceL2", kOnnxDomain, 18};
+
+  RunReductionTests(op_def_18, true, true);
 }
 
 TEST(GradientCheckerTest, ReduceLogSumExpGrad) {
@@ -648,6 +661,10 @@ TEST(GradientCheckerTest, ReduceLogSumExpGrad) {
   OpDef op_def{"ReduceLogSumExp", kOnnxDomain, 11};
 
   RunReductionTests(op_def);
+
+  OpDef op_def_opset18{"ReduceLogSumExp", kOnnxDomain, 18};
+
+  RunReductionTests(op_def_opset18, true, true);
 }
 
 TEST(GradientCheckerTest, ReluGrad) {
@@ -698,6 +715,13 @@ TEST(GradientCheckerTest, SplitGrad) {
   ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def_13, {shape}, {{3, 5}, {3, 5}, {3, 5}}, &max_error,
                                                          {MakeAttribute("axis", int64_t(0))}));
   EXPECT_IS_TINY(max_error);
+
+  // opset18 test
+  OpDef op_def_18{"Split", kOnnxDomain, 18};
+  ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def_18, {shape}, {{3, 5}, {3, 5}, {3, 5}}, &max_error,
+                                                         {MakeAttribute("axis", int64_t(0)),
+                                                          MakeAttribute("num_outputs", int64_t(3))}));
+  EXPECT_IS_TINY(max_error);
 }
 
 template <typename T>
@@ -2733,7 +2757,7 @@ TEST(GradientCheckerTest, TileGrad) {
 TEST(GradientCheckerTest, PadGrad) {
   float max_error;
   GradientChecker<float, float, float> gradient_checker;
-  OpDef op_def{"Pad", kOnnxDomain, 11};
+  OpDef op_def{"Pad", kOnnxDomain, 18};
 
   {
     TensorInfo x_info({2, 4}, true);
@@ -2803,7 +2827,7 @@ TEST(GradientCheckerTest, PadGrad) {
 TEST(GradientCheckerTest, ScatterNDGrad) {
   float max_error;
   GradientChecker<float, float, float> gradient_checker;
-  OpDef op_def{"ScatterND", kOnnxDomain, 11};
+  OpDef op_def{"ScatterND", kOnnxDomain, 18};
 
   {
     TensorInfo data_info({8}, true);
@@ -2887,7 +2911,7 @@ TEST(GradientCheckerTest, ScatterNDGrad) {
 TEST(GradientCheckerTest, ScatterElementsGrad) {
   float max_error;
   GradientChecker<float, float, float> gradient_checker;
-  OpDef op_def{"ScatterElements", kOnnxDomain, 13};
+  OpDef op_def{"ScatterElements", kOnnxDomain, 18};
 
   {  // without axis
     TensorInfo data_info({3, 3}, true);

From d4c8bc359e321cdabdd87b70b392dd0e7a14502e Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Wed, 20 Mar 2024 00:33:24 +0800
Subject: [PATCH 201/279] Fix Training CPU docker image name to avoid
 unnecessary rebuilding  (#19973)

### Description
The docker image name was fixed, but the docker argument was different
in different job.
It would trigger rebuilding the docker image almost every time!!!
---
 .../azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml
index bf1ba71b7b818..0e6e5bd53fab3 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml
@@ -46,7 +46,7 @@ stages:
             --build-arg PYTHON_VERSION=$(PythonVersion)
             --build-arg INSTALL_DEPS_EXTRA_ARGS=-tu
             --build-arg BUILD_UID=$(id -u)
-          Repository: onnxruntimetrainingcpubuild
+          Repository: onnxruntimetrainingcpubuild_$(PythonVersion)
 
       - task: CmdLine@2
         displayName: 'build onnxruntime'

From 8293aa156414946428f635c71edea88cb20b4925 Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Tue, 19 Mar 2024 11:36:42 -0700
Subject: [PATCH 202/279] Exclude TRT provider in tests crashed in A100
 (#19972)

TensorRT EP segmentation fault on A100 for some tests. Exclude TRT EP in
those tests on A100 to unblock developing.

### Motivation and Context
https://github.com/microsoft/onnxruntime/issues/19530
---
 onnxruntime/test/common/cuda_op_test_utils.cc | 36 +++++++++
 onnxruntime/test/common/cuda_op_test_utils.h  | 27 ++-----
 onnxruntime/test/common/trt_op_test_utils.h   | 33 ++++++++
 .../test/providers/cpu/math/einsum_test.cc    | 75 +++++++++---------
 .../cpu/math/element_wise_ops_test.cc         |  6 +-
 .../cpu/object_detection/roialign_test.cc     |  7 +-
 .../providers/cpu/tensor/onehot_op_test.cc    | 14 ++--
 .../providers/cpu/tensor/resize_op_test.cc    | 76 +++++++++++--------
 .../providers/cpu/tensor/upsample_op_test.cc  |  5 +-
 9 files changed, 178 insertions(+), 101 deletions(-)
 create mode 100644 onnxruntime/test/common/cuda_op_test_utils.cc
 create mode 100644 onnxruntime/test/common/trt_op_test_utils.h

diff --git a/onnxruntime/test/common/cuda_op_test_utils.cc b/onnxruntime/test/common/cuda_op_test_utils.cc
new file mode 100644
index 0000000000000..bab4e9a60e2ed
--- /dev/null
+++ b/onnxruntime/test/common/cuda_op_test_utils.cc
@@ -0,0 +1,36 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#ifdef USE_CUDA
+#include "cuda_runtime_api.h"
+#endif
+
+namespace onnxruntime {
+namespace test {
+
+int GetCudaArchitecture() {
+  // This will cache the result so we only call cudaGetDeviceProperties once.
+  // Usually, we test on a single GPU or multiple GPUs of same architecture, so it's fine to cache the result.
+  static int cuda_arch = -1;
+
+#ifdef USE_CUDA
+  if (cuda_arch == -1) {
+    int current_device_id = 0;
+    cudaGetDevice(&current_device_id);
+    // must wait GPU idle, otherwise cudaGetDeviceProperties might fail
+    cudaDeviceSynchronize();
+    cudaDeviceProp prop;
+
+    // When cudaGetDeviceProperties fails, just return -1 and no error is raised.
+    // If cuda device has issue, test will fail anyway so no need to raise error here.
+    if (cudaSuccess == cudaGetDeviceProperties(&prop, current_device_id)) {
+      cuda_arch = prop.major * 100 + prop.minor * 10;
+    }
+  }
+#endif
+
+  return cuda_arch;
+}
+
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/common/cuda_op_test_utils.h b/onnxruntime/test/common/cuda_op_test_utils.h
index 043e3059c38d7..6f3e460628566 100644
--- a/onnxruntime/test/common/cuda_op_test_utils.h
+++ b/onnxruntime/test/common/cuda_op_test_utils.h
@@ -4,37 +4,20 @@
 #pragma once
 
 #include "test/util/include/default_providers.h"
-#ifdef USE_CUDA
-#include "cuda_runtime_api.h"
-#endif
 
 namespace onnxruntime {
 namespace test {
 
+// CUDA architecture of the current device like 100 * major + 10 * minor.
+// Please call this function after CUDA EP is enabled.
+int GetCudaArchitecture();
+
 inline bool HasCudaEnvironment(int min_cuda_architecture) {
   if (DefaultCudaExecutionProvider().get() == nullptr) {
     return false;
   }
 
-  if (min_cuda_architecture == 0) {
-    return true;
-  }
-
-  int cuda_architecture = 0;
-
-#ifdef USE_CUDA
-  int currentCudaDevice = 0;
-  cudaGetDevice(&currentCudaDevice);
-  cudaDeviceSynchronize();
-  cudaDeviceProp prop;
-  if (cudaSuccess != cudaGetDeviceProperties(&prop, currentCudaDevice)) {
-    return false;
-  }
-
-  cuda_architecture = prop.major * 100 + prop.minor * 10;
-#endif
-
-  return cuda_architecture >= min_cuda_architecture;
+  return GetCudaArchitecture() >= min_cuda_architecture;
 }
 
 inline bool NeedSkipIfCudaArchLowerThan(int min_cuda_architecture) {
diff --git a/onnxruntime/test/common/trt_op_test_utils.h b/onnxruntime/test/common/trt_op_test_utils.h
new file mode 100644
index 0000000000000..a0b0b9bb1931f
--- /dev/null
+++ b/onnxruntime/test/common/trt_op_test_utils.h
@@ -0,0 +1,33 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "test/common/cuda_op_test_utils.h"
+
+namespace onnxruntime {
+namespace test {
+
+// TensorRT EP Segmentation fault on A100: https://github.com/microsoft/onnxruntime/issues/19530
+inline const std::unordered_set<std::string> ExcludeTrtOnA100() {
+  // Note: GetCudaArchitecture need USE_CUDA to be defined. Currently, it is defined when TRT EP is enabled.
+  // If we want to make TRT EP independent of CUDA EP, we need to change the implementation of GetCudaArchitecture.
+  if (DefaultTensorrtExecutionProvider() != nullptr && GetCudaArchitecture() == 800) {
+    return {kTensorrtExecutionProvider};
+  }
+
+  return {};
+}
+
+// Add TensorRT EP to an excluded provider list when running on A100
+inline const std::unordered_set<std::string>& ExcludeTrtOnA100(std::unordered_set<std::string>& excluded_providers) {
+  if (DefaultTensorrtExecutionProvider() != nullptr && GetCudaArchitecture() == 800) {
+    excluded_providers.insert(kTensorrtExecutionProvider);
+    return excluded_providers;
+  }
+
+  return excluded_providers;
+}
+
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/math/einsum_test.cc b/onnxruntime/test/providers/cpu/math/einsum_test.cc
index 4e968d3de6b8a..423ea3f682f4c 100644
--- a/onnxruntime/test/providers/cpu/math/einsum_test.cc
+++ b/onnxruntime/test/providers/cpu/math/einsum_test.cc
@@ -4,6 +4,7 @@
 #include "gtest/gtest.h"
 #include "test/providers/provider_test_utils.h"
 #include "test/common/cuda_op_test_utils.h"
+#include "test/common/trt_op_test_utils.h"
 #include "core/framework/data_types.h"
 #include "core/util/math.h"
 
@@ -50,7 +51,7 @@ TEST(Einsum, ExplicitEinsumAsTransposeOp_2D_input_With_Broadcasting) {
   test.AddAttribute<std::string>("equation", "...i->i...");
   test.AddInput<float>("x", {2, 2}, {1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("y", {2, 2}, {1.f, 3.f, 2.f, 4.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsBatchedTransposeOp_3D_input) {
@@ -58,7 +59,7 @@ TEST(Einsum, ExplicitEinsumAsBatchedTransposeOp_3D_input) {
   test.AddAttribute<std::string>("equation", "...ji->...ij");
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("y", {2, 2, 2}, {1.f, 3.f, 2.f, 4.f, 1.f, 3.f, 2.f, 4.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 // Implicit
@@ -75,7 +76,7 @@ TEST(Einsum, ImplicitEinsumAsBatchedTransposeOp_3D_input) {
   test.AddAttribute<std::string>("equation", "...ji");
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("y", {2, 2, 2}, {1.f, 3.f, 2.f, 4.f, 1.f, 3.f, 2.f, 4.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 // Theme: Axis/Axes reduction
@@ -102,7 +103,7 @@ TEST(Einsum, ExplicitEinsumAsBatchedReduceOp_3D_input_0) {
   test.AddAttribute<std::string>("equation", "...ji->...j");
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("y", {2, 2}, {3.f, 7.f, 3.f, 7.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsBatchedReduceOp_3D_input_1) {
@@ -110,7 +111,7 @@ TEST(Einsum, ExplicitEinsumAsBatchedReduceOp_3D_input_1) {
   test.AddAttribute<std::string>("equation", "...ji->...");
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("y", {2}, {10.f, 10.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 // Implicit
@@ -144,7 +145,7 @@ TEST(Einsum, ExplicitEinsumAsOuterProductWithTransposeOp_Multi_Input) {
   test.AddInput<float>("y", {2}, {3.f, 4.f});
   test.AddInput<float>("z", {2}, {5.f, 6.f});
   test.AddOutput<float>("o", {2, 2, 2}, {15.f, 18.f, 30.f, 36.f, 20.f, 24.f, 40.f, 48.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 // Implicit
@@ -155,7 +156,7 @@ TEST(Einsum, ImplicitEinsumAsOuterProductOp_2D_input) {
   test.AddInput<float>("y", {2}, {3.f, 4.f});
   test.AddInput<float>("z", {2}, {5.f, 6.f});
   test.AddOutput<float>("o", {2, 2, 2}, {15.f, 18.f, 20.f, 24.f, 30.f, 36.f, 40.f, 48.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ImplicitEinsumAsOuterProductOp_Multi_Input) {
@@ -165,7 +166,7 @@ TEST(Einsum, ImplicitEinsumAsOuterProductOp_Multi_Input) {
   test.AddInput<float>("y", {2}, {3.f, 4.f});
   test.AddInput<float>("z", {2}, {5.f, 6.f});
   test.AddOutput<float>("o", {2, 2, 2}, {15.f, 18.f, 20.f, 24.f, 30.f, 36.f, 40.f, 48.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 // Theme: MatMul
 
@@ -233,7 +234,7 @@ TEST(Einsum, ExplicitEinsumAsMatmul_Multi_Input) {
   test.AddInput<float>("y", {2, 2}, {1.f, 2.f, 3.f, 4.f});
   test.AddInput<float>("z", {2, 2}, {1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 2}, {37.f, 81.f, 54.f, 118.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsBatchedMatmul) {
@@ -251,7 +252,7 @@ TEST(Einsum, ExplicitEinsumAsBatchedMatmulWithBroadcasting_0) {
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddInput<float>("y", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 2, 2}, {7.f, 10.f, 15.f, 22.f, 7.f, 10.f, 15.f, 22.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsBatchedMatmulWithBroadcasting_1) {
@@ -260,7 +261,7 @@ TEST(Einsum, ExplicitEinsumAsBatchedMatmulWithBroadcasting_1) {
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddInput<float>("y", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 2, 2}, {14.f, 20.f, 30.f, 44.f, 14.f, 20.f, 30.f, 44.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsMatmul_OutputTransposed) {
@@ -303,7 +304,7 @@ TEST(Einsum, ImplicitEinsumAsMatmul_Multi_Input) {
   test.AddInput<float>("y", {2, 2}, {1.f, 2.f, 3.f, 4.f});
   test.AddInput<float>("z", {2, 2}, {1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 2}, {37.f, 54.f, 81.f, 118.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 TEST(Einsum, ImplicitEinsumAsBatchedMatmul) {
   OpTester test("Einsum", 12, onnxruntime::kOnnxDomain);
@@ -320,7 +321,7 @@ TEST(Einsum, ImplicitEinsumAsBatchedMatmulWithBroadcasting_0) {
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddInput<float>("y", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 2, 2}, {7.f, 10.f, 15.f, 22.f, 7.f, 10.f, 15.f, 22.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ImplicitEinsumAsMatmul_2) {
@@ -343,7 +344,7 @@ TEST(Einsum, DiagonalWithMatmul) {
   test.AddInput<float>("x", {2, 2, 3}, {1.f, 2.f, 3.f, 1.f, 2.f, 3.f, 1.f, 2.f, 3.f, 1.f, 2.f, 3.f});
   test.AddInput<float>("y", {3, 3}, {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f});
   test.AddOutput<float>("o", {3}, {60.f, 72.f, 84.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 // Theme: Diagonal parsing
@@ -354,7 +355,7 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOp) {
   test.AddAttribute<std::string>("equation", "ii->i");
   test.AddInput<float>("x", {2, 2}, {1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2}, {1.f, 4.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsDiagonalOp_1) {
@@ -362,7 +363,7 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOp_1) {
   test.AddAttribute<std::string>("equation", "iii->i");
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2}, {1.f, 4.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsDiagonalOpWithAxisReduced) {
@@ -370,7 +371,7 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOpWithAxisReduced) {
   test.AddAttribute<std::string>("equation", "iji->j");
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2}, {3.f, 7.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsDiagonalOpWithAxisPreserved) {
@@ -378,7 +379,7 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOpWithAxisPreserved) {
   test.AddAttribute<std::string>("equation", "iji->ij");
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 2}, {1.f, 3.f, 2.f, 4.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsDiagonalOpWithTranspose) {
@@ -386,7 +387,7 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOpWithTranspose) {
   test.AddAttribute<std::string>("equation", "iji->ji");
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 2}, {1.f, 2.f, 3.f, 4.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 // ROCm doesn't support double
@@ -396,7 +397,7 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOpWithTranspose_double) {
   test.AddAttribute<std::string>("equation", "iji->ji");
   test.AddInput<double>("x", {2, 2, 2}, {1., 2., 3., 4., 1., 2., 3., 4.});
   test.AddOutput<double>("o", {2, 2}, {1., 2., 3., 4.});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 #endif
 
@@ -405,7 +406,7 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOpWithTranspose_int32) {
   test.AddAttribute<std::string>("equation", "iji->ji");
   test.AddInput<int32_t>("x", {2, 2, 2}, {1, 2, 3, 4, 1, 2, 3, 4});
   test.AddOutput<int32_t>("o", {2, 2}, {1, 2, 3, 4});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsDiagonalOpWithTranspose_int64) {
@@ -413,14 +414,14 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOpWithTranspose_int64) {
   test.AddAttribute<std::string>("equation", "iji->ji");
   test.AddInput<int64_t>("x", {2, 2, 2}, {1, 2, 3, 4, 1, 2, 3, 4});
   test.AddOutput<int64_t>("o", {2, 2}, {1, 2, 3, 4});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 TEST(Einsum, ExplicitEinsumAsBatchedDiagonalOp) {
   OpTester test("Einsum", 12, onnxruntime::kOnnxDomain);
   test.AddAttribute<std::string>("equation", "...ii->...i");
   test.AddInput<float>("x", {3, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {3, 2}, {1.f, 4.f, 1.f, 4.f, 1.f, 4.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsBatchedDiagonalOp_1) {
@@ -428,7 +429,7 @@ TEST(Einsum, ExplicitEinsumAsBatchedDiagonalOp_1) {
   test.AddAttribute<std::string>("equation", "...iij->...j");
   test.AddInput<float>("x", {2, 2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 2}, {4.f, 6.f, 4.f, 6.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 // Implicit (Implicit diagonal ops will sum up diagonal values)
@@ -442,7 +443,7 @@ TEST(Einsum, ImplicitEinsumAsDiagonalOp) {
   test.AddAttribute<std::string>("equation", "ii");
   test.AddInput<float>("x", {2, 2}, {1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {}, {5.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ImplicitEinsumAsDiagonalOp_1) {
@@ -455,7 +456,7 @@ TEST(Einsum, ImplicitEinsumAsDiagonalOp_1) {
   test.AddAttribute<std::string>("equation", "iii");
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {}, {5.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ImplicitEinsumAsDiagonalOpWithAxisReduced) {
@@ -463,7 +464,7 @@ TEST(Einsum, ImplicitEinsumAsDiagonalOpWithAxisReduced) {
   test.AddAttribute<std::string>("equation", "iji");
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2}, {3.f, 7.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ImplicitEinsumAsBatchedDiagonalOp) {
@@ -471,7 +472,7 @@ TEST(Einsum, ImplicitEinsumAsBatchedDiagonalOp) {
   test.AddAttribute<std::string>("equation", "...ii");
   test.AddInput<float>("x", {2, 1, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 1}, {5.f, 5.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ImplicitEinsumAsBatchedDiagonalOp_1) {
@@ -479,7 +480,7 @@ TEST(Einsum, ImplicitEinsumAsBatchedDiagonalOp_1) {
   test.AddAttribute<std::string>("equation", "...iij");
   test.AddInput<float>("x", {2, 2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 2}, {4.f, 6.f, 4.f, 6.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 // Theme: Scalar inputs and outputs
@@ -491,7 +492,7 @@ TEST(Einsum, ExplicitEinsumAsElementwiseMulOpWithOneScalar) {
   test.AddInput<float>("x", {}, {10.f});
   test.AddInput<float>("y", {2, 2}, {1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 2}, {10.f, 20.f, 30.f, 40.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsElementwiseMulOpWithTwoScalars_Multi_Input) {
@@ -501,7 +502,7 @@ TEST(Einsum, ExplicitEinsumAsElementwiseMulOpWithTwoScalars_Multi_Input) {
   test.AddInput<float>("y", {2, 2}, {1.f, 2.f, 3.f, 4.f});
   test.AddInput<float>("z", {}, {10.f});
   test.AddOutput<float>("o", {2, 2}, {100.f, 200.f, 300.f, 400.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 TEST(Einsum, ExplicitEinsumAsElementwiseMulOpWithAllScalars) {
   OpTester test("Einsum", 12, onnxruntime::kOnnxDomain);
@@ -527,7 +528,7 @@ TEST(Einsum, ImplicitEinsumAsElementwiseMulOpWithOneScalar) {
   test.AddInput<float>("x", {}, {10.f});
   test.AddInput<float>("y", {2, 2}, {1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 2}, {10.f, 20.f, 30.f, 40.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ImplicitEinsumAsElementwiseMulOpWithThreeScalars_Multi_Input) {
@@ -538,7 +539,7 @@ TEST(Einsum, ImplicitEinsumAsElementwiseMulOpWithThreeScalars_Multi_Input) {
   test.AddInput<float>("c", {}, {10.f});
   test.AddInput<float>("d", {}, {10.f});
   test.AddOutput<float>("o", {2, 2}, {1000.f, 2000.f, 3000.f, 4000.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 TEST(Einsum, ImplicitEinsumAsElementwiseMulOpWithAllScalars) {
   OpTester test("Einsum", 12, onnxruntime::kOnnxDomain);
@@ -568,7 +569,7 @@ TEST(Einsum, ExplicitEinsumAsTensorContractionReshapeFinal) {
   test.AddInput<float>("y", {2, 2}, {1.f, 2.f, -6.f, 2.f});
   test.AddInput<float>("z", {2, 2}, {3.f, 4.f, 5.f, 6.f});
   test.AddOutput<float>("o", {2, 2, 2}, {63.f, -132.f, 63.f, -132.f, 63.f, -132.f, 63.f, -132.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsTensorContractionReshapeLeft) {
@@ -720,7 +721,7 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOp_Half) {
   ConvertFloatToMLFloat16(output_f.data(), output.data(), 2);
   test.AddInput<MLFloat16>("x", {2, 2}, input_x);
   test.AddOutput<MLFloat16>("o", {2}, output);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsElementwiseMulOpWithOneScalar_Half) {
@@ -741,7 +742,7 @@ TEST(Einsum, ExplicitEinsumAsElementwiseMulOpWithOneScalar_Half) {
   test.AddInput<MLFloat16>("x", {}, input_x);
   test.AddInput<MLFloat16>("y", {2, 2}, input_y);
   test.AddOutput<MLFloat16>("o", {2, 2}, output);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsTensorContraction_Half) {
@@ -2093,7 +2094,7 @@ TEST_P(EinsumTransposeMatMulThreeInputsTest, EinsumTransposeMatMulThreeInputsTes
   std::vector<int64_t> v1(tst.shape.begin(), tst.shape.end());
   std::vector<float> v2(tst.expected.begin(), tst.expected.end());
   test.AddOutput<float>("o", v1, v2);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 INSTANTIATE_TEST_SUITE_P(EinsumTransposeMatMulThreeInputsTests, EinsumTransposeMatMulThreeInputsTest, testing::ValuesIn(case1));
diff --git a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
index d35e5c78cfd69..0e99b2306873e 100644
--- a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
@@ -6,6 +6,7 @@
 #include "test/util/include/default_providers.h"
 #include "test/common/dnnl_op_test_utils.h"
 #include "test/common/cuda_op_test_utils.h"
+#include "test/common/trt_op_test_utils.h"
 #include "core/util/math.h"
 #include <algorithm>
 #include <math.h>
@@ -1370,7 +1371,8 @@ static void TestSumMultipleInputsNoBroadcasting(size_t num_inputs, const TensorS
 
   test.AddOutput<element_type>("sum", dims, expected_output_data);
 
-  test.Run();
+  // TRT EP segmentation fault in A100
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(MathOpTest, SumMultipleInputsNoBroadcasting) {
@@ -2639,6 +2641,7 @@ void TrigFloatTest(OpTester& test, std::initializer_list<float> input) {
 
   test.AddInput<float>("X", dims, input);
   test.AddOutput<float>("Y", dims, output);
+
   test.Run();
 }
 
@@ -2708,6 +2711,7 @@ TEST(MathOpTest, CosFloat16) {
     TrigFloat16Test<::cosf>(test, {1.1f, -1.1f, 2.2f, -2.2f});
   }
 }
+
 TEST(MathOpTest, Tan) {
   OpTester test("Tan");
   TrigFloatTest<::tanf>(test, {-100.0f, -50.0f, 0.0f, 50.0f, 100.0f});
diff --git a/onnxruntime/test/providers/cpu/object_detection/roialign_test.cc b/onnxruntime/test/providers/cpu/object_detection/roialign_test.cc
index 2f97f6e71e92b..0bff46edccc12 100644
--- a/onnxruntime/test/providers/cpu/object_detection/roialign_test.cc
+++ b/onnxruntime/test/providers/cpu/object_detection/roialign_test.cc
@@ -4,6 +4,7 @@
 #include "gtest/gtest.h"
 #include "test/providers/provider_test_utils.h"
 #include "test/util/include/default_providers.h"
+#include "test/common/trt_op_test_utils.h"
 
 namespace onnxruntime {
 namespace test {
@@ -713,7 +714,8 @@ TEST(RoiAlignTest, AvgModeNegativeInvalidMode) {
   test.AddInput<int64_t>("batch_indices", {5}, {0, 0, 0, 0, 0});
   test.AddOutput<float>("Y", {5, 3, 3, 4}, {2.95833f, 3.20833f, 3.45833f, 3.70833f, 4.625f, 4.875f, 5.125f, 5.375f, 6.29167f, 6.54167f, 6.79167f, 7.04167f, 27.9583f, 28.2083f, 28.4583f, 28.7083f, 29.625f, 29.875f, 30.125f, 30.375f, 31.2917f, 31.5417f, 31.7917f, 32.0417f, 52.9583f, 53.2083f, 53.4583f, 53.7083f, 54.625f, 54.875f, 55.125f, 55.375f, 56.2917f, 56.5417f, 56.7917f, 57.0417f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 7.39583f, 7.39583f, 7.42708f, 7.64583f, 9.0625f, 9.0625f, 9.09375f, 9.3125f, 10.7292f, 10.7292f, 10.7604f, 10.9792f, 32.3958f, 32.3958f, 32.4271f, 32.6458f, 34.0625f, 34.0625f, 34.0938f, 34.3125f, 35.7292f, 35.7292f, 35.7604f, 35.9792f, 57.3958f, 57.3958f, 57.4271f, 57.6458f, 59.0625f, 59.0625f, 59.0938f, 59.3125f, 60.7292f, 60.7292f, 60.7604f, 60.9792f, 4.27083f, 4.52083f, 4.77083f, 5.02083f, 5.9375f, 6.1875f, 6.4375f, 6.6875f, 7.60417f, 7.85417f, 8.10417f, 8.35417f, 29.2708f, 29.5208f, 29.7708f, 30.0208f, 30.9375f, 31.1875f, 31.4375f, 31.6875f, 32.6042f, 32.8542f, 33.1042f, 33.3542f, 54.2708f, 54.5208f, 54.7708f, 55.0208f, 55.9375f, 56.1875f, 56.4375f, 56.6875f, 57.6042f, 57.8542f, 58.1042f, 58.3542f, 6.77083f, 6.77083f, 6.77083f, 6.80208f, 8.4375f, 8.4375f, 8.4375f, 8.46875f, 10.1042f, 10.1042f, 10.1042f, 10.1354f, 31.7708f, 31.7708f, 31.7708f, 31.8021f, 33.4375f, 33.4375f, 33.4375f, 33.4688f, 35.1042f, 35.1042f, 35.1042f, 35.1354f, 56.7708f, 56.7708f, 56.7708f, 56.8021f, 58.4375f, 58.4375f, 58.4375f, 58.4688f, 60.1042f, 60.1042f, 60.1042f, 60.1354f});
 
-  test.Run(OpTester::ExpectResult::kExpectFailure, "Invalid mode");
+  // TRT EP segmentation fault in A100
+  test.Run(OpTester::ExpectResult::kExpectFailure, "Invalid mode", ExcludeTrtOnA100());
 }
 
 TEST(RoiAlignTest, AvgModeNegativeSamplingRatio) {
@@ -738,7 +740,8 @@ TEST(RoiAlignTest, AvgModeNegativeSamplingRatio) {
   test.AddInput<int64_t>("batch_indices", {5}, {0, 0, 0, 0, 0});
   test.AddOutput<float>("Y", {5, 3, 3, 4}, {2.95833f, 3.20833f, 3.45833f, 3.70833f, 4.625f, 4.875f, 5.125f, 5.375f, 6.29167f, 6.54167f, 6.79167f, 7.04167f, 27.9583f, 28.2083f, 28.4583f, 28.7083f, 29.625f, 29.875f, 30.125f, 30.375f, 31.2917f, 31.5417f, 31.7917f, 32.0417f, 52.9583f, 53.2083f, 53.4583f, 53.7083f, 54.625f, 54.875f, 55.125f, 55.375f, 56.2917f, 56.5417f, 56.7917f, 57.0417f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 7.39583f, 7.39583f, 7.42708f, 7.64583f, 9.0625f, 9.0625f, 9.09375f, 9.3125f, 10.7292f, 10.7292f, 10.7604f, 10.9792f, 32.3958f, 32.3958f, 32.4271f, 32.6458f, 34.0625f, 34.0625f, 34.0938f, 34.3125f, 35.7292f, 35.7292f, 35.7604f, 35.9792f, 57.3958f, 57.3958f, 57.4271f, 57.6458f, 59.0625f, 59.0625f, 59.0938f, 59.3125f, 60.7292f, 60.7292f, 60.7604f, 60.9792f, 4.27083f, 4.52083f, 4.77083f, 5.02083f, 5.9375f, 6.1875f, 6.4375f, 6.6875f, 7.60417f, 7.85417f, 8.10417f, 8.35417f, 29.2708f, 29.5208f, 29.7708f, 30.0208f, 30.9375f, 31.1875f, 31.4375f, 31.6875f, 32.6042f, 32.8542f, 33.1042f, 33.3542f, 54.2708f, 54.5208f, 54.7708f, 55.0208f, 55.9375f, 56.1875f, 56.4375f, 56.6875f, 57.6042f, 57.8542f, 58.1042f, 58.3542f, 6.77083f, 6.77083f, 6.77083f, 6.80208f, 8.4375f, 8.4375f, 8.4375f, 8.46875f, 10.1042f, 10.1042f, 10.1042f, 10.1354f, 31.7708f, 31.7708f, 31.7708f, 31.8021f, 33.4375f, 33.4375f, 33.4375f, 33.4688f, 35.1042f, 35.1042f, 35.1042f, 35.1354f, 56.7708f, 56.7708f, 56.7708f, 56.8021f, 58.4375f, 58.4375f, 58.4375f, 58.4688f, 60.1042f, 60.1042f, 60.1042f, 60.1354f});
 
-  test.Run(OpTester::ExpectResult::kExpectFailure, "Sampling ratio should be >=0");
+  // TRT EP segmentation fault in A100
+  test.Run(OpTester::ExpectResult::kExpectFailure, "Sampling ratio should be >=0", ExcludeTrtOnA100());
 }
 
 TEST(RoiAlignTest, AvgModeNegativeInvalidNumRoiDims) {
diff --git a/onnxruntime/test/providers/cpu/tensor/onehot_op_test.cc b/onnxruntime/test/providers/cpu/tensor/onehot_op_test.cc
index a2ffbdcc0bdf1..55c247e4c2fea 100644
--- a/onnxruntime/test/providers/cpu/tensor/onehot_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/onehot_op_test.cc
@@ -3,6 +3,7 @@
 
 #include "gtest/gtest.h"
 #include "test/providers/provider_test_utils.h"
+#include "test/common/trt_op_test_utils.h"
 
 using namespace std;
 
@@ -36,7 +37,8 @@ TEST(OneHotOpTest, DefaultAxis_float_float_float /*indices, output, depth*/) {
                          0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
                          0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
                          0., 0., 0., 0., 0., 0., 1., 0., 0., 0.});
-  test.Run();
+  // TRT EP segmentation fault in A100
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(OneHotOpTest, DefaultAxis_int64_int32_float /*indices, output, depth*/) {
@@ -51,7 +53,7 @@ TEST(OneHotOpTest, DefaultAxis_int64_int32_float /*indices, output, depth*/) {
                            0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
                            0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
                            0, 0, 0, 0, 0, 0, 1, 0, 0, 0});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(OneHotOpTest, DefaultAxis_int64_float_int64 /*indices, output, depth*/) {
@@ -81,7 +83,7 @@ TEST(OneHotOpTest, DefaultAxis_int32_float_float /*indices, output, depth*/) {
                          0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
                          0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(OneHotOpTest, DefaultAxis_int32_float_int32 /*indices, output, depth*/) {
@@ -231,7 +233,7 @@ TEST(OneHotOpTest, DefaultAxis_float_float_float_NonZeroOffValue /*indices, outp
                          2., 2., 3., 2., 2., 2., 2., 2., 2., 2.,
                          2., 2., 2., 2., 3., 2., 2., 2., 2., 2.,
                          2., 2., 2., 2., 2., 2., 3., 2., 2., 2.});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(OneHotOpTest, DefaultAxis_int64_int32_float_NonZeroOffValue /*indices, output, depth*/) {
@@ -246,7 +248,7 @@ TEST(OneHotOpTest, DefaultAxis_int64_int32_float_NonZeroOffValue /*indices, outp
                            2, 2, 3, 2, 2, 2, 2, 2, 2, 2,
                            2, 2, 2, 2, 3, 2, 2, 2, 2, 2,
                            2, 2, 2, 2, 2, 2, 3, 2, 2, 2});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(OneHotOpTest, DefaultAxis_int64_float_int64_NonZeroOffValue /*indices, output, depth*/) {
@@ -276,7 +278,7 @@ TEST(OneHotOpTest, DefaultAxis_int32_float_float_NonZeroOffValue /*indices, outp
                          2.0f, 2.0f, 3.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f,
                          2.0f, 2.0f, 2.0f, 2.0f, 3.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f,
                          2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 3.0f, 2.0f, 2.0f, 2.0f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(OneHotOpTest, DefaultAxis_int32_float_int32_NonZeroOffValue /*indices, output, depth*/) {
diff --git a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
index 062f25b989a70..496f2213e9d32 100644
--- a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
@@ -5,9 +5,11 @@
 #include "gtest/gtest.h"
 #include "test/providers/provider_test_utils.h"
 #include "test/util/include/default_providers.h"
+#include "test/common/trt_op_test_utils.h"
 
 namespace onnxruntime {
 namespace test {
+
 TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_tf_crop_and_resize) {
   // TODO: Unskip when fixed #41968513
   if (DefaultDmlExecutionProvider().get() != nullptr) {
@@ -243,7 +245,10 @@ TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear) {
   std::vector<float> Y = {2.66666651f, 4.3333331f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kQnnExecutionProvider});  // QNN: result diff
+  // QNN: result diff
+  // TRT: Segmentation fault in A100
+  std::unordered_set<std::string> excluded_providers({kQnnExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100(excluded_providers));
 }
 
 TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear) {
@@ -267,8 +272,9 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear) {
   test.AddOutput<float>("Y", {N, static_cast<int64_t>(H * scales[1]), static_cast<int64_t>(W * scales[2]), C}, Y);
   // CUDA: result mismatch due to not implementing NHWC support
   // ROCm: results mismatch
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider});
+  // TRT: Segmentation fault in A100
+  std::unordered_set<std::string> excluded_providers({kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100(excluded_providers));
 }
 
 TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_uint8) {
@@ -315,7 +321,7 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_int8) {
   std::vector<int8_t> Y = {0, 0};
 
   test.AddOutput<int8_t>("Y", {N, static_cast<int64_t>(H * scales[1]), static_cast<int64_t>(W * scales[2]), C}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 // Since NNAPI(TFLite) only using the scale calculate using the input/output size
@@ -347,7 +353,7 @@ TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear1) {
     std::vector<float> Y = {3.5f, 5.5f};
 
     test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-    test.Run();
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
   };
 
   run_test(false);
@@ -405,7 +411,7 @@ TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear_align_corners) {
     std::vector<float> Y = {1.0f, 4.0f};
 
     test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-    test.Run();
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
   };
 
   run_test(false);
@@ -608,7 +614,7 @@ TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_4DBilinear_asymmetric_scales) {
         7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 11.0f, 11.0f, 11.0f};
 
     test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-    test.Run();
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
   };
 
   run_test(false);
@@ -725,7 +731,7 @@ TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_2DBilinear_align_corners) {
       4.0f, 4.5714290f, 5.142857f, 5.714286f, 6.285714f, 6.8571430f, 7.428571f, 8.0f};
 
   test.AddOutput<float>("Y", {static_cast<int64_t>(H * scales[0]), static_cast<int64_t>(W * scales[1])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_3DTrilinear_pytorch_half_pixel) {
@@ -819,7 +825,7 @@ TEST(ResizeOpTest, ResizeOpLinearScalesNoOpTest) {
                             7.0f, 11.0f};
 
     test.AddOutput<float>("Y", {N, C, H, W}, Y);
-    test.Run();
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
   };
 
   run_test(false);
@@ -845,7 +851,7 @@ TEST(ResizeOpTest, ResizeOpNearestDownSampleTest) {
   std::vector<float> Y = {1.0f, 3.0f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpNearestDownSampleTest_Opset12) {
@@ -867,7 +873,7 @@ TEST(ResizeOpTest, ResizeOpNearestDownSampleTest_Opset12) {
   std::vector<float> Y = {1.0f, 3.0f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpNearestDownSampleTest_WithSizes) {
@@ -920,7 +926,7 @@ TEST(ResizeOpTest, ResizeOpNearestDownSampleTest_tf_half_pixel) {
                           14.0f, 16.0f};
 
   test.AddOutput<float>("Y", {N, C, sizes[2], sizes[3]}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpNearestDownSampleTest_tf_crop_and_resize_with_extrapolation) {
@@ -1000,7 +1006,7 @@ TEST(ResizeOpTest, ResizeOpNearestUpSampleTest) {
                           3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 4.0f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpNearestUpSampleTest_WithSizes_CeilMode) {
@@ -1093,7 +1099,7 @@ TEST(ResizeOpTest, ResizeOpNearestUpSample_Floor_Align_Corners) {
                           13.0f, 13.0f, 13.0f, 14.0f, 14.0f, 15.0f, 15.0f, 16.0f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpNearest_OneToOneMappingBetweenInputAndOutputDataDims) {
@@ -1197,7 +1203,7 @@ TEST(ResizeOpTest, ResizeOpNearestUpSample_Nearest2xOptimization_Scales) {
                             3.0f, 3.0f, 4.0f, 4.0f};
 
     test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-    test.Run();
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
   };
 
   run_test(false);
@@ -1262,7 +1268,7 @@ TEST(ResizeOpTest, ResizeOpCubicDownSampleTest) {
                           11.9165f, 13.2266f, 14.5278f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpCubicDownSampleTest_exclude_outside) {
@@ -1292,7 +1298,7 @@ TEST(ResizeOpTest, ResizeOpCubicDownSampleTest_exclude_outside) {
                           11.949f, 13.2503f, 14.5942f};
 
   test.AddOutput<float>("Y", {static_cast<int64_t>(H * scales[0]), static_cast<int64_t>(W * scales[1])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpCubicDownSampleTest_coeff) {
@@ -1319,7 +1325,7 @@ TEST(ResizeOpTest, ResizeOpCubicDownSampleTest_coeff) {
                           11.8701f, 13.168f, 14.4912f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpCubicDownSampleTest_with_roi) {
@@ -1373,7 +1379,7 @@ TEST(ResizeOpTest, ResizeOpCubicDownSampleTest_asymmetric) {
                           11.375f, 12.6719f, 13.9688f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpCubicUpSampleTest) {
@@ -1405,7 +1411,7 @@ TEST(ResizeOpTest, ResizeOpCubicUpSampleTest) {
                           13.375f, 13.7813f, 14.375f, 14.875f, 15.375f, 15.9688f, 16.375f, 16.4688f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpCubicUpSampleTest_MultiChannel) {
@@ -1486,7 +1492,7 @@ TEST(ResizeOpTest, ResizeOpCubicUpSampleTest_tf_half_pixel_for_nn) {
                           13.332f, 13.8086f, 14.4375f, 14.8438f, 15.4727f, 15.9492f, 16.2461f, 16.1758f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear_Ver10) {
@@ -1512,7 +1518,10 @@ TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear_Ver10) {
   std::vector<float> Y = {1.0f, 2.66666651f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kQnnExecutionProvider});  // QNN: result diff
+  // QNN: result diff
+  // TRT: segmentation fault in A100
+  std::unordered_set<std::string> excluded_providers({kQnnExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100(excluded_providers));
 }
 
 TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_2DBilinear_Ver10) {
@@ -1538,7 +1547,7 @@ TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_2DBilinear_Ver10) {
   std::vector<float> Y = {1.0f, 2.66666651f};
 
   test.AddOutput<float>("Y", {static_cast<int64_t>(H * scales[0]), static_cast<int64_t>(W * scales[1])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_4DBilinear_Ver10) {
@@ -1574,7 +1583,10 @@ TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_4DBilinear_Ver10) {
       7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 11.0f, 11.0f, 11.0f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kQnnExecutionProvider});  // QNN: result diff
+  // QNN: result diff
+  // TRT: segmentation fault in A100
+  std::unordered_set<std::string> excluded_providers({kQnnExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100(excluded_providers));
 }
 
 TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_2DBilinear_Ver10) {
@@ -1602,7 +1614,7 @@ TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_2DBilinear_Ver10) {
       4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 8.0f, 8.0f, 8.0f};
 
   test.AddOutput<float>("Y", {static_cast<int64_t>(H * scales[0]), static_cast<int64_t>(W * scales[1])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpLinearScalesNoOpTest_Ver10) {
@@ -1627,7 +1639,7 @@ TEST(ResizeOpTest, ResizeOpLinearScalesNoOpTest_Ver10) {
                           7.0f, 11.0f};
 
   test.AddOutput<float>("Y", {N, C, H, W}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpNearestDownSampleTest_Ver10) {
@@ -1647,7 +1659,7 @@ TEST(ResizeOpTest, ResizeOpNearestDownSampleTest_Ver10) {
   std::vector<float> Y = {1.0f, 3.0f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpNearestUpSampleTest_Ver10) {
@@ -1668,10 +1680,10 @@ TEST(ResizeOpTest, ResizeOpNearestUpSampleTest_Ver10) {
                           3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 4.0f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
-TEST(UpsampleOpTest, ResizeOpNearestNoScaleTest_Ver10) {
+TEST(ResizeOpTest, ResizeOpNearestNoScaleTest_Ver10) {
   OpTester test("Resize", 10);
   std::vector<float> scales{1.0f, 1.0f, 1.0f, 1.0f};
 
@@ -1686,7 +1698,7 @@ TEST(UpsampleOpTest, ResizeOpNearestNoScaleTest_Ver10) {
   std::vector<float> Y = {1.0f, 2.0f, 3.0f, 4.0f};
 
   test.AddOutput<float>("Y", {N, C, H, W}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOp_MissingRoiAndMissingScalesOptionalInputs) {
@@ -1737,7 +1749,7 @@ void ResizeOpTypeCheck_Ver_10() {
                       3, 3, 3, 4, 4, 4};
 
   test.AddOutput<T>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpTypeCheck_Ver_10) {
@@ -1768,7 +1780,7 @@ void ResizeOpTypeCheck_Ver_11_13_18(int opset_version) {
                       3, 3, 3, 4, 4, 4};
 
   test.AddOutput<T>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpTypeCheck_Ver11) {
diff --git a/onnxruntime/test/providers/cpu/tensor/upsample_op_test.cc b/onnxruntime/test/providers/cpu/tensor/upsample_op_test.cc
index 188532cfa350a..3ac8053aef95e 100644
--- a/onnxruntime/test/providers/cpu/tensor/upsample_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/upsample_op_test.cc
@@ -4,6 +4,7 @@
 #include "gtest/gtest.h"
 #include "test/providers/provider_test_utils.h"
 #include "test/util/include/default_providers.h"
+#include "test/common/trt_op_test_utils.h"
 
 namespace onnxruntime {
 namespace test {
@@ -939,7 +940,9 @@ TEST(UpsampleOpTest, UpsampleOpNearest2XTest_opset9) {
       7, 7, 9, 9};
 
   test.AddOutput<int32_t>("Y", {N, C, (int64_t)(H * scales[2]), (int64_t)(W * scales[3])}, Y);
-  test.Run();
+
+  // TRT: segmentation fault in A100
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(UpsampleOpTest, NhwcUpsampleOpNearest2XTest_opset9) {

From 01c7aaf6aa75c88a0fd7e9aacc13ebeb958674aa Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Tue, 19 Mar 2024 12:55:00 -0700
Subject: [PATCH 203/279] [js/webgpu] allow setting env.webgpu.adapter (#19940)

### Description
Allow user to set `env.webgpu.adapter` before creating the first
inference session.

Feature request:
https://github.com/microsoft/onnxruntime/pull/19857#issuecomment-1999984753

@xenova
---
 js/common/lib/env.ts                   | 10 +++++---
 js/web/lib/wasm/jsep/backend-webgpu.ts |  6 +++--
 js/web/lib/wasm/wasm-core-impl.ts      | 35 ++++++++++++++++++--------
 3 files changed, 35 insertions(+), 16 deletions(-)

diff --git a/js/common/lib/env.ts b/js/common/lib/env.ts
index b139c719e863f..c8df1613b3268 100644
--- a/js/common/lib/env.ts
+++ b/js/common/lib/env.ts
@@ -166,16 +166,20 @@ export declare namespace Env {
      */
     forceFallbackAdapter?: boolean;
     /**
-     * Get the adapter for WebGPU.
+     * Set or get the adapter for WebGPU.
      *
-     * This property is only available after the first WebGPU inference session is created.
+     * Setting this property only has effect before the first WebGPU inference session is created. The value will be
+     * used as the GPU adapter for the underlying WebGPU backend to create GPU device.
+     *
+     * If this property is not set, it will be available to get after the first WebGPU inference session is created. The
+     * value will be the GPU adapter that created by the underlying WebGPU backend.
      *
      * When use with TypeScript, the type of this property is `GPUAdapter` defined in "@webgpu/types".
      * Use `const adapter = env.webgpu.adapter as GPUAdapter;` in TypeScript to access this property with correct type.
      *
      * see comments on {@link Tensor.GpuBufferType}
      */
-    readonly adapter: unknown;
+    adapter: unknown;
     /**
      * Get the device for WebGPU.
      *
diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts
index d92b8ac68dbe7..b36dc73330d46 100644
--- a/js/web/lib/wasm/jsep/backend-webgpu.ts
+++ b/js/web/lib/wasm/jsep/backend-webgpu.ts
@@ -252,8 +252,10 @@ export class WebGpuBackend {
       }
     };
 
-    Object.defineProperty(this.env.webgpu, 'device', {value: this.device});
-    Object.defineProperty(this.env.webgpu, 'adapter', {value: adapter});
+    Object.defineProperty(
+        this.env.webgpu, 'device', {value: this.device, writable: false, enumerable: true, configurable: false});
+    Object.defineProperty(
+        this.env.webgpu, 'adapter', {value: adapter, writable: false, enumerable: true, configurable: false});
 
     // init queryType, which is necessary for InferenceSession.create
     this.setQueryType();
diff --git a/js/web/lib/wasm/wasm-core-impl.ts b/js/web/lib/wasm/wasm-core-impl.ts
index 7019758be0efd..9b27051f1b9fe 100644
--- a/js/web/lib/wasm/wasm-core-impl.ts
+++ b/js/web/lib/wasm/wasm-core-impl.ts
@@ -93,18 +93,31 @@ export const initEp = async(env: Env, epName: string): Promise<void> => {
       if (typeof navigator === 'undefined' || !navigator.gpu) {
         throw new Error('WebGPU is not supported in current environment');
       }
-      const powerPreference = env.webgpu?.powerPreference;
-      if (powerPreference !== undefined && powerPreference !== 'low-power' && powerPreference !== 'high-performance') {
-        throw new Error(`Invalid powerPreference setting: "${powerPreference}"`);
-      }
-      const forceFallbackAdapter = env.webgpu?.forceFallbackAdapter;
-      if (forceFallbackAdapter !== undefined && typeof forceFallbackAdapter !== 'boolean') {
-        throw new Error(`Invalid forceFallbackAdapter setting: "${forceFallbackAdapter}"`);
-      }
-      const adapter = await navigator.gpu.requestAdapter({powerPreference, forceFallbackAdapter});
+
+      let adapter = env.webgpu.adapter as GPUAdapter | null;
       if (!adapter) {
-        throw new Error(
-            'Failed to get GPU adapter. You may need to enable flag "--enable-unsafe-webgpu" if you are using Chrome.');
+        // if adapter is not set, request a new adapter.
+        const powerPreference = env.webgpu.powerPreference;
+        if (powerPreference !== undefined && powerPreference !== 'low-power' &&
+            powerPreference !== 'high-performance') {
+          throw new Error(`Invalid powerPreference setting: "${powerPreference}"`);
+        }
+        const forceFallbackAdapter = env.webgpu.forceFallbackAdapter;
+        if (forceFallbackAdapter !== undefined && typeof forceFallbackAdapter !== 'boolean') {
+          throw new Error(`Invalid forceFallbackAdapter setting: "${forceFallbackAdapter}"`);
+        }
+        adapter = await navigator.gpu.requestAdapter({powerPreference, forceFallbackAdapter});
+        if (!adapter) {
+          throw new Error(
+              'Failed to get GPU adapter. ' +
+              'You may need to enable flag "--enable-unsafe-webgpu" if you are using Chrome.');
+        }
+      } else {
+        // if adapter is set, validate it.
+        if (typeof adapter.limits !== 'object' || typeof adapter.features !== 'object' ||
+            typeof adapter.requestDevice !== 'function') {
+          throw new Error('Invalid GPU adapter set in `env.webgpu.adapter`. It must be a GPUAdapter object.');
+        }
       }
 
       if (!env.wasm.simd) {

From 18a7f34ba052d183a254dcdcc9a939790e8c73e0 Mon Sep 17 00:00:00 2001
From: Adrian Lizarraga <adlizarraga@microsoft.com>
Date: Tue, 19 Mar 2024 13:48:04 -0700
Subject: [PATCH 204/279] [NhwcTransformerTests] Fix linker error due to
 explicit template instantiation of ModelBuilder methods (#19980)

Currently, the nhwc_transformer_test.cc compilation unit defines
explicit FP16 versions of `ModelTestBuilder::MakeInput<MLFloat16>` and
`ModelTestBuilder::MakeInitializer<MLFloat16>` outside of the
ModelTestBuilder class's header file.

These explicit template instantiations cause linker errors when other
compilation units also instantiate these functions due to duplicate
definitions. Additionally, the versions defined in
nhwc_transformer_test.cc do not really conform to the expected behavior
in the original ModelTestBuilder class, which is to make random
input/initializer values. Instead, the versions in
nhwc_transformer_test.cc create a range of values.

The solution is to edit nhwc_transformer_test.cc to use stand-alone
static functions that do not change the ModelTestBuilder class.

**Note**: This linker error cannot currently be replicated in our CIs
because it requires a QNN-HTP-enabled Windows ARM64 environment with
`MLAS_F16VEC_INTRINSICS_SUPPORTED` defined. I can replicate on a local
build. The linker error/conflict happens with with this new FP16 QNN
test:

https://github.com/microsoft/onnxruntime/blob/d4c8bc359e321cdabdd87b70b392dd0e7a14502e/onnxruntime/test/providers/qnn/clip_op_test.cc#L186
---
 .../test/optimizer/nhwc_transformer_test.cc   | 34 +++++++++----------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/onnxruntime/test/optimizer/nhwc_transformer_test.cc b/onnxruntime/test/optimizer/nhwc_transformer_test.cc
index c254d340cdcb8..e6f0a259805e5 100644
--- a/onnxruntime/test/optimizer/nhwc_transformer_test.cc
+++ b/onnxruntime/test/optimizer/nhwc_transformer_test.cc
@@ -518,7 +518,7 @@ TEST(NhwcTransformerTests, ConvMixTensorRanks) {
 
 #ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
 
-std::vector<MLFloat16> randomfp16(const std::vector<int64_t>& shape, MLFloat16 min, MLFloat16 max) {
+static std::vector<MLFloat16> ARangeOfFP16Values(const std::vector<int64_t>& shape, MLFloat16 min, MLFloat16 max) {
   std::vector<MLFloat16> val(detail::SizeFromDims(shape));
   float start = min.ToFloat();
   float end = max.ToFloat();
@@ -534,22 +534,22 @@ std::vector<MLFloat16> randomfp16(const std::vector<int64_t>& shape, MLFloat16 m
   return val;
 }
 
-template <>
-NodeArg* ModelTestBuilder::MakeInput<MLFloat16>(const std::vector<int64_t>& shape, MLFloat16 min, MLFloat16 max) {
-  return MakeInput<MLFloat16>(shape, randomfp16(shape, min, max));
+static NodeArg* MakeInputARangeFP16(ModelTestBuilder& builder, const std::vector<int64_t>& shape,
+                                    MLFloat16 min, MLFloat16 max) {
+  return builder.MakeInput<MLFloat16>(shape, ARangeOfFP16Values(shape, min, max));
 }
 
-template <>
-NodeArg* ModelTestBuilder::MakeInitializer(const std::vector<int64_t>& shape, MLFloat16 min, MLFloat16 max) {
-  return MakeInitializer(shape, randomfp16(shape, min, max));
+static NodeArg* MakeInitializerARangeFP16(ModelTestBuilder& builder, const std::vector<int64_t>& shape,
+                                          MLFloat16 min, MLFloat16 max) {
+  return builder.MakeInitializer<MLFloat16>(shape, ARangeOfFP16Values(shape, min, max));
 }
 
 TEST(NhwcTransformerTests, ConvFp16) {
   auto test_case = [&](const std::vector<int64_t>& input_shape, const std::vector<int64_t>& weights_shape) {
     auto build_test_case = [&](ModelTestBuilder& builder) {
-      auto* input_arg = builder.MakeInput<MLFloat16>(input_shape, MLFloat16(-1.5f), MLFloat16(1.5f));
+      auto* input_arg = MakeInputARangeFP16(builder, input_shape, MLFloat16(-1.5f), MLFloat16(1.5f));
       auto* output_arg = builder.MakeOutput();
-      auto* weight_arg = builder.MakeInitializer(weights_shape, MLFloat16(-1.5f), MLFloat16(1.5f));
+      auto* weight_arg = MakeInitializerARangeFP16(builder, weights_shape, MLFloat16(-1.5f), MLFloat16(1.5f));
 
       builder.AddConvNode(input_arg, weight_arg, output_arg);
     };
@@ -575,10 +575,10 @@ TEST(NhwcTransformerTests, ConvFp16) {
 TEST(NhwcTransformerTests, ConvMaxPoolFp16) {
   auto test_case = [&](const std::vector<int64_t>& input_shape, const std::vector<int64_t>& weights_shape) {
     auto build_test_case = [&](ModelTestBuilder& builder) {
-      auto* input_arg = builder.MakeInput<MLFloat16>(input_shape, MLFloat16(-1.5f), MLFloat16(1.5f));
+      auto* input_arg = MakeInputARangeFP16(builder, input_shape, MLFloat16(-1.5f), MLFloat16(1.5f));
       auto* conv_output_arg = builder.MakeIntermediate();
       auto* output_arg = builder.MakeOutput();
-      auto* conv_weight_arg = builder.MakeInitializer(weights_shape, MLFloat16(-1.5f), MLFloat16(1.5f));
+      auto* conv_weight_arg = MakeInitializerARangeFP16(builder, weights_shape, MLFloat16(-1.5f), MLFloat16(1.5f));
 
       builder.AddConvNode(input_arg, conv_weight_arg, conv_output_arg);
       Node& pool_node = builder.AddNode("MaxPool", {conv_output_arg}, {output_arg});
@@ -609,13 +609,13 @@ TEST(NhwcTransformerTests, ConvMaxPoolFp16) {
 
 TEST(NhwcTransformerTests, ConvGlobalAveragePoolFp16) {
   auto build_test_case = [&](ModelTestBuilder& builder) {
-    auto* input_arg = builder.MakeInput<MLFloat16>({1, 23, 13, 13}, MLFloat16(-1.5f), MLFloat16(1.5f));
+    auto* input_arg = MakeInputARangeFP16(builder, {1, 23, 13, 13}, MLFloat16(-1.5f), MLFloat16(1.5f));
     auto* conv1_output_arg = builder.MakeIntermediate();
     auto* conv2_output_arg = builder.MakeIntermediate();
     auto* gavgpool1_output_arg = builder.MakeIntermediate();
     auto* output_arg = builder.MakeOutput();
-    auto* conv1_weight_arg = builder.MakeInitializer<MLFloat16>({30, 23, 3, 3}, MLFloat16(-1.5f), MLFloat16(1.5f));
-    auto* conv2_weight_arg = builder.MakeInitializer<MLFloat16>({16, 30, 1, 1}, MLFloat16(-1.5f), MLFloat16(1.5f));
+    auto* conv1_weight_arg = MakeInitializerARangeFP16(builder, {30, 23, 3, 3}, MLFloat16(-1.5f), MLFloat16(1.5f));
+    auto* conv2_weight_arg = MakeInitializerARangeFP16(builder, {16, 30, 1, 1}, MLFloat16(-1.5f), MLFloat16(1.5f));
 
     Node& conv1_node = builder.AddConvNode(input_arg, conv1_weight_arg, conv1_output_arg);
     conv1_node.AddAttribute("pads", std::vector<int64_t>{1, 1, 1, 1});
@@ -640,13 +640,13 @@ TEST(NhwcTransformerTests, ConvGlobalAveragePoolFp16) {
 
 TEST(NhwcTransformerTests, ConvAveragePoolFp16) {
   auto build_test_case = [&](ModelTestBuilder& builder) {
-    auto* input_arg = builder.MakeInput<MLFloat16>({1, 23, 13, 13}, MLFloat16(-1.5f), MLFloat16(1.5f));
+    auto* input_arg = MakeInputARangeFP16(builder, {1, 23, 13, 13}, MLFloat16(-1.5f), MLFloat16(1.5f));
     auto* conv1_output_arg = builder.MakeIntermediate();
     auto* conv2_output_arg = builder.MakeIntermediate();
     auto* avgpool1_output_arg = builder.MakeIntermediate();
     auto* output_arg = builder.MakeOutput();
-    auto* conv1_weight_arg = builder.MakeInitializer<MLFloat16>({30, 23, 3, 3}, MLFloat16(-1.5f), MLFloat16(1.5f));
-    auto* conv2_weight_arg = builder.MakeInitializer<MLFloat16>({16, 30, 3, 3}, MLFloat16(-1.5f), MLFloat16(1.5f));
+    auto* conv1_weight_arg = MakeInitializerARangeFP16(builder, {30, 23, 3, 3}, MLFloat16(-1.5f), MLFloat16(1.5f));
+    auto* conv2_weight_arg = MakeInitializerARangeFP16(builder, {16, 30, 3, 3}, MLFloat16(-1.5f), MLFloat16(1.5f));
 
     Node& conv1_node = builder.AddConvNode(input_arg, conv1_weight_arg, conv1_output_arg);
     conv1_node.AddAttribute("pads", std::vector<int64_t>{1, 1, 1, 1});

From cd6ec50b50f25ff46e71978db53050fedeceee86 Mon Sep 17 00:00:00 2001
From: Hariharan Seshadri <shariharan91@gmail.com>
Date: Tue, 19 Mar 2024 14:54:58 -0700
Subject: [PATCH 205/279] Switch a portion of CI/packaging jobs to MacOS12
 (#19908)

---
 onnxruntime/test/framework/inference_session_test.cc        | 6 ++++++
 .../github/azure-pipelines/mac-coreml-ci-pipeline.yml       | 4 +++-
 .../ci_build/github/azure-pipelines/mac-ios-ci-pipeline.yml | 4 ++--
 .../github/azure-pipelines/mac-ios-packaging-pipeline.yml   | 2 +-
 .../templates/mac-cpu-packaging-pipeline.yml                | 2 +-
 .../azure-pipelines/templates/mac-cpu-packing-jobs.yml      | 4 +++-
 .../templates/stages/mac-ios-packaging-build-stage.yml      | 4 ++--
 7 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc
index 60effda9ec772..d0520ebbcba5a 100644
--- a/onnxruntime/test/framework/inference_session_test.cc
+++ b/onnxruntime/test/framework/inference_session_test.cc
@@ -2944,6 +2944,11 @@ TEST(InferenceSessionTests, GlobalThreadPoolWithDenormalAsZero) {
 }
 
 // test inter thread pool with setting denormal as zero
+#if !defined(__APPLE__)
+// TODO (hasesh): Debug this test failure on MacOS 12 with XCode 14.2
+// It seemingly passes on MacOS 13 with XCode 15.x but we had to drop down to Mac OS 12
+// because at the time of writing this, Mac OS 13 images were making CI/Packaging pipelines
+// very unstable.
 TEST(InferenceSessionTests, InterThreadPoolWithDenormalAsZero) {
   if constexpr (!SessionOptions::DEFAULT_USE_PER_SESSION_THREADS) {
     GTEST_SKIP() << "Skipping the test";
@@ -3001,6 +3006,7 @@ TEST(InferenceSessionTests, InterThreadPoolWithDenormalAsZero) {
   VerifyThreadPoolWithDenormalAsZero(session2.GetIntraOpThreadPoolToUse(), false);
   VerifyThreadPoolWithDenormalAsZero(session2.GetInterOpThreadPoolToUse(), false);
 }
+#endif
 
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/tools/ci_build/github/azure-pipelines/mac-coreml-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/mac-coreml-ci-pipeline.yml
index a3f56f5c448a9..f0a35d809c700 100644
--- a/tools/ci_build/github/azure-pipelines/mac-coreml-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/mac-coreml-ci-pipeline.yml
@@ -32,7 +32,7 @@ jobs:
   workspace:
     clean: all
   pool:
-    vmImage: 'macOS-13'
+    vmImage: 'macOS-latest'
   variables:
     MACOSX_DEPLOYMENT_TARGET: '11.0'
     TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
@@ -43,6 +43,8 @@ jobs:
     displayName: Install coreutils and ninja
 
   - template: templates/use-xcode-version.yml
+    parameters:
+      xcodeVersion: 14.2
 
   - template: templates/mac-build-step-with-cache.yml
     parameters:
diff --git a/tools/ci_build/github/azure-pipelines/mac-ios-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/mac-ios-ci-pipeline.yml
index a1ca68c8279e7..255531681b039 100644
--- a/tools/ci_build/github/azure-pipelines/mac-ios-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/mac-ios-ci-pipeline.yml
@@ -30,7 +30,7 @@ pr:
 jobs:
 - job: iOS_CI_on_Mac
   pool:
-    vmImage: 'macOS-13'
+    vmImage: 'macOS-latest'
   variables:
     PROTO_CACHE_DIR: $(Pipeline.Workspace)/proto_ccache
     ORT_CACHE_DIR: $(Pipeline.Workspace)/ort_ccache
@@ -39,7 +39,7 @@ jobs:
   steps:
     - template: templates/use-xcode-version.yml
       parameters:
-        xcodeVersion: 14.3
+        xcodeVersion: 14.2
     - template: templates/mac-build-step-with-cache.yml
       parameters:
         WithCache: true
diff --git a/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml
index 5fd15b64e03b6..881023e1c1186 100644
--- a/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml
@@ -53,7 +53,7 @@ stages:
     displayName: "Set common variables"
 
     pool:
-      vmImage: "macOS-13"
+      vmImage: "macOS-latest"
 
     timeoutInMinutes: 5
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-pipeline.yml
index 080079388a76c..945fbb7c4a094 100644
--- a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-pipeline.yml
@@ -71,7 +71,7 @@ stages:
         ${{ if eq(parameters.DoESRP, true)}}:
           vmImage: 'macOS-12'
         ${{ else }}:
-          vmImage: 'macOS-13'
+          vmImage: 'macOS-latest'
       steps:
       - checkout: none
       - template: flex-downloadPipelineArtifact.yml
diff --git a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml
index fd2113502478a..9e192716c3ffd 100644
--- a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml
@@ -37,7 +37,7 @@ jobs:
     PROTO_CACHE_DIR: $(Pipeline.Workspace)/ccache_proto
     ORT_CACHE_DIR: $(Pipeline.Workspace)/ccache_ort
   pool:
-    vmImage: 'macOS-13'
+    vmImage: 'macOS-latest'
   timeoutInMinutes: 300
   steps:
   - checkout: self
@@ -55,6 +55,8 @@ jobs:
   - template: set-version-number-variables-step.yml
 
   - template: use-xcode-version.yml
+    parameters:
+      xcodeVersion: 14.2
 
   - template: mac-build-step-with-cache.yml
     parameters:
diff --git a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
index ed32c5d0e15be..b1cdb498bb4ae 100644
--- a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
@@ -16,10 +16,10 @@ stages:
     displayName: "Build iOS package for variant: ${{ parameters.packageVariant}}"
 
     pool:
-      vmImage: "macOS-13"
+      vmImage: "macOS-latest"
 
     variables:
-      xcodeVersion: "14.3"
+      xcodeVersion: "14.2"
       ortPodVersion: $[stageDependencies.IosPackaging_SetCommonVariables.j.outputs['SetCommonVariables.ORT_POD_VERSION']]
 
       ${{ if eq(parameters.packageVariant, 'Mobile') }}:

From 597e828aaea52d0bcf995858ee29ff0f41488c20 Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Tue, 19 Mar 2024 15:50:13 -0700
Subject: [PATCH 206/279] Adjust test tolerance (#19947)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Description
Improve the precision of tests.

Changes include:
(1) Update checkers.cc to use consistent default tolerance.
(2) Allow different default tolerances for different providers at
runtime (Previously, threshold of a test is decided during compiling).
(3) Explicitly set absolute and relative error tolerances for tests that
failed to pass new default threshold.

#### Default Thresholds Change

Note that the formula of testing is `abs(expected - value) < absolute +
relative * expected`

Default test thresholds when both absolute and relative tolerance are
not set:

type | provider | absolute (before) | absolute (after) | relative
(before) | relative (after)
-- | -- | -- | -- | -- | --
double | CPU | 0.001 | 0.00001 | 0 | 0.00001
double | CUDA | 0.005 | 0.00001 | 0 | 0.00001
double | TRT | 0.005 | 0.00001 | 0 | 0.00001
double | ROCM | 0.005 | 0.00001 | 0 | 0.00001
double | DML | 0.005 | 0.00001 | 0 | 0.00001
  |   |   |   |   |  
float | CPU | 0.0001 | 0.00001 | 0 | 0.0001
float | CUDA | 0.005 | 0.00001 | 0 | 0.0001
float | TRT | 0.005 | 0.00001 | 0 | 0.0001
float | ROCM | 0.005 | 0.00001 | 0 | 0.0001
float | DML | 0.005 | 0.00001 | 0 | 0.0001
float | Training* | 0.005 | 0.001 | 0 | 0.0001
  |   |   |   |   |  
half | CPU | 0.001 | 0.0025 | 0 | 0.001
half | CUDA | 0.005 | 0.0025 | 0 | 0.001
half | TRT | 0.005 | 0.0025 | 0 | 0.001
half | ROCM | 0.005 | 0.0025 | 0 | 0.001
half | DML | 0.02 | 0.005 | 0 | 0.001
half | Training* | 0.005 | 0.005 | 0 | 0.001
  |   |   |   |   |  
bfloat16 | CPU | 0.0001 | 0.02 | 0 | 0.01
bfloat16 | CUDA | 0.0001 | 0.02 | 0.05 | 0.01
bfloat16 | TRT | 0.0001 | 0.02 | 0.05 | 0.01
bfloat16 | ROCM | 0.0001 | 0.02 | 0.05 | 0.01
bfloat16 | DML | 0.0001 | 0.02 | 0.05 | 0.01
bfloat16 | Training* | 0.0001 | 0.02 | 0.05 | 0.01

*Training mean a build flag ENABLE_TRAINING_CORE is defined. The
provider can be any one.

#### Threshold for provider

Previously, the threshold might change according to build flags:
```
#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML)
  constexpr float threshold = 0.005f;
#else
  constexpr float threshold = 0.0001f;
#endif
```
For a cpu only build, the threshold is 0.0001. For a cuda build, the
threshold for CPU provider (some tests in cuda build actually run with
CPU provider) is changed to 0.005.

After this change, the threshold only depends on data type and provider
used in the test. It will not change by build flags for non-training
builds.


Default thresholds for training might be different from inference
(please refer to the above table). There are a few factors there:
Training has gradient outputs; TF32 is not disabled in training; Some
training tests has iterations, and error might accumulate. How to set
different thresholds based on these factors could be a future task.
---
 .../test/contrib_ops/attention_op_test.cc     |   9 ++
 .../contrib_ops/decoder_attention_op_test.cc  |   7 +-
 ...oder_masked_multihead_attention_op_test.cc |   6 +-
 onnxruntime/test/contrib_ops/fft_op_test.cc   |   2 +
 .../test/contrib_ops/gemm_fastgelu_op_test.cc |   6 +-
 .../test/contrib_ops/gridsample_test.cc       |   1 +
 .../test/contrib_ops/layer_norm_op_test.cc    |   6 +
 .../matmul_integer_to_float_test.cc           |   2 +-
 onnxruntime/test/contrib_ops/moe_test.cc      |   2 +
 .../packed_multihead_attention_op_test.cc     |   2 +
 .../contrib_ops/quantize_attention_op_test.cc |   2 +
 onnxruntime/test/providers/base_tester.cc     |  14 +++
 onnxruntime/test/providers/base_tester.h      |  11 ++
 onnxruntime/test/providers/checkers.cc        | 117 ++++++++++--------
 .../cpu/activation/activation_op_test.h       |   5 +
 .../cpu/math/element_wise_ops_test.cc         |   9 +-
 .../providers/cpu/math/logsoftmax_test.cc     |   9 +-
 .../providers/cpu/nn/batch_norm_op_test.cc    |   9 +-
 .../test/providers/cpu/nn/pool_op_test.cc     |   1 +
 .../cpu/object_detection/roialign_test.cc     |   2 +
 .../cpu/rnn/deep_cpu_lstm_op_test.cc          |   2 +
 .../providers/cpu/tensor/affine_grid_test.cc  |  17 +++
 .../mean_variance_normalization_test.cc       |   5 +
 .../test/gradient/optimizer_ops_test.cc       |  19 +++
 .../cpu/nn/batchnorm_internal_test.cc         |   2 +
 .../cuda/batch_norm_internal_test.cc          |   1 +
 26 files changed, 204 insertions(+), 64 deletions(-)

diff --git a/onnxruntime/test/contrib_ops/attention_op_test.cc b/onnxruntime/test/contrib_ops/attention_op_test.cc
index 7fe70fd2d6f09..a8e2fccdd0462 100644
--- a/onnxruntime/test/contrib_ops/attention_op_test.cc
+++ b/onnxruntime/test/contrib_ops/attention_op_test.cc
@@ -227,6 +227,12 @@ static void RunAttentionTest(
       tester.AddOptionalInputEdge<int32_t>();
     }
 
+    if (use_float16) {
+      tester.SetOutputTolerance(0.005f);
+    } else {
+      tester.SetOutputTolerance(0.001f, 0.001f);
+    }
+
     if (enable_cuda) {
       std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
       execution_providers.push_back(DefaultCudaExecutionProvider());
@@ -254,6 +260,9 @@ static void RunAttentionTest(
     if (enable_dml) {
       std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
       execution_providers.push_back(DefaultDmlExecutionProvider());
+      if (use_float16) {
+        tester.SetOutputTolerance(0.02f);
+      }
       tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
     }
   }
diff --git a/onnxruntime/test/contrib_ops/decoder_attention_op_test.cc b/onnxruntime/test/contrib_ops/decoder_attention_op_test.cc
index 88a2bdf6a4849..8a37ef921fd2b 100644
--- a/onnxruntime/test/contrib_ops/decoder_attention_op_test.cc
+++ b/onnxruntime/test/contrib_ops/decoder_attention_op_test.cc
@@ -31,10 +31,8 @@ static void RunAttentionTest(
     const std::vector<float>* new_value_cache = nullptr,
     const std::vector<float>* key_cache = nullptr,
     const std::vector<float>* value_cache = nullptr,
-    const std::initializer_list<bool>* key_padding_mask_data = nullptr,
-    bool use_float16 = false) {
-  int min_cuda_architecture = use_float16 ? 530 : 0;
-  bool enable_cuda = HasCudaEnvironment(min_cuda_architecture);
+    const std::initializer_list<bool>* key_padding_mask_data = nullptr) {
+  bool enable_cuda = HasCudaEnvironment(0);
   bool enable_rocm = (nullptr != DefaultRocmExecutionProvider().get());
   bool enable_cpu = false;
 
@@ -99,6 +97,7 @@ static void RunAttentionTest(
       tester.AddOutput<float>("new_key_cache", output_cache_dims, *new_key_cache);
       tester.AddOutput<float>("new_value_cache", output_cache_dims, *new_value_cache);
     }
+    tester.SetOutputTolerance(0.001f, 0.001f);
 
     std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
     if (enable_cuda) {
diff --git a/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc b/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc
index acaae2dcd9712..17c9e8592f64e 100644
--- a/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc
+++ b/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc
@@ -754,9 +754,10 @@ TEST(DecoderMaskedSelfAttentionTest, Test_fp32) {
 
     // Output(s)
     tester.AddOutput<float>("output", input_dims, output);
-
     tester.AddOutput<float>("present", past_dims, present);
 
+    tester.SetOutputTolerance(0.001f, 0.001f);
+
     // Run - Regular kernel execution path
     {
       std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
@@ -897,9 +898,10 @@ TEST(DecoderMaskedSelfAttentionTest, Test_fp16) {
 
     // Output(s)
     tester.AddOutput<MLFloat16>("output", input_dims, output);
-
     tester.AddOutput<MLFloat16>("present", past_dims, present);
 
+    tester.SetOutputTolerance(0.005f);
+
     // Run - Regular kernel execution path
     {
       std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
diff --git a/onnxruntime/test/contrib_ops/fft_op_test.cc b/onnxruntime/test/contrib_ops/fft_op_test.cc
index 56a6466c760f6..7a6b6cca6425a 100644
--- a/onnxruntime/test/contrib_ops/fft_op_test.cc
+++ b/onnxruntime/test/contrib_ops/fft_op_test.cc
@@ -25,6 +25,7 @@ TEST(ContribOpTest, Rfft) {
   // Target values conputed using PyTorch torch.fft.rfft(X, dim=-1, norm="backward")
   test.AddInput<float>("X", {4, 4}, {0.8129f, 1.3108f, -0.8790f, -1.2046f, 0.1661f, -0.9831f, 0.5879f, 0.4918f, 1.2506f, 0.7244f, -2.6260f, -1.1268f, -1.6885f, 1.0439f, -0.2595f, 1.8780f});
   test.AddOutput<float>("Y", {4, 3, 2}, {0.0400f, 0.0000f, 1.6919f, -2.5154f, -0.1722f, 0.0000f, 0.2627f, 0.0000f, -0.4218f, 1.4748f, 1.2454f, 0.0000f, -1.7779f, 0.0000f, 3.8766f, -1.8512f, -0.9730f, 0.0000f, 0.9740f, 0.0000f, -1.4290f, 0.8341f, -4.8699f, 0.0000f});
+  test.SetOutputTolerance(0.0001f);
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
 }
 
@@ -45,6 +46,7 @@ TEST(ContribOpTest, Irfft) {
   test.AddAttribute("normalized", static_cast<int64_t>(0));
   test.AddInput<float>("X", {4, 3, 2}, {0.0400f, 0.0000f, 1.6919f, -2.5154f, -0.1722f, 0.0000f, 0.2627f, 0.0000f, -0.4218f, 1.4748f, 1.2454f, 0.0000f, -1.7779f, 0.0000f, 3.8766f, -1.8512f, -0.9730f, 0.0000f, 0.9740f, 0.0000f, -1.4290f, 0.8341f, -4.8699f, 0.0000f});
   test.AddOutput<float>("Y", {4, 4}, {0.8129f, 1.3108f, -0.8790f, -1.2046f, 0.1661f, -0.9831f, 0.5879f, 0.4918f, 1.2506f, 0.7244f, -2.6260f, -1.1268f, -1.6885f, 1.0439f, -0.2595f, 1.8780f});
+  test.SetOutputTolerance(0.0001f);
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
 }
 }  // namespace test
diff --git a/onnxruntime/test/contrib_ops/gemm_fastgelu_op_test.cc b/onnxruntime/test/contrib_ops/gemm_fastgelu_op_test.cc
index a24f3b6b441e1..d9d2681dd3b3f 100644
--- a/onnxruntime/test/contrib_ops/gemm_fastgelu_op_test.cc
+++ b/onnxruntime/test/contrib_ops/gemm_fastgelu_op_test.cc
@@ -50,6 +50,8 @@ static void RunGemmFastGeluGpuTest(const std::vector<float>& input_data, const s
     tester.AddOutput<float>("Y", output_dims, output_data);
   }
 
+  tester.SetOutputTolerance(use_float16 ? 0.005f : 0.0025f);
+
   tester.Config(run_with_tunable_op)
       .RunWithConfig();
 }
@@ -154,7 +156,7 @@ TEST(GemmFastGeluTest, GemmFastGeluWithoutBiasFloat16) {
 
   RunGemmFastGeluGpuTest(input_data, weight_data, bias_data, output_data,
                          input_dims, weight_dims, bias_dims, output_dims,
-                         false);
+                         false, true);
 }
 
 TEST(GemmFastGeluTest, GemmFastGeluWithBiasFloat16) {
@@ -189,7 +191,7 @@ TEST(GemmFastGeluTest, GemmFastGeluWithBiasFloat16) {
 
   RunGemmFastGeluGpuTest(input_data, weight_data, bias_data, output_data,
                          input_dims, weight_dims, bias_dims, output_dims,
-                         true);
+                         true, true);
 }
 
 TEST(GemmFastGeluTest, GemmFastGeluWithBias_bfloat16) {
diff --git a/onnxruntime/test/contrib_ops/gridsample_test.cc b/onnxruntime/test/contrib_ops/gridsample_test.cc
index 46ed04301a9e8..d970178e29ab8 100644
--- a/onnxruntime/test/contrib_ops/gridsample_test.cc
+++ b/onnxruntime/test/contrib_ops/gridsample_test.cc
@@ -126,6 +126,7 @@ TEST(GridsampleContribOpTest, gridsample_mode_bicubic) {
                         0.5000f, 0.5000f, 1.0000f, 1.0000f});
   test.AddAttribute("mode", "bicubic");
   test.AddOutput<float>("Y", {1, 1, 2, 4}, {-0.1406f, 0.3828f, 1.7556f, 2.9688f, 2.9688f, 1.7556f, 5.1445f, 1.3906f});
+  test.SetOutputTolerance(0.0001f);
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider});
 }
 
diff --git a/onnxruntime/test/contrib_ops/layer_norm_op_test.cc b/onnxruntime/test/contrib_ops/layer_norm_op_test.cc
index 98fb62e435f31..655c4951f262d 100644
--- a/onnxruntime/test/contrib_ops/layer_norm_op_test.cc
+++ b/onnxruntime/test/contrib_ops/layer_norm_op_test.cc
@@ -160,6 +160,7 @@ TEST(LayerNormTest, LayerNorm_Scale_Bias) {
   test.AddInput<float>("gamma", {2}, {-0.6953f, 5.1824f});
   test.AddInput<float>("bias", {2}, {0.6435f, -0.3964f});
   test.AddOutput<float>("output", dims, {-0.0516f, -5.5776f, -0.0518f, -5.5788f, -0.0518f, -5.5788f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -172,6 +173,8 @@ TEST(LayerNormTest, LayerNorm_Scale_Bias_Float16Input) {
   test.AddInput<float>("gamma", {2}, {-0.6953f, 5.1824f});
   test.AddInput<float>("bias", {2}, {0.6435f, -0.3964f});
   test.AddOutput<float>("output", dims, {-0.0516f, -5.5776f, -0.0518f, -5.5788f, -0.0518f, -5.5788f});
+  test.SetOutputTolerance(0.0001f);
+
   // TRT, DNNL, OpenVINO and NNAPI, CoreML don't support this combination of datatypes
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
            {kTensorrtExecutionProvider, kDnnlExecutionProvider, kQnnExecutionProvider,
@@ -228,6 +231,9 @@ TEST(LayerNormTest, LayerNorm17_double) {
   test.AddInput<double>("x", dims, {1.0, 2.0, 3.0, 4.0, 5.0, 6.0});
   test.AddInput<double>("gamma", {3}, {1.0, 1.0, 1.0});
   test.AddOutput<double>("output", dims, {-1.2247, 0.0, 1.2247, -1.2247, 0.0, 1.2247});
+
+  test.SetOutputTolerance(0.0001f);
+
   // DNNL does not support double
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kDnnlExecutionProvider});
 }
diff --git a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
index 72a5ba4dcefbf..8d7629b5fda1c 100644
--- a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
@@ -127,7 +127,7 @@ void TestMatMulIntegerToFloat(bool is_matrix_b_constant,
 
   if (std::is_same_v<OType, float>) {
     test.AddOutput<float>("Y", {M, N}, Y_data);
-    test.SetOutputAbsErr("Y", 0.0001f);
+    test.SetOutputAbsErr("Y", 0.001f);
     test.SetOutputRelErr("Y", 0.02f);
   } else {
     test.AddOutput<MLFloat16>("Y", {M, N}, ToFloat16(Y_data));
diff --git a/onnxruntime/test/contrib_ops/moe_test.cc b/onnxruntime/test/contrib_ops/moe_test.cc
index ebb0261deefa5..e88ef7794cd07 100644
--- a/onnxruntime/test/contrib_ops/moe_test.cc
+++ b/onnxruntime/test/contrib_ops/moe_test.cc
@@ -47,6 +47,7 @@ static void RunMoETest(
       tester.AddInput<MLFloat16>("fc1_experts_bias", fc1_experts_bias_dims, ToFloat16(fc1_experts_bias));
       tester.AddInput<MLFloat16>("fc2_experts_bias", fc2_experts_bias_dims, ToFloat16(fc2_experts_bias));
       tester.AddOutput<MLFloat16>("output", output_dims, ToFloat16(output_data));
+      tester.SetOutputTolerance(0.005f);
     } else {
       tester.AddInput<float>("input", input_dims, input);
       tester.AddInput<float>("router_probs", router_probs_dims, router_probs);
@@ -55,6 +56,7 @@ static void RunMoETest(
       tester.AddInput<float>("fc1_experts_bias", fc1_experts_bias_dims, fc1_experts_bias);
       tester.AddInput<float>("fc2_experts_bias", fc2_experts_bias_dims, fc2_experts_bias);
       tester.AddOutput<float>("output", output_dims, output_data);
+      tester.SetOutputTolerance(0.001f);
     }
 
     std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
diff --git a/onnxruntime/test/contrib_ops/packed_multihead_attention_op_test.cc b/onnxruntime/test/contrib_ops/packed_multihead_attention_op_test.cc
index 22253955566f2..5f811c8cf35f6 100644
--- a/onnxruntime/test/contrib_ops/packed_multihead_attention_op_test.cc
+++ b/onnxruntime/test/contrib_ops/packed_multihead_attention_op_test.cc
@@ -107,6 +107,7 @@ static void RunPackedMultiHeadAttentionTest(
       }
 
       tester.AddOutput<MLFloat16>("output", output_dims, ToFloat16(output_data));
+      tester.SetOutputTolerance(0.005f);
     } else {
       if (is_packed_qkv) {
         tester.AddInput<float>("query", packed_qkv_dims, query_data);
@@ -131,6 +132,7 @@ static void RunPackedMultiHeadAttentionTest(
       }
 
       tester.AddOutput<float>("output", output_dims, output_data);
+      tester.SetOutputTolerance(0.001f, 0.001f);
     }
 
     std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
diff --git a/onnxruntime/test/contrib_ops/quantize_attention_op_test.cc b/onnxruntime/test/contrib_ops/quantize_attention_op_test.cc
index fd222583ac67f..54dd831fe2fc2 100644
--- a/onnxruntime/test/contrib_ops/quantize_attention_op_test.cc
+++ b/onnxruntime/test/contrib_ops/quantize_attention_op_test.cc
@@ -90,11 +90,13 @@ void RunQAttention(const std::vector<float>& input_data,
     tester.AddInput<MLFloat16>("input_scale", {1}, ToFloat16({input_quant_params.scale}));
     tester.AddInput<MLFloat16>("weight_scale", {1}, ToFloat16({weight_quant_params.scale}));
     tester.AddOutput<MLFloat16>("output", output_dims, ToFloat16(output_data));
+    tester.SetOutputTolerance(0.01f);
   } else {
     tester.AddInput<float>("bias", bias_dims, bias_data);
     tester.AddInput<float>("input_scale", {1}, {input_quant_params.scale});
     tester.AddInput<float>("weight_scale", {1}, {weight_quant_params.scale});
     tester.AddOutput<float>("output", output_dims, output_data);
+    tester.SetOutputTolerance(0.005f);
   }
 
   if (mask_index_data.size() > 0) {
diff --git a/onnxruntime/test/providers/base_tester.cc b/onnxruntime/test/providers/base_tester.cc
index e94f8c2673be3..8d84c689cd23e 100644
--- a/onnxruntime/test/providers/base_tester.cc
+++ b/onnxruntime/test/providers/base_tester.cc
@@ -120,6 +120,20 @@ void BaseTester::SetOutputRelErr(const char* name, float v) {
   it->validation_params.relative_error = optional<float>(v);
 }
 
+void BaseTester::SetOutputTolerance(float abs_error, float rel_error) {
+  for (auto& output : output_data_) {
+    if (output.def.Exists()) {
+      if (abs_error >= 0.0f) {
+        output.validation_params.absolute_error = optional<float>(abs_error);
+      }
+
+      if (rel_error >= 0.0f) {
+        output.validation_params.relative_error = optional<float>(rel_error);
+      }
+    }
+  }
+}
+
 std::vector<int64_t> BaseTester::GetDimsForProto(gsl::span<const int64_t> dims) {
   std::vector<int64_t> dims_for_proto{dims.begin(), dims.end()};
   if (add_symbolic_dim_to_tensor_data_ >= 0 &&
diff --git a/onnxruntime/test/providers/base_tester.h b/onnxruntime/test/providers/base_tester.h
index 5607e58315a12..c276ae494df43 100644
--- a/onnxruntime/test/providers/base_tester.h
+++ b/onnxruntime/test/providers/base_tester.h
@@ -519,9 +519,20 @@ class BaseTester {
     custom_session_registries_.push_back(registry);
   }
 
+  // For floating types (double/float/half/bfloat16), tolerance is similar to numpy.isclose:
+  //   absolute(expected_value - actual_value) <= abs_error + rel_error * absolute(expected_value)
+  // For integer types, tolerance parameters are ignored except the following cases:
+  //   For uint8, tolerance is only applied to NNAPI/XNNPACK/DML providers.
+  //   For int8, only abs_error is used, and rel_error is ignored. See checkers.cc for detail.
+  // If abs_error or rel_error is not set, a default value is used (search DefaultTolerance for detail).
   void SetOutputAbsErr(const char* name, float v);
   void SetOutputRelErr(const char* name, float v);
 
+  // Set absolute and relative tolerance for all existed outputs.
+  // Negative value will be ignored.
+  // Note that it will not set tolerance for new outputs added after this call.
+  void SetOutputTolerance(float abs_error, float rel_error = -1.0f);
+
   // Number of times to call InferenceSession::Run. The same feeds are used each time.
   // e.g. used to verify the generator ops behave as expected
   void SetNumRunCalls(int n) {
diff --git a/onnxruntime/test/providers/checkers.cc b/onnxruntime/test/providers/checkers.cc
index c97e6d9de4911..47c18c478dd9c 100644
--- a/onnxruntime/test/providers/checkers.cc
+++ b/onnxruntime/test/providers/checkers.cc
@@ -20,46 +20,87 @@ struct DefaultTolerance;
 
 template <>
 struct DefaultTolerance<double> {
-  static constexpr float absolute = 1e-6f;
+  static constexpr float absolute = 1e-5f;
   static constexpr float relative = 1e-5f;
+
+  // Allow to have different default absolute tolerance for different providers.
+  static float get_absolute(const std::string& /*provider_type*/) {
+    return absolute;
+  }
 };
 
 template <>
 struct DefaultTolerance<float> {
+#if defined(ENABLE_TRAINING)
+  static constexpr float absolute = 1e-3f;
+#else
   static constexpr float absolute = 1e-5f;
+#endif
+
   static constexpr float relative = 1e-4f;
+
+  static float get_absolute(const std::string& /*provider_type*/) {
+    return absolute;
+  }
 };
 
 template <>
 struct DefaultTolerance<MLFloat16> {
-  // The thresholds are estimated with PyTorch script like the following:
+#if defined(ENABLE_TRAINING)
+  static constexpr float absolute = 0.005f;
+#else
+  // The thresholds for inference are estimated with PyTorch script like the following:
   //    x = torch.rand(1000, 1000)
   //    absolute = ((x + 1e-6).to(torch.float16) - x).abs().max() * 10
   //    x[abs(x) < absolute] = absolute
   //    relative = ((x - x.to(torch.float16)) / x).abs().max() * 2
   static constexpr float absolute = 0.0025f;
+#endif
+
   static constexpr float relative = 0.001f;
+
+  static float get_absolute(const std::string& provider_type) {
+    if (provider_type == kDmlExecutionProvider) {
+      return 0.005f;
+    }
+    return absolute;
+  }
 };
 
 template <>
 struct DefaultTolerance<BFloat16> {
+  // The thresholds for inference are estimated with PyTorch script like the following:
+  //    x = torch.rand(1000, 1000)
+  //    absolute = ((x + 1e-6).to(torch.bfloat16) - x).abs().max() * 10
+  //    x[abs(x) < absolute] = absolute
+  //    relative = ((x - x.to(torch.bfloat16)) / x).abs().max() * 2
   static constexpr float absolute = 0.02f;
   static constexpr float relative = 0.01f;
+
+  static float get_absolute(const std::string& /*provider_type*/) {
+    return absolute;
+  }
+};
+
+struct ToleranceParams {
+  float absolute;
+  float relative;
 };
 
 template <typename T>
-T get_tolerance(float absolute, float relative, T expected_value) {
+ToleranceParams get_tolerance_params(const ValidateOutputParams& params, const std::string& provider_type) {
+  ToleranceParams new_params;
+  new_params.absolute = params.absolute_error.has_value() ? *(params.absolute_error) : DefaultTolerance<T>::get_absolute(provider_type);
+  new_params.relative = params.relative_error.has_value() ? *(params.relative_error) : DefaultTolerance<T>::relative;
+  return new_params;
+}
+
+template <typename T>
+T get_tolerance(const ToleranceParams& params, T expected_value) {
   static_assert(std::is_floating_point<T>::value, "T must be a floating point type");
 
   // The formula is similar to numpy.isclose: https://numpy.org/doc/stable/reference/generated/numpy.isclose.html
-  return static_cast<T>(absolute) + static_cast<T>(relative) * std::abs(expected_value);
-}
-
-template <typename T, typename D>  // D is the original data type
-T get_tolerance(const ValidateOutputParams& params, T expected_value) {
-  float absolute = (params.absolute_error.has_value() ? *(params.absolute_error) : DefaultTolerance<D>::absolute);
-  float relative = (params.relative_error.has_value() ? *(params.relative_error) : DefaultTolerance<D>::relative);
-  return get_tolerance<T>(absolute, relative, expected_value);
+  return static_cast<T>(params.absolute) + static_cast<T>(params.relative) * std::abs(expected_value);
 }
 
 template <typename T>
@@ -201,7 +242,10 @@ struct TensorCheck<int8_t> {
       cur_actual = actual.template Data<int8_t>();
     }
 
-    const bool has_abs_err = params.absolute_error.has_value();
+    // When absolute error is less than 1 for int8, it has same effect as no tolerance.
+    const bool has_abs_err = params.absolute_error.has_value() && *(params.absolute_error) >= 1.0f;
+
+    // TODO: the relative error is not used for int8 yet.
     if (has_abs_err) {
       double threshold = *(params.absolute_error);
 
@@ -221,11 +265,9 @@ struct TensorCheck<double> {
   void operator()(const Tensor& expected,
                   const Tensor& actual,
                   const ValidateOutputParams& params,
-                  const std::string& /*provider_type*/) const {
+                  const std::string& provider_type) const {
     auto size = actual.Shape().Size();
 
-    const bool has_tolerance = params.absolute_error.has_value() || params.relative_error.has_value();
-
     // deal with rare cases in which order of output data from a kernel MAY be
     // undefined
     Tensor expected_sorted, actual_sorted;
@@ -240,10 +282,7 @@ struct TensorCheck<double> {
       cur_actual = actual.Data<double>();
     }
 
-    double threshold = 0.001;
-#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML)
-    threshold = 0.005;
-#endif
+    auto tolerance_params = get_tolerance_params<double>(params, provider_type);
 
     for (int64_t i = 0; i < size; ++i) {
       // NOTE: Check isnan first to work around MSVC linker bug when /LTCG:incremental is specified.
@@ -253,7 +292,7 @@ struct TensorCheck<double> {
       } else if (std::isinf(cur_expected[i])) {  // Test infinity for equality
         EXPECT_EQ(cur_expected[i], cur_actual[i]) << "Expected infinity. i:" << i;
       } else {
-        double tolerance = has_tolerance ? get_tolerance<double, double>(params, cur_expected[i]) : threshold;
+        double tolerance = get_tolerance<double>(tolerance_params, cur_expected[i]);
         EXPECT_NEAR(cur_expected[i], cur_actual[i], tolerance) << "i:" << i;
       }
     }
@@ -264,9 +303,7 @@ template <typename T>
 void InternalNumericalCheck(const Tensor& expected,
                             const Tensor& actual,
                             const ValidateOutputParams& params,
-                            const std::string& /*provider_type*/) {
-  const bool has_tolerance = params.absolute_error.has_value() || params.relative_error.has_value();
-
+                            const std::string& provider_type) {
   // deal with rare cases in which order of output data from a kernel MAY be
   // undefined
   Tensor expected_sorted, actual_sorted;
@@ -282,11 +319,7 @@ void InternalNumericalCheck(const Tensor& expected,
     cur_actual = actual.Data<T>();
   }
 
-#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML)
-  constexpr float threshold = 0.005f;
-#else
-  constexpr float threshold = 0.0001f;
-#endif
+  auto tolerance_params = get_tolerance_params<T>(params, provider_type);
 
   for (int64_t i = 0; i < size; ++i) {
     // NOTE: Check isnan first to work around MSVC linker bug when /LTCG:incremental is specified.
@@ -296,7 +329,7 @@ void InternalNumericalCheck(const Tensor& expected,
     } else if (std::isinf(cur_expected[i])) {  // Test infinity for equality
       EXPECT_EQ(cur_expected[i], cur_actual[i]) << "Expected infinity. i:" << i;
     } else {
-      T tolerance = has_tolerance ? get_tolerance<T, T>(params, cur_expected[i]) : threshold;
+      T tolerance = get_tolerance<T>(tolerance_params, cur_expected[i]);
       EXPECT_NEAR(cur_expected[i], cur_actual[i], tolerance) << "i:" << i;
     }
   }
@@ -317,7 +350,7 @@ struct TensorCheck<MLFloat16> {
   void operator()(const Tensor& expected,
                   const Tensor& actual,
                   const ValidateOutputParams& params,
-                  const std::string& /*provider_type*/) const {
+                  const std::string& provider_type) const {
     auto* cur_expected = expected.Data<MLFloat16>();
     auto* cur_actual = actual.Data<MLFloat16>();
     auto size = actual.Shape().Size();
@@ -333,21 +366,15 @@ struct TensorCheck<MLFloat16> {
       sort_expected_and_actual_buffers<float>(f_expected, f_actual);
     }
 
-    const bool has_tolerance = params.absolute_error.has_value() || params.relative_error.has_value();
+    auto tolerance_params = get_tolerance_params<MLFloat16>(params, provider_type);
 
-    float threshold = 0.001f;
-#if defined(USE_TENSORRT) || defined(ENABLE_TRAINING_CORE) || defined(USE_CUDA) || defined(USE_ROCM)
-    threshold = 0.005f;
-#elif defined(USE_DML)
-    threshold = 0.02f;
-#endif
     for (int64_t i = 0; i < size; ++i) {
       if (std::isnan(f_expected[i])) {
         EXPECT_TRUE(std::isnan(f_expected[i])) << "Expected NaN. i:" << i;
       } else if (std::isinf(f_expected[i])) {  // Test infinity for equality
         EXPECT_EQ(f_expected[i], f_actual[i]) << "Expected infinity. i:" << i;
       } else {
-        float tolerance = has_tolerance ? get_tolerance<float, MLFloat16>(params, f_expected[i]) : threshold;
+        float tolerance = get_tolerance<float>(tolerance_params, f_expected[i]);
         EXPECT_NEAR(f_expected[i], f_actual[i], tolerance) << "i:" << i;
       }
     }
@@ -359,7 +386,7 @@ struct TensorCheck<BFloat16> {
   void operator()(const Tensor& expected,
                   const Tensor& actual,
                   const ValidateOutputParams& params,
-                  const std::string& /*provider_type*/) const {
+                  const std::string& provider_type) const {
     auto* cur_expected = expected.Data<BFloat16>();
     auto* cur_actual = actual.Data<BFloat16>();
     auto size = actual.Shape().Size();
@@ -375,13 +402,7 @@ struct TensorCheck<BFloat16> {
       sort_expected_and_actual_buffers<float>(f_expected, f_actual);
     }
 
-    const bool has_tolerance = params.absolute_error.has_value() || params.relative_error.has_value();
-
-    float abs_threshold = 0.0001f;
-    float rel_threshold = 0.001f;
-#if defined(USE_TENSORRT) || defined(ENABLE_TRAINING_CORE) || defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML) || defined(USE_DNNL)
-    rel_threshold = 0.05f;  // expect at least 95% close
-#endif
+    auto tolerance_params = get_tolerance_params<BFloat16>(params, provider_type);
 
     for (int64_t i = 0; i < size; ++i) {
       if (std::isnan(f_expected[i])) {
@@ -389,9 +410,7 @@ struct TensorCheck<BFloat16> {
       } else if (std::isinf(f_expected[i])) {  // Test infinity for equality
         EXPECT_EQ(f_expected[i], f_actual[i]) << "Expected infinity. i:" << i;
       } else {
-        float tolerance = has_tolerance
-                              ? get_tolerance<float, BFloat16>(params, f_expected[i])
-                              : get_tolerance<float>(abs_threshold, rel_threshold, f_expected[i]);
+        float tolerance = get_tolerance<float>(tolerance_params, f_expected[i]);
         EXPECT_NEAR(f_expected[i], f_actual[i], tolerance) << "i:" << i;
       }
     }
diff --git a/onnxruntime/test/providers/cpu/activation/activation_op_test.h b/onnxruntime/test/providers/cpu/activation/activation_op_test.h
index 984b8f4437a3b..9a74d763a13e3 100644
--- a/onnxruntime/test/providers/cpu/activation/activation_op_test.h
+++ b/onnxruntime/test/providers/cpu/activation/activation_op_test.h
@@ -69,6 +69,11 @@ inline void TestActivationOp(const char* szOp, const std::vector<std::vector<T>>
       test.SetOutputRelErr("Y", .000001f);
     }
 #endif
+
+    if (strcmp(szOp, "QuickGelu") == 0) {
+      test.SetOutputTolerance(0.0001f);
+    }
+
     test.Run(OpTester::ExpectResult::kExpectSuccess, "", excluded_providers);
   }
 }
diff --git a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
index 0e99b2306873e..c73dfcbce1b53 100644
--- a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
@@ -2632,7 +2632,7 @@ TEST(MathOpTest, Mean_8) {
 #endif
 
 template <float (&op)(float value) MATH_NO_EXCEPT>
-void TrigFloatTest(OpTester& test, std::initializer_list<float> input) {
+void TrigFloatTest(OpTester& test, std::initializer_list<float> input, float abs_error = -1.0f) {
   std::vector<int64_t> dims{static_cast<int64_t>(input.size())};
 
   std::vector<float> output;
@@ -2642,6 +2642,10 @@ void TrigFloatTest(OpTester& test, std::initializer_list<float> input) {
   test.AddInput<float>("X", dims, input);
   test.AddOutput<float>("Y", dims, output);
 
+  if (abs_error >= 0.0f) {
+    test.SetOutputTolerance(abs_error);
+  }
+
   test.Run();
 }
 
@@ -2719,7 +2723,8 @@ TEST(MathOpTest, Tan) {
 
 TEST(MathOpTest, Asin) {
   OpTester test("Asin");
-  TrigFloatTest<::asinf>(test, {-1.0f, -0.5f, 0.0f, 0.5f, 1.0f});
+  float abs_error = DefaultDmlExecutionProvider().get() != nullptr ? 0.0001f : -1.0f;
+  TrigFloatTest<::asinf>(test, {-1.0f, -0.5f, 0.0f, 0.5f, 1.0f}, abs_error);
 }
 
 TEST(MathOpTest, Acos) {
diff --git a/onnxruntime/test/providers/cpu/math/logsoftmax_test.cc b/onnxruntime/test/providers/cpu/math/logsoftmax_test.cc
index 273503e7bf6af..f057e4a071bd9 100644
--- a/onnxruntime/test/providers/cpu/math/logsoftmax_test.cc
+++ b/onnxruntime/test/providers/cpu/math/logsoftmax_test.cc
@@ -15,7 +15,8 @@ static void RunTest(const std::vector<float>& x_vals,
                     int64_t axis = 1,
                     bool is_tensorrt_supported = true,
                     OpTester::ExpectResult expect_result = OpTester::ExpectResult::kExpectSuccess,
-                    const std::string& error_msg = "") {
+                    const std::string& error_msg = "",
+                    float tolerance = 0.0f) {
   OpTester tester("LogSoftmax", opset);
 
   if (opset < 13) {
@@ -31,6 +32,10 @@ static void RunTest(const std::vector<float>& x_vals,
   tester.AddInput("X", dimensions, x_vals);
   tester.AddOutput("Y", dimensions, expected_vals);
 
+  if (tolerance != 0.0f) {
+    tester.SetOutputAbsErr("Y", tolerance);
+  }
+
   std::unordered_set<std::string> excluded_providers;
   if (!is_tensorrt_supported) {
     excluded_providers.insert(kTensorrtExecutionProvider);
@@ -62,7 +67,7 @@ TEST(LogSoftmaxOperator, LargeNumber) {
                                       -3.4401896f, -2.4401896f, -1.44018972f, -0.44018969f};
   std::vector<int64_t> dimensions = {2, 4};
 
-  RunTest(x_vals, expected_vals, dimensions);
+  RunTest(x_vals, expected_vals, dimensions, 7, 1, true, OpTester::ExpectResult::kExpectSuccess, "", 0.0005f);
 }
 
 // np.random.seed(123)   # Use a seed so we can replicate the input and expected values here and in python
diff --git a/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc b/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc
index 3d30fc62a945d..d91a1de3faa6e 100644
--- a/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc
@@ -905,14 +905,16 @@ TEST(BatchNormTest, ForwardTrainingTestWithSavedOutputsOpset9) {
   test.AddInput<float>("var", channel_dims, {1.0f, 2.0f});
 
   test.AddOutput<float>("Y", input_output_dims, {0.0131f, 0.5210f, 1.7244f, 0.1387f, -0.2708f, -0.1191f, 1.2089f, -0.0922f, -0.9548f, -1.5203f, 0.9077f, -0.8298f, 0.5796f, -0.4501f, -2.0921f, 1.2358f});
-
   test.AddOutput<float>("running_mean", channel_dims, {-0.1754f, 0.303106f});
   test.AddOutput<float>("running_var", channel_dims, {0.696052f, 1.41316f});
+
   // mean and variance of X across channel dimension
   // With Opset9 we output saved_inv_std instead of saved_var to match CUDA EP
   test.AddOutput<float>("saved_mean", channel_dims, {-0.306f, 0.114562f});
   test.AddOutput<float>("saved_inv_std", channel_dims, {1.2288f, 0.861317f});
 
+  test.SetOutputTolerance(0.0001f);
+
   // exclude CUDA Execution Provider due to flakiness
   // exclude TRT and OpenVINO for same reasons as seen in TestBatchNorm()
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
@@ -938,10 +940,11 @@ TEST(BatchNormTest, ForwardTrainingTestOpset14) {
   test.AddInput<float>("var", channel_dims, {1.0f, 2.0f});
 
   test.AddOutput<float>("Y", input_output_dims, {0.0131f, 0.5210f, 1.7244f, 0.1387f, -0.2708f, -0.1191f, 1.2089f, -0.0922f, -0.9548f, -1.5203f, 0.9077f, -0.8298f, 0.5796f, -0.4501f, -2.0921f, 1.2358f});
-
   test.AddOutput<float>("running_mean", channel_dims, {-0.1754f, 0.303106f});
   test.AddOutput<float>("running_var", channel_dims, {0.696052f, 1.41316f});
 
+  test.SetOutputTolerance(0.0001f);
+
   // exclude CUDA Execution Provider due to flakiness
   // exclude TRT and OpenVINO for same reasons as seen in TestBatchNorm()
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
@@ -970,6 +973,8 @@ TEST(BatchNormTest, ForwardTrainingTestOpset15) {
   test.AddOutput<float>("running_mean", channel_dims, {-0.1754f, 0.303106f});
   test.AddOutput<float>("running_var", channel_dims, {0.696052f, 1.41316f});
 
+  test.SetOutputTolerance(0.0001f);
+
   // Same exclusions as the opset 14 test
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
            {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider,
diff --git a/onnxruntime/test/providers/cpu/nn/pool_op_test.cc b/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
index e24cda17166ed..f98b18ddb17eb 100644
--- a/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
@@ -888,6 +888,7 @@ TEST(PoolTest, AveragePool_IncludePadPixel) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
+  test.SetOutputTolerance(0.0001f);
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
 }
 
diff --git a/onnxruntime/test/providers/cpu/object_detection/roialign_test.cc b/onnxruntime/test/providers/cpu/object_detection/roialign_test.cc
index 0bff46edccc12..58a616717316e 100644
--- a/onnxruntime/test/providers/cpu/object_detection/roialign_test.cc
+++ b/onnxruntime/test/providers/cpu/object_detection/roialign_test.cc
@@ -464,6 +464,7 @@ static void BasicTest() {
                                            0.3661f,
                                            0.2349f,
                                        });
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -690,6 +691,7 @@ TEST(RoiAlignTest, MaxModePositive) {
                                           });*/
   test.Run();
 }
+
 TEST(RoiAlignTest, AvgModeNegativeInvalidMode) {
   // TODO: Unskip when fixed #41968513
   if (DefaultDmlExecutionProvider().get() != nullptr) {
diff --git a/onnxruntime/test/providers/cpu/rnn/deep_cpu_lstm_op_test.cc b/onnxruntime/test/providers/cpu/rnn/deep_cpu_lstm_op_test.cc
index 7e81fc80ddf85..e73a1b492cc05 100644
--- a/onnxruntime/test/providers/cpu/rnn/deep_cpu_lstm_op_test.cc
+++ b/onnxruntime/test/providers/cpu/rnn/deep_cpu_lstm_op_test.cc
@@ -143,6 +143,8 @@ static void RunLstmTest(const std::vector<float>& X_data,
     test.AddOptionalOutputEdge<float>();
   }
 
+  test.SetOutputTolerance(0.0001f);
+
   // TensorRT failed on LSTM tests
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
diff --git a/onnxruntime/test/providers/cpu/tensor/affine_grid_test.cc b/onnxruntime/test/providers/cpu/tensor/affine_grid_test.cc
index e37e784f28930..1ffe6c73d4fa4 100644
--- a/onnxruntime/test/providers/cpu/tensor/affine_grid_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/affine_grid_test.cc
@@ -13,6 +13,7 @@ TEST(AffineGridTest, 2d) {
   test.AddInput<int64_t>("size", {4}, {1, 1, 2, 3});
   test.AddOutput<float>("grid", {1, 2, 3, 2},
                         {-0.6667f, -0.5000f, 0.0000f, -0.5000f, 0.6667f, -0.5000f, -0.6667f, 0.5000f, 0.0000f, 0.5000f, 0.6667f, 0.5000f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -24,6 +25,7 @@ TEST(AffineGridTest, test_2d_0) {
   test.AddInput<float>("theta", {1, 2, 3}, {1.477212f, -0.173648f, 0.300000f, 0.173648f, 0.492404f, -0.500000f});
   test.AddInput<int64_t>("size", {4}, {1, 1, 3, 2});
   test.AddOutput<float>("grid", {1, 3, 2, 2}, {-0.3228f, -0.9151f, 1.1544f, -0.7414f, -0.4386f, -0.5868f, 1.0386f, -0.4132f, -0.5544f, -0.2586f, 0.9228f, -0.0849f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -33,6 +35,7 @@ TEST(AffineGridTest, test_2d_1) {
   test.AddInput<float>("theta", {2, 2, 3}, {1.477212f, -0.173648f, 0.300000f, 0.173648f, 0.492404f, -0.500000f, 1.477212f, -0.173648f, 0.300000f, 0.173648f, 0.492404f, -0.500000f});
   test.AddInput<int64_t>("size", {4}, {2, 10, 2, 3});
   test.AddOutput<float>("grid", {2, 2, 3, 2}, {-0.5980f, -0.8620f, 0.3868f, -0.7462f, 1.3716f, -0.6304f, -0.7716f, -0.3696f, 0.2132f, -0.2538f, 1.1980f, -0.1380f, -0.5980f, -0.8620f, 0.3868f, -0.7462f, 1.3716f, -0.6304f, -0.7716f, -0.3696f, 0.2132f, -0.2538f, 1.1980f, -0.1380f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -42,6 +45,7 @@ TEST(AffineGridTest, test_2d_2) {
   test.AddInput<float>("theta", {1, 2, 3}, {1.500000f, -0.866025f, -0.500000f, 0.866025f, 2.750000f, -0.500000f});
   test.AddInput<int64_t>("size", {4}, {1, 1, 3, 2});
   test.AddOutput<float>("grid", {1, 3, 2, 2}, {-0.6726f, -2.7663f, 0.8274f, -1.9003f, -1.2500f, -0.9330f, 0.2500f, -0.0670f, -1.8274f, 0.9003f, -0.3274f, 1.7663f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -51,6 +55,7 @@ TEST(AffineGridTest, test_2d_3) {
   test.AddInput<float>("theta", {2, 2, 3}, {1.500000f, -0.866025f, -0.500000f, 0.866025f, 2.750000f, -0.500000f, 1.500000f, -0.866025f, -0.500000f, 0.866025f, 2.750000f, -0.500000f});
   test.AddInput<int64_t>("size", {4}, {2, 10, 2, 3});
   test.AddOutput<float>("grid", {2, 2, 3, 2}, {-1.0670f, -2.4524f, -0.0670f, -1.8750f, 0.9330f, -1.2976f, -1.9330f, 0.2976f, -0.9330f, 0.8750f, 0.0670f, 1.4524f, -1.0670f, -2.4524f, -0.0670f, -1.8750f, 0.9330f, -1.2976f, -1.9330f, 0.2976f, -0.9330f, 0.8750f, 0.0670f, 1.4524f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -60,6 +65,7 @@ TEST(AffineGridTest, test_2d_4) {
   test.AddInput<float>("theta", {1, 2, 3}, {1.477212f, -0.173648f, 0.300000f, 0.173648f, 0.492404f, -0.500000f});
   test.AddInput<int64_t>("size", {4}, {1, 1, 3, 2});
   test.AddOutput<float>("grid", {1, 3, 2, 2}, {-1.0036f, -1.1661f, 1.9509f, -0.8188f, -1.1772f, -0.6736f, 1.7772f, -0.3264f, -1.3509f, -0.1812f, 1.6036f, 0.1661f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -69,6 +75,7 @@ TEST(AffineGridTest, test_2d_5) {
   test.AddInput<float>("theta", {2, 2, 3}, {1.477212f, -0.173648f, 0.300000f, 0.173648f, 0.492404f, -0.500000f, 1.477212f, -0.173648f, 0.300000f, 0.173648f, 0.492404f, -0.500000f});
   test.AddInput<int64_t>("size", {4}, {2, 10, 2, 3});
   test.AddOutput<float>("grid", {2, 2, 3, 2}, {-1.0036f, -1.1661f, 0.4736f, -0.9924f, 1.9509f, -0.8188f, -1.3509f, -0.1812f, 0.1264f, -0.0076f, 1.6036f, 0.1661f, -1.0036f, -1.1661f, 0.4736f, -0.9924f, 1.9509f, -0.8188f, -1.3509f, -0.1812f, 0.1264f, -0.0076f, 1.6036f, 0.1661f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -78,6 +85,7 @@ TEST(AffineGridTest, test_2d_6) {
   test.AddInput<float>("theta", {1, 2, 3}, {1.500000f, -0.866025f, -0.500000f, 0.866025f, 2.750000f, -0.500000f});
   test.AddInput<int64_t>("size", {4}, {1, 1, 3, 2});
   test.AddOutput<float>("grid", {1, 3, 2, 2}, {-1.1340f, -4.1160f, 1.8660f, -2.3840f, -2.0000f, -1.3660f, 1.0000f, 0.3660f, -2.8660f, 1.3840f, 0.1340f, 3.1160f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -87,6 +95,7 @@ TEST(AffineGridTest, test_2d_7) {
   test.AddInput<float>("theta", {2, 2, 3}, {1.500000f, -0.866025f, -0.500000f, 0.866025f, 2.750000f, -0.500000f, 1.500000f, -0.866025f, -0.500000f, 0.866025f, 2.750000f, -0.500000f});
   test.AddInput<int64_t>("size", {4}, {2, 10, 2, 3});
   test.AddOutput<float>("grid", {2, 2, 3, 2}, {-1.1340f, -4.1160f, 0.3660f, -3.2500f, 1.8660f, -2.3840f, -2.8660f, 1.3840f, -1.3660f, 2.2500f, 0.1340f, 3.1160f, -1.1340f, -4.1160f, 0.3660f, -3.2500f, 1.8660f, -2.3840f, -2.8660f, 1.3840f, -1.3660f, 2.2500f, 0.1340f, 3.1160f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -96,6 +105,7 @@ TEST(AffineGridTest, test_3d_0) {
   test.AddInput<float>("theta", {1, 3, 4}, {1.409539f, 0.000000f, 0.513030f, 0.300000f, 0.118782f, 1.969615f, -0.326352f, -0.500000f, -0.168412f, 0.086824f, 0.462708f, 1.800000f});
   test.AddInput<int64_t>("size", {5}, {1, 1, 3, 2, 2});
   test.AddOutput<float>("grid", {1, 3, 2, 2, 3}, {-0.7468f, -1.3266f, 1.5323f, 0.6627f, -1.2078f, 1.3639f, -0.7468f, 0.6430f, 1.6191f, 0.6627f, 0.7618f, 1.4507f, -0.4048f, -1.5442f, 1.8408f, 1.0048f, -1.4254f, 1.6724f, -0.4048f, 0.4254f, 1.9276f, 1.0048f, 0.5442f, 1.7592f, -0.0627f, -1.7618f, 2.1493f, 1.3468f, -1.6430f, 1.9809f, -0.0627f, 0.2078f, 2.2361f, 1.3468f, 0.3266f, 2.0677f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -105,6 +115,7 @@ TEST(AffineGridTest, test_3d_1) {
   test.AddInput<float>("theta", {2, 3, 4}, {1.409539f, 0.000000f, 0.513030f, 0.300000f, 0.118782f, 1.969615f, -0.326352f, -0.500000f, -0.168412f, 0.086824f, 0.462708f, 1.800000f, 1.409539f, 0.000000f, 0.513030f, 0.300000f, 0.118782f, 1.969615f, -0.326352f, -0.500000f, -0.168412f, 0.086824f, 0.462708f, 1.800000f});
   test.AddInput<int64_t>("size", {5}, {2, 10, 2, 2, 3});
   test.AddOutput<float>("grid", {2, 2, 2, 3, 3}, {-0.8962f, -1.4008f, 1.6375f, 0.0435f, -1.3216f, 1.5252f, 0.9832f, -1.2424f, 1.4130f, -0.8962f, 0.5688f, 1.7243f, 0.0435f, 0.6480f, 1.6121f, 0.9832f, 0.7272f, 1.4998f, -0.3832f, -1.7272f, 2.1002f, 0.5565f, -1.6480f, 1.9879f, 1.4962f, -1.5688f, 1.8757f, -0.3832f, 0.2424f, 2.1870f, 0.5565f, 0.3216f, 2.0748f, 1.4962f, 0.4008f, 1.9625f, -0.8962f, -1.4008f, 1.6375f, 0.0435f, -1.3216f, 1.5252f, 0.9832f, -1.2424f, 1.4130f, -0.8962f, 0.5688f, 1.7243f, 0.0435f, 0.6480f, 1.6121f, 0.9832f, 0.7272f, 1.4998f, -0.3832f, -1.7272f, 2.1002f, 0.5565f, -1.6480f, 1.9879f, 1.4962f, -1.5688f, 1.8757f, -0.3832f, 0.2424f, 2.1870f, 0.5565f, 0.3216f, 2.0748f, 1.4962f, 0.4008f, 1.9625f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -114,6 +125,7 @@ TEST(AffineGridTest, test_3d_2) {
   test.AddInput<float>("theta", {1, 3, 4}, {0.259808f, 0.000000f, -0.150000f, -0.500000f, -1.299038f, 1.500000f, -2.250000f, -0.500000f, 1.375000f, 4.763140f, 2.381570f, 0.300000f});
   test.AddInput<int64_t>("size", {5}, {1, 1, 3, 2, 2});
   test.AddOutput<float>("grid", {1, 3, 2, 2, 3}, {-0.5299f, 0.8995f, -4.3568f, -0.2701f, -0.3995f, -2.9818f, -0.5299f, 2.3995f, 0.4064f, -0.2701f, 1.1005f, 1.7814f, -0.6299f, -0.6005f, -2.7691f, -0.3701f, -1.8995f, -1.3941f, -0.6299f, 0.8995f, 1.9941f, -0.3701f, -0.3995f, 3.3691f, -0.7299f, -2.1005f, -1.1814f, -0.4701f, -3.3995f, 0.1936f, -0.7299f, -0.6005f, 3.5818f, -0.4701f, -1.8995f, 4.9568f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -123,6 +135,7 @@ TEST(AffineGridTest, test_3d_3) {
   test.AddInput<float>("theta", {2, 3, 4}, {0.259808f, 0.000000f, -0.150000f, -0.500000f, -1.299038f, 1.500000f, -2.250000f, -0.500000f, 1.375000f, 4.763140f, 2.381570f, 0.300000f, 0.259808f, 0.000000f, -0.150000f, -0.500000f, -1.299038f, 1.500000f, -2.250000f, -0.500000f, 1.375000f, 4.763140f, 2.381570f, 0.300000f});
   test.AddInput<int64_t>("size", {5}, {2, 10, 2, 2, 3});
   test.AddOutput<float>("grid", {2, 2, 2, 3, 3}, {-0.5982f, 0.7410f, -4.1890f, -0.4250f, -0.1250f, -3.2724f, -0.2518f, -0.9910f, -2.3557f, -0.5982f, 2.2410f, 0.5741f, -0.4250f, 1.3750f, 1.4908f, -0.2518f, 0.5090f, 2.4075f, -0.7482f, -1.5090f, -1.8075f, -0.5750f, -2.3750f, -0.8908f, -0.4018f, -3.2410f, 0.0259f, -0.7482f, -0.0090f, 2.9557f, -0.5750f, -0.8750f, 3.8724f, -0.4018f, -1.7410f, 4.7890f, -0.5982f, 0.7410f, -4.1890f, -0.4250f, -0.1250f, -3.2724f, -0.2518f, -0.9910f, -2.3557f, -0.5982f, 2.2410f, 0.5741f, -0.4250f, 1.3750f, 1.4908f, -0.2518f, 0.5090f, 2.4075f, -0.7482f, -1.5090f, -1.8075f, -0.5750f, -2.3750f, -0.8908f, -0.4018f, -3.2410f, 0.0259f, -0.7482f, -0.0090f, 2.9557f, -0.5750f, -0.8750f, 3.8724f, -0.4018f, -1.7410f, 4.7890f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -132,6 +145,7 @@ TEST(AffineGridTest, test_3d_4) {
   test.AddInput<float>("theta", {1, 3, 4}, {1.409539f, 0.000000f, 0.513030f, 0.300000f, 0.118782f, 1.969615f, -0.326352f, -0.500000f, -0.168412f, 0.086824f, 0.462708f, 1.800000f});
   test.AddInput<int64_t>("size", {5}, {1, 1, 3, 2, 2});
   test.AddOutput<float>("grid", {1, 3, 2, 2, 3}, {-1.6226f, -2.2620f, 1.4189f, 1.1965f, -2.0245f, 1.0821f, -1.6226f, 1.6772f, 1.5925f, 1.1965f, 1.9147f, 1.2557f, -1.1095f, -2.5884f, 1.8816f, 1.7095f, -2.3508f, 1.5448f, -1.1095f, 1.3508f, 2.0552f, 1.7095f, 1.5884f, 1.7184f, -0.5965f, -2.9147f, 2.3443f, 2.2226f, -2.6772f, 2.0075f, -0.5965f, 1.0245f, 2.5179f, 2.2226f, 1.2620f, 2.1811f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -141,6 +155,7 @@ TEST(AffineGridTest, test_3d_5) {
   test.AddInput<float>("theta", {2, 3, 4}, {1.409539f, 0.000000f, 0.513030f, 0.300000f, 0.118782f, 1.969615f, -0.326352f, -0.500000f, -0.168412f, 0.086824f, 0.462708f, 1.800000f, 1.409539f, 0.000000f, 0.513030f, 0.300000f, 0.118782f, 1.969615f, -0.326352f, -0.500000f, -0.168412f, 0.086824f, 0.462708f, 1.800000f});
   test.AddInput<int64_t>("size", {5}, {2, 10, 2, 2, 3});
   test.AddOutput<float>("grid", {2, 2, 2, 3, 3}, {-1.6226f, -2.2620f, 1.4189f, -0.2130f, -2.1433f, 1.2505f, 1.1965f, -2.0245f, 1.0821f, -1.6226f, 1.6772f, 1.5925f, -0.2130f, 1.7960f, 1.4241f, 1.1965f, 1.9147f, 1.2557f, -0.5965f, -2.9147f, 2.3443f, 0.8130f, -2.7960f, 2.1759f, 2.2226f, -2.6772f, 2.0075f, -0.5965f, 1.0245f, 2.5179f, 0.8130f, 1.1433f, 2.3495f, 2.2226f, 1.2620f, 2.1811f, -1.6226f, -2.2620f, 1.4189f, -0.2130f, -2.1433f, 1.2505f, 1.1965f, -2.0245f, 1.0821f, -1.6226f, 1.6772f, 1.5925f, -0.2130f, 1.7960f, 1.4241f, 1.1965f, 1.9147f, 1.2557f, -0.5965f, -2.9147f, 2.3443f, 0.8130f, -2.7960f, 2.1759f, 2.2226f, -2.6772f, 2.0075f, -0.5965f, 1.0245f, 2.5179f, 0.8130f, 1.1433f, 2.3495f, 2.2226f, 1.2620f, 2.1811f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -150,6 +165,7 @@ TEST(AffineGridTest, test_3d_6) {
   test.AddInput<float>("theta", {1, 3, 4}, {0.259808f, 0.000000f, -0.150000f, -0.500000f, -1.299038f, 1.500000f, -2.250000f, -0.500000f, 1.375000f, 4.763140f, 2.381570f, 0.300000f});
   test.AddInput<int64_t>("size", {5}, {1, 1, 3, 2, 2});
   test.AddOutput<float>("grid", {1, 3, 2, 2, 3}, {-0.6098f, 1.5490f, -8.2197f, -0.0902f, -1.0490f, -5.4697f, -0.6098f, 4.5490f, 1.3066f, -0.0902f, 1.9510f, 4.0566f, -0.7598f, -0.7010f, -5.8381f, -0.2402f, -3.2990f, -3.0881f, -0.7598f, 2.2990f, 3.6881f, -0.2402f, -0.2990f, 6.4381f, -0.9098f, -2.9510f, -3.4566f, -0.3902f, -5.5490f, -0.7066f, -0.9098f, 0.0490f, 6.0697f, -0.3902f, -2.5490f, 8.8197f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -159,6 +175,7 @@ TEST(AffineGridTest, test_3d_7) {
   test.AddInput<float>("theta", {2, 3, 4}, {0.259808f, 0.000000f, -0.150000f, -0.500000f, -1.299038f, 1.500000f, -2.250000f, -0.500000f, 1.375000f, 4.763140f, 2.381570f, 0.300000f, 0.259808f, 0.000000f, -0.150000f, -0.500000f, -1.299038f, 1.500000f, -2.250000f, -0.500000f, 1.375000f, 4.763140f, 2.381570f, 0.300000f});
   test.AddInput<int64_t>("size", {5}, {2, 10, 2, 2, 3});
   test.AddOutput<float>("grid", {2, 2, 2, 3, 3}, {-0.6098f, 1.5490f, -8.2197f, -0.3500f, 0.2500f, -6.8447f, -0.0902f, -1.0490f, -5.4697f, -0.6098f, 4.5490f, 1.3066f, -0.3500f, 3.2500f, 2.6816f, -0.0902f, 1.9510f, 4.0566f, -0.9098f, -2.9510f, -3.4566f, -0.6500f, -4.2500f, -2.0816f, -0.3902f, -5.5490f, -0.7066f, -0.9098f, 0.0490f, 6.0697f, -0.6500f, -1.2500f, 7.4447f, -0.3902f, -2.5490f, 8.8197f, -0.6098f, 1.5490f, -8.2197f, -0.3500f, 0.2500f, -6.8447f, -0.0902f, -1.0490f, -5.4697f, -0.6098f, 4.5490f, 1.3066f, -0.3500f, 3.2500f, 2.6816f, -0.0902f, 1.9510f, 4.0566f, -0.9098f, -2.9510f, -3.4566f, -0.6500f, -4.2500f, -2.0816f, -0.3902f, -5.5490f, -0.7066f, -0.9098f, 0.0490f, 6.0697f, -0.6500f, -1.2500f, 7.4447f, -0.3902f, -2.5490f, 8.8197f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 }  // namespace test
diff --git a/onnxruntime/test/providers/cpu/tensor/mean_variance_normalization_test.cc b/onnxruntime/test/providers/cpu/tensor/mean_variance_normalization_test.cc
index b6720ae2a9a7d..8dcb15cbc6926 100644
--- a/onnxruntime/test/providers/cpu/tensor/mean_variance_normalization_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/mean_variance_normalization_test.cc
@@ -5,6 +5,7 @@
 
 #include "test/common/tensor_op_test_utils.h"
 #include "test/providers/provider_test_utils.h"
+#include "test/util/include/default_providers.h"
 
 namespace onnxruntime::test {
 
@@ -155,6 +156,10 @@ TEST(MeanVarianceNormalizationTest, AxesSubsets5D) {
     test.AddInput<float>("input", shape, X.data(), X.size());
     test.AddOutput<float>("output", shape, Y.data(), Y.size());
 
+    if (DefaultDmlExecutionProvider().get() != nullptr) {
+      test.SetOutputTolerance(0.001f);
+    }
+
     test.Run();
   };
 
diff --git a/orttraining/orttraining/test/gradient/optimizer_ops_test.cc b/orttraining/orttraining/test/gradient/optimizer_ops_test.cc
index bfb59f1525e47..18c1364f5d1f6 100644
--- a/orttraining/orttraining/test/gradient/optimizer_ops_test.cc
+++ b/orttraining/orttraining/test/gradient/optimizer_ops_test.cc
@@ -144,6 +144,8 @@ TEST(OptimizerTest, AdamBiasCorrection) {
   test.AddOutput<float>("Moment_2_Out", {3}, {1.7400e-04f, 8.9966e-04f, 1.5102e-03f});
   test.AddOutput<float>("W_Out", {3}, {-1.4634f, -0.6416f, -1.2121f});
 
+  test.SetOutputTolerance(0.0001f);
+
   test.AddAttribute("do_bias_correction", static_cast<int64_t>(1));
   test.AddAttribute("weight_decay_mode", static_cast<int64_t>(0));
 
@@ -167,6 +169,8 @@ TEST(OptimizerTest, AdamWeightDecayMode0NoBiasCorrection) {
   test.AddOutput<float>("W_Out", {3}, {-3.6210f, -2.8075f, -3.3723f});
   test.AddOutput<float>("G_Out", {3}, {-3.1576f, -3.1658f, -3.1601f});
 
+  test.SetOutputTolerance(0.0001f);
+
   test.AddAttribute("do_bias_correction", static_cast<int64_t>(0));
   test.AddAttribute("lambda", 0.01f);
   test.AddAttribute("weight_decay_mode", static_cast<int64_t>(0));
@@ -191,6 +195,8 @@ TEST(OptimizerTest, AdamWeightDecayMode0WithBiasCorrection) {
   test.AddOutput<float>("W_Out", {3}, {-1.4587f, -0.6452f, -1.2099f});
   test.AddOutput<float>("G_Out", {3}, {-0.9954f, -1.0036f, -0.9979f});
 
+  test.SetOutputTolerance(0.0001f);
+
   test.AddAttribute("do_bias_correction", static_cast<int64_t>(1));
   test.AddAttribute("lambda", 0.01f);
   test.AddAttribute("weight_decay_mode", static_cast<int64_t>(0));
@@ -214,6 +220,8 @@ TEST(OptimizerTest, AdamWeightDecayMode1NoBiasCorrection) {
   test.AddOutput<float>("Moment_2_Out", {3}, {1.7400e-04f, 8.9966e-04f, 1.5102e-03f});
   test.AddOutput<float>("W_Out", {3}, {-3.5894f, -2.7758f, -3.3406f});
 
+  test.SetOutputTolerance(0.0001f);
+
   test.AddAttribute("do_bias_correction", static_cast<int64_t>(0));
   test.AddAttribute("lambda", 0.01f);
   test.AddAttribute("weight_decay_mode", static_cast<int64_t>(1));
@@ -237,6 +245,8 @@ TEST(OptimizerTest, AdamWeightDecayMode1WithBiasCorrection) {
   test.AddOutput<float>("Moment_2_Out", {3}, {1.7400e-04f, 8.9966e-04f, 1.5102e-03f});
   test.AddOutput<float>("W_Out", {3}, {-1.4488f, -0.6352f, -1.1999f});
 
+  test.SetOutputTolerance(0.0001f);
+
   test.AddAttribute("do_bias_correction", static_cast<int64_t>(1));
   test.AddAttribute("lambda", 0.01f);
   test.AddAttribute("weight_decay_mode", static_cast<int64_t>(1));
@@ -368,6 +378,11 @@ TEST(OptimizerTest, AdamOptimizerMixPrecision_FP16Weight_ClipNorm_Test) {
   test.AddOptionalOutputEdge<MLFloat16>();
   test.AddOutput<MLFloat16>("FP16_W_Out", {3}, w_new_half);
 
+  test.SetOutputAbsErr("Moment_1_Out", 0.005f);
+  test.SetOutputAbsErr("Moment_2_Out", 0.005f);
+  test.SetOutputAbsErr("W_Out", 0.001f);
+  test.SetOutputAbsErr("FP16_W_Out", 0.005f);
+
   test.AddAttribute("do_bias_correction", static_cast<int64_t>(0));
   test.AddAttribute("weight_decay_mode", static_cast<int64_t>(0));
   test.AddAttribute("max_norm_clip", 0.001f);
@@ -617,6 +632,8 @@ void run_lamb_test_with_baseline(
     test.AddOptionalOutputEdge<MLFloat16>();
   }
 
+  test.SetOutputTolerance(0.005f);
+
   test.Run();
 }
 
@@ -737,6 +754,8 @@ void run_multi_tensor_lamb_test_with_baseline(
   test.AddAttribute("ratio_min", ratio_min);
   test.AddAttribute("ratio_max", ratio_max);
 
+  test.SetOutputTolerance(0.005f);
+
   test.Run();
 }
 
diff --git a/orttraining/orttraining/test/training_ops/cpu/nn/batchnorm_internal_test.cc b/orttraining/orttraining/test/training_ops/cpu/nn/batchnorm_internal_test.cc
index e9795a24681cb..e89883bfd4d94 100644
--- a/orttraining/orttraining/test/training_ops/cpu/nn/batchnorm_internal_test.cc
+++ b/orttraining/orttraining/test/training_ops/cpu/nn/batchnorm_internal_test.cc
@@ -37,6 +37,8 @@ TEST(BatchNormInternalTest, ForwardTrainingTest) {
   test.AddOutput<float>("saved_mean", channel_dims, {-0.306f, 0.114562f});
   test.AddOutput<float>("saved_inv_std", channel_dims, {1.2288f, 0.861317f});
 
+  test.SetOutputTolerance(0.0001f);
+
   std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
   execution_providers.emplace_back(DefaultCpuExecutionProvider());
 
diff --git a/orttraining/orttraining/test/training_ops/cuda/batch_norm_internal_test.cc b/orttraining/orttraining/test/training_ops/cuda/batch_norm_internal_test.cc
index 6335a666e0381..d842d4f1ea736 100644
--- a/orttraining/orttraining/test/training_ops/cuda/batch_norm_internal_test.cc
+++ b/orttraining/orttraining/test/training_ops/cuda/batch_norm_internal_test.cc
@@ -68,6 +68,7 @@ static void TestBatchNormInternal(bool test_double = false, bool T_is_half = fal
     test.AddOutput<double>("running_var", channel_dims, running_var_double);
     test.AddOutput<double>("saved_mean", channel_dims, saved_mean_double);
     test.AddOutput<double>("saved_inv_std", channel_dims, saved_inv_std_double);
+    test.SetOutputTolerance(0.0001f);
   } else {
     if (T_is_half) {
       std::vector<MLFloat16> X_half(X.size());

From c45cff60cfd10e6c35dbcff3d6dc7e4da16bace2 Mon Sep 17 00:00:00 2001
From: Guenther Schmuelling <guschmue@microsoft.com>
Date: Tue, 19 Mar 2024 16:15:49 -0700
Subject: [PATCH 207/279] [js/webgpu] fix maxpool / fp16 (#19981)

---
 js/web/lib/wasm/jsep/webgpu/ops/pool.ts | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/pool.ts b/js/web/lib/wasm/jsep/webgpu/ops/pool.ts
index 4e933573b9137..5521650e8ded4 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/pool.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/pool.ts
@@ -381,8 +381,9 @@ const createMaxPoolProgramInfo =
           programUniforms
         }),
         getShaderSource: shaderHelper => generatePoolingCode(
-            shaderHelper, x, input.dims.length, outputShape.length, adjustedAttributes, op1, op2, -1e5, uniforms,
-            hasPads, pwStartEndNotZero, phStartEndNotZero),
+            shaderHelper, x, input.dims.length, outputShape.length, adjustedAttributes, op1, op2,
+            (input.dataType === DataType.float16) ? -65504 : -1e5, uniforms, hasPads, pwStartEndNotZero,
+            phStartEndNotZero),
       };
     };
 

From 6fe02068af2ec9e7b0f49214e3ca84ed1d7cf6df Mon Sep 17 00:00:00 2001
From: Abhishek Jindal <abjindal@microsoft.com>
Date: Tue, 19 Mar 2024 17:00:44 -0700
Subject: [PATCH 208/279] Add const cast for DLManagedTensor (#19982)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Description
<!-- Describe your changes. -->
Add Const Cast for DLManagedTensor as PyTorch has changed it's
[code](https://github.com/pytorch/pytorch/pull/121102) which creates
incompatibility.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
Fix the below error while configuring ORT-training with nightly PyTorch
```
aten_op_executor.cc:60:40: error: invalid conversion from ‘const DLManagedTensor*’ to ‘DLManagedTensor*’ [-fpermissive]
   60 |     at::Tensor tensor = at::fromDLPack(dlpack);
      |                                        ^~~~~~
      |                                        |
      |                                        const DLManagedTensor*
```
---
 .../torch_cpp_extensions/aten_op_executor/aten_op_executor.cc   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/python/torch_cpp_extensions/aten_op_executor/aten_op_executor.cc b/onnxruntime/python/torch_cpp_extensions/aten_op_executor/aten_op_executor.cc
index e8be98cbfc0e4..4148e63d58619 100644
--- a/onnxruntime/python/torch_cpp_extensions/aten_op_executor/aten_op_executor.cc
+++ b/onnxruntime/python/torch_cpp_extensions/aten_op_executor/aten_op_executor.cc
@@ -57,7 +57,7 @@ struct ATenOperator {
     c10::IValue i_value;
     // Create the torch tensor from this DLPack no matter we need it or not below,
     // so that the dlpack's deleter will be triggered when torch tensor is out of scope.
-    at::Tensor tensor = at::fromDLPack(dlpack);
+    at::Tensor tensor = at::fromDLPack(const_cast<DLManagedTensor*>(dlpack));
     switch (elem_kinds[index]) {
       case c10::TypeKind::TensorType: {
         i_value = is_optional ? c10::IValue(c10::optional<at::Tensor>(tensor)) : c10::IValue(tensor);

From 3dfe4a5e6d5075a976516a9c43fe5f92da8614a9 Mon Sep 17 00:00:00 2001
From: mindest <30493312+mindest@users.noreply.github.com>
Date: Wed, 20 Mar 2024 08:35:18 +0800
Subject: [PATCH 209/279] [ROCm] Remove MPI dependency and collectives to use
 NCCL  (#19830)

### Description
* Remove MPI dependency to use NCCL AllReduce, etc.
* Exclude unsupported collectives in hipify
---
 cmake/onnxruntime_rocm_hipify.cmake           | 24 +++++--------------
 .../contrib_ops/rocm/rocm_contrib_kernels.cc  |  4 ++--
 .../cuda/communication/nccl_service.cc        |  2 ++
 .../linux-migraphx-ci-pipeline.yml            |  1 +
 4 files changed, 11 insertions(+), 20 deletions(-)

diff --git a/cmake/onnxruntime_rocm_hipify.cmake b/cmake/onnxruntime_rocm_hipify.cmake
index 6f54943f09afe..cadb06bb38707 100644
--- a/cmake/onnxruntime_rocm_hipify.cmake
+++ b/cmake/onnxruntime_rocm_hipify.cmake
@@ -94,30 +94,18 @@ set(contrib_ops_excluded_files
   "bert/group_query_attention.cc"
   "bert/group_query_attention_impl.h"
   "bert/group_query_attention_impl.cu"
+  "collective/distributed_*"
+  "collective/shard*"
 )
 
-if (NOT onnxruntime_ENABLE_ATEN)
-  list(APPEND contrib_ops_excluded_files "aten_ops/aten_op.cc")
-endif()
 if (NOT onnxruntime_USE_NCCL)
   # Those are string patterns to exclude. Do NOT use stars such as
   # collective/*.cc or *.h.
   list(APPEND contrib_ops_excluded_files "collective/nccl_kernels.cc")
-  list(APPEND contrib_ops_excluded_files "collective/sharded_moe.h")
-  list(APPEND contrib_ops_excluded_files "collective/sharded_moe.cc")
-  list(APPEND contrib_ops_excluded_files "collective/sharding.cc")
-  list(APPEND contrib_ops_excluded_files "collective/sharding_spec.cc")
-  list(APPEND contrib_ops_excluded_files "collective/distributed_matmul.cc")
-  list(APPEND contrib_ops_excluded_files "collective/distributed_slice.cc")
-  list(APPEND contrib_ops_excluded_files "collective/distributed_reshape.cc")
-  list(APPEND contrib_ops_excluded_files "collective/distributed_expand.cc")
-  list(APPEND contrib_ops_excluded_files "collective/distributed_reduce.cc")
-  list(APPEND contrib_ops_excluded_files "collective/distributed_unsqueeze.cc")
-  list(APPEND contrib_ops_excluded_files "collective/distributed_squeeze.cc")
-else()
-  # moe not supported for ROCm EP
-  list(APPEND contrib_ops_excluded_files "collective/sharded_moe.h")
-  list(APPEND contrib_ops_excluded_files "collective/sharded_moe.cc")
+endif()
+
+if (NOT onnxruntime_ENABLE_ATEN)
+  list(APPEND contrib_ops_excluded_files "aten_ops/aten_op.cc")
 endif()
 
 set(provider_excluded_files
diff --git a/onnxruntime/contrib_ops/rocm/rocm_contrib_kernels.cc b/onnxruntime/contrib_ops/rocm/rocm_contrib_kernels.cc
index 382a3951f3a83..e19a976f3141c 100644
--- a/onnxruntime/contrib_ops/rocm/rocm_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/rocm/rocm_contrib_kernels.cc
@@ -151,7 +151,7 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kPytorchAtenDomain
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, ShrunkenGather);
 #endif
 
-#if defined(USE_MPI) && defined(ORT_USE_NCCL)
+#ifdef ORT_USE_NCCL
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, AllReduce);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, AllGather);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, AllToAll);
@@ -311,7 +311,7 @@ Status RegisterRocmContribKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, ShrunkenGather)>,
 #endif
 
-#if defined(USE_MPI) && defined(ORT_USE_NCCL)
+#ifdef ORT_USE_NCCL
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, AllReduce)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, AllGather)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, AllToAll)>,
diff --git a/orttraining/orttraining/training_ops/cuda/communication/nccl_service.cc b/orttraining/orttraining/training_ops/cuda/communication/nccl_service.cc
index f604e4c4aaf3e..c642a87e22de6 100644
--- a/orttraining/orttraining/training_ops/cuda/communication/nccl_service.cc
+++ b/orttraining/orttraining/training_ops/cuda/communication/nccl_service.cc
@@ -233,6 +233,7 @@ void NcclService::Initialize() {
   //   CPUs
   //   Other devices
 
+#ifdef USE_MPI
   const int mpi_rank = onnxruntime::training::MPIContext::GetInstance().GetWorldRank();
   const int mpi_local_rank = onnxruntime::training::MPIContext::GetInstance().GetLocalRank();
   const int mpi_size = onnxruntime::training::MPIContext::GetInstance().GetWorldSize();
@@ -248,6 +249,7 @@ void NcclService::Initialize() {
   if (mpi_rank == 0) NCCL_CALL_THROW(ncclGetUniqueId(&id));
   MPI_CHECK(MPI_Bcast((void*)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD));
   NCCL_CALL_THROW(ncclCommInitRank(&comm_, mpi_size, id, mpi_rank));
+#endif  // USE_MPI
 }
 
 void NcclService::Launch() {
diff --git a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml
index 9cf7a3fb42397..8b58d958ba899 100644
--- a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml
@@ -109,6 +109,7 @@ jobs:
               --rocm_version=$(RocmVersion) \
               --rocm_home /opt/rocm \
               --nccl_home /opt/rocm \
+              --enable_nccl \
               --update \
               --build_dir /build \
               --build \

From 6ff31e06d5757779b9c8d53e9d02a3b62b3e3438 Mon Sep 17 00:00:00 2001
From: Ye Wang <52801275+wangyems@users.noreply.github.com>
Date: Tue, 19 Mar 2024 21:28:15 -0700
Subject: [PATCH 210/279] [MoE] Add TP and Mixtral MoE (#19945)

### Description
<!-- Describe your changes. -->

1.Support Tensor Parallelism in ShardedMoE.
2.Make necessary code changes to support Mixtral MoE.
3.Fix a bug related to using IOBinding in test script.
4.Fix the input size limitation

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 docs/ContribOperators.md                      |  16 +-
 docs/OperatorKernels.md                       |   2 +-
 .../cuda/collective/sharded_moe.cc            | 113 ++++--
 .../contrib_ops/cuda/collective/sharded_moe.h |   1 +
 .../cuda/moe/ft_moe/epilogue_helpers.h        |  33 +-
 .../cuda/moe/ft_moe/moe_gemm_kernels.h        |   9 +-
 .../moe/ft_moe/moe_gemm_kernels_template.h    |  48 ++-
 .../contrib_ops/cuda/moe/ft_moe/moe_kernel.cu | 304 +++++++++++----
 .../contrib_ops/cuda/moe/ft_moe/moe_kernel.h  |  22 +-
 onnxruntime/contrib_ops/cuda/moe/moe.cc       |  58 ++-
 onnxruntime/contrib_ops/cuda/moe/moe_base.h   |  50 ++-
 .../core/graph/contrib_ops/collective_defs.cc |  32 +-
 .../core/graph/contrib_ops/contrib_defs.cc    |  11 +-
 .../core/providers/cuda/cu_inc/common.cuh     |   4 +-
 onnxruntime/test/contrib_ops/moe_test.cc      | 177 ++++++++-
 .../sharded_moe/test_sharded_moe.py           | 260 ++++++++++---
 .../transformers/test_parity_mixtral_moe.py   | 365 ++++++++++++++++++
 .../python/transformers/test_parity_moe.py    |  13 +-
 18 files changed, 1272 insertions(+), 246 deletions(-)
 create mode 100644 onnxruntime/test/python/transformers/test_parity_mixtral_moe.py

diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
index 5f0100fad95a2..32a4ca16b7824 100644
--- a/docs/ContribOperators.md
+++ b/docs/ContribOperators.md
@@ -2931,8 +2931,8 @@ This version of the operator has been available since version 1 of the 'com.micr
 ### <a name="com.microsoft.MoE"></a><a name="com.microsoft.moe">**com.microsoft.MoE**</a>
 
   Mixture of experts. Examples: Switch transformer(https://arxiv.org/pdf/2101.03961.pdf) use top 1,
-        GLaM(https://arxiv.org/abs/2112.06905) activates top 2 FFN, and Vision MOE(https://arxiv.org/pdf/2106.05974.pdf)
-        usually uses top 32 experts.
+        GLaM(https://arxiv.org/abs/2112.06905) activates top 2 FFN, Vision MOE(https://arxiv.org/pdf/2106.05974.pdf)
+        usually uses top 32 experts and Mixtral(https://huggingface.co/blog/mixtral).
         
 
 #### Version
@@ -2946,9 +2946,11 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dd>Activation function to use. Choose from relu, gelu, silu and identity. Default is relu</dd>
 <dt><tt>k</tt> : int</dt>
 <dd>Number of top experts to select from expert pool</dd>
+<dt><tt>normalize_routing_weights</tt> : int</dt>
+<dd>Whether to normalize routing weights</dd>
 </dl>
 
-#### Inputs (4 - 6)
+#### Inputs (5 - 8)
 
 <dl>
 <dt><tt>input</tt> : T</dt>
@@ -2957,12 +2959,16 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dd>2D input tensor with shape (num_rows, num_experts)</dd>
 <dt><tt>fc1_experts_weights</tt> : T</dt>
 <dd>3D input tensor with shape (num_experts, hidden_size, inter_size)</dd>
-<dt><tt>fc2_experts_weights</tt> : T</dt>
-<dd>3D input tensor with shape (num_experts, inter_size, hidden_size)</dd>
 <dt><tt>fc1_experts_bias</tt> (optional) : T</dt>
 <dd>2D optional input tensor with shape (num_experts, inter_size)</dd>
+<dt><tt>fc2_experts_weights</tt> : T</dt>
+<dd>3D input tensor with shape (num_experts, inter_size, hidden_size)</dd>
 <dt><tt>fc2_experts_bias</tt> (optional) : T</dt>
 <dd>2D optional input tensor with shape (num_experts, hidden_size)</dd>
+<dt><tt>fc3_experts_weights</tt> (optional) : T</dt>
+<dd>3D optional input tensor with shape (num_experts, hidden_size, inter_size)</dd>
+<dt><tt>fc3_experts_bias</tt> (optional) : T</dt>
+<dd>2D optional input tensor with shape (num_experts, inter_size)</dd>
 </dl>
 
 #### Outputs
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index eddc3b7873d80..bca8e17b3dfd4 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -861,7 +861,7 @@ Do not modify directly.*
 |LongformerAttention|*in* input:**T**<br> *in* weight:**T**<br> *in* bias:**T**<br> *in* mask:**T**<br> *in* global_weight:**T**<br> *in* global_bias:**T**<br> *in* global:**G**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
 |MatMulBnb4|*in* A:**T1**<br> *in* B:**T2**<br> *in* absmax:**T1**<br> *out* Y:**T1**|1+|**T1** = tensor(bfloat16), tensor(float), tensor(float16)<br/> **T2** = tensor(uint8)|
 |MatMulNBits|*in* A:**T1**<br> *in* B:**T2**<br> *in* scales:**T1**<br> *in* zero_points:**T3**<br> *in* g_idx:**T4**<br> *out* Y:**T1**|1+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(uint8)|
-|MoE|*in* input:**T**<br> *in* router_probs:**T**<br> *in* fc1_experts_weights:**T**<br> *in* fc2_experts_weights:**T**<br> *in* fc1_experts_bias:**T**<br> *in* fc2_experts_bias:**T**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
+|MoE|*in* input:**T**<br> *in* router_probs:**T**<br> *in* fc1_experts_weights:**T**<br> *in* fc1_experts_bias:**T**<br> *in* fc2_experts_weights:**T**<br> *in* fc2_experts_bias:**T**<br> *in* fc3_experts_weights:**T**<br> *in* fc3_experts_bias:**T**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
 |MultiHeadAttention|*in* query:**T**<br> *in* key:**T**<br> *in* value:**T**<br> *in* bias:**T**<br> *in* key_padding_mask:**M**<br> *in* relative_position_bias:**T**<br> *in* past_key:**T**<br> *in* past_value:**T**<br> *out* output:**T**<br> *out* present_key:**T**<br> *out* present_value:**T**|1+|**T** = tensor(float), tensor(float16)|
 |NGramRepeatBlock|*in* input_ids:**Tid**<br> *in* scores:**T**<br> *out* scores_out:**T**|1+|**T** = tensor(float)<br/> **Tid** = tensor(int64)|
 |NhwcConv|*in* X:**T**<br> *in* W:**T**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
diff --git a/onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc b/onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc
index 40a667ffd5d83..2efc37cf98010 100644
--- a/onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc
+++ b/onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc
@@ -1,6 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include <utility>
+
 #include "core/common/safeint.h"
 #include "core/providers/cuda/cuda_common.h"
 #include "contrib_ops/cuda/bert/transformer_cuda_common.h"
@@ -35,6 +37,7 @@ using namespace ONNX_NAMESPACE;
 
 template <typename T>
 ShardedMoE<T>::ShardedMoE(const OpKernelInfo& op_kernel_info) : NcclKernel(op_kernel_info), MoEBase(op_kernel_info) {
+  ORT_ENFORCE(op_kernel_info.GetAttr<int64_t>("tensor_shards", &tensor_shards_).IsOK());
   ORT_ENFORCE(op_kernel_info.GetAttr<int64_t>("local_experts_start_index", &local_experts_start_index_).IsOK());
   rank_to_experts_start_index_.resize(nccl_->Size());
   // Initialize rank_to_experts_start_index_[0] to a value to convey that it is not initialized.
@@ -55,27 +58,36 @@ Status ShardedMoE<T>::ComputeInternal(OpKernelContext* context) const {
   // Create a {Rank, ExpertsStartIndex} map on Host.
   AutoDestoryCudaEvent cuda_event;
   cudaEvent_t& copy_event = cuda_event.Get();
-  ORT_RETURN_IF_ERROR(SynchronizeExpertsStartIndex(allocator, context, copy_event));
 
   const Tensor* input = context->Input<Tensor>(0);
   const Tensor* router_probs = context->Input<Tensor>(1);
   const Tensor* fc1_experts_weights = context->Input<Tensor>(2);
-  const Tensor* fc2_experts_weights = context->Input<Tensor>(3);
-  const Tensor* fc1_experts_bias_optional = context->Input<Tensor>(4);
+  const Tensor* fc1_experts_bias_optional = context->Input<Tensor>(3);
+  const Tensor* fc2_experts_weights = context->Input<Tensor>(4);
   const Tensor* fc2_experts_bias_optional = context->Input<Tensor>(5);
+  const Tensor* fc3_experts_weights_optional = context->Input<Tensor>(6);
+  const Tensor* fc3_experts_bias_optional = context->Input<Tensor>(7);
+
+  MoEParameters moe_params(tensor_shards_);
+  ORT_RETURN_IF_ERROR(CheckInputs(moe_params, input, router_probs, fc1_experts_weights, fc1_experts_bias_optional,
+                                  fc2_experts_weights, fc2_experts_bias_optional, fc3_experts_weights_optional,
+                                  fc3_experts_bias_optional));
 
-  MoEParameters moe_params;
-  ORT_RETURN_IF_ERROR(CheckInputs(moe_params, input, router_probs, fc1_experts_weights, fc2_experts_weights,
-                                  fc1_experts_bias_optional, fc2_experts_bias_optional));
   ORT_RETURN_IF_NOT(moe_params.num_experts % nccl_->Size() == 0,
                     "num_experts should be divisible by world_size");
 
-  ort_fastertransformer::CutlassMoeFCRunner<CudaT, CudaT> moe_runner(sm);
+  if (moe_params.parallel_type == MoEParallelType::EP || moe_params.parallel_type == MoEParallelType::EPAndTP) {
+    ORT_RETURN_IF_ERROR(SynchronizeExpertsStartIndex(allocator, context, copy_event));
+  }
+
+  ort_fastertransformer::CutlassMoeFCRunner<CudaT, CudaT> moe_runner(sm,
+                                                                     fc3_experts_weights_optional != nullptr,
+                                                                     normalize_routing_weights_);
 
   size_t ws_size =
-      moe_runner.getWorkspaceSize(static_cast<int>(moe_params.num_rows), static_cast<int>(moe_params.hidden_size),
-                                  static_cast<int>(moe_params.inter_size), static_cast<int>(moe_params.num_experts),
-                                  static_cast<int>(k_));
+      moe_runner.getWorkspaceSize(static_cast<size_t>(moe_params.num_rows), static_cast<size_t>(moe_params.hidden_size),
+                                  static_cast<size_t>(moe_params.inter_size),
+                                  static_cast<size_t>(moe_params.num_experts), static_cast<size_t>(k_));
 
   size_t fc2_output_size = k_ * moe_params.num_rows * moe_params.hidden_size * sizeof(CudaT);
   size_t expert_scales_size = k_ * moe_params.num_rows * sizeof(CudaT);
@@ -93,19 +105,25 @@ Status ShardedMoE<T>::ComputeInternal(OpKernelContext* context) const {
   IAllocatorUniquePtr<void> expert_for_source_row =
       IAllocator::MakeUniquePtr<void>(allocator, expert_for_source_row_size, false, stream);
 
-  // fc1_scales and fc2_scales are used in quantized MoE
-  const CudaT* fc1_scales_ptr = nullptr;
-  const CudaT* fc2_scales_ptr = nullptr;
+  const CudaT* fc_scales_ptr = nullptr;
 
   moe_runner.run_moe_fc(reinterpret_cast<const CudaT*>(input->template Data<T>()),
                         reinterpret_cast<const CudaT*>(router_probs->template Data<T>()),
                         reinterpret_cast<const CudaT*>(fc1_experts_weights->template Data<T>()),
-                        std::move(fc1_scales_ptr),
+                        std::move(fc_scales_ptr),
                         fc1_experts_bias_optional == nullptr
                             ? nullptr
                             : reinterpret_cast<const CudaT*>(fc1_experts_bias_optional->template Data<T>()),
-                        activation_type_, reinterpret_cast<const CudaT*>(fc2_experts_weights->template Data<T>()),
-                        std::move(fc2_scales_ptr), static_cast<int>(moe_params.num_rows),
+                        activation_type_,
+                        fc3_experts_weights_optional == nullptr
+                            ? nullptr
+                            : reinterpret_cast<const CudaT*>(fc3_experts_weights_optional->template Data<T>()),
+                        std::move(fc_scales_ptr),
+                        fc3_experts_bias_optional == nullptr
+                            ? nullptr
+                            : reinterpret_cast<const CudaT*>(fc3_experts_bias_optional->template Data<T>()),
+                        reinterpret_cast<const CudaT*>(fc2_experts_weights->template Data<T>()),
+                        std::move(fc_scales_ptr), static_cast<int>(moe_params.num_rows),
                         static_cast<int>(moe_params.hidden_size),
                         static_cast<int>(moe_params.inter_size), static_cast<int>(moe_params.num_experts),
                         static_cast<int>(moe_params.local_num_experts), static_cast<int>(local_experts_start_index_),
@@ -116,31 +134,54 @@ Status ShardedMoE<T>::ComputeInternal(OpKernelContext* context) const {
 
   Tensor* output = context->Output(0, input->Shape());
 
-  size_t stride_count = moe_params.hidden_size;
-  size_t stride_bytes = stride_count * sizeof(CudaT);
-  int64_t total_past_rows = 0;
-  int64_t total_covered_rows = 0;
-  if (copy_event != nullptr) {
-    CUDA_RETURN_IF_ERROR(cudaEventSynchronize(copy_event));
+  if (moe_params.parallel_type == MoEParallelType::None) {
+    fc2_output_bc = std::move(fc2_output);
   }
-  NCCL_RETURN_IF_ERROR(ncclGroupStart());
-  for (int rank = 0; rank < nccl_->Size(); ++rank) {
-    int64_t experts_start_index = rank_to_experts_start_index_[rank];
-    moe_runner.get_total_rows_info(experts_start_index,
-                                   moe_params.local_num_experts,
-                                   total_past_rows,
-                                   total_covered_rows);
-    const char* src = reinterpret_cast<const char*>(fc2_output.get()) + total_past_rows * stride_bytes;
-    char* dst = reinterpret_cast<char*>(fc2_output_bc.get()) + total_past_rows * stride_bytes;
-    NCCL_RETURN_IF_ERROR(ncclBroadcast(src,
-                                       dst,
-                                       total_covered_rows * stride_count,
+
+  if (moe_params.parallel_type == MoEParallelType::EPAndTP) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Expert and Tensor Parallelism is not supported yet");
+  }
+
+  if (moe_params.parallel_type == MoEParallelType::TP) {
+    ORT_ENFORCE(moe_params.tensor_shards == nccl_->Size());
+    NCCL_RETURN_IF_ERROR(ncclGroupStart());
+    NCCL_RETURN_IF_ERROR(ncclAllReduce(reinterpret_cast<const char*>(fc2_output.get()),
+                                       reinterpret_cast<char*>(fc2_output_bc.get()),
+                                       fc2_output_size / sizeof(CudaT),
                                        GetNcclDataType(input->DataType()),
-                                       rank,
+                                       ncclSum,
                                        nccl_->Comm(),
                                        Stream(context)));
+    NCCL_RETURN_IF_ERROR(ncclGroupEnd());
+  }
+
+  if (moe_params.parallel_type == MoEParallelType::EP) {
+    size_t stride_count = moe_params.hidden_size;
+    size_t stride_bytes = stride_count * sizeof(CudaT);
+    int64_t total_past_rows = 0;
+    int64_t total_covered_rows = 0;
+    if (copy_event != nullptr) {
+      CUDA_RETURN_IF_ERROR(cudaEventSynchronize(copy_event));
+    }
+    NCCL_RETURN_IF_ERROR(ncclGroupStart());
+    for (int rank = 0; rank < nccl_->Size(); ++rank) {
+      int64_t experts_start_index = rank_to_experts_start_index_[rank];
+      moe_runner.get_total_rows_info(experts_start_index,
+                                     moe_params.local_num_experts,
+                                     total_past_rows,
+                                     total_covered_rows);
+      const char* src = reinterpret_cast<const char*>(fc2_output.get()) + total_past_rows * stride_bytes;
+      char* dst = reinterpret_cast<char*>(fc2_output_bc.get()) + total_past_rows * stride_bytes;
+      NCCL_RETURN_IF_ERROR(ncclBroadcast(src,
+                                         dst,
+                                         total_covered_rows * stride_count,
+                                         GetNcclDataType(input->DataType()),
+                                         rank,
+                                         nccl_->Comm(),
+                                         Stream(context)));
+    }
+    NCCL_RETURN_IF_ERROR(ncclGroupEnd());
   }
-  NCCL_RETURN_IF_ERROR(ncclGroupEnd());
 
   ort_fastertransformer::finalize_moe_routing_kernelLauncher(
       reinterpret_cast<CudaT*>(fc2_output_bc.get()), reinterpret_cast<CudaT*>(output->template MutableData<T>()),
diff --git a/onnxruntime/contrib_ops/cuda/collective/sharded_moe.h b/onnxruntime/contrib_ops/cuda/collective/sharded_moe.h
index 5ea4ae59c4020..827283a794dd6 100644
--- a/onnxruntime/contrib_ops/cuda/collective/sharded_moe.h
+++ b/onnxruntime/contrib_ops/cuda/collective/sharded_moe.h
@@ -26,6 +26,7 @@ class ShardedMoE final : public NcclKernel, public MoEBase {
   Status SynchronizeExpertsStartIndex(AllocatorPtr& alloc, OpKernelContext* ctx, cudaEvent_t& cuda_event) const;
 
   int64_t local_experts_start_index_;
+  int64_t tensor_shards_;
   std::vector<int64_t> rank_to_experts_start_index_;
 };
 
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/epilogue_helpers.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/epilogue_helpers.h
index 78d206bf1d9bc..b18a70e899d1c 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/epilogue_helpers.h
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/epilogue_helpers.h
@@ -83,10 +83,16 @@ namespace ort_fastertransformer {
 
 struct EpilogueOpBiasSilu {};
 
+struct EpilogueOpNoBiasSilu {};
+
 struct EpilogueOpBiasReLU {};
 
+struct EpilogueOpNoBiasReLU {};
+
 struct EpilogueOpBiasFtGelu {};
 
+struct EpilogueOpNoBiasFtGelu {};
+
 struct EpilogueOpBias {};
 
 struct EpilogueOpNoBias {};
@@ -101,6 +107,13 @@ struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, Epilog
                                                               cutlass::epilogue::thread::ScaleType::NoBetaScaling>;
 };
 
+template <typename ElementType, int ElementsPerVectorAccess, typename ElementAccumulator>
+struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpNoBiasSilu> {
+  using Op = cutlass::epilogue::thread::LinearCombinationSilu<ElementType, ElementsPerVectorAccess, ElementAccumulator,
+                                                              ElementAccumulator,
+                                                              cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling>;
+};
+
 template <typename ElementType, int ElementsPerVectorAccess, typename ElementAccumulator>
 struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpBiasReLU> {
   using Op = cutlass::epilogue::thread::LinearCombinationRelu<ElementType, ElementsPerVectorAccess, ElementAccumulator,
@@ -108,6 +121,13 @@ struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, Epilog
                                                               cutlass::epilogue::thread::ScaleType::NoBetaScaling>;
 };
 
+template <typename ElementType, int ElementsPerVectorAccess, typename ElementAccumulator>
+struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpNoBiasReLU> {
+  using Op = cutlass::epilogue::thread::LinearCombinationRelu<ElementType, ElementsPerVectorAccess, ElementAccumulator,
+                                                              ElementAccumulator,
+                                                              cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling>;
+};
+
 template <typename ElementType, int ElementsPerVectorAccess, typename ElementAccumulator>
 struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpBiasFtGelu> {
   using Op = cutlass::epilogue::thread::LinearCombinationGeneric<
@@ -116,6 +136,14 @@ struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, Epilog
       cutlass::FloatRoundStyle::round_to_nearest, true>;
 };
 
+template <typename ElementType, int ElementsPerVectorAccess, typename ElementAccumulator>
+struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpNoBiasFtGelu> {
+  using Op = cutlass::epilogue::thread::LinearCombinationGeneric<
+      cutlass::epilogue::thread::GELU_taylor, ElementType, ElementsPerVectorAccess, ElementAccumulator,
+      ElementAccumulator, cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling,
+      cutlass::FloatRoundStyle::round_to_nearest, true>;
+};
+
 template <typename ElementType, int ElementsPerVectorAccess, typename ElementAccumulator>
 struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpBias> {
   using Op = cutlass::epilogue::thread::LinearCombination<ElementType, ElementsPerVectorAccess, ElementAccumulator,
@@ -126,8 +154,9 @@ struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, Epilog
 template <typename ElementType, int ElementsPerVectorAccess, typename ElementAccumulator>
 struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpNoBias> {
   using Op =
-      cutlass::epilogue::thread::LinearCombination<ElementType, ElementsPerVectorAccess, ElementAccumulator,
-                                                   ElementAccumulator, cutlass::epilogue::thread::ScaleType::Default>;
+      cutlass::epilogue::thread::LinearCombination<
+          ElementType, ElementsPerVectorAccess, ElementAccumulator,
+          ElementAccumulator, cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling>;
 };
 
 }  // namespace ort_fastertransformer
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels.h
index 60608f462fde5..e0f91ab806c85 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels.h
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels.h
@@ -42,8 +42,13 @@ class MoeGemmRunner {
                          int64_t* total_rows_before_expert, int64_t total_rows, int64_t gemm_n, int64_t gemm_k,
                          int num_experts, ActivationType activation_type, cudaStream_t stream);
 
-  void moe_gemm(const T* A, const WeightType* B, const T* weight_scales, T* C, int64_t* total_rows_before_expert,
-                int64_t total_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, cudaStream_t stream);
+  void moe_gemm_act(const T* A, const WeightType* B, const T* weight_scales, T* C, int64_t* total_rows_before_expert,
+                    int64_t total_rows, int64_t gemm_n, int64_t gemm_k, int num_experts,
+                    ActivationType activation_type, cudaStream_t stream);
+
+  void moe_gemm(const T* A, const WeightType* B, const T* weight_scales, const T* biases, T* C,
+                int64_t* total_rows_before_expert, int64_t total_rows, int64_t gemm_n, int64_t gemm_k,
+                int num_experts, cudaStream_t stream);
 
  private:
   template <typename EpilogueTag>
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_template.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_template.h
index a3dcf0da16b98..2a15fdfd1cc1a 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_template.h
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_template.h
@@ -311,8 +311,8 @@ void dispatch_moe_gemm_to_cutlass(const T* A, const WeightType* B, const T* weig
 template <typename T, typename WeightType, typename arch, typename EpilogueTag,
           typename std::enable_if<std::is_same<T, float>::value>::type* = nullptr>
 void dispatch_moe_gemm_to_cutlass(const T* A, const WeightType* B, const T* weight_scales, const T* biases, T* C,
-                                  int64_t* total_rows_before_expert, int64_t /*total_rows*/, int64_t gemm_n, int64_t gemm_k,
-                                  int num_experts, CutlassGemmConfig gemm_config, int /*sm_version*/,
+                                  int64_t* total_rows_before_expert, int64_t /*total_rows*/, int64_t gemm_n,
+                                  int64_t gemm_k, int num_experts, CutlassGemmConfig gemm_config, int /*sm_version*/,
                                   int multi_processor_count, cudaStream_t stream, int* occupancy = nullptr) {
   switch (gemm_config.tile_config) {
     case CutlassTileConfig::CtaShape128x128x8_WarpShape64x64x8:
@@ -429,11 +429,47 @@ void MoeGemmRunner<T, WeightType>::moe_gemm_bias_act(const T* A, const WeightTyp
 }
 
 template <typename T, typename WeightType>
-void MoeGemmRunner<T, WeightType>::moe_gemm(const T* A, const WeightType* B, const T* weight_scales, T* C,
-                                            int64_t* total_rows_before_expert, int64_t total_rows, int64_t gemm_n,
-                                            int64_t gemm_k, int num_experts, cudaStream_t stream) {
-  run_gemm<EpilogueOpNoBias>(A, B, weight_scales, nullptr, C, total_rows_before_expert, total_rows, gemm_n, gemm_k,
+void MoeGemmRunner<T, WeightType>::moe_gemm_act(const T* A, const WeightType* B, const T* weight_scales,
+                                                T* C, int64_t* total_rows_before_expert, int64_t total_rows,
+                                                int64_t gemm_n, int64_t gemm_k, int num_experts,
+                                                ActivationType activation_type, cudaStream_t stream) {
+  switch (activation_type) {
+    case ActivationType::Relu:
+      run_gemm<EpilogueOpNoBiasReLU>(A, B, weight_scales, nullptr, C, total_rows_before_expert, total_rows, gemm_n,
+                                     gemm_k, num_experts, stream);
+      break;
+    case ActivationType::Gelu:
+      run_gemm<EpilogueOpNoBiasFtGelu>(A, B, weight_scales, nullptr, C, total_rows_before_expert, total_rows, gemm_n,
+                                       gemm_k, num_experts, stream);
+      break;
+    case ActivationType::Silu:
+      run_gemm<EpilogueOpNoBiasSilu>(A, B, weight_scales, nullptr, C, total_rows_before_expert, total_rows, gemm_n,
+                                     gemm_k, num_experts, stream);
+      break;
+    case ActivationType::Identity:
+      run_gemm<EpilogueOpNoBias>(A, B, weight_scales, nullptr, C, total_rows_before_expert, total_rows, gemm_n, gemm_k,
+                                 num_experts, stream);
+      break;
+    case ActivationType::InvalidType:
+      ORT_THROW("[FT Error][MoE Runner] Invalid activation type for MoE GEMM");
+      break;
+    default: {
+      ORT_THROW("[FT Error][MoE Runner] Invalid activation type for MoE GEMM");
+    }
+  }
+}
+
+template <typename T, typename WeightType>
+void MoeGemmRunner<T, WeightType>::moe_gemm(const T* A, const WeightType* B, const T* weight_scales, const T* biases,
+                                            T* C, int64_t* total_rows_before_expert, int64_t total_rows,
+                                            int64_t gemm_n, int64_t gemm_k, int num_experts, cudaStream_t stream) {
+  if (biases != nullptr) {
+    run_gemm<EpilogueOpBias>(A, B, weight_scales, biases, C, total_rows_before_expert, total_rows, gemm_n, gemm_k,
                              num_experts, stream);
+  } else {
+    run_gemm<EpilogueOpNoBias>(A, B, weight_scales, nullptr, C, total_rows_before_expert, total_rows, gemm_n, gemm_k,
+                               num_experts, stream);
+  }
 }
 
 }  // namespace ort_fastertransformer
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu
index a5b47bcddefbc..5e6e484567988 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu
@@ -30,7 +30,6 @@
 
 #include "cutlass/array.h"
 #include "cutlass/numeric_conversion.h"
-#include "cutlass/numeric_types.h"
 
 #ifdef __GNUC__
 #pragma GCC diagnostic pop
@@ -49,15 +48,14 @@
 #endif
 
 namespace ort_fastertransformer {
-
 static constexpr int WARP_SIZE = 32;
 
 // ====================== Softmax things ===============================
 // We have our own implementation of softmax here so we can support transposing the output
 // in the softmax kernel when we extend this module to support expert-choice routing.
 template <typename T, int TPB>
-__launch_bounds__(TPB) __global__
-    void moe_softmax(const T* input, const bool* finished, T* output, const int num_cols) {
+__launch_bounds__(TPB) __global__ void moe_softmax(const T* input, const bool* finished, T* output,
+                                                   const int num_cols) {
   using BlockReduce = cub::BlockReduce<float, TPB>;
   __shared__ typename BlockReduce::TempStorage tmpStorage;
 
@@ -108,14 +106,15 @@ __launch_bounds__(TPB) __global__
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 530
 template <typename T, int TPB>
-__launch_bounds__(TPB) __global__ void moe_top_k(const T*, const bool*, T*, int*, int*, int, const int) {
+__launch_bounds__(TPB) __global__ void moe_top_k(const T*, const bool*, T*, int*, int*, int, int, bool) {
   // Does not support pre-Kepler architectures
   ;
 }
 #else
 template <typename T, int TPB>
 __launch_bounds__(TPB) __global__ void moe_top_k(const T* inputs_after_softmax, const bool* finished, T* output,
-                                                 int* indices, int* source_rows, int num_experts, int k) {
+                                                 int* indices, int* source_rows, int num_experts, int k,
+                                                 bool normalize_routing_weights) {
   using cub_kvp = cub::KeyValuePair<int, T>;
   using BlockReduce = cub::BlockReduce<cub_kvp, TPB>;
   __shared__ typename BlockReduce::TempStorage tmpStorage;
@@ -128,6 +127,7 @@ __launch_bounds__(TPB) __global__ void moe_top_k(const T* inputs_after_softmax,
 
   const bool should_process_row = finished ? !finished[block_row] : true;
   const int thread_read_offset = blockIdx.x * num_experts;
+  float output_row_sum = 0.f;
   for (int k_idx = 0; k_idx < k; ++k_idx) {
     thread_kvp.key = 0;
     thread_kvp.value = T(-1.f);  // This is OK because inputs are probabilities
@@ -155,6 +155,13 @@ __launch_bounds__(TPB) __global__ void moe_top_k(const T* inputs_after_softmax,
       output[idx] = result_kvp.value;
       indices[idx] = should_process_row ? result_kvp.key : num_experts;
       source_rows[idx] = k_idx * num_rows + block_row;
+
+      if (normalize_routing_weights && k_idx == k - 1) {
+#pragma unroll
+        for (int ki = 0; ki < k; ++ki) {
+          output[idx - ki] = T(static_cast<float>(output[idx - ki]) / output_row_sum);
+        }
+      }
     }
     __syncthreads();
   }
@@ -178,7 +185,7 @@ __launch_bounds__(TPB) __global__ void moe_top_k(const T* inputs_after_softmax,
 template <typename T, int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG>
 __launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
     void topk_gating_softmax(const T* input, const bool* finished, T* output, int num_rows, int* indices,
-                             int* source_rows, int k) {
+                             int* source_rows, int k, bool normalize_routing_weights) {
   // We begin by enforcing compile time assertions and setting up compile time constants.
   static_assert(VPT == (VPT & -VPT), "VPT must be power of 2");
   static_assert(NUM_EXPERTS == (NUM_EXPERTS & -NUM_EXPERTS), "NUM_EXPERTS must be power of 2");
@@ -296,6 +303,7 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
   int start_col = first_elt_read_by_thread;
   static constexpr int COLS_PER_GROUP_LDG = ELTS_PER_LDG * THREADS_PER_ROW;
 
+  float output_row_sum = 0.f;
   for (int k_idx = 0; k_idx < k; ++k_idx) {
     // First, each thread does the local argmax
     float max_val = row_chunk[0];
@@ -336,8 +344,16 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
       // single) thread per row of the input/output matrices.
       const int idx = k * thread_row + k_idx;
       output[idx] = T(max_val);
+      output_row_sum = output_row_sum + static_cast<float>(max_val);
       indices[idx] = should_process_row ? expert : NUM_EXPERTS;
       source_rows[idx] = k_idx * num_rows + thread_row;
+
+      if (normalize_routing_weights && k_idx == k - 1) {
+#pragma unroll
+        for (int ki = 0; ki < k; ++ki) {
+          output[idx - ki] = T(static_cast<float>(output[idx - ki]) / output_row_sum);
+        }
+      }
     }
 
     // Finally, we clear the value in the thread with the current max if there is another iteration to run.
@@ -370,7 +386,8 @@ struct TopkConstants {
 
 template <typename T, int EXPERTS, int WARPS_PER_TB>
 void topk_gating_softmax_launcher_helper(const T* input, const bool* finished, T* output, int* indices, int* source_row,
-                                         int num_rows, int /*num_experts*/, int k, cudaStream_t stream) {
+                                         int num_rows, int /*num_experts*/, int k, bool normalize_routing_weights,
+                                         cudaStream_t stream) {
   static constexpr unsigned long MAX_BYTES_PER_LDG = 16;
 
   static constexpr int BYTES_PER_LDG = std::min((int)MAX_BYTES_PER_LDG, (int)sizeof(T) * EXPERTS);
@@ -382,61 +399,63 @@ void topk_gating_softmax_launcher_helper(const T* input, const bool* finished, T
 
   dim3 block_dim(WARP_SIZE, WARPS_PER_TB);
   topk_gating_softmax<T, VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG>
-      <<<num_blocks, block_dim, 0, stream>>>(input, finished, output, num_rows, indices, source_row, k);
+      <<<num_blocks, block_dim, 0, stream>>>(input, finished, output, num_rows, indices, source_row, k,
+                                             normalize_routing_weights);
 }
 
 template <typename T>
 void topk_gating_softmax_kernelLauncher(const T* input, const bool* finished, T* output, T* softmax_temp_output,
                                         int* indices, int* source_row, int num_rows, int num_experts,
-                                        int k, cudaStream_t stream) {
+                                        int k, bool normalize_routing_weights, cudaStream_t stream) {
   static constexpr int WARPS_PER_TB = 4;
 
   switch (num_experts) {
     case 2: {
       topk_gating_softmax_launcher_helper<T, 2, WARPS_PER_TB>(input, finished, output, indices, source_row, num_rows,
-                                                              num_experts, k, stream);
+                                                              num_experts, k, normalize_routing_weights, stream);
       break;
     }
     case 4: {
       topk_gating_softmax_launcher_helper<T, 4, WARPS_PER_TB>(input, finished, output, indices, source_row, num_rows,
-                                                              num_experts, k, stream);
+                                                              num_experts, k, normalize_routing_weights, stream);
       break;
     }
     case 8: {
       topk_gating_softmax_launcher_helper<T, 8, WARPS_PER_TB>(input, finished, output, indices, source_row, num_rows,
-                                                              num_experts, k, stream);
+                                                              num_experts, k, normalize_routing_weights, stream);
       break;
     }
     case 16: {
       topk_gating_softmax_launcher_helper<T, 16, WARPS_PER_TB>(input, finished, output, indices, source_row, num_rows,
-                                                               num_experts, k, stream);
+                                                               num_experts, k, normalize_routing_weights, stream);
       break;
     }
     case 32: {
       topk_gating_softmax_launcher_helper<T, 32, WARPS_PER_TB>(input, finished, output, indices, source_row, num_rows,
-                                                               num_experts, k, stream);
+                                                               num_experts, k, normalize_routing_weights, stream);
       break;
     }
     case 64: {
       topk_gating_softmax_launcher_helper<T, 64, WARPS_PER_TB>(input, finished, output, indices, source_row, num_rows,
-                                                               num_experts, k, stream);
+                                                               num_experts, k, normalize_routing_weights, stream);
       break;
     }
     case 128: {
       topk_gating_softmax_launcher_helper<T, 128, WARPS_PER_TB>(input, finished, output, indices, source_row, num_rows,
-                                                                num_experts, k, stream);
+                                                                num_experts, k, normalize_routing_weights, stream);
       break;
     }
     case 256: {
       topk_gating_softmax_launcher_helper<T, 256, WARPS_PER_TB>(input, finished, output, indices, source_row, num_rows,
-                                                                num_experts, k, stream);
+                                                                num_experts, k, normalize_routing_weights, stream);
       break;
     }
     default: {
       static constexpr int TPB = 256;
       moe_softmax<T, TPB><<<num_rows, TPB, 0, stream>>>(input, finished, softmax_temp_output, num_experts);
       moe_top_k<T, TPB>
-          <<<num_rows, TPB, 0, stream>>>(softmax_temp_output, finished, output, indices, source_row, num_experts, k);
+          <<<num_rows, TPB, 0, stream>>>(softmax_temp_output, finished, output, indices, source_row, num_experts, k,
+                                         normalize_routing_weights);
     }
   }
 }
@@ -521,25 +540,31 @@ __global__ void dispatch_activations_kernel(int64_t* total_rows_before_expert, i
 }
 
 template <typename T, typename WeightType, typename Enable>
-CutlassMoeFCRunner<T, WeightType, Enable>::CutlassMoeFCRunner(int sm_version) {
-  total_past_rows_ = 0;
-  total_covered_rows_ = 0;
+CutlassMoeFCRunner<T, WeightType, Enable>::CutlassMoeFCRunner(int sm_version,
+                                                              bool has_fc3,
+                                                              bool normalize_routing_weights)
+    : has_fc3_(has_fc3),
+      total_past_rows_(0),
+      total_covered_rows_(0),
+      normalize_routing_weights_(normalize_routing_weights) {
   moe_gemm_runner_.initialize(sm_version);
 }
 
 template <typename T, typename WeightType, typename Enable>
-size_t CutlassMoeFCRunner<T, WeightType, Enable>::getWorkspaceSize(int num_rows, const int hidden_size,
-                                                                   const int inter_size, int num_experts,
-                                                                   int k) {
-  const int buf_size = static_cast<int>(pad_to_multiple_of_16(k * num_rows * hidden_size));
-  const int interbuf_size = static_cast<int>(pad_to_multiple_of_16(k * num_rows * inter_size));
-  const int padded_experts = static_cast<int>(pad_to_multiple_of_16(num_experts));
-  const int num_moe_inputs = static_cast<int>(pad_to_multiple_of_16(k * num_rows));
-  int num_softmax_outs = 0;
+size_t CutlassMoeFCRunner<T, WeightType, Enable>::getWorkspaceSize(size_t num_rows, const size_t hidden_size,
+                                                                   const size_t inter_size, size_t num_experts,
+                                                                   size_t k) {
+  total_covered_rows_ = k * num_rows;
+
+  const size_t buf_size = pad_to_multiple_of_16(k * num_rows * hidden_size);
+  const size_t interbuf_size = pad_to_multiple_of_16(k * num_rows * inter_size);
+  const size_t padded_experts = pad_to_multiple_of_16(num_experts);
+  const size_t num_moe_inputs = pad_to_multiple_of_16(k * num_rows);
+  size_t num_softmax_outs = 0;
 
   const bool is_pow_2 = (num_experts != 0) && ((num_experts & (num_experts - 1)) == 0);
   if (!is_pow_2 || num_experts > 256) {
-    num_softmax_outs = static_cast<int>(pad_to_multiple_of_16(num_rows * num_experts));
+    num_softmax_outs = pad_to_multiple_of_16(num_rows * num_experts);
   }
 
   // softmax output, permuted_rows and permuted_experts have moved to outside of moe kernel, allocate them
@@ -548,13 +573,13 @@ size_t CutlassMoeFCRunner<T, WeightType, Enable>::getWorkspaceSize(int num_rows,
   total_ws_bytes += buf_size * sizeof(T);                    // permuted_data
   total_ws_bytes += padded_experts * sizeof(int64_t);        // Hold total_rows_before_expert_
   total_ws_bytes += num_softmax_outs * sizeof(T);
-  const int bytes_for_fc1_result = interbuf_size * sizeof(T);
-  const int sorter_ws_size_bytes = static_cast<int>(pad_to_multiple_of_16(sorter_.getWorkspaceSize(num_rows)));
-  sorter_.update_num_experts(num_experts);
+  const size_t bytes_for_fc1_result = has_fc3_ ? 2 * interbuf_size * sizeof(T) : interbuf_size * sizeof(T);
+  const size_t sorter_ws_size_bytes = pad_to_multiple_of_16(sorter_.getWorkspaceSize(num_rows));
+  sorter_.update_num_experts(static_cast<int>(num_experts));
 
-  int bytes_for_intermediate_and_sorting = bytes_for_fc1_result;
+  size_t bytes_for_intermediate_and_sorting = bytes_for_fc1_result;
   if (sorter_ws_size_bytes > bytes_for_fc1_result) {
-    int remaining_bytes = static_cast<int>(pad_to_multiple_of_16(sorter_ws_size_bytes - bytes_for_fc1_result));
+    size_t remaining_bytes = pad_to_multiple_of_16(sorter_ws_size_bytes - bytes_for_fc1_result);
     bytes_for_intermediate_and_sorting += remaining_bytes;
   }
 
@@ -563,13 +588,13 @@ size_t CutlassMoeFCRunner<T, WeightType, Enable>::getWorkspaceSize(int num_rows,
 }
 
 template <typename T, typename WeightType, typename Enable>
-void CutlassMoeFCRunner<T, WeightType, Enable>::configure_ws_ptrs(char* ws_ptr, int num_rows,
-                                                                  const int hidden_size, const int inter_size,
-                                                                  int num_experts, int k) {
-  const int buf_size = static_cast<int>(pad_to_multiple_of_16(k * num_rows * hidden_size));
-  const int interbuf_size = static_cast<int>(pad_to_multiple_of_16(k * num_rows * inter_size));
-  const int padded_experts = static_cast<int>(pad_to_multiple_of_16(num_experts));
-  const int num_moe_inputs = static_cast<int>(pad_to_multiple_of_16(k * num_rows));
+void CutlassMoeFCRunner<T, WeightType, Enable>::configure_ws_ptrs(char* ws_ptr, size_t num_rows,
+                                                                  const size_t hidden_size, const size_t inter_size,
+                                                                  size_t num_experts, size_t k) {
+  const size_t buf_size = pad_to_multiple_of_16(k * num_rows * hidden_size);
+  const size_t interbuf_size = pad_to_multiple_of_16(k * num_rows * inter_size);
+  const size_t padded_experts = pad_to_multiple_of_16(num_experts);
+  const size_t num_moe_inputs = pad_to_multiple_of_16(k * num_rows);
 
   source_rows_ = (int*)ws_ptr;
   permuted_rows_ = source_rows_ + num_moe_inputs;
@@ -578,28 +603,130 @@ void CutlassMoeFCRunner<T, WeightType, Enable>::configure_ws_ptrs(char* ws_ptr,
 
   total_rows_before_expert_ = (int64_t*)(permuted_data_ + buf_size);
 
-  fc1_result_ = (T*)(total_rows_before_expert_ + padded_experts);
+  if (has_fc3_) {
+    fc3_result_ = reinterpret_cast<T*>(total_rows_before_expert_ + padded_experts);
+    fc1_result_ = reinterpret_cast<T*>(fc3_result_ + interbuf_size);
+  } else {
+    fc1_result_ = reinterpret_cast<T*>(total_rows_before_expert_ + padded_experts);
+  }
 
   const bool is_pow_2 = (num_experts != 0) && ((num_experts & (num_experts - 1)) == 0);
   if (!is_pow_2 || num_experts > 256) {
-    softmax_out_ = (T*)(fc1_result_ + interbuf_size);
+    softmax_out_ = reinterpret_cast<T*>(fc1_result_ + interbuf_size);
   } else {
     softmax_out_ = nullptr;
   }
 }
 
+namespace {
+
+struct __align__(8) Half4 {
+  half2 x;
+  half2 y;
+};
+
+// TODO(wy): move to common header
+template <typename T>
+struct T4;
+template <>
+struct T4<float> {
+  using Type = float4;
+};
+template <>
+struct T4<half> {
+  using Type = Half4;
+};
+
+template <typename T>
+struct T2;
+template <>
+struct T2<float> {
+  using Type = float2;
+};
+template <>
+struct T2<half> {
+  using Type = half2;
+};
+
+inline __device__ float2 operator*(const float2 a, const float2 b) {
+  return make_float2(a.x * b.x, a.y * b.y);
+}
+
+inline __device__ float4 operator*(const float4 a, const float4 b) {
+  return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
+}
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 530
+inline __device__ half operator*(const half a, const half b) {
+  return __float2half(__half2float(a) * __half2float(b));
+}
+
+inline __device__ half2 operator*(const half2 a, const half2 b) {
+  return make_half2(a.x * b.x, a.y * b.y);
+}
+#endif
+
+inline __device__ Half4 operator*(const Half4 a, const Half4 b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 530
+  Half4 result;
+  result.x = a.x * b.x;
+  result.y = a.y * b.y;
+  return result;
+#else
+  return Half4{__hmul2(a.x, b.x), __hmul2(a.y, b.y)};
+#endif
+}
+
+}  // anonymous namespace
+
+template <typename T>
+__global__ void elementWiseMulKernel(T* output, T const* input, size_t inter_size) {
+  int const tid = threadIdx.x;
+  int const token = blockIdx.x;
+
+  output = output + token * inter_size;
+  input = input + token * inter_size;
+  for (int i = tid; i < inter_size; i += blockDim.x) {
+    T fc1_value = input[i];
+    output[i] = fc1_value * output[i];
+  }
+}
+
+template <typename T>
+void elementWiseMul(T* output, T const* input, int inter_size, int num_tokens, cudaStream_t stream) {
+  int const blocks = num_tokens;
+
+  if (inter_size & 3 == 0) {
+    using vec_type = typename T4<T>::Type;
+    int const threads = std::min(inter_size / 4, 1024);
+    elementWiseMulKernel<vec_type><<<blocks, threads, 0, stream>>>(reinterpret_cast<vec_type*>(output),
+                                                                   reinterpret_cast<vec_type const*>(input),
+                                                                   inter_size / 4);
+  } else if (inter_size & 1 == 0) {
+    using vec_type = typename T2<T>::Type;
+    int const threads = std::min(inter_size / 2, 1024);
+    elementWiseMulKernel<vec_type><<<blocks, threads, 0, stream>>>(reinterpret_cast<vec_type*>(output),
+                                                                   reinterpret_cast<vec_type const*>(input),
+                                                                   inter_size / 2);
+  } else {
+    int const threads = std::min(inter_size, 1024);
+    elementWiseMulKernel<T><<<blocks, threads, 0, stream>>>(output, input, inter_size);
+  }
+}
+
 template <typename T, typename WeightType, typename Enable>
 void CutlassMoeFCRunner<T, WeightType, Enable>::run_moe_fc(
     const T* input_activations, const T* gating_output, const WeightType* fc1_expert_weights, const T* fc1_scales,
-    const T* fc1_expert_biases, ActivationType fc1_activation_type, const WeightType* fc2_expert_weights,
-    const T* fc2_scales, int num_rows, const int hidden_size, const int inter_size, int num_experts,
-    int local_num_experts, int local_experts_start_index, int k, char* workspace_ptr, T* fc2_result,
-    const bool* finished, int active_rows, T* expert_scales, int* expanded_source_row_to_expanded_dest_row,
-    int* expert_for_source_row, cudaStream_t stream) {
+    const T* fc1_expert_biases, ActivationType fc1_activation_type, const WeightType* fc3_expert_weights,
+    const T* fc3_scales, const T* fc3_expert_biases, const WeightType* fc2_expert_weights, const T* fc2_scales,
+    int num_rows, const int hidden_size, const int inter_size, int num_experts, int local_num_experts,
+    int local_experts_start_index, int k, char* workspace_ptr, T* fc2_result, const bool* finished, int active_rows,
+    T* expert_scales, int* expanded_source_row_to_expanded_dest_row, int* expert_for_source_row,
+    cudaStream_t stream) {
   static constexpr bool scales_required =
       std::is_same<WeightType, uint8_t>::value || std::is_same<WeightType, cutlass::uint4b_t>::value;
 
-  if constexpr (scales_required) {
+  if (scales_required) {
     if (fc1_scales == nullptr) {
       ORT_THROW("[FT Error][Run MoE FC] Scales expected but scale for first matmul is a null pointer");
     } else if (fc2_scales == nullptr) {
@@ -613,9 +740,10 @@ void CutlassMoeFCRunner<T, WeightType, Enable>::run_moe_fc(
     }
   }
 
-  configure_ws_ptrs(workspace_ptr, num_rows, hidden_size, inter_size, num_experts, k);
+  configure_ws_ptrs(workspace_ptr, static_cast<size_t>(num_rows), static_cast<size_t>(hidden_size),
+                    static_cast<size_t>(inter_size), static_cast<size_t>(num_experts), static_cast<size_t>(k));
   topk_gating_softmax_kernelLauncher<T>(gating_output, finished, expert_scales, softmax_out_, expert_for_source_row,
-                                        source_rows_, num_rows, num_experts, k, stream);
+                                        source_rows_, num_rows, num_experts, k, normalize_routing_weights_, stream);
 
   const int sorter_ws_size_bytes = static_cast<int>(pad_to_multiple_of_16(sorter_.getWorkspaceSize(k * num_rows)));
   sorter_.run((void*)fc1_result_, sorter_ws_size_bytes, expert_for_source_row, permuted_experts_, source_rows_,
@@ -634,15 +762,48 @@ void CutlassMoeFCRunner<T, WeightType, Enable>::run_moe_fc(
   }
 
   // expanded_active_expert_rows is not used
-  moe_gemm_runner_.moe_gemm_bias_act(permuted_data_ + total_past_rows_ * hidden_size,
-                                     fc1_expert_weights, fc1_scales, fc1_expert_biases,
-                                     fc1_result_ + total_past_rows_ * inter_size,
-                                     total_rows_before_expert_ + local_experts_start_index,
-                                     expanded_active_expert_rows, inter_size, hidden_size,
-                                     local_num_experts, fc1_activation_type, stream);
+  if (fc1_expert_biases != nullptr) {
+    moe_gemm_runner_.moe_gemm_bias_act(permuted_data_ + total_past_rows_ * hidden_size,
+                                       fc1_expert_weights, fc1_scales, fc1_expert_biases,
+                                       fc1_result_ + total_past_rows_ * inter_size,
+                                       total_rows_before_expert_ + local_experts_start_index,
+                                       expanded_active_expert_rows, inter_size, hidden_size,
+                                       local_num_experts, fc1_activation_type, stream);
+  } else {
+    moe_gemm_runner_.moe_gemm_act(permuted_data_ + total_past_rows_ * hidden_size,
+                                  fc1_expert_weights, fc1_scales,
+                                  fc1_result_ + total_past_rows_ * inter_size,
+                                  total_rows_before_expert_ + local_experts_start_index,
+                                  expanded_active_expert_rows, inter_size, hidden_size,
+                                  local_num_experts, fc1_activation_type, stream);
+  }
+
+  if (has_fc3_) {
+    if (scales_required) {
+      if (fc3_scales == nullptr) {
+        ORT_THROW("[FT Error][Run MoE FC] Scales expected but scale for third matmul is a null pointer");
+      }
+    } else {
+      if (fc3_scales != nullptr) {
+        ORT_THROW("[FT Error][Run MoE FC] Scales are ignored for fp32/fp16/bf16 but received scale for FC3");
+      }
+    }
+    if (fc3_expert_weights == nullptr) {
+      ORT_THROW("[FT Error][Run MoE FC] FC3 weights are null");
+    }
+    moe_gemm_runner_.moe_gemm(permuted_data_ + total_past_rows_ * hidden_size,
+                              fc3_expert_weights, fc3_scales, fc3_expert_biases,
+                              fc3_result_ + total_past_rows_ * inter_size,
+                              total_rows_before_expert_ + local_experts_start_index,
+                              expanded_active_expert_rows, inter_size, hidden_size,
+                              local_num_experts, stream);
+
+    elementWiseMul(fc1_result_ + total_past_rows_ * inter_size, fc3_result_ + total_past_rows_ * inter_size,
+                   static_cast<int>(inter_size), static_cast<int>(total_covered_rows_), stream);
+  }
 
   moe_gemm_runner_.moe_gemm(fc1_result_ + total_past_rows_ * inter_size,
-                            fc2_expert_weights, fc2_scales,
+                            fc2_expert_weights, fc2_scales, nullptr,
                             fc2_result + total_past_rows_ * hidden_size,
                             total_rows_before_expert_ + local_experts_start_index,
                             expanded_active_expert_rows, hidden_size, inter_size, local_num_experts, stream);
@@ -651,14 +812,16 @@ void CutlassMoeFCRunner<T, WeightType, Enable>::run_moe_fc(
 template <typename T, typename WeightType, typename Enable>
 void CutlassMoeFCRunner<T, WeightType, Enable>::run_moe_fc(
     const T* input_activations, const T* gating_output, const WeightType* fc1_expert_weights, const T* fc1_scales,
-    const T* fc1_expert_biases, ActivationType fc1_activation_type, const WeightType* fc2_expert_weights,
-    const T* fc2_scales, int num_rows, const int hidden_size, const int inter_size, int num_experts,
-    int local_num_experts, int local_experts_start_index, int k, char* workspace_ptr, T* fc2_result, T* expert_scales,
+    const T* fc1_expert_biases, ActivationType fc1_activation_type, const WeightType* fc3_expert_weights,
+    const T* fc3_scales, const T* fc3_expert_biases, const WeightType* fc2_expert_weights, const T* fc2_scales,
+    int num_rows, const int hidden_size, const int inter_size, int num_experts, int local_num_experts,
+    int local_experts_start_index, int k, char* workspace_ptr, T* fc2_result, T* expert_scales,
     int* expanded_source_row_to_expanded_dest_row, int* expert_for_source_row, cudaStream_t stream) {
   run_moe_fc(input_activations, gating_output, fc1_expert_weights, fc1_scales, fc1_expert_biases, fc1_activation_type,
-             fc2_expert_weights, fc2_scales, num_rows, hidden_size, inter_size, num_experts, local_num_experts,
-             local_experts_start_index, k, workspace_ptr, fc2_result, nullptr, num_rows, expert_scales,
-             expanded_source_row_to_expanded_dest_row, expert_for_source_row, stream);
+             fc3_expert_weights, fc3_scales, fc3_expert_biases, fc2_expert_weights, fc2_scales, num_rows, hidden_size,
+             inter_size, num_experts, local_num_experts, local_experts_start_index, k, workspace_ptr, fc2_result,
+             nullptr, num_rows, expert_scales, expanded_source_row_to_expanded_dest_row, expert_for_source_row,
+             stream);
 }
 
 template <typename T, typename WeightType, typename Enable>
@@ -811,9 +974,10 @@ __global__ void finalize_moe_routing_kernel(const T* expanded_permuted_rows, T*
       const T* expanded_permuted_rows_row_ptr = expanded_permuted_rows + expanded_permuted_row * cols;
 
       const int expert_idx = expert_for_source_row[k_offset];
-      const T* bias_ptr = bias + expert_idx * cols;
+      const T* bias_ptr = bias ? bias + expert_idx * cols : nullptr;
 
-      thread_output = thread_output + row_scale * (expanded_permuted_rows_row_ptr[tid] + bias_ptr[tid]);
+      thread_output = thread_output + row_scale * (expanded_permuted_rows_row_ptr[tid] +
+                                                   (bias_ptr ? bias_ptr[tid] : T(0)));
     }
     reduced_row_ptr[tid] = thread_output;
   }
@@ -866,9 +1030,9 @@ void finalize_moe_routing_kernelLauncher(const T* expanded_permuted_rows, T* red
 
 // ========================= TopK Softmax specializations ===========================
 template void topk_gating_softmax_kernelLauncher(const float*, const bool*, float*, float*, int*, int*, int,
-                                                 int, int, cudaStream_t);
+                                                 int, int, bool, cudaStream_t);
 template void topk_gating_softmax_kernelLauncher(const half*, const bool*, half*, half*, int*, int*, int,
-                                                 int, int, cudaStream_t);
+                                                 int, int, bool, cudaStream_t);
 
 // ==================== Variable batched GEMM specializations ==================================
 template class CutlassMoeFCRunner<float, float>;
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.h
index 5cc2a3f79f003..5eef6f95f4820 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.h
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.h
@@ -24,6 +24,8 @@
 #include "core/common/common.h"
 #include "contrib_ops/cuda/bert/transformer_cuda_common.h"
 
+#include "cutlass/numeric_types.h"
+
 using namespace onnxruntime;
 
 namespace ort_fastertransformer {
@@ -107,12 +109,13 @@ template <typename T,          /*The type used for activations/scales/compute*/
           typename Enable = void>
 class CutlassMoeFCRunner {
  public:
-  CutlassMoeFCRunner(int sm_version);
+  CutlassMoeFCRunner(int sm_version, bool has_fc3, bool normalize_routing_weights);
 
-  size_t getWorkspaceSize(int num_rows, int hidden_size, int inter_size, int num_experts, int k);
+  size_t getWorkspaceSize(size_t num_rows, size_t hidden_size, size_t inter_size, size_t num_experts, size_t k);
 
   void run_moe_fc(const T* input_activations, const T* gating_output, const WeightType* fc1_expert_weights,
                   const T* fc1_scales, const T* fc1_expert_biases, ActivationType fc1_activation_type,
+                  const WeightType* fc3_expert_weights, const T* fc3_scales, const T* fc3_expert_biases,
                   const WeightType* fc2_expert_weights, const T* fc2_scales, int num_rows, int hidden_size,
                   int inter_size, int num_experts, int local_num_experts, int local_experts_start_index, int k,
                   char* workspace_ptr, T* fc2_result, T* expert_scales, int* expanded_source_row_to_expanded_dest_row,
@@ -120,6 +123,7 @@ class CutlassMoeFCRunner {
 
   void run_moe_fc(const T* input_activations, const T* gating_output, const WeightType* fc1_expert_weights,
                   const T* fc1_scales, const T* fc1_expert_biases, ActivationType fc1_activation_type,
+                  const WeightType* fc3_expert_weights, const T* fc3_scales, const T* fc3_expert_biases,
                   const WeightType* fc2_expert_weights, const T* fc2_scales, int num_rows, int hidden_size,
                   int inter_size, int num_experts, int local_num_experts, int local_experts_start_index, int k,
                   char* workspace_ptr, T* fc2_result, const bool* finished, int active_rows, T* expert_scales,
@@ -135,7 +139,8 @@ class CutlassMoeFCRunner {
                            int64_t& total_covered_rows);
 
  private:
-  void configure_ws_ptrs(char* ws_ptr, int num_rows, int hidden_size, int inter_size, int num_experts, int k);
+  void configure_ws_ptrs(char* ws_ptr, size_t num_rows, size_t hidden_size, size_t inter_size, size_t num_experts,
+                         size_t k);
 
  private:
   CubKeyValueSorter sorter_;
@@ -152,12 +157,17 @@ class CutlassMoeFCRunner {
   int64_t* total_rows_before_expert_;
 
   T* fc1_result_;
+  T* fc3_result_;
+
+  bool has_fc3_;
+  bool normalize_routing_weights_;
 
   // Cuda events
   contrib::cuda::AutoDestoryCudaEvent cuda_event_;
 
   int64_t total_past_rows_;
   int64_t total_covered_rows_;
+
   // TODO: use pinned memory
   std::vector<int64_t> total_rows_before_expert_host_;
 };
@@ -165,11 +175,11 @@ class CutlassMoeFCRunner {
 template <typename WeightType>
 class CutlassMoeFCRunner<float, WeightType, typename std::enable_if_t<!std::is_same<float, WeightType>::value>> {
  public:
-  CutlassMoeFCRunner(int sm_version);
+  CutlassMoeFCRunner(int sm_version, bool has_fc3, bool normalize_routing_weights);
 
-  size_t getWorkspaceSize(int num_rows, int hidden_size, int inter_size, int num_experts, int k) {
+  size_t getWorkspaceSize(size_t num_rows, size_t hidden_size, size_t inter_size, size_t num_experts, size_t k) {
     return 0;
   }
 };
 
-}  // namespace ort_fastertransformer
\ No newline at end of file
+}  // namespace ort_fastertransformer
diff --git a/onnxruntime/contrib_ops/cuda/moe/moe.cc b/onnxruntime/contrib_ops/cuda/moe/moe.cc
index 3f26a274109ad..b13aab959fc48 100644
--- a/onnxruntime/contrib_ops/cuda/moe/moe.cc
+++ b/onnxruntime/contrib_ops/cuda/moe/moe.cc
@@ -39,13 +39,16 @@ Status MoE<T>::ComputeInternal(OpKernelContext* context) const {
   const Tensor* input = context->Input<Tensor>(0);
   const Tensor* router_probs = context->Input<Tensor>(1);
   const Tensor* fc1_experts_weights = context->Input<Tensor>(2);
-  const Tensor* fc2_experts_weights = context->Input<Tensor>(3);
-  const Tensor* fc1_experts_bias_optional = context->Input<Tensor>(4);
+  const Tensor* fc1_experts_bias_optional = context->Input<Tensor>(3);
+  const Tensor* fc2_experts_weights = context->Input<Tensor>(4);
   const Tensor* fc2_experts_bias_optional = context->Input<Tensor>(5);
+  const Tensor* fc3_experts_weights_optional = context->Input<Tensor>(6);
+  const Tensor* fc3_experts_bias_optional = context->Input<Tensor>(7);
 
   MoEParameters moe_params;
-  ORT_RETURN_IF_ERROR(CheckInputs(moe_params, input, router_probs, fc1_experts_weights, fc2_experts_weights,
-                                  fc1_experts_bias_optional, fc2_experts_bias_optional));
+  ORT_RETURN_IF_ERROR(CheckInputs(moe_params, input, router_probs, fc1_experts_weights, fc1_experts_bias_optional,
+                                  fc2_experts_weights, fc2_experts_bias_optional, fc3_experts_weights_optional,
+                                  fc3_experts_bias_optional));
 
   typedef typename ToCudaType<T>::MappedType CudaT;
   auto stream = context->GetComputeStream();
@@ -53,12 +56,14 @@ Status MoE<T>::ComputeInternal(OpKernelContext* context) const {
   auto& device_prop = GetDeviceProp();
   const int sm = device_prop.major * 10 + device_prop.minor;
 
-  ort_fastertransformer::CutlassMoeFCRunner<CudaT, CudaT> moe_runner(sm);
+  ort_fastertransformer::CutlassMoeFCRunner<CudaT, CudaT> moe_runner(sm,
+                                                                     fc3_experts_weights_optional != nullptr,
+                                                                     normalize_routing_weights_);
 
   size_t ws_size =
-      moe_runner.getWorkspaceSize(static_cast<int>(moe_params.num_rows), static_cast<int>(moe_params.hidden_size),
-                                  static_cast<int>(moe_params.inter_size), static_cast<int>(moe_params.num_experts),
-                                  static_cast<int>(k_));
+      moe_runner.getWorkspaceSize(static_cast<size_t>(moe_params.num_rows), static_cast<size_t>(moe_params.hidden_size),
+                                  static_cast<size_t>(moe_params.inter_size),
+                                  static_cast<size_t>(moe_params.num_experts), static_cast<size_t>(k_));
   size_t fc2_output_size = k_ * moe_params.num_rows * moe_params.hidden_size * sizeof(CudaT);
   size_t expert_scales_size = k_ * moe_params.num_rows * sizeof(CudaT);
   size_t expanded_source_row_to_expanded_dest_row_size = k_ * moe_params.num_rows * sizeof(int);
@@ -77,26 +82,37 @@ Status MoE<T>::ComputeInternal(OpKernelContext* context) const {
   IAllocatorUniquePtr<void> expert_for_source_row =
       IAllocator::MakeUniquePtr<void>(allocator, expert_for_source_row_size, false, stream);
 
-  // fc1_scales and fc2_scales are used in quantized MoE
-  const CudaT* fc1_scales_ptr = nullptr;
-  const CudaT* fc2_scales_ptr = nullptr;
-
+  const CudaT* fc_scales_ptr = nullptr;
   moe_runner.run_moe_fc(reinterpret_cast<const CudaT*>(input->template Data<T>()),
                         reinterpret_cast<const CudaT*>(router_probs->template Data<T>()),
-                        reinterpret_cast<const CudaT*>(fc1_experts_weights->template Data<T>()),
-                        std::move(fc1_scales_ptr),
+                        reinterpret_cast<const CudaT*>(fc1_experts_weights->DataRaw()),
+                        fc_scales_ptr,
                         fc1_experts_bias_optional == nullptr
                             ? nullptr
                             : reinterpret_cast<const CudaT*>(fc1_experts_bias_optional->template Data<T>()),
-                        activation_type_, reinterpret_cast<const CudaT*>(fc2_experts_weights->template Data<T>()),
-                        std::move(fc2_scales_ptr), static_cast<int>(moe_params.num_rows),
-                        static_cast<int>(moe_params.hidden_size), static_cast<int>(moe_params.inter_size),
-                        static_cast<int>(moe_params.num_experts), static_cast<int>(moe_params.local_num_experts),
-                        0 /*local_experts_start_index_ used in sharded MoE*/, static_cast<int>(k_),
-                        reinterpret_cast<char*>(work_space.get()), reinterpret_cast<CudaT*>(fc2_output.get()),
+                        activation_type_,
+                        fc3_experts_weights_optional == nullptr
+                            ? nullptr
+                            : reinterpret_cast<const CudaT*>(fc3_experts_weights_optional->DataRaw()),
+                        fc_scales_ptr,
+                        fc3_experts_bias_optional == nullptr
+                            ? nullptr
+                            : reinterpret_cast<const CudaT*>(fc3_experts_bias_optional->template Data<T>()),
+                        reinterpret_cast<const CudaT*>(fc2_experts_weights->DataRaw()),
+                        fc_scales_ptr,
+                        static_cast<int>(moe_params.num_rows),
+                        static_cast<int>(moe_params.hidden_size),
+                        static_cast<int>(moe_params.inter_size),
+                        static_cast<int>(moe_params.num_experts),
+                        static_cast<int>(moe_params.local_num_experts),
+                        0 /*local_experts_start_index_ used in sharded MoE*/,
+                        static_cast<int>(k_),
+                        reinterpret_cast<char*>(work_space.get()),
+                        reinterpret_cast<CudaT*>(fc2_output.get()),
                         reinterpret_cast<CudaT*>(expert_scales.get()),
                         reinterpret_cast<int*>(expanded_source_row_to_expanded_dest_row.get()),
-                        reinterpret_cast<int*>(expert_for_source_row.get()), Stream(context));
+                        reinterpret_cast<int*>(expert_for_source_row.get()),
+                        Stream(context));
 
   Tensor* output = context->Output(0, input->Shape());
 
diff --git a/onnxruntime/contrib_ops/cuda/moe/moe_base.h b/onnxruntime/contrib_ops/cuda/moe/moe_base.h
index f55a7cde2e208..84a5e8c7c120d 100644
--- a/onnxruntime/contrib_ops/cuda/moe/moe_base.h
+++ b/onnxruntime/contrib_ops/cuda/moe/moe_base.h
@@ -13,16 +13,22 @@ namespace cuda {
 
 enum class MoEParallelType {
   None = 0,
-  ExpertSlicing = 1,
+  EP = 1,
+  TP = 2,
+  EPAndTP = 3,
 };
 
 struct MoEParameters {
+  MoEParameters() {}
+  explicit MoEParameters(int64_t tensor_shards) : tensor_shards(tensor_shards) {}
   int64_t num_rows;
   int64_t num_experts;
   int64_t local_num_experts;
   int64_t hidden_size;
   int64_t inter_size;
+
   MoEParallelType parallel_type;
+  int64_t tensor_shards{1};
 };
 
 class MoEBase {
@@ -31,9 +37,11 @@ class MoEBase {
                      const Tensor* input,
                      const Tensor* router_probs,
                      const Tensor* fc1_experts_weights,
-                     const Tensor* fc2_experts_weights,
                      const Tensor* fc1_experts_bias_optional,
-                     const Tensor* fc2_experts_bias_optional) const {
+                     const Tensor* fc2_experts_weights,
+                     const Tensor* fc2_experts_bias_optional,
+                     const Tensor* fc3_experts_weights_optional,
+                     const Tensor* fc3_experts_bias_optional) const {
     const auto& input_dims = input->Shape().GetDims();
     const auto& router_probs_dims = router_probs->Shape().GetDims();
     const auto& fc1_experts_weights_dims = fc1_experts_weights->Shape().GetDims();
@@ -83,12 +91,6 @@ class MoEBase {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "router_probs_dims[0] must be equal to num_rows, got ",
                              router_probs_dims[0], " and ", num_rows);
     }
-    if (fc1_experts_bias_optional != nullptr && fc2_experts_bias_optional == nullptr) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "fc1_experts_bias is set but fc2_experts_bias is not set");
-    }
-    if (fc1_experts_bias_optional == nullptr && fc2_experts_bias_optional != nullptr) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "fc1_experts_bias is not set but fc2_experts_bias is set");
-    }
     if (fc1_experts_bias_optional != nullptr && fc2_experts_bias_optional != nullptr) {
       const auto& fc1_experts_bias_dims = fc1_experts_bias_optional->Shape().GetDims();
       const auto& fc2_experts_bias_dims = fc2_experts_bias_optional->Shape().GetDims();
@@ -126,15 +128,38 @@ class MoEBase {
       }
     }
 
+    if (fc3_experts_weights_optional != nullptr &&
+        fc3_experts_weights_optional->Shape().GetDims() != fc1_experts_weights_dims) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "fc3_experts_weights_dims must be equal to fc1_experts_weights_dims, got ",
+                             fc3_experts_weights_optional->Shape().GetDims(), " and ", fc1_experts_weights_dims);
+    }
+
+    if (fc3_experts_bias_optional != nullptr && fc1_experts_bias_optional != nullptr &&
+        fc3_experts_bias_optional->Shape().GetDims() != fc1_experts_bias_optional->Shape().GetDims()) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "fc3_experts_bias_dims must be equal to fc1_experts_bias_dims, got ",
+                             fc3_experts_bias_optional->Shape().GetDims(), " and ",
+                             fc1_experts_bias_optional->Shape().GetDims());
+    }
+
     parameters.num_rows = num_rows;
     parameters.num_experts = num_experts;
     parameters.local_num_experts = local_num_experts;
     parameters.hidden_size = hidden_size;
     parameters.inter_size = inter_size;
     if (num_experts == local_num_experts) {
-      parameters.parallel_type = MoEParallelType::None;
+      if (parameters.tensor_shards == 1) {
+        parameters.parallel_type = MoEParallelType::None;
+      } else {
+        parameters.parallel_type = MoEParallelType::TP;
+      }
     } else if (num_experts > local_num_experts) {
-      parameters.parallel_type = MoEParallelType::ExpertSlicing;
+      if (parameters.tensor_shards == 1) {
+        parameters.parallel_type = MoEParallelType::EP;
+      } else {
+        parameters.parallel_type = MoEParallelType::EPAndTP;
+      }
     } else {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                              "num_experts must be greater than or equal to local_num_experts, got ",
@@ -161,8 +186,11 @@ class MoEBase {
     } else {
       ORT_THROW("Unsupported MoE activation type: ", activation_type_str);
     }
+
+    normalize_routing_weights_ = op_kernel_info.GetAttrOrDefault<int64_t>("normalize_routing_weights", 0) == 1;
   }
 
+  bool normalize_routing_weights_;
   int64_t k_;
   ort_fastertransformer::ActivationType activation_type_;
 };
diff --git a/onnxruntime/core/graph/contrib_ops/collective_defs.cc b/onnxruntime/core/graph/contrib_ops/collective_defs.cc
index 4aa43f5de1cd5..a0ca2e45f153a 100644
--- a/onnxruntime/core/graph/contrib_ops/collective_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/collective_defs.cc
@@ -91,10 +91,18 @@ void RegisterCollectiveOps() {
             "Number of top experts to select from expert pool",
             AttributeProto::INT,
             static_cast<int64_t>(1))
+      .Attr("normalize_routing_weights",
+            "Whether to normalize routing weights",
+            AttributeProto::INT,
+            static_cast<int64_t>(0))
       .Attr("local_experts_start_index",
             "The start index of local experts",
             AttributeProto::INT,
-            static_cast<int64_t>(-1))
+            static_cast<int64_t>(0))
+      .Attr("tensor_shards",
+            "Tensor parallelism config. The number of shards for each expert weight and bias",
+            AttributeProto::INT,
+            static_cast<int64_t>(1))
       .Input(0,
              "input",
              "2D input tensor with shape (num_rows, hidden_size) or "
@@ -106,22 +114,32 @@ void RegisterCollectiveOps() {
              "T")
       .Input(2,
              "fc1_experts_weights",
-             "3D input tensor with shape (local_num_experts, hidden_size, inter_size)",
+             "3D input tensor with shape (local_num_experts, hidden_size, local_inter_size)",
              "T")
       .Input(3,
-             "fc2_experts_weights",
-             "3D input tensor with shape (local_num_experts, inter_size, hidden_size)",
-             "T")
-      .Input(4,
              "fc1_experts_bias",
-             "2D optional input tensor with shape (local_num_experts, inter_size)",
+             "2D optional input tensor with shape (local_num_experts, local_inter_size)",
              "T",
              OpSchema::Optional)
+      .Input(4,
+             "fc2_experts_weights",
+             "3D input tensor with shape (local_num_experts, local_inter_size, hidden_size)",
+             "T")
       .Input(5,
              "fc2_experts_bias",
              "2D optional input tensor with shape (num_experts, hidden_size)",
              "T",
              OpSchema::Optional)
+      .Input(6,
+             "fc3_experts_weights",
+             "3D optional input tensor with shape (local_num_experts, hidden_size, local_inter_size)",
+             "T",
+             OpSchema::Optional)
+      .Input(7,
+             "fc3_experts_bias",
+             "2D optional input tensor with shape (local_num_experts, local_inter_size)",
+             "T",
+             OpSchema::Optional)
       .Output(0,
               "output",
               "2D input tensor with shape (num_rows, hidden_size) or "
diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
index 6709398c788f0..82cc16acad582 100644
--- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
@@ -1382,8 +1382,8 @@ ONNX_MS_OPERATOR_SET_SCHEMA(Sampling, 1,
 
 constexpr const char* MoE_ver1_doc = R"DOC(
       Mixture of experts. Examples: Switch transformer(https://arxiv.org/pdf/2101.03961.pdf) use top 1,
-      GLaM(https://arxiv.org/abs/2112.06905) activates top 2 FFN, and Vision MOE(https://arxiv.org/pdf/2106.05974.pdf)
-      usually uses top 32 experts.
+      GLaM(https://arxiv.org/abs/2112.06905) activates top 2 FFN, Vision MOE(https://arxiv.org/pdf/2106.05974.pdf)
+      usually uses top 32 experts and Mixtral(https://huggingface.co/blog/mixtral).
       )DOC";
 
 ONNX_MS_OPERATOR_SET_SCHEMA(MoE, 1,
@@ -1391,12 +1391,15 @@ ONNX_MS_OPERATOR_SET_SCHEMA(MoE, 1,
                                 .SetDoc(MoE_ver1_doc)
                                 .Attr("activation_type", "Activation function to use. Choose from relu, gelu, silu and identity. Default is relu", AttributeProto::STRING, std::string("relu"))
                                 .Attr("k", "Number of top experts to select from expert pool", AttributeProto::INT, static_cast<int64_t>(1))
+                                .Attr("normalize_routing_weights", "Whether to normalize routing weights", AttributeProto::INT, static_cast<int64_t>(0))
                                 .Input(0, "input", "2D input tensor with shape (num_rows, hidden_size) or 3D input tensor with shape (batch_size, sequence_length, hidden_size)", "T")
                                 .Input(1, "router_probs", "2D input tensor with shape (num_rows, num_experts)", "T")
                                 .Input(2, "fc1_experts_weights", "3D input tensor with shape (num_experts, hidden_size, inter_size)", "T")
-                                .Input(3, "fc2_experts_weights", "3D input tensor with shape (num_experts, inter_size, hidden_size)", "T")
-                                .Input(4, "fc1_experts_bias", "2D optional input tensor with shape (num_experts, inter_size)", "T", OpSchema::Optional)
+                                .Input(3, "fc1_experts_bias", "2D optional input tensor with shape (num_experts, inter_size)", "T", OpSchema::Optional)
+                                .Input(4, "fc2_experts_weights", "3D input tensor with shape (num_experts, inter_size, hidden_size)", "T")
                                 .Input(5, "fc2_experts_bias", "2D optional input tensor with shape (num_experts, hidden_size)", "T", OpSchema::Optional)
+                                .Input(6, "fc3_experts_weights", "3D optional input tensor with shape (num_experts, hidden_size, inter_size)", "T", OpSchema::Optional)
+                                .Input(7, "fc3_experts_bias", "2D optional input tensor with shape (num_experts, inter_size)", "T", OpSchema::Optional)
                                 .Output(0, "output", "2D input tensor with shape (num_rows, hidden_size) or 3D input tensor with shape (batch_size, sequence_length, hidden_size)", "T")
                                 .TypeConstraint("T", {"tensor(float)", "tensor(float16)"}, "Constrain input and output types to float or float16 tensors.")
                                 .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput));
diff --git a/onnxruntime/core/providers/cuda/cu_inc/common.cuh b/onnxruntime/core/providers/cuda/cu_inc/common.cuh
index bed2f677166d6..1cd3532846114 100644
--- a/onnxruntime/core/providers/cuda/cu_inc/common.cuh
+++ b/onnxruntime/core/providers/cuda/cu_inc/common.cuh
@@ -543,7 +543,7 @@ struct _IsNan {
 template <>
 struct _IsNan<half> {
   __device__ __inline__ bool operator()(half a) const {
-    return static_cast<uint16_t>(*reinterpret_cast<const uint16_t*>(&a) & ~MLFloat16::kSignMask) 
+    return static_cast<uint16_t>(*reinterpret_cast<const uint16_t*>(&a) & ~MLFloat16::kSignMask)
            > MLFloat16::kPositiveInfinityBits;
   }
 };
@@ -551,7 +551,7 @@ struct _IsNan<half> {
 template <>
 struct _IsNan<BFloat16> {
   __device__ __inline__ bool operator()(BFloat16 a) const {
-    return static_cast<uint16_t>(*reinterpret_cast<const uint16_t*>(&a) & ~BFloat16::kSignMask) 
+    return static_cast<uint16_t>(*reinterpret_cast<const uint16_t*>(&a) & ~BFloat16::kSignMask)
            > BFloat16::kPositiveInfinityBits;
   }
 };
diff --git a/onnxruntime/test/contrib_ops/moe_test.cc b/onnxruntime/test/contrib_ops/moe_test.cc
index e88ef7794cd07..263ace25ddfe0 100644
--- a/onnxruntime/test/contrib_ops/moe_test.cc
+++ b/onnxruntime/test/contrib_ops/moe_test.cc
@@ -14,6 +14,7 @@ static void RunMoETest(
     const std::vector<float>& router_probs,
     const std::vector<float>& fc1_experts_weights,
     const std::vector<float>& fc2_experts_weights,
+    const std::vector<float>& fc3_experts_weights,
     const std::vector<float>& fc1_experts_bias,
     const std::vector<float>& fc2_experts_bias,
     const std::vector<float>& output_data,
@@ -22,19 +23,23 @@ static void RunMoETest(
     int hidden_size,
     int inter_size,
     std::string activation_type,
+    int normalize_routing_weights = 0,
+    int top_k = 1,
     bool use_float16 = false) {
   int min_cuda_architecture = use_float16 ? 530 : 0;
 
   bool enable_cuda = HasCudaEnvironment(min_cuda_architecture);
   if (enable_cuda) {
     OpTester tester("MoE", 1, onnxruntime::kMSDomain);
-    tester.AddAttribute<int64_t>("k", static_cast<int64_t>(1));
+    tester.AddAttribute<int64_t>("k", static_cast<int64_t>(top_k));
     tester.AddAttribute<std::string>("activation_type", activation_type);
+    tester.AddAttribute<int64_t>("normalize_routing_weights", static_cast<int64_t>(normalize_routing_weights));
 
     std::vector<int64_t> input_dims = {num_rows, hidden_size};
     std::vector<int64_t> router_probs_dims = {num_rows, num_experts};
     std::vector<int64_t> fc1_experts_weights_dims = {num_experts, hidden_size, inter_size};
     std::vector<int64_t> fc2_experts_weights_dims = {num_experts, inter_size, hidden_size};
+    std::vector<int64_t> fc3_experts_weights_dims = fc1_experts_weights_dims;
     std::vector<int64_t> fc1_experts_bias_dims = {num_experts, inter_size};
     std::vector<int64_t> fc2_experts_bias_dims = {num_experts, hidden_size};
     std::vector<int64_t> output_dims = {num_rows, hidden_size};
@@ -43,18 +48,40 @@ static void RunMoETest(
       tester.AddInput<MLFloat16>("input", input_dims, ToFloat16(input));
       tester.AddInput<MLFloat16>("router_probs", router_probs_dims, ToFloat16(router_probs));
       tester.AddInput<MLFloat16>("fc1_experts_weights", fc1_experts_weights_dims, ToFloat16(fc1_experts_weights));
+      if (!fc1_experts_bias.empty()) {
+        tester.AddInput<MLFloat16>("fc1_experts_bias", fc1_experts_bias_dims, ToFloat16(fc1_experts_bias));
+      } else {
+        tester.AddOptionalInputEdge<MLFloat16>();
+      }
       tester.AddInput<MLFloat16>("fc2_experts_weights", fc2_experts_weights_dims, ToFloat16(fc2_experts_weights));
-      tester.AddInput<MLFloat16>("fc1_experts_bias", fc1_experts_bias_dims, ToFloat16(fc1_experts_bias));
-      tester.AddInput<MLFloat16>("fc2_experts_bias", fc2_experts_bias_dims, ToFloat16(fc2_experts_bias));
+      if (!fc2_experts_bias.empty()) {
+        tester.AddInput<MLFloat16>("fc2_experts_bias", fc2_experts_bias_dims, ToFloat16(fc2_experts_bias));
+      } else {
+        tester.AddOptionalInputEdge<MLFloat16>();
+      }
+      if (!fc3_experts_weights.empty()) {
+        tester.AddInput<MLFloat16>("fc3_experts_weights", fc3_experts_weights_dims, ToFloat16(fc3_experts_weights));
+      }
       tester.AddOutput<MLFloat16>("output", output_dims, ToFloat16(output_data));
       tester.SetOutputTolerance(0.005f);
     } else {
       tester.AddInput<float>("input", input_dims, input);
       tester.AddInput<float>("router_probs", router_probs_dims, router_probs);
       tester.AddInput<float>("fc1_experts_weights", fc1_experts_weights_dims, fc1_experts_weights);
+      if (!fc1_experts_bias.empty()) {
+        tester.AddInput<float>("fc1_experts_bias", fc1_experts_bias_dims, fc1_experts_bias);
+      } else {
+        tester.AddOptionalInputEdge<float>();
+      }
       tester.AddInput<float>("fc2_experts_weights", fc2_experts_weights_dims, fc2_experts_weights);
-      tester.AddInput<float>("fc1_experts_bias", fc1_experts_bias_dims, fc1_experts_bias);
-      tester.AddInput<float>("fc2_experts_bias", fc2_experts_bias_dims, fc2_experts_bias);
+      if (!fc2_experts_bias.empty()) {
+        tester.AddInput<float>("fc2_experts_bias", fc2_experts_bias_dims, fc2_experts_bias);
+      } else {
+        tester.AddOptionalInputEdge<float>();
+      }
+      if (!fc3_experts_weights.empty()) {
+        tester.AddInput<float>("fc3_experts_weights", fc3_experts_weights_dims, fc3_experts_weights);
+      }
       tester.AddOutput<float>("output", output_dims, output_data);
       tester.SetOutputTolerance(0.001f);
     }
@@ -233,6 +260,7 @@ TEST(MoETest, MoETest_Gelu) {
              router_probs,
              fc1_experts_weights,
              fc2_experts_weights,
+             {},
              fc1_experts_bias,
              fc2_experts_bias,
              output,
@@ -411,6 +439,7 @@ TEST(MoETest, MoETest_Relu) {
              router_probs,
              fc1_experts_weights,
              fc2_experts_weights,
+             {},
              fc1_experts_bias,
              fc2_experts_bias,
              output,
@@ -421,5 +450,143 @@ TEST(MoETest, MoETest_Relu) {
              "relu");
 }
 
+TEST(MoETest, MoETest_Mixtral) {
+  int num_rows = 6;
+  int num_experts = 8;
+  int hidden_size = 4;
+  int inter_size = 8;
+
+  const std::vector<float> input = {
+      0.9212995f, 0.5282444f, -0.008228387f, -1.449332f, -0.6051824f, -0.17924511f, 0.1995587f, -1.2461947f,
+      0.86708033f, 0.19191018f, 1.1600108f, -0.008815222f, 0.8504777f, -0.84964496f, -1.4019964f, 0.17225051f,
+      0.35569248f, 1.2056456f, 1.3690308f, -0.69495815f, 1.4324434f, 0.22761835f, -1.1286871f, 1.124213f};
+  const std::vector<float> router_probs = {
+      -0.09331456f, -0.47121337f, 0.07311103f, 0.47643483f, 0.21135253f, -0.72226393f, -0.048502743f, 0.39447474f,
+      -0.9014899f, -0.36629856f, -0.23088816f, -0.099606544f, -0.45191774f, -0.30394578f, 0.6266495f, 0.67937183f,
+      0.27117345f, -0.36059442f, 0.81510246f, 0.61359257f, 0.07649982f, -0.44949868f, -0.54758865f, 0.4736983f,
+      0.21584567f, 0.21296778f, 0.093342215f, -0.09353682f, 0.61422515f, 0.19574627f, 0.0063361377f, -0.2465148f,
+      0.15675665f, -0.4546509f, 0.24447554f, 0.5921611f, -0.18192923f, -0.66116416f, -0.40265432f, 0.33475468f,
+      1.2906091f, 0.4709078f, 0.16256471f, 0.19308007f, 0.97568524f, 0.25876164f, -0.7964541f, -1.0319631f};
+  const std::vector<float> fc1_experts_weights = {
+      0.3860137f, 0.077925384f, 0.13434184f, 0.28902978f, 0.25391752f, -0.38351142f, 0.15813059f, 0.031481862f,
+      0.083209574f, 0.4039817f, -0.13558972f, -0.21858627f, -0.30475253f, 0.41026944f, -0.008697987f, -0.3412701f,
+      -0.16235226f, 0.054659843f, 0.21042877f, 0.28863233f, -0.49495423f, 0.14401567f, 0.39130414f, 0.154176f,
+      0.30897498f, -0.15768659f, 0.44641107f, 0.089463115f, -0.19318026f, 0.20710677f, -0.3552568f, -0.17219114f,
+      0.41923493f, -0.4233985f, -0.41503525f, 0.19466156f, -0.08633667f, 0.45547962f, -0.054792404f, 0.26722562f,
+      -0.09923202f, 0.3460176f, -0.49708033f, -0.41033173f, 0.10443485f, -0.39646107f, -0.37424505f, 0.1757198f,
+      0.43019837f, -0.13757241f, 0.14305532f, 0.37121457f, 0.2581259f, 0.12583363f, 0.45542932f, 0.16247797f,
+      0.15579104f, -0.19166303f, -0.109221935f, -0.36702687f, 0.40365517f, -0.21506298f, -0.36697525f, -0.2703231f,
+      -0.49740213f, -0.3486371f, 0.24005288f, -0.0048963428f, 0.20468098f, -0.09111178f, -0.1485982f, -0.088219464f,
+      0.33463532f, -0.49346995f, 0.42075223f, -0.38025302f, -0.245484f, -0.35191745f, 0.3086716f, -0.2423737f,
+      0.37881732f, -0.40608948f, 0.26193494f, -0.4283861f, -0.10062629f, -0.32670784f, -0.16040438f, -0.15297079f,
+      0.1822241f, 0.37285012f, 0.12654608f, -0.46767431f, -0.28775263f, 0.16585541f, -0.36678362f, -0.4759978f,
+      -0.34751755f, -0.3163945f, -0.3858195f, -0.38030273f, -0.06156373f, -0.04352224f, -0.4041785f, -0.335764f,
+      -0.10303855f, -0.4009425f, -0.1236487f, -0.40111196f, 0.23985302f, -0.118291676f, -0.26773083f, 0.121197104f,
+      0.3702919f, -0.34168184f, 0.33743858f, 0.24873763f, -0.23140603f, -0.25351608f, 0.48291886f, 0.13780516f,
+      0.25632292f, -0.49343884f, 0.08369112f, -0.37192065f, -0.05451995f, -0.44571918f, -0.24150735f, 0.27395487f,
+      -0.20423341f, -0.024149835f, 0.40208143f, -0.18211937f, -0.19767642f, -0.19397742f, -0.1510992f, 0.48074025f,
+      0.18377024f, -0.18288034f, 0.08111167f, 0.12729281f, 0.27861303f, 0.0076527f, 0.36356348f, -0.24359548f,
+      -0.33313757f, -0.374829f, -0.08705664f, 0.23576546f, -0.39819986f, -0.09880793f, -0.012998581f, -0.36475456f,
+      -0.32685202f, 0.29657948f, -0.4631365f, -0.06320876f, 0.31600899f, 0.060619473f, 0.39029974f, 0.401151f,
+      0.15562236f, 0.43565983f, -0.058149397f, 0.36150748f, 0.10750586f, -0.063970566f, -0.47026545f, -0.3035437f,
+      -0.38143605f, -0.4734699f, 0.31273925f, -0.43410504f, 0.07299572f, 0.47506f, 0.021913886f, -0.036100805f,
+      -0.31637233f, 0.37718338f, -0.046213806f, 0.19239199f, 0.13676548f, 0.33592474f, -0.34048676f, -0.11097133f,
+      -0.41569126f, -0.01680845f, 0.31357706f, 0.0943895f, -0.24053341f, -0.018784225f, 0.40659577f, 0.08897692f,
+      0.3793823f, -0.3271106f, 0.067666054f, -0.12331611f, -0.010209799f, -0.48908865f, 0.19195485f, -0.45211792f,
+      0.48282713f, 0.4363466f, -0.40184838f, -0.025082052f, -0.31057972f, 0.14850605f, 0.39756012f, -0.25782883f,
+      0.3181312f, 0.17685872f, -0.16694272f, -0.41516554f, -0.062004805f, -0.33060408f, -0.13665432f, -0.43781847f,
+      -0.298562f, 0.013283849f, 0.48130906f, -0.27970356f, 0.20347959f, -0.24402553f, -0.20528454f, -0.114435256f,
+      0.12556863f, -0.4344011f, 0.2868948f, 0.19894183f, -0.12849897f, -0.18726158f, -0.4850099f, -0.4352169f,
+      -0.40527463f, 0.13625044f, -0.49707252f, -0.45698053f, 0.28196156f, 0.16826987f, -0.25944453f, 0.2801003f,
+      0.21121234f, -0.04066527f, 0.45854944f, -0.17861038f, 0.18178529f, 0.17789757f, 0.34227383f, 0.26976448f,
+      0.15789884f, 0.22840887f, 0.419321f, -0.14490443f, 0.39608955f, -0.4162954f, -0.47072983f, 0.41119635f};
+  const std::vector<float> fc2_experts_weights = {
+      0.10833451f, 0.34020698f, -0.18258394f, -0.17842063f, -0.07365984f, -0.29177922f, -0.24102151f, 0.1077901f,
+      0.2932343f, -0.35068116f, 0.1875877f, 0.07474385f, -0.20955177f, -0.27660736f, -0.14290786f, -0.09014153f,
+      -0.21085852f, -0.2378315f, 0.21457997f, 0.21074237f, -0.21087126f, 0.14320332f, -0.08389844f, 0.24034885f,
+      0.31800103f, 0.12659892f, 0.20224877f, -0.2563875f, 0.11782206f, 0.29377612f, -0.27469966f, -0.18875091f,
+      0.32136288f, 0.0788243f, -0.26413083f, 0.18453442f, 0.0776935f, -0.19561274f, 0.12608862f, 0.18579696f,
+      0.045481127f, -0.17894714f, 0.27366453f, 0.13220324f, -0.3115706f, -0.016884197f, -0.3328494f, -0.062126897f,
+      0.14841764f, 0.19741052f, 0.08211302f, -0.09362138f, -0.053040292f, -0.090344846f, 0.18264277f, 0.037823465f,
+      -0.16197139f, -0.20172869f, 0.064109616f, -0.062456656f, 0.30368346f, -0.12107184f, -0.12590908f, -0.10535928f,
+      0.1978099f, 0.13119277f, 0.21948591f, -0.080250844f, -0.24614547f, 0.33202717f, 0.2645375f, -0.21193951f,
+      0.17770219f, -0.04986229f, 0.33435768f, -0.0309231f, 0.16043694f, -0.0027341924f, -0.08339601f, -0.17402375f,
+      0.2525901f, -0.0813988f, -0.2904943f, -0.14452116f, -0.27119386f, -0.2952116f, 0.0794895f, -0.11223866f,
+      0.25427446f, 0.16967128f, 0.19531254f, -0.33598322f, -0.16714293f, -0.35097876f, -0.35189477f, 0.2900932f,
+      0.26874313f, -0.1322388f, -0.330179f, 0.064027935f, 0.19688474f, -0.20129368f, 0.006225848f, 0.19252343f,
+      -0.35054854f, -0.31874785f, 0.32238203f, 0.29287276f, 0.03135616f, 0.015792634f, 0.20397249f, -0.3245995f,
+      0.21416605f, 0.15667121f, -0.2058509f, 0.23639117f, -0.032677338f, 0.07826358f, -0.04589425f, -0.24935842f,
+      -0.20834164f, 0.069915086f, -0.26063374f, 0.13239416f, 0.33705652f, -0.26813045f, -0.17056243f, 0.29919288f,
+      0.27704936f, -0.096224755f, 0.13250813f, 0.26709175f, -0.26995474f, 0.3261805f, -0.18062393f, -0.04732303f,
+      -0.02733084f, 0.050550338f, -0.2937818f, -0.19453493f, -0.34864828f, -0.20862648f, -0.19311349f, 0.17665526f,
+      -0.2894185f, -0.020016002f, 0.3409702f, -0.18320526f, 0.068286195f, 0.08490415f, 0.30223787f, -0.2386011f,
+      0.09405743f, 0.123811804f, 0.31660154f, -0.11290163f, 0.07494662f, -0.24999082f, 0.2075398f, 0.07419645f,
+      0.3327035f, -0.09647329f, 0.24138254f, -0.32546985f, 0.033594366f, 0.16555631f, 0.33516192f, -0.32619375f,
+      0.20476541f, -0.07724f, 0.018923176f, -0.21126744f, 0.2744358f, -0.23979841f, -0.30413106f, -0.3485449f,
+      0.2854276f, 0.14391156f, -0.24802732f, -0.21701548f, -0.122100174f, 0.054206114f, -0.21961808f, 0.13481297f,
+      -0.07907457f, 0.15763119f, -0.31156835f, 0.29488218f, 0.17039073f, 0.35125035f, -0.17721775f, -0.10516899f,
+      0.072144486f, -0.038529005f, -0.058253434f, 0.13062657f, -0.3312356f, -0.15963489f, -0.20129326f, 0.014987925f,
+      0.30869225f, 0.283981f, -0.057181682f, 0.15174268f, 0.22181617f, -0.19763571f, 0.28675067f, 0.0003976555f,
+      -0.34610963f, 0.2931936f, -0.26233214f, 0.19563977f, -0.16886877f, 0.022812065f, 0.080249704f, -0.2798801f,
+      0.11531327f, 0.07107194f, -0.34746924f, -0.051920194f, -0.07264093f, 0.27581826f, 0.18536879f, 0.15684144f,
+      -0.26691115f, -0.22811417f, -0.1498502f, -0.176639f, -0.25876564f, -0.16051741f, -0.0048792143f, -0.08490091f,
+      0.18136817f, 0.24729891f, 0.32358363f, -0.09566104f, 0.3074607f, -0.24191524f, -0.21220984f, -0.23039621f,
+      0.21154472f, -0.19495378f, 0.002779711f, -0.34692943f, 0.055384878f, 0.25809082f, 0.16814983f, 0.19935164f,
+      0.11652225f, 0.1115539f, -0.24407779f, 0.09392998f, 0.33556697f, 0.11422251f, 0.34336287f, -0.33113837f};
+  const std::vector<float> fc3_experts_weights = {
+      0.45783097f, -0.2863351f, 0.011728346f, -0.43760604f, 0.15407985f, 0.07818556f, 0.0013856292f, -0.34319758f,
+      -0.16871625f, 0.12490183f, -0.34154075f, -0.31836903f, -0.46634215f, -0.43996066f, -0.1860516f, -0.2917009f,
+      -0.1772582f, -0.06599659f, -0.42419833f, 0.49980444f, -0.3283869f, -0.21543652f, -0.034647882f, -0.17114872f,
+      -0.4837973f, -0.362943f, -0.27533132f, 0.09443748f, -0.16642791f, -0.2993343f, -0.33881485f, -0.39464045f,
+      0.31960344f, 0.007296145f, -0.45412838f, -0.024868786f, -0.16298121f, -0.44197202f, 0.07232875f, -0.32362783f,
+      0.42969978f, -0.029854119f, -0.18451887f, -0.30145288f, 0.16885209f, -0.30068123f, -0.12948537f, 0.36494362f,
+      -0.049498677f, 0.12020564f, 0.42106473f, -0.30590254f, 0.31881082f, -0.078908324f, 0.20685762f, -0.22735089f,
+      -0.11194843f, 0.14011681f, 0.19477749f, -0.44788343f, 0.23084867f, 0.48367476f, -0.19044077f, -0.100233376f,
+      0.4191656f, -0.4515314f, -0.3214385f, 0.016065598f, -0.4069137f, -0.17348295f, -0.43329984f, 0.33521235f,
+      -0.07843453f, -0.4865722f, -0.039011598f, -0.10605621f, 0.4192536f, 0.04063064f, 0.1984514f, 0.49294376f,
+      -0.056941032f, 0.18582922f, -0.16650558f, -0.17215621f, -0.20009357f, 0.46615022f, 0.47462142f, -0.0766145f,
+      -0.20405996f, -0.27452308f, -0.16176039f, -0.23940295f, 0.13248974f, 0.23036134f, 0.13154167f, 0.10377723f,
+      0.0070211887f, 0.29162645f, 0.34465307f, -0.4058748f, -0.13989884f, -0.12305027f, -0.2541607f, 0.4767149f,
+      0.4549045f, -0.108933926f, 0.2452516f, 0.054080307f, 0.33768386f, -0.45279485f, 0.1557768f, 0.17416143f,
+      -0.42602575f, -0.102350116f, 0.16022503f, 0.14813942f, 0.03982985f, -0.47012872f, -0.14555538f, 0.35645115f,
+      -0.1909796f, -0.20839584f, -0.28098184f, -0.23085594f, 0.022559166f, -0.23900753f, -0.19561106f, -0.24205637f,
+      0.2573983f, -0.2947166f, 0.4568925f, 0.11514187f, 0.18671238f, -0.121082425f, 0.3909887f, -0.10985571f,
+      -0.19420451f, -0.3255307f, 0.4863913f, 0.007830441f, 0.4648854f, -0.24156213f, 0.22956276f, -0.09216207f,
+      -0.29428315f, 0.26062596f, 0.14955276f, -0.036366224f, -0.12957954f, 0.08501935f, -0.36796576f, 0.041123867f,
+      0.06744653f, -0.0839923f, 0.17207885f, 0.006872058f, -0.21135789f, 0.3732242f, -0.2683524f, -0.45898575f,
+      -0.14543939f, 0.30806476f, 0.08574325f, 0.027492225f, -0.38164973f, -0.040038824f, -0.26947904f, -0.09740937f,
+      0.26697665f, -0.43565083f, 0.1359719f, 0.12271714f, 0.0149876475f, -0.44011843f, 0.26128954f, -0.42487514f,
+      -0.24668545f, 0.06113738f, -0.29119557f, 0.194273f, -0.24981815f, 0.3489496f, -0.47321397f, -0.31794417f,
+      -0.23641628f, 0.44169098f, -0.006898284f, 0.43446392f, -0.39553195f, 0.057907403f, -0.19339961f, -0.08160931f,
+      0.4979084f, -0.11149913f, 0.35366338f, -0.16032219f, -0.48278677f, 0.08397317f, 0.4008311f, 0.30288273f,
+      0.2546957f, -0.10675722f, 0.069722414f, 0.456497f, -0.19691509f, 0.49017924f, 0.41796166f, -0.2337895f,
+      -0.3635872f, -0.45445484f, -0.29122698f, -0.4339773f, 0.15762383f, 0.09782606f, -0.27986187f, -0.23860168f,
+      0.38454843f, -0.07870716f, 0.15390605f, -0.15793777f, 0.48130733f, 0.288768f, 0.45969498f, -0.4193731f,
+      -0.3218134f, -0.29914904f, -0.3426242f, 0.06931591f, -0.2633695f, -0.25429398f, 0.25366426f, -0.27700734f,
+      0.49418402f, -0.21919805f, 0.041192472f, -0.19817531f, -0.49578953f, 0.48185098f, -0.41920406f, -0.08335745f,
+      0.19111753f, -0.07547706f, 0.049694f, 0.13012594f, 0.2617172f, -0.22612399f, 0.32247066f, -0.33702326f,
+      0.20062232f, -0.09143996f, -0.063310504f, 0.1885702f, 0.11926836f, 0.3378734f, -0.45973647f, 0.48845494f};
+  const std::vector<float> output = {
+      0.026516449f, 0.04061616f, 0.04403834f, -0.13644142f, 0.038774252f, 0.024002096f, -0.061423667f, 0.034824893f,
+      -0.022858473f, 0.04693405f, -0.0120724365f, -0.028846134f, -0.0168579f, -0.07958221f, 0.048179876f, 0.053492386f,
+      -0.026292695f, -0.009724421f, -0.026503641f, 0.031220898f, 0.04189077f, 0.11775493f, -0.037770163f, -0.0790936f};
+
+  RunMoETest(input,
+             router_probs,
+             fc1_experts_weights,
+             fc2_experts_weights,
+             fc3_experts_weights,
+             {},
+             {},
+             output,
+             num_rows,
+             num_experts,
+             hidden_size,
+             inter_size,
+             "silu",
+             1, /*normalize_routing_weights*/
+             2 /*top_k*/);
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/python/transformers/sharded_moe/test_sharded_moe.py b/onnxruntime/test/python/transformers/sharded_moe/test_sharded_moe.py
index fd1d58cd2a3b8..ec64f2359f4be 100644
--- a/onnxruntime/test/python/transformers/sharded_moe/test_sharded_moe.py
+++ b/onnxruntime/test/python/transformers/sharded_moe/test_sharded_moe.py
@@ -24,25 +24,17 @@ def get_size():
     return comm.Get_size()
 
 
-def barrier():
-    comm.Barrier()
-
-
 def print_out(*args):
     if get_rank() == 0:
         print(*args)
 
 
-def broadcast(data):
-    comm = MPI.COMM_WORLD
-    comm.broadcast(data, root=0)
-
-
 local_rank = get_rank()
 
 ORT_DTYPE = TensorProto.FLOAT16
 NP_TYPE = np.float16 if ORT_DTYPE == TensorProto.FLOAT16 else np.float32
-THRESHOLD = 1e-3
+THRESHOLD_TP = 3e-2
+THRESHOLD_EP = 1e-6
 
 
 def create_moe_onnx_graph(
@@ -52,12 +44,17 @@ def create_moe_onnx_graph(
     hidden_size,
     inter_size,
     fc1_experts_weights,
-    fc2_experts_weights,
     fc1_experts_bias,
+    fc2_experts_weights,
     fc2_experts_bias,
-    local_experts_start_index=-1,
+    fc3_experts_weights,
+    local_experts_start_index=0,
+    topk=2,
+    normalize_routing_weights=1,
+    activation_type="gelu",
+    tensor_shards=1,
 ):
-    use_sharded_moe = local_experts_start_index >= 0
+    use_sharded_moe = num_experts > local_num_experts or tensor_shards > 1
     nodes = [
         (
             helper.make_node(
@@ -66,14 +63,16 @@ def create_moe_onnx_graph(
                     "input",
                     "router_probs",
                     "fc1_experts_weights",
-                    "fc2_experts_weights",
                     "fc1_experts_bias",
+                    "fc2_experts_weights",
                     "fc2_experts_bias",
+                    "fc3_experts_weights",
                 ],
                 ["output"],
                 "MoE_0",
-                k=1,
-                activation_type="gelu",
+                k=topk,
+                normalize_routing_weights=normalize_routing_weights,
+                activation_type=activation_type,
                 domain="com.microsoft",
             )
             if not use_sharded_moe
@@ -83,15 +82,18 @@ def create_moe_onnx_graph(
                     "input",
                     "router_probs",
                     "fc1_experts_weights",
-                    "fc2_experts_weights",
                     "fc1_experts_bias",
+                    "fc2_experts_weights",
                     "fc2_experts_bias",
+                    "fc3_experts_weights",
                 ],
                 ["output"],
                 "MoE_0",
-                k=1,
-                activation_type="gelu",
+                k=topk,
+                normalize_routing_weights=normalize_routing_weights,
+                activation_type=activation_type,
                 local_experts_start_index=local_experts_start_index,
+                tensor_shards=tensor_shards,
                 domain="com.microsoft",
             )
         ),
@@ -99,6 +101,7 @@ def create_moe_onnx_graph(
 
     fc1_shape = [local_num_experts, hidden_size, inter_size]
     fc2_shape = [local_num_experts, inter_size, hidden_size]
+    fc3_shape = fc1_shape
 
     initializers = [
         helper.make_tensor(
@@ -115,6 +118,13 @@ def create_moe_onnx_graph(
             fc2_experts_weights.flatten(),
             raw=False,
         ),
+        helper.make_tensor(
+            "fc3_experts_weights",
+            ORT_DTYPE,
+            fc3_shape,
+            fc3_experts_weights.flatten(),
+            raw=False,
+        ),
     ]
 
     fc1_bias_shape = [local_num_experts, inter_size]
@@ -166,18 +176,18 @@ def create_moe_onnx_graph(
     return model.SerializeToString()
 
 
-def test_moe_with_expert_slicing(
+def generate_weights_and_initial_model(
+    num_rows,
+    num_experts,
     hidden_size,
     inter_size,
-    num_experts,
-    num_rows,
 ):
-    local_experts_start_index = local_rank * num_experts // get_size()
-
-    fc1_experts_weights_all = np.random.rand(num_experts, hidden_size, inter_size).astype(NP_TYPE)
-    fc2_experts_weights_all = np.random.rand(num_experts, inter_size, hidden_size).astype(NP_TYPE)
-    fc1_experts_bias_all = np.random.rand(num_experts, inter_size).astype(NP_TYPE)
-    fc2_experts_bias_all = np.random.rand(num_experts, hidden_size).astype(NP_TYPE)
+    s = 0.1
+    fc1_experts_weights_all = np.random.normal(scale=s, size=(num_experts, hidden_size, inter_size)).astype(NP_TYPE)
+    fc2_experts_weights_all = np.random.normal(scale=s, size=(num_experts, inter_size, hidden_size)).astype(NP_TYPE)
+    fc3_experts_weights_all = np.random.normal(scale=s, size=(num_experts, hidden_size, inter_size)).astype(NP_TYPE)
+    fc1_experts_bias_all = np.random.normal(scale=s, size=(num_experts, inter_size)).astype(NP_TYPE)
+    fc2_experts_bias_all = np.random.normal(scale=s, size=(num_experts, hidden_size)).astype(NP_TYPE)
 
     onnx_model_full = create_moe_onnx_graph(
         num_rows,
@@ -186,34 +196,31 @@ def test_moe_with_expert_slicing(
         hidden_size,
         inter_size,
         fc1_experts_weights_all,
-        fc2_experts_weights_all,
         fc1_experts_bias_all,
+        fc2_experts_weights_all,
         fc2_experts_bias_all,
+        fc3_experts_weights_all,
     )
 
-    fc1_experts_weights = fc1_experts_weights_all[
-        local_experts_start_index : local_experts_start_index + num_experts // get_size(), :, :
-    ]
-    fc2_experts_weights = fc2_experts_weights_all[
-        local_experts_start_index : local_experts_start_index + num_experts // get_size(), :, :
-    ]
-    fc1_experts_bias = fc1_experts_bias_all[
-        local_experts_start_index : local_experts_start_index + num_experts // get_size(), :
-    ]
-
-    onnx_model_local = create_moe_onnx_graph(
-        num_rows,
-        num_experts,
-        num_experts // get_size(),
-        hidden_size,
-        inter_size,
-        fc1_experts_weights,
-        fc2_experts_weights,
-        fc1_experts_bias,
+    return (
+        onnx_model_full,
+        fc1_experts_weights_all,
+        fc1_experts_bias_all,
+        fc2_experts_weights_all,
         fc2_experts_bias_all,
-        local_experts_start_index,
+        fc3_experts_weights_all,
     )
 
+
+def run_ort_with_parity_check(
+    onnx_model_full,
+    onnx_model_local,
+    num_rows,
+    hidden_size,
+    num_experts,
+    inter_size,
+    threshold,
+):
     sess_options = onnxruntime.SessionOptions()
     cuda_provider_options = {"device_id": local_rank}
     execution_providers = [("CUDAExecutionProvider", cuda_provider_options)]
@@ -229,30 +236,161 @@ def test_moe_with_expert_slicing(
     output = ort_session.run(None, ort_inputs)
     sharded_output = ort_session_local.run(None, ort_inputs)
 
-    assert np.allclose(output[0], sharded_output[0], atol=THRESHOLD, rtol=THRESHOLD)
+    print_out("max diff:", np.max(np.abs(output[0] - sharded_output[0])))
+    assert np.allclose(output[0], sharded_output[0], atol=threshold, rtol=threshold)
 
     print_out(
-        "hidden_size: ",
+        "hidden_size:",
         hidden_size,
-        " inter_size: ",
+        " inter_size:",
         inter_size,
-        " num_experts: ",
+        " num_experts:",
         num_experts,
-        " num_rows: ",
+        " num_rows:",
         num_rows,
-        " world_size: ",
+        " world_size:",
         get_size(),
         " Parity: OK",
     )
 
 
+def test_moe_with_tensor_parallelism(
+    hidden_size,
+    inter_size,
+    num_experts,
+    num_rows,
+    threshold=THRESHOLD_TP,
+):
+    assert inter_size % get_size() == 0
+
+    (
+        onnx_model_full,
+        fc1_experts_weights_all,
+        fc1_experts_bias_all,
+        fc2_experts_weights_all,
+        fc2_experts_bias_all,
+        fc3_experts_weights_all,
+    ) = generate_weights_and_initial_model(
+        num_rows,
+        num_experts,
+        hidden_size,
+        inter_size,
+    )
+
+    fc1_experts_weights = fc1_experts_weights_all[
+        :, :, local_rank * inter_size // get_size() : (local_rank + 1) * inter_size // get_size()
+    ]
+    fc2_experts_weights = fc2_experts_weights_all[
+        :, local_rank * inter_size // get_size() : (local_rank + 1) * inter_size // get_size(), :
+    ]
+    fc3_experts_weights = fc3_experts_weights_all[
+        :, :, local_rank * inter_size // get_size() : (local_rank + 1) * inter_size // get_size()
+    ]
+    fc1_experts_bias = fc1_experts_bias_all[
+        :, local_rank * inter_size // get_size() : (local_rank + 1) * inter_size // get_size()
+    ]
+
+    onnx_model_local = create_moe_onnx_graph(
+        num_rows,
+        num_experts,
+        num_experts,
+        hidden_size,
+        inter_size // get_size(),
+        fc1_experts_weights,
+        fc1_experts_bias,
+        fc2_experts_weights,
+        fc2_experts_bias_all,
+        fc3_experts_weights,
+        tensor_shards=get_size(),
+    )
+
+    run_ort_with_parity_check(
+        onnx_model_full,
+        onnx_model_local,
+        num_rows,
+        hidden_size,
+        num_experts,
+        inter_size,
+        threshold,
+    )
+
+
+def test_moe_with_expert_parallelism(
+    hidden_size,
+    inter_size,
+    num_experts,
+    num_rows,
+    threshold=THRESHOLD_EP,
+):
+    local_experts_start_index = local_rank * num_experts // get_size()
+
+    (
+        onnx_model_full,
+        fc1_experts_weights_all,
+        fc1_experts_bias_all,
+        fc2_experts_weights_all,
+        fc2_experts_bias_all,
+        fc3_experts_weights_all,
+    ) = generate_weights_and_initial_model(
+        num_rows,
+        num_experts,
+        hidden_size,
+        inter_size,
+    )
+
+    fc1_experts_weights = fc1_experts_weights_all[
+        local_experts_start_index : local_experts_start_index + num_experts // get_size(), :, :
+    ]
+    fc2_experts_weights = fc2_experts_weights_all[
+        local_experts_start_index : local_experts_start_index + num_experts // get_size(), :, :
+    ]
+    fc3_experts_weights = fc3_experts_weights_all[
+        local_experts_start_index : local_experts_start_index + num_experts // get_size(), :, :
+    ]
+    fc1_experts_bias = fc1_experts_bias_all[
+        local_experts_start_index : local_experts_start_index + num_experts // get_size(), :
+    ]
+
+    onnx_model_local = create_moe_onnx_graph(
+        num_rows,
+        num_experts,
+        num_experts // get_size(),
+        hidden_size,
+        inter_size,
+        fc1_experts_weights,
+        fc1_experts_bias,
+        fc2_experts_weights,
+        fc2_experts_bias_all,
+        fc3_experts_weights,
+        local_experts_start_index,
+    )
+
+    run_ort_with_parity_check(
+        onnx_model_full,
+        onnx_model_local,
+        num_rows,
+        hidden_size,
+        num_experts,
+        inter_size,
+        threshold,
+    )
+
+
 class TestMoE(unittest.TestCase):
-    def test_moe_expert_slicing(self):
-        for hidden_size in [16, 128]:
-            for inter_size in [512, 1024]:
-                for num_experts in [8, 16, 32]:
-                    for num_rows in [16, 128, 512]:
-                        test_moe_with_expert_slicing(
+    def test_moe_parallelism(self):
+        for hidden_size in [128, 1024]:
+            for inter_size in [512, 2048]:
+                for num_experts in [64]:
+                    for num_rows in [1024]:
+                        print_out("EP")
+                        test_moe_with_expert_parallelism(
+                            hidden_size,
+                            inter_size,
+                            num_experts,
+                            num_rows,
+                        )
+                        print_out("TP")
+                        test_moe_with_tensor_parallelism(
                             hidden_size,
                             inter_size,
                             num_experts,
diff --git a/onnxruntime/test/python/transformers/test_parity_mixtral_moe.py b/onnxruntime/test/python/transformers/test_parity_mixtral_moe.py
new file mode 100644
index 0000000000000..90b7da255081a
--- /dev/null
+++ b/onnxruntime/test/python/transformers/test_parity_mixtral_moe.py
@@ -0,0 +1,365 @@
+# --------------------------------------------------------------------------
+# Copyright 2020 The HuggingFace Inc. team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+import unittest
+from collections import OrderedDict
+
+import numpy
+import torch
+import torch.nn.functional as F
+from onnx import TensorProto, helper
+from torch import nn
+
+import onnxruntime
+
+torch.manual_seed(42)
+numpy.random.seed(42)
+
+ORT_DTYPE = TensorProto.FLOAT
+NP_TYPE = numpy.float16 if ORT_DTYPE == TensorProto.FLOAT16 else numpy.float32
+THRESHOLD = 3e-2
+
+
+def value_string_of(numpy_array):
+    arr = numpy_array.flatten()
+    lines = ["f, ".join([str(v) for v in arr[i : min(i + 8, arr.size)]]) for i in range(0, arr.size, 8)]
+    return "{\n    " + "f,\n    ".join(lines) + "f}"
+
+
+def print_tensor(name, numpy_array):
+    print(f"const std::vector<float> {name} = {value_string_of(numpy_array)};")
+
+
+def create_moe_onnx_graph(
+    num_rows,
+    num_experts,
+    hidden_size,
+    inter_size,
+    fc1_experts_weights,
+    fc2_experts_weights,
+    fc3_experts_weights,
+    topk,
+):
+    nodes = [
+        helper.make_node(
+            "MoE",
+            [
+                "input",
+                "router_probs",
+                "fc1_experts_weights",
+                "",
+                "fc2_experts_weights",
+                "",
+                "fc3_experts_weights",
+            ],
+            ["output"],
+            "MoE_0",
+            k=topk,
+            normalize_routing_weights=1,
+            activation_type="silu",
+            domain="com.microsoft",
+        ),
+    ]
+
+    fc1_shape = [num_experts, hidden_size, inter_size]
+    fc2_shape = [num_experts, inter_size, hidden_size]
+    fc3_shape = [num_experts, hidden_size, inter_size]
+
+    torch_type = torch.float16 if ORT_DTYPE == TensorProto.FLOAT16 else torch.float32
+
+    initializers = [
+        helper.make_tensor(
+            "fc1_experts_weights",
+            ORT_DTYPE,
+            fc1_shape,
+            fc1_experts_weights.to(torch_type).flatten().tolist(),
+            raw=False,
+        ),
+        helper.make_tensor(
+            "fc2_experts_weights",
+            ORT_DTYPE,
+            fc2_shape,
+            fc2_experts_weights.to(torch_type).flatten().tolist(),
+            raw=False,
+        ),
+        helper.make_tensor(
+            "fc3_experts_weights",
+            ORT_DTYPE,
+            fc3_shape,
+            fc3_experts_weights.to(torch_type).flatten().tolist(),
+            raw=False,
+        ),
+    ]
+
+    graph_inputs = [
+        helper.make_tensor_value_info("input", ORT_DTYPE, [num_rows, hidden_size]),
+    ]
+
+    graph_inputs.append(
+        helper.make_tensor_value_info(
+            "router_probs",
+            ORT_DTYPE,
+            [num_rows, num_experts],
+        )
+    )
+
+    graph_outputs = [
+        helper.make_tensor_value_info("output", ORT_DTYPE, [num_rows, hidden_size]),
+    ]
+
+    graph = helper.make_graph(
+        nodes,
+        "MoE_Graph",
+        graph_inputs,
+        graph_outputs,
+        initializers,
+    )
+
+    model = helper.make_model(graph)
+    return model.SerializeToString()
+
+
+class ClassInstantier(OrderedDict):
+    def __getitem__(self, key):
+        content = super().__getitem__(key)
+        cls, kwargs = content if isinstance(content, tuple) else (content, {})
+        return cls(**kwargs)
+
+
+ACT2CLS = {
+    "silu": nn.SiLU,
+}
+ACT2FN = ClassInstantier(ACT2CLS)
+
+
+class MixtralConfig:
+    def __init__(
+        self,
+        hidden_size=4096,
+        intermediate_size=14336,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        rope_theta=1e6,
+        attention_dropout=0.0,
+        num_experts_per_tok=2,
+        num_local_experts=8,
+        output_router_logits=False,
+        router_aux_loss_coef=0.001,
+    ):
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_local_experts = num_local_experts
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+
+
+class MixtralBlockSparseTop2MLP(nn.Module):
+    def __init__(self, config: MixtralConfig):
+        super().__init__()
+        self.ffn_dim = config.intermediate_size
+        self.hidden_dim = config.hidden_size
+
+        self.w1 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)
+        self.w2 = nn.Linear(self.ffn_dim, self.hidden_dim, bias=False)
+        self.w3 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)
+
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states):
+        current_hidden_states_1 = self.act_fn(self.w1(hidden_states))
+        current_hidden_states_3 = self.w3(hidden_states)
+        current_hidden_states = current_hidden_states_1 * current_hidden_states_3
+        current_hidden_states = self.w2(current_hidden_states)
+        return current_hidden_states
+
+
+class MixtralSparseMoeBlock(nn.Module):
+    """
+    This implementation is
+    strictly equivalent to standard MoE with full capacity (no
+    dropped tokens). It's faster since it formulates MoE operations
+    in terms of block-sparse operations to accommodate imbalanced
+    assignments of tokens to experts, whereas standard MoE either
+    (1) drop tokens at the cost of reduced performance or (2) set
+    capacity factor to number of experts and thus waste computation
+    and memory on padding.
+    """
+
+    def __init__(self, config, batch_size, sequence_length):
+        super().__init__()
+        self.hidden_dim = config.hidden_size
+        self.ffn_dim = config.intermediate_size
+        self.num_experts = config.num_local_experts
+        self.top_k = config.num_experts_per_tok
+
+        # gating
+        self.gate = nn.Linear(self.hidden_dim, self.num_experts, bias=False)
+
+        self.experts = nn.ModuleList([MixtralBlockSparseTop2MLP(config) for _ in range(self.num_experts)])
+
+        w1_list = []
+        w2_list = []
+        w3_list = []
+        for i in range(self.num_experts):
+            w1_list.append(self.experts[i].w1.weight.transpose(0, 1))
+            w2_list.append(self.experts[i].w2.weight.transpose(0, 1))
+            w3_list.append(self.experts[i].w3.weight.transpose(0, 1))
+
+        self.moe_experts_weight1 = torch.stack(w1_list, dim=0)
+        self.moe_experts_weight2 = torch.stack(w2_list, dim=0)
+        self.moe_experts_weight3 = torch.stack(w3_list, dim=0)
+
+        self.batch_size = batch_size
+        self.sequence_length = sequence_length
+        self.moe_onnx_graph = create_moe_onnx_graph(
+            self.batch_size * self.sequence_length,
+            self.num_experts,
+            self.hidden_dim,
+            self.ffn_dim,
+            self.moe_experts_weight1,
+            self.moe_experts_weight2,
+            self.moe_experts_weight3,
+            self.top_k,
+        )
+
+        self.ort_sess = self.create_ort_session()
+
+    def create_ort_session(self):
+        from onnxruntime import InferenceSession, SessionOptions
+
+        sess_options = SessionOptions()
+
+        cuda_providers = ["CUDAExecutionProvider"]
+        if cuda_providers[0] not in onnxruntime.get_available_providers():
+            return None
+
+        sess_options.log_severity_level = 2
+        ort_session = InferenceSession(self.moe_onnx_graph, sess_options, providers=["CUDAExecutionProvider"])
+
+        return ort_session
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """ """
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        # router_logits: (batch * sequence_length, n_experts)
+        router_logits = self.gate(hidden_states)
+
+        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
+
+        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+        # we cast back to the input dtype
+        routing_weights = routing_weights.to(hidden_states.dtype)
+
+        final_hidden_states = torch.zeros(
+            (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
+        )
+
+        # One hot encode the selected experts to create an expert mask
+        # this will be used to easily index which expert is going to be sollicitated
+        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
+
+        # Loop over all available experts in the model and perform the computation on each expert
+        for expert_idx in range(self.num_experts):
+            expert_layer = self.experts[expert_idx]
+            idx, top_x = torch.where(expert_mask[expert_idx])
+
+            if top_x.shape[0] == 0:
+                continue
+
+            # in torch it is faster to index using lists than torch tensors
+            top_x_list = top_x.tolist()
+            idx_list = idx.tolist()
+
+            # Index the correct hidden states and compute the expert hidden state for
+            # the current expert. We need to make sure to multiply the output hidden
+            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
+            current_state = hidden_states[None, top_x_list].reshape(-1, hidden_dim)
+            current_hidden_states = expert_layer(current_state) * routing_weights[top_x_list, idx_list, None]
+
+            # However `index_add_` only support torch tensors for indexing so we'll use
+            # the `top_x` tensor here.
+            final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
+        final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
+        return final_hidden_states  # , router_logits
+
+    def ort_forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        # router_logits: (batch * sequence_length, n_experts)
+        router_logits = self.gate(hidden_states)
+
+        ort_inputs = {
+            "input": numpy.ascontiguousarray(hidden_states.detach().numpy().astype(NP_TYPE)),
+            "router_probs": numpy.ascontiguousarray(router_logits.detach().numpy().astype(NP_TYPE)),
+        }
+
+        ort_output = None
+        if self.ort_sess is not None:
+            ort_output = self.ort_sess.run(None, ort_inputs)
+            return torch.tensor(ort_output).reshape(batch_size, sequence_length, -1)  # , router_logits
+
+        # print_tensor("input", ort_inputs["input"])
+        # print_tensor("router_probs", ort_inputs["router_probs"])
+        # print_tensor("fc1_experts_weights", self.moe_experts_weight1.detach().numpy())
+        # print_tensor("fc2_experts_weights", self.moe_experts_weight2.detach().numpy())
+        # print_tensor("fc3_experts_weights", self.moe_experts_weight3.detach().numpy())
+        # print_tensor("output", ort_output[0])
+
+        return None
+
+    def parity_check(self):
+        hidden_state = torch.randn(self.batch_size, self.sequence_length, self.hidden_dim)
+        torch_output = self.forward(hidden_state)
+        ort_output = self.ort_forward(hidden_state)
+        if ort_output is not None:
+            assert torch.allclose(torch_output, ort_output, rtol=1e-04, atol=1e-04)
+            print(
+                "batch_size:",
+                self.batch_size,
+                " sequence_length:",
+                self.sequence_length,
+                " max_diff:",
+                (torch_output - ort_output).abs().max(),
+                " parity: OK",
+            )
+
+
+class TestMixtralMoE(unittest.TestCase):
+    def test_mixtral_moe_parity(self):
+        for batch_size in [1, 16]:
+            for sequence_length in [128, 1024]:
+                # use a small sizes to speed up the test
+                config = MixtralConfig(hidden_size=256, intermediate_size=1024)
+                mixtral_moe = MixtralSparseMoeBlock(config, batch_size, sequence_length)
+                mixtral_moe.parity_check()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/onnxruntime/test/python/transformers/test_parity_moe.py b/onnxruntime/test/python/transformers/test_parity_moe.py
index 72ca5d9975c05..dbf6ee7dabb0e 100644
--- a/onnxruntime/test/python/transformers/test_parity_moe.py
+++ b/onnxruntime/test/python/transformers/test_parity_moe.py
@@ -47,8 +47,8 @@ def create_moe_onnx_graph(
     hidden_size,
     inter_size,
     fc1_experts_weights,
-    fc2_experts_weights,
     fc1_experts_bias,
+    fc2_experts_weights,
     fc2_experts_bias,
 ):
     nodes = [
@@ -58,8 +58,8 @@ def create_moe_onnx_graph(
                 "input",
                 "router_probs",
                 "fc1_experts_weights",
-                "fc2_experts_weights",
                 "fc1_experts_bias",
+                "fc2_experts_weights",
                 "fc2_experts_bias",
             ],
             ["output"],
@@ -250,8 +250,8 @@ def __init__(
             in_features,
             hidden_features,
             self.moe_experts.weight1,
-            self.moe_experts.weight2,
             self.moe_experts.bias1,
+            self.moe_experts.weight2,
             self.moe_experts.bias2,
         )
 
@@ -296,8 +296,6 @@ def ort_run_with_iobinding(self, ort_inputs, repeat=1000):
             ).data_ptr(),
         )
 
-        iobinding.synchronize_inputs()
-
         iobinding.bind_output(
             name="output",
             device_type="cuda",
@@ -308,11 +306,12 @@ def ort_run_with_iobinding(self, ort_inputs, repeat=1000):
                 numpy.zeros(ort_inputs["input"].shape), "cuda", device_id
             ).data_ptr(),
         )
-        iobinding.synchronize_outputs()
 
         s = time.time()
         for _ in range(repeat):
+            iobinding.synchronize_inputs()
             self.ort_sess.run_with_iobinding(iobinding)
+            iobinding.synchronize_outputs()
         e = time.time()
         print(f"MoE cuda kernel time: {(e - s) / repeat * 1000} ms")
 
@@ -356,8 +355,8 @@ def onnx_forward(self, iobinding=False):
         # print_tensor("input", ort_inputs["input"])
         # print_tensor("router_probs", ort_inputs["router_probs"])
         # print_tensor("fc1_experts_weights", self.moe_experts.weight1.detach().numpy())
-        # print_tensor("fc2_experts_weights", self.moe_experts.weight2.detach().numpy())
         # print_tensor("fc1_experts_bias", self.moe_experts.bias1.detach().numpy())
+        # print_tensor("fc2_experts_weights", self.moe_experts.weight2.detach().numpy())
         # print_tensor("fc2_experts_bias", self.moe_experts.bias2.detach().numpy())
         # print_tensor("output", ort_output[0])
 

From 7e18cb4c3509fbf6f9780c5f782a129b3de15d4d Mon Sep 17 00:00:00 2001
From: zesongw <zesong.wang@intel.com>
Date: Wed, 20 Mar 2024 23:32:57 +0800
Subject: [PATCH 211/279] [WebNN EP] Support MatMul 1D (#19862)

### Description
Support MatMul 1D inputs by combining Reshape and ReduceMean.


### Motivation and Context
ONNX MatMul can support 1D inputs, which is disabled in
`IsOpSupportedImpl`.
---
 .../webnn/builders/impl/gemm_op_builder.cc    | 49 +++++++++++++++++--
 1 file changed, 45 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
index 455e0e5f16a42..ed320132169e9 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
@@ -42,6 +42,26 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
     if (!GetShape(*input_defs[a_idx], a_shape, logger)) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Can not get shape of A.");
     }
+    std::vector<int64_t> b_shape;
+    if (!GetShape(*input_defs[b_idx], b_shape, logger)) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Can not get shape of B.");
+    }
+    // If the first argument is 1-D, it is promoted to a matrix by prepending a 1 to its dimensions.
+    bool extended_a_shape = false;
+    if (a_shape.size() == 1) {
+      extended_a_shape = true;
+      a_shape.insert(a_shape.begin(), 1);
+      a = model_builder.GetBuilder().call<emscripten::val>("reshape", a,
+                                                           emscripten::val::array(GetVecUint32FromVecInt64(a_shape)));
+    }
+    // If the second argument is 1-D, it is promoted to a matrix by appending a 1 to its dimensions.
+    bool extended_b_shape = false;
+    if (b_shape.size() == 1) {
+      extended_b_shape = true;
+      b_shape.push_back(1);
+      b = model_builder.GetBuilder().call<emscripten::val>("reshape", b,
+                                                           emscripten::val::array(GetVecUint32FromVecInt64(b_shape)));
+    }
     // The inputs of MatMul must be at least 3D for WebNN CPU backend. Use GEMM for 2D case.
     // TODO: Remove this workaround when it is fixed in Chromium.
     if (model_builder.GetWebnnDeviceType() == WebnnDeviceType::CPU && a_shape.size() == 2) {
@@ -49,6 +69,27 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
     } else {
       output = model_builder.GetBuilder().call<emscripten::val>("matmul", a, b);
     }
+    // If the inputs are both 1D， reduce the output to a scalar.
+    if (extended_a_shape && extended_b_shape) {
+      output = model_builder.GetBuilder().call<emscripten::val>("reshape", output, emscripten::val::array());
+    }
+    // After matrix multiplication the prepended 1 is removed.
+    else if (extended_a_shape) {
+      std::vector<uint32_t> new_shape;
+      for (size_t i = 0; i < b_shape.size() - 2; i++) {
+        new_shape.push_back(narrow<uint32_t>(b_shape[i]));
+      }
+      new_shape.push_back(narrow<uint32_t>(b_shape.back()));
+      output = model_builder.GetBuilder().call<emscripten::val>("reshape", output, emscripten::val::array(new_shape));
+    }
+    // After matrix multiplication the appended 1 is removed.
+    else if (extended_b_shape) {
+      std::vector<uint32_t> new_shape;
+      for (size_t i = 0; i < a_shape.size() - 1; i++) {
+        new_shape.push_back(narrow<uint32_t>(a_shape[i]));
+      }
+      output = model_builder.GetBuilder().call<emscripten::val>("reshape", output, emscripten::val::array(new_shape));
+    }
   } else if (op_type == "MatMulInteger") {
     emscripten::val a_zero_point = emscripten::val::null();
     emscripten::val b_zero_point = emscripten::val::null();
@@ -152,10 +193,10 @@ bool GemmOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
   }
 
   if (op_type == "MatMul") {
-    if (a_shape.size() < 2 || b_shape.size() < 2) {
-      LOGS(logger, VERBOSE) << "Inputs of MatMul must be at least 2D";
-      return false;
-    }
+    // If the first argument is 1-D, it is promoted to a matrix by prepending a 1 to its dimensions.
+    // If the second argument is 1-D, it is promoted to a matrix by appending a 1 to its dimensions.
+    if (a_shape.size() == 1) a_shape.insert(a_shape.begin(), 1);
+    if (b_shape.size() == 1) b_shape.push_back(1);
 
     // WebNN CPU backend has two more constraints.
     // https://source.chromium.org/chromium/chromium/src/+/main:third_party/blink/renderer/modules/ml/webnn/ml_graph_xnnpack.cc;l=1177

From 8adbc09314e80ec9528c3269afe8f30fab8c864b Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Thu, 21 Mar 2024 00:02:50 +0800
Subject: [PATCH 212/279] [Fix] Error Python Packaging Pipeline (Training CPU)
 (#19992)

### Description
fix the error caused by
https://github.com/microsoft/onnxruntime/pull/19973
---
 .../azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml
index 0e6e5bd53fab3..4ca122f639551 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml
@@ -63,7 +63,7 @@ stages:
               -e BUILD_BUILDNUMBER \
               -e ORT_DISABLE_PYTHON_PACKAGE_LOCAL_VERSION \
               -e DEFAULT_TRAINING_PACKAGE_DEVICE \
-              onnxruntimetrainingcpubuild \
+              onnxruntimetrainingcpubuild_$(PythonVersion) \
                 $(PythonManylinuxDir)/bin/python3 /onnxruntime_src/tools/ci_build/build.py \
                   --build_dir /build --cmake_generator Ninja \
                   --config Debug Release \

From 0af5eacc8b300b475cfca06a7cff8369c1b07157 Mon Sep 17 00:00:00 2001
From: Markus Tavenrath <mtavenrath@users.noreply.github.com>
Date: Wed, 20 Mar 2024 17:57:29 +0100
Subject: [PATCH 213/279] Fix broken Pooling CUDA NHWC Ops and ensure NCHW /
 NHWC parity. (#19889)

### Description
Fixed all CUDA NHWC Pooling operations which were broken and enabled the
NHWC CUDA pooling tests. Disabled all pooling tests which are not
supported by the CUDA EP.


### Motivation and Context
Ensure parity between CUDA NHWC / NCHW and work towards 100% tests
enabled for the CUDA EP / CUDA NHWC EP.

---------

Co-authored-by: Tianlei Wu <tlwu@microsoft.com>
---
 .../core/providers/cuda/cuda_nhwc_kernels.cc  |   7 ++
 .../core/providers/cuda/cudnn_common.cc       |  29 +++--
 .../providers/cuda/nn/max_pool_with_index.cu  | 114 +++++++++++++-----
 .../providers/cuda/nn/max_pool_with_index.h   |   2 +-
 onnxruntime/core/providers/cuda/nn/pool.cc    |  90 ++++++++------
 onnxruntime/core/providers/cuda/nn/pool.h     |   6 +-
 onnxruntime/core/providers/rocm/nn/pool.cc    |   2 +-
 .../test/providers/cpu/nn/pool_op_test.cc     | 112 +++++++++++------
 8 files changed, 252 insertions(+), 110 deletions(-)

diff --git a/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc b/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc
index 8fdcaacdb0f29..7afd2d430ec46 100644
--- a/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc
+++ b/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc
@@ -74,6 +74,8 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kM
                                                       MaxPool);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 12, float, MaxPool);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 12, MLFloat16, MaxPool);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 12, int8_t, MaxPool);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 12, uint8_t, MaxPool);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 14, 14, float,
                                                       BatchNormalization);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 14, 14, double,
@@ -165,6 +167,7 @@ Status RegisterCudaNhwcKernels(KernelRegistry& kernel_registry) {
           kCudaExecutionProvider, kMSInternalNHWCDomain, 10, 10, float, MaxPool)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
           kCudaExecutionProvider, kMSInternalNHWCDomain, 10, 10, MLFloat16, MaxPool)>,
+
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11,
                                                                   float, AveragePool)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11,
@@ -177,6 +180,10 @@ Status RegisterCudaNhwcKernels(KernelRegistry& kernel_registry) {
                                                                   float, MaxPool)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 12,
                                                                   MLFloat16, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 12,
+                                                                  int8_t, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 12,
+                                                                  uint8_t, MaxPool)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11,
                                                                   float, ConvTranspose)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11,
diff --git a/onnxruntime/core/providers/cuda/cudnn_common.cc b/onnxruntime/core/providers/cuda/cudnn_common.cc
index 39b73163794f0..9aa011c1d0ec4 100644
--- a/onnxruntime/core/providers/cuda/cudnn_common.cc
+++ b/onnxruntime/core/providers/cuda/cudnn_common.cc
@@ -37,13 +37,28 @@ Status CudnnTensor::Set(gsl::span<const int64_t> input_dims, cudnnDataType_t dat
   TensorPitches pitches(input_dims);
   InlinedVector<int, kTensorShapeSmallBufferElementsSize> dims(rank);
   InlinedVector<int, kTensorShapeSmallBufferElementsSize> strides(rank);
-  for (int i = 0; i < rank; i++) {
-    dims[i] = gsl::narrow_cast<int>(input_dims[i]);
-    strides[i] = gsl::narrow_cast<int>(pitches[i]);
-  }
-  if (is_nhwc) {
-    std::swap(dims[1], dims[rank - 1]);
-    std::swap(strides[1], strides[rank - 1]);
+
+  if (!is_nhwc) {
+    for (int i = 0; i < rank; i++) {
+      dims[i] = gsl::narrow_cast<int>(input_dims[i]);
+      strides[i] = gsl::narrow_cast<int>(pitches[i]);
+    }
+  } else {
+    // NHWDC <-> NCHWD
+
+    // N
+    dims[0] = gsl::narrow_cast<int>(input_dims[0]);
+    strides[0] = gsl::narrow_cast<int>(pitches[0]);
+
+    // HWD
+    for (int i = 1; i < rank - 1; i++) {
+      dims[i + 1] = gsl::narrow_cast<int>(input_dims[i]);
+      strides[i + 1] = gsl::narrow_cast<int>(pitches[i]);
+    }
+
+    // C
+    dims[1] = gsl::narrow_cast<int>(input_dims[rank - 1]);
+    strides[1] = gsl::narrow_cast<int>(pitches[rank - 1]);
   }
   CUDNN_RETURN_IF_ERROR(cudnnSetTensorNdDescriptor(tensor_, dataType, static_cast<int>(rank), dims.data(), strides.data()));
   return Status::OK();
diff --git a/onnxruntime/core/providers/cuda/nn/max_pool_with_index.cu b/onnxruntime/core/providers/cuda/nn/max_pool_with_index.cu
index ef1155af127d1..9311f044f4ec5 100644
--- a/onnxruntime/core/providers/cuda/nn/max_pool_with_index.cu
+++ b/onnxruntime/core/providers/cuda/nn/max_pool_with_index.cu
@@ -7,10 +7,11 @@
 
 #include "core/providers/cuda/cu_inc/common.cuh"
 #include "core/providers/cuda/shared_inc/fast_divmod.h"
+#include "core/providers/cuda/shared_inc/cuda_utils.h"
 
 namespace onnxruntime {
 namespace cuda {
-template <typename T>
+template <typename T, bool Layout>
 __global__ void MaxPoolWithIndexKernel(
     int64_t batch,
     int64_t channels,
@@ -44,11 +45,27 @@ __global__ void MaxPoolWithIndexKernel(
   int id = blockIdx.x * blockDim.x + threadIdx.x;
   if (id >= output_size) return;
 
+  auto compute_offset =
+    [height, width, depth, channels](int n_index, int c_index, int h_index, int w_index, int d_index) -> int64_t {
+    if constexpr (Layout == LAYOUT_NCHW) {
+      return (((n_index * channels + c_index) * height + h_index) * width + w_index) * depth + d_index;
+    } else if constexpr (Layout == LAYOUT_NHWC) {
+      return (((n_index * height + h_index) * width + w_index) * depth + d_index) * channels + c_index;
+    }
+  };
+
   int d_index, w_index, h_index, c_index, n_index, id_tmp;
-  fdm_d.divmod(id, id_tmp, d_index);
-  fdm_w.divmod(id_tmp, id_tmp, w_index);
-  fdm_h.divmod(id_tmp, id_tmp, h_index);
-  fdm_c.divmod(id_tmp, n_index, c_index);
+  if constexpr (Layout == LAYOUT_NCHW) {
+    fdm_d.divmod(id, id_tmp, d_index);
+    fdm_w.divmod(id_tmp, id_tmp, w_index);
+    fdm_h.divmod(id_tmp, id_tmp, h_index);
+    fdm_c.divmod(id_tmp, n_index, c_index);
+  } else if constexpr (Layout == LAYOUT_NHWC) {
+    fdm_c.divmod(id, id_tmp, c_index);
+    fdm_d.divmod(id_tmp, id_tmp, d_index);
+    fdm_w.divmod(id_tmp, id_tmp, w_index);
+    fdm_h.divmod(id_tmp, n_index, h_index);
+  }
 
   int64_t d_start = d_index * stride_d - pad_d;
   int64_t w_start = w_index * stride_w - pad_w;
@@ -64,29 +81,45 @@ __global__ void MaxPoolWithIndexKernel(
   int64_t d_index_max = -1;
   int64_t w_index_max = -1;
   int64_t h_index_max = -1;
-  int64_t offset = (n_index * channels + c_index) * height * width * depth;
+  int64_t offset = compute_offset(n_index, c_index, 0, 0, 0);
   const T* p_slice = p_input + offset;
-  T maxval = p_slice[h_start * width * depth + w_start * depth + d_start] - (T)1;
+  T maxval = p_slice[compute_offset(0, 0, h_start, w_start, d_start)] - (T)1;
   for (int64_t d = d_start; d < d_end; d += dilation_d) {
     for (int64_t w = w_start; w < w_end; w += dilation_w) {
       for (int64_t h = h_start; h < h_end; h += dilation_h) {
-        if (p_slice[h * width * depth + w * depth + d] > maxval) {
+        auto pool_offset = compute_offset(0, 0, h, w, d);
+        if (p_slice[pool_offset] > maxval) {
           h_index_max = h;
           w_index_max = w;
           d_index_max = d;
-          maxval = static_cast<float>(p_slice[h * width * depth + w * depth + d]);
+          maxval = static_cast<float>(p_slice[pool_offset]);
         }
       }
     }
   }
-  p_output[id] = p_input[offset + h_index_max * width * depth + w_index_max * depth + d_index_max];
+  p_output[id] = p_input[offset + compute_offset(0, 0, h_index_max, w_index_max, d_index_max)];
+
   if (p_indices) {
-    p_indices[id] = storage_order == 0 ? offset + h_index_max * width * depth + w_index_max * depth + d_index_max
-                                       : offset + h_index_max + w_index_max * height + d_index_max * width * height;
+    if constexpr (Layout == LAYOUT_NCHW) {
+      p_indices[id] = storage_order == 0 ? offset + h_index_max * width * depth + w_index_max * depth + d_index_max
+                                         : offset + h_index_max + w_index_max * height + d_index_max * width * height;
+    } else if constexpr (Layout == LAYOUT_NHWC) {
+      // The tests currently have to be provided in NHWC layout so that tests do not fail. When converting between
+      // layouts, does it make sense to do an index conversion as well?
+      // Storing indices in NHWC layout isn't critical as they are supposed to be used by Unpooling operations
+      // which currently assume that indices reference to Tensors in NHWC layout.
+      int64_t id_nchw = 
+        (((n_index * channels + c_index) * pooled_height + h_index) * pooled_width + w_index) * pooled_depth + d_index;
+      int64_t offset_nchw = (n_index * channels + c_index) * width * height * depth;
+
+      p_indices[id_nchw] = (storage_order == 0)
+                               ? offset_nchw + h_index_max * width * depth + w_index_max * depth + d_index_max
+                               : offset_nchw + h_index_max + w_index_max * height + d_index_max * width * height;
+    }
   }
 }
 
-template <typename T>
+template <typename T, bool Layout>
 void MaxPoolWithIndex(
     cudaStream_t stream,
     const TensorShape& input_shape,
@@ -99,14 +132,29 @@ void MaxPoolWithIndex(
     const T* p_input,
     T* p_output,
     int64_t* p_indices) {
-  int64_t batchs = input_shape[0];
-  int64_t channels = input_shape[1];
-  int64_t height = input_shape[2];
-  int64_t width = kernel_shape.size() > 1 ? input_shape[3] : 1;
-  int64_t depth = kernel_shape.size() > 2 ? input_shape[4] : 1;
-  int64_t pooled_height = output_shape[2];
-  int64_t pooled_width = kernel_shape.size() > 1 ? output_shape[3] : 1;
-  int64_t pooled_depth = kernel_shape.size() > 2 ? output_shape[4] : 1;
+  int64_t batchs, channels, height, width, depth;
+  int64_t pooled_height, pooled_width, pooled_depth;
+  if constexpr (Layout == LAYOUT_NCHW) {
+    batchs = input_shape[0];
+    channels = input_shape[1];
+    height = input_shape[2];
+    width = kernel_shape.size() > 1 ? input_shape[3] : 1;
+    depth = kernel_shape.size() > 2 ? input_shape[4] : 1;
+
+    pooled_height = output_shape[2];
+    pooled_width = kernel_shape.size() > 1 ? output_shape[3] : 1;
+    pooled_depth = kernel_shape.size() > 2 ? output_shape[4] : 1;
+  } else if constexpr (Layout == LAYOUT_NHWC) {
+    batchs = input_shape[0];
+    height = input_shape[1];
+    width = kernel_shape.size() > 1 ? input_shape[2] : 1;
+    depth = kernel_shape.size() > 2 ? input_shape[3] : 1;
+    channels = input_shape[input_shape.NumDimensions() - 1];
+
+    pooled_height = output_shape[1];
+    pooled_width = kernel_shape.size() > 1 ? output_shape[2] : 1;
+    pooled_depth = kernel_shape.size() > 2 ? output_shape[3] : 1;
+  }
   int64_t kernel_h = kernel_shape[0];
   int64_t kernel_w = kernel_shape.size() > 1 ? kernel_shape[1] : 1;
   int64_t kernel_d = kernel_shape.size() > 2 ? kernel_shape[2] : 1;
@@ -130,7 +178,7 @@ void MaxPoolWithIndex(
   fast_divmod fdm_d(static_cast<int>(pooled_depth));
 
   int blocksPerGrid = (int)((output_size + GridDim::maxThreadsPerBlock - 1) / GridDim::maxThreadsPerBlock);
-  MaxPoolWithIndexKernel<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+  MaxPoolWithIndexKernel<T, Layout><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
       batchs,
       channels,
       height,
@@ -162,8 +210,8 @@ void MaxPoolWithIndex(
       p_indices);
 }
 
-#define INSTANTIATEMAXPOOLWITHINDEX(T)              \
-  template void MaxPoolWithIndex<T>(                \
+#define INSTANTIATEMAXPOOLWITHINDEX(T, Layout)      \
+  template void MaxPoolWithIndex<T, Layout>(        \
       cudaStream_t stream,                          \
       const TensorShape& input_shape,               \
       const TensorShape& output_shape,              \
@@ -176,11 +224,19 @@ void MaxPoolWithIndex(
       T* p_output,                                  \
       int64_t* p_indices);
 
-INSTANTIATEMAXPOOLWITHINDEX(float)
-INSTANTIATEMAXPOOLWITHINDEX(double)
-INSTANTIATEMAXPOOLWITHINDEX(half)
-INSTANTIATEMAXPOOLWITHINDEX(int8_t)
-INSTANTIATEMAXPOOLWITHINDEX(uint8_t)
+INSTANTIATEMAXPOOLWITHINDEX(float, LAYOUT_NCHW)
+INSTANTIATEMAXPOOLWITHINDEX(double, LAYOUT_NCHW)
+INSTANTIATEMAXPOOLWITHINDEX(half, LAYOUT_NCHW)
+INSTANTIATEMAXPOOLWITHINDEX(int8_t, LAYOUT_NCHW)
+INSTANTIATEMAXPOOLWITHINDEX(uint8_t, LAYOUT_NCHW)
+
+#ifdef ENABLE_CUDA_NHWC_OPS
+INSTANTIATEMAXPOOLWITHINDEX(float, LAYOUT_NHWC)
+INSTANTIATEMAXPOOLWITHINDEX(double, LAYOUT_NHWC)
+INSTANTIATEMAXPOOLWITHINDEX(half, LAYOUT_NHWC)
+INSTANTIATEMAXPOOLWITHINDEX(int8_t, LAYOUT_NHWC)
+INSTANTIATEMAXPOOLWITHINDEX(uint8_t, LAYOUT_NHWC)
+#endif
 
 }  // namespace cuda
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/nn/max_pool_with_index.h b/onnxruntime/core/providers/cuda/nn/max_pool_with_index.h
index 27f5b241cc785..98f14c3f6a626 100644
--- a/onnxruntime/core/providers/cuda/nn/max_pool_with_index.h
+++ b/onnxruntime/core/providers/cuda/nn/max_pool_with_index.h
@@ -7,7 +7,7 @@
 
 namespace onnxruntime {
 namespace cuda {
-template <typename T>
+template <typename T, bool Layout>
 void MaxPoolWithIndex(
     cudaStream_t stream,
     const TensorShape& input_shape,
diff --git a/onnxruntime/core/providers/cuda/nn/pool.cc b/onnxruntime/core/providers/cuda/nn/pool.cc
index 8bc96958693bc..4acdcfcf35491 100644
--- a/onnxruntime/core/providers/cuda/nn/pool.cc
+++ b/onnxruntime/core/providers/cuda/nn/pool.cc
@@ -87,6 +87,8 @@ POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, float, MaxPool<8>, 11, 11, kMSInt
 POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, MLFloat16, MaxPool<8>, 11, 11, kMSInternalNHWCDomain, true)
 POOLING_KERNEL_WITH_INDICES(MaxPool, float, MaxPool<8>, 12, kMSInternalNHWCDomain, true)
 POOLING_KERNEL_WITH_INDICES(MaxPool, MLFloat16, MaxPool<8>, 12, kMSInternalNHWCDomain, true)
+POOLING_KERNEL_WITH_INDICES(MaxPool, int8_t, MaxPool<8>, 12, kMSInternalNHWCDomain, true)
+POOLING_KERNEL_WITH_INDICES(MaxPool, uint8_t, MaxPool<8>, 12, kMSInternalNHWCDomain, true)
 
 POOLING_KERNEL(GlobalMaxPool, float, MaxPool<1>, 1, kMSInternalNHWCDomain, true)
 POOLING_KERNEL(GlobalMaxPool, MLFloat16, MaxPool<1>, 1, kMSInternalNHWCDomain, true)
@@ -145,8 +147,8 @@ class CudnnPoolingDescriptor final {
   cudnnPoolingDescriptor_t desc_;
 };
 
-template <typename T, typename PoolType, bool NHWC>
-Status Pool<T, PoolType, NHWC>::ComputeInternal(OpKernelContext* context) const {
+template <typename T, typename PoolType, bool Layout>
+Status Pool<T, PoolType, Layout>::ComputeInternal(OpKernelContext* context) const {
   typedef typename ToCudaType<T>::MappedType CudaT;
   const Tensor* X = context->Input<Tensor>(0);
   const TensorShape& x_shape = X->Shape();
@@ -157,16 +159,21 @@ Status Pool<T, PoolType, NHWC>::ComputeInternal(OpKernelContext* context) const
   }
 
   auto kernel_shape = pool_attrs_.kernel_shape;
-  auto pads = pool_attrs_.pads;
   auto strides = pool_attrs_.strides;
+  TensorShapeVector pads = pool_attrs_.pads;
 
   if (pool_attrs_.global_pooling) {
-    kernel_shape.assign(x_dims.begin() + 2, x_dims.end());
-    pads.assign(kernel_shape.size(), 0);
+    if constexpr (Layout == LAYOUT_NCHW) {
+      kernel_shape.assign(x_dims.begin() + 2, x_dims.end());
+    } else if constexpr (Layout == LAYOUT_NHWC) {
+      kernel_shape.assign(x_dims.begin() + 1, x_dims.end() - 1);
+    }
+    pads.assign(2 * kernel_shape.size(), 0);
     strides.assign(kernel_shape.size(), 1);
   }
-  auto out_channel = NHWC ? x_shape[3] : x_shape[1];
-  auto y_dims = pool_attrs_.SetOutputSize(x_shape, out_channel, &pads, NHWC);
+  auto out_channel = (Layout == LAYOUT_NHWC) ? x_shape[x_dims.size() - 1] : x_shape[1];
+
+  auto y_dims = pool_attrs_.SetOutputSize(x_shape, out_channel, &pads, Layout == LAYOUT_NHWC);
   TensorShape y_shape(y_dims);
   Tensor* Y = context->Output(0, y_shape);
   // special case when there is a dim value of 0 in the shape.
@@ -178,20 +185,22 @@ Status Pool<T, PoolType, NHWC>::ComputeInternal(OpKernelContext* context) const
   TensorShapeVector x_dims_cudnn(x_dims.begin(), x_dims.end());
   TensorShapeVector y_dims_cudnn(y_dims);
   if (kernel_shape.size() < 2) {
-    // cudnn only takes 4D or 5D input, so pad dimensions if needed
-    if (NHWC) {
-      x_dims_cudnn.insert(x_dims_cudnn.begin() + 1, 1);
-      y_dims_cudnn.insert(y_dims_cudnn.begin() + 1, 1);
-      kernel_shape.insert(kernel_shape.begin() + 1, 1);
-      strides.insert(strides.begin() + 1, 1);
-    } else {
-      x_dims_cudnn.push_back(1);
-      y_dims_cudnn.push_back(1);
-      kernel_shape.push_back(1);
-      strides.push_back(1);
+    // cuDNN only takes 4D or 5D input, so pad dimensions if needed
+    if constexpr (Layout == LAYOUT_NHWC) {
+      x_dims_cudnn.insert(x_dims_cudnn.end() - 1, 1);
+      y_dims_cudnn.insert(y_dims_cudnn.end() - 1, 1);
+      pads.insert(pads.begin() + pads.size() / 2, 0);
+      pads.insert(pads.end(), 0);
+      kernel_shape.insert(kernel_shape.end(), 1);
+      strides.insert(strides.end(), 1);
+    } else {  // Layout == LAYOUT_NCHW
+      x_dims_cudnn.insert(x_dims_cudnn.end(), 1);
+      y_dims_cudnn.insert(y_dims_cudnn.end(), 1);
+      pads.insert(pads.begin() + pads.size() / 2, 0);
+      pads.insert(pads.end(), 0);
+      kernel_shape.insert(kernel_shape.end(), 1);
+      strides.insert(strides.end(), 1);
     }
-    pads.insert(pads.begin() + kernel_shape.size(), 0);
-    pads.insert(pads.end(), 0);
   }
 
   cudnnPoolingMode_t mode = CUDNN_POOLING_MAX;
@@ -208,8 +217,8 @@ Status Pool<T, PoolType, NHWC>::ComputeInternal(OpKernelContext* context) const
     const auto beta = Consts<float>::Zero;
     CudnnTensor x_tensor;
     CudnnTensor y_tensor;
-    ORT_RETURN_IF_ERROR(x_tensor.Set(x_dims_cudnn, CudnnTensor::GetDataType<float>(), NHWC));
-    ORT_RETURN_IF_ERROR(y_tensor.Set(y_dims_cudnn, CudnnTensor::GetDataType<float>(), NHWC));
+    ORT_RETURN_IF_ERROR(x_tensor.Set(x_dims_cudnn, CudnnTensor::GetDataType<float>(), Layout == LAYOUT_NHWC));
+    ORT_RETURN_IF_ERROR(y_tensor.Set(y_dims_cudnn, CudnnTensor::GetDataType<float>(), Layout == LAYOUT_NHWC));
 
     const auto input_count = x_shape.Size();
     const auto output_count = y_shape.Size();
@@ -225,8 +234,8 @@ Status Pool<T, PoolType, NHWC>::ComputeInternal(OpKernelContext* context) const
     const auto beta = Consts<CudaT>::Zero;
     CudnnTensor x_tensor;
     CudnnTensor y_tensor;
-    ORT_RETURN_IF_ERROR(x_tensor.Set(x_dims_cudnn, CudnnTensor::GetDataType<CudaT>(), NHWC));
-    ORT_RETURN_IF_ERROR(y_tensor.Set(y_dims_cudnn, CudnnTensor::GetDataType<CudaT>(), NHWC));
+    ORT_RETURN_IF_ERROR(x_tensor.Set(x_dims_cudnn, CudnnTensor::GetDataType<CudaT>(), Layout == LAYOUT_NHWC));
+    ORT_RETURN_IF_ERROR(y_tensor.Set(y_dims_cudnn, CudnnTensor::GetDataType<CudaT>(), Layout == LAYOUT_NHWC));
 
     CUDNN_RETURN_IF_ERROR(
         PoolingForwardHelper(GetCudnnHandle(context), pooling_desc, &alpha, x_tensor, x_data, &beta, y_tensor, y_data));
@@ -235,8 +244,8 @@ Status Pool<T, PoolType, NHWC>::ComputeInternal(OpKernelContext* context) const
   return Status::OK();
 }
 
-template <typename T, bool NHWC>
-Status Pool<T, MaxPool<8>, NHWC>::ComputeInternal(OpKernelContext* context) const {
+template <typename T, bool Layout>
+Status Pool<T, MaxPool<8>, Layout>::ComputeInternal(OpKernelContext* context) const {
   typedef typename ToCudaType<T>::MappedType CudaT;
   const Tensor* X = context->Input<Tensor>(0);
   const TensorShape& x_shape = X->Shape();
@@ -251,12 +260,16 @@ Status Pool<T, MaxPool<8>, NHWC>::ComputeInternal(OpKernelContext* context) cons
   auto strides = this->pool_attrs_.strides;
 
   if (this->pool_attrs_.global_pooling) {
-    kernel_shape.assign(x_dims.begin() + 2, x_dims.end());
-    pads.assign(kernel_shape.size(), 0);
+    if constexpr (Layout == LAYOUT_NCHW) {
+      kernel_shape.assign(x_dims.begin() + 2, x_dims.end());
+    } else if constexpr (Layout == LAYOUT_NHWC) {
+      kernel_shape.assign(x_dims.begin() + 1, x_dims.end() - 1);
+    }
+    pads.assign(2 * kernel_shape.size(), 0);  // x{i}_begin + x{i}_end
     strides.assign(kernel_shape.size(), 1);
   }
-  auto out_channel = NHWC ? x_shape[3] : x_shape[1];
-  auto y_dims = this->pool_attrs_.SetOutputSize(x_shape, out_channel, &pads, NHWC);
+  auto out_channel = Layout == LAYOUT_NHWC ? x_shape[x_shape.NumDimensions() - 1] : x_shape[1];
+  auto y_dims = this->pool_attrs_.SetOutputSize(x_shape, out_channel, &pads, Layout == LAYOUT_NHWC);
   Tensor* Y = context->Output(0, TensorShape(y_dims));
 
   // special case when there is a dim value of 0 in the shape.
@@ -265,13 +278,22 @@ Status Pool<T, MaxPool<8>, NHWC>::ComputeInternal(OpKernelContext* context) cons
   auto x_data = reinterpret_cast<const CudaT*>(X->Data<T>());
   auto y_data = reinterpret_cast<CudaT*>(Y->MutableData<T>());
 
-  Tensor* I = context->Output(1, TensorShape(y_dims));
+  // I is in NCHW format and the contained indices use NCHW math to compute the index
+  auto i_dims = y_dims;
+  if constexpr (Layout == LAYOUT_NHWC) {
+    // y_dims in NHWDC format, i_dims has to be in NCHWD format.
+    i_dims.insert(i_dims.begin() + 1, i_dims.back());  // N*C*HWDC
+    i_dims.pop_back();                                 // NCHW
+  }
+
+  Tensor* I = context->Output(1, TensorShape(i_dims));
   if (nullptr != I || !this->pool_attrs_.default_dilations) {
     auto i_data = nullptr == I ? nullptr : I->MutableData<int64_t>();
-    MaxPoolWithIndex<CudaT>(this->Stream(context), x_shape, TensorShape(y_dims), kernel_shape, strides, pads,
-                            this->pool_attrs_.dilations, this->pool_attrs_.storage_order, x_data, y_data, i_data);
+    MaxPoolWithIndex<CudaT, Layout == LAYOUT_NHWC>(this->Stream(context), x_shape, TensorShape(y_dims), kernel_shape,
+                                                   strides, pads, this->pool_attrs_.dilations,
+                                                   this->pool_attrs_.storage_order, x_data, y_data, i_data);
   } else {
-    ORT_RETURN_IF_ERROR((Pool<T, MaxPool<1>, NHWC>::ComputeInternal(context)));
+    ORT_RETURN_IF_ERROR((Pool<T, MaxPool<1>, Layout == LAYOUT_NHWC>::ComputeInternal(context)));
   }
   return Status::OK();
 }
diff --git a/onnxruntime/core/providers/cuda/nn/pool.h b/onnxruntime/core/providers/cuda/nn/pool.h
index 8b5152a1565a9..97f7c8b8762d5 100644
--- a/onnxruntime/core/providers/cuda/nn/pool.h
+++ b/onnxruntime/core/providers/cuda/nn/pool.h
@@ -19,10 +19,10 @@ class Pool : public CudaKernel, public PoolBase {
   Status ComputeInternal(OpKernelContext* context) const override;
 };
 
-template <typename T, bool NHWC>
-class Pool<T, MaxPool<8>, NHWC> final : public Pool<T, MaxPool<1>, NHWC> {
+template <typename T, bool Layout>
+class Pool<T, MaxPool<8>, Layout> final : public Pool<T, MaxPool<1>, Layout> {
  public:
-  explicit Pool(const OpKernelInfo& info) : Pool<T, MaxPool<1>, NHWC>(info) {}
+  explicit Pool(const OpKernelInfo& info) : Pool<T, MaxPool<1>, Layout>(info) {}
 
   Status ComputeInternal(OpKernelContext* context) const override;
 };
diff --git a/onnxruntime/core/providers/rocm/nn/pool.cc b/onnxruntime/core/providers/rocm/nn/pool.cc
index 045c8b55c0b0d..3a82ab598004b 100644
--- a/onnxruntime/core/providers/rocm/nn/pool.cc
+++ b/onnxruntime/core/providers/rocm/nn/pool.cc
@@ -257,7 +257,7 @@ Status Pool<T, MaxPool<8>>::ComputeInternal(OpKernelContext* context) const {
   Tensor* I = context->Output(1, TensorShape(y_dims));
   if (nullptr != I || !this->pool_attrs_.default_dilations) {
     auto i_data = nullptr == I ? nullptr : I->MutableData<int64_t>();
-    MaxPoolWithIndex<HipT>(
+    MaxPoolWithIndex<HipT, false>(
         this->Stream(context),
         x_shape,
         TensorShape(y_dims),
diff --git a/onnxruntime/test/providers/cpu/nn/pool_op_test.cc b/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
index f98b18ddb17eb..c8cf183291518 100644
--- a/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
@@ -58,7 +58,7 @@ TEST(PoolTest, MaxPool) {
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
   // TensorRT: result differs
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
 // Only CUDA kernel has float 16 support
@@ -117,7 +117,7 @@ TEST(PoolTest, MaxPool_F16) {
   test.AddInput<MLFloat16>("X", x_dims, f_X);
   test.AddOutput<MLFloat16>("Y", expected_dims, f_Y);
   // TensorRT: Assertion `!attrs.count("pads")' failed
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 #endif
 
@@ -170,7 +170,7 @@ static void MaxPool_8_WithIndexTest(bool has_index, int64_t storage_order = 0) {
                        : test.AddOutput<int64_t>("Indices", expected_dims, expected_indices_col);
   }
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kDnnlExecutionProvider, kCudaNHWCExecutionProvider, kTensorrtExecutionProvider,
+           {kDnnlExecutionProvider, kTensorrtExecutionProvider,
             kAclExecutionProvider, kArmNNExecutionProvider, kOpenVINOExecutionProvider});
 }
 
@@ -185,7 +185,7 @@ TEST(PoolTest, MaxPool_8_With_Index) {
   MaxPool_8_WithIndexTest(true, 1 /*storage_order*/);  // col major
 }
 
-TEST(PoolTest, MaxPool1D) {
+TEST(PoolTest, MaxPool1D_case1) {
   OpTester test("MaxPool");
 
   test.AddAttribute("auto_pad", "");
@@ -200,7 +200,45 @@ TEST(PoolTest, MaxPool1D) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
+TEST(PoolTest, MaxPool1D_case2) {
+  OpTester test("MaxPool");
+  // no padding
+  test.AddAttribute("auto_pad", "VALID");
+  test.AddAttribute("strides", std::vector<int64_t>{1});
+  test.AddAttribute("pads", vector<int64_t>{0, 0});
+  test.AddAttribute("kernel_shape", vector<int64_t>{2});
+
+  std::vector<float> x_vals = {1, 2, 3, 4, 5};
+  std::vector<int64_t> x_dims = {1, 1, 5};
+  // The last dim is (5-2+1)/1 = 4
+  std::vector<int64_t> expected_dims = {1, 1, 4};
+  std::vector<float> expected_vals = {2, 3, 4, 5};
+
+  test.AddInput<float>("X", x_dims, x_vals);
+  test.AddOutput<float>("Y", expected_dims, expected_vals);
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
+TEST(PoolTest, MaxPool1D_case3) {
+  OpTester test("MaxPool");
+  test.AddAttribute("auto_pad", "");
+  test.AddAttribute("strides", std::vector<int64_t>{1});
+  // Pad one element
+  test.AddAttribute("pads", vector<int64_t>{0, 1});
+  test.AddAttribute("kernel_shape", vector<int64_t>{2});
+
+  std::vector<float> x_vals = {1, 2, 3, 4, 5};
+  std::vector<int64_t> x_dims = {1, 1, 5};
+  // Since we padded it, the last dim is larger compared to the case above
+  std::vector<int64_t> expected_dims = {1, 1, 5};
+  std::vector<float> expected_vals = {2, 3, 4, 5, 5};
+
+  test.AddInput<float>("X", x_dims, x_vals);
+  test.AddOutput<float>("Y", expected_dims, expected_vals);
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
 static void MaxPool1D_8_WithIndexTest(int64_t storage_order) {
@@ -222,7 +260,7 @@ static void MaxPool1D_8_WithIndexTest(int64_t storage_order) {
   test.AddOutput<float>("Y", expected_dims, expected_vals);
   test.AddOutput<int64_t>("Indices", expected_dims, expected_indices);
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kAclExecutionProvider});
+           {kTensorrtExecutionProvider, kAclExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool1D_8_With_Index) {
@@ -249,7 +287,7 @@ static void MaxPool1D_12_WithIndexTest_int8(int64_t storage_order) {
   test.AddOutput<int8_t>("Y", expected_dims, expected_vals);
   test.AddOutput<int64_t>("Indices", expected_dims, expected_indices);
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kAclExecutionProvider});
+           {kTensorrtExecutionProvider, kAclExecutionProvider});
 }
 
 static void MaxPool1D_12_WithIndexTest_uint8(int64_t storage_order) {
@@ -271,7 +309,7 @@ static void MaxPool1D_12_WithIndexTest_uint8(int64_t storage_order) {
   test.AddOutput<uint8_t>("Y", expected_dims, expected_vals);
   test.AddOutput<int64_t>("Indices", expected_dims, expected_indices);
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kAclExecutionProvider});
+           {kTensorrtExecutionProvider, kAclExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool1D_12_With_Index_8bits) {
@@ -309,9 +347,9 @@ TEST(PoolTest, MaxPool2D_uint8) {
 
   test.AddOutput<uint8_t>("Output", output_shape, output);
 #if defined(OPENVINO_CONFIG_GPU_FP32) || defined(OPENVINO_CONFIG_GPU_FP16)
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kOpenVINOExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider});
 #else
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {});
 #endif
 }
 
@@ -337,7 +375,7 @@ TEST(PoolTest, MaxPool_10_Dilation_1d) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool_DefaultDilations) {
@@ -357,7 +395,7 @@ TEST(PoolTest, MaxPool_DefaultDilations) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool_DefaultDilations_int8) {
@@ -377,7 +415,7 @@ TEST(PoolTest, MaxPool_DefaultDilations_int8) {
 
   test.AddInput<int8_t>("X", x_dims, x_vals);
   test.AddOutput<int8_t>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool_DefaultDilations_uint8) {
@@ -397,7 +435,7 @@ TEST(PoolTest, MaxPool_DefaultDilations_uint8) {
 
   test.AddInput<uint8_t>("X", x_dims, x_vals);
   test.AddOutput<uint8_t>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool_10_DilationPadding_1d) {
@@ -451,7 +489,7 @@ TEST(PoolTest, MaxPool_10_Dilation_2d) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool_10_Dilation_2d_int8) {
@@ -479,7 +517,7 @@ TEST(PoolTest, MaxPool_10_Dilation_2d_int8) {
 
   test.AddInput<int8_t>("X", x_dims, x_vals);
   test.AddOutput<int8_t>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool_10_DilationPadding_2d) {
@@ -536,7 +574,7 @@ TEST(PoolTest, MaxPool_10_Dilation_Ceil0_2d) {
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kAclExecutionProvider});
+           {kTensorrtExecutionProvider, kAclExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool_12_Dilation_Ceil0_2d_int8) {
@@ -565,7 +603,7 @@ TEST(PoolTest, MaxPool_12_Dilation_Ceil0_2d_int8) {
   test.AddInput<int8_t>("X", x_dims, x_vals);
   test.AddOutput<int8_t>("Y", expected_dims, expected_vals);
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kAclExecutionProvider});
+           {kTensorrtExecutionProvider, kAclExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool_10_Dilation_Ceil1_2d) {
@@ -595,7 +633,7 @@ TEST(PoolTest, MaxPool_10_Dilation_Ceil1_2d) {
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kAclExecutionProvider});
+           {kTensorrtExecutionProvider, kAclExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool_10_DilationPadding_3d) {
@@ -707,7 +745,7 @@ TEST(PoolTest, GlobalMaxPool) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {});
 }
 
 TEST(PoolTest, GlobalMaxPool3D) {
@@ -783,7 +821,7 @@ TEST(PoolTest, GlobalMaxPool3D) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
 TEST(PoolTest, AveragePool) {
@@ -864,7 +902,7 @@ TEST(PoolTest, AveragePool) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
 TEST(PoolTest, AveragePool_IncludePadPixel) {
@@ -889,7 +927,7 @@ TEST(PoolTest, AveragePool_IncludePadPixel) {
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
   test.SetOutputTolerance(0.0001f);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
 // test 'strides' attribute not specified
@@ -908,7 +946,7 @@ TEST(PoolTest, AveragePool_DefaultStrides) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
 TEST(PoolTest, AveragePool_10_ceil1_2d) {
@@ -932,7 +970,7 @@ TEST(PoolTest, AveragePool_10_ceil1_2d) {
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kAclExecutionProvider});
+           {kTensorrtExecutionProvider, kAclExecutionProvider});
 }
 
 TEST(PoolTest, AveragePool_19_dilation_2d) {
@@ -956,7 +994,9 @@ TEST(PoolTest, AveragePool_19_dilation_2d) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kAclExecutionProvider, kOpenVINOExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider,
+            kTensorrtExecutionProvider, kAclExecutionProvider, kOpenVINOExecutionProvider});
 }
 
 TEST(PoolTest, GlobalAveragePool) {
@@ -1032,7 +1072,7 @@ TEST(PoolTest, GlobalAveragePool) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {});
 }
 
 TEST(PoolTest, GlobalAveragePool_Large_128) {
@@ -1045,7 +1085,7 @@ TEST(PoolTest, GlobalAveragePool_Large_128) {
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals,
                         /*sort_output=*/false, /*rel_error=*/1e-3f, /*abs_error=*/1e-2f);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {});
 }
 
 TEST(PoolTest, GlobalAveragePool_Large_256) {
@@ -1058,7 +1098,7 @@ TEST(PoolTest, GlobalAveragePool_Large_256) {
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals,
                         /*sort_output=*/false, /*rel_error=*/1e-3f, /*abs_error=*/1e-2f);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {});
 }
 
 TEST(PoolTest, LpPool) {
@@ -1365,7 +1405,7 @@ TEST(PoolTest, LpPool) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kCudaNHWCExecutionProvider});
 }
 
 // test data generated with lp_pool_test_generator.py
@@ -1397,7 +1437,8 @@ TEST(PoolTest, LpPool1d) {
 
       // https://docs.nvidia.com/deeplearning/tensorrt/api/c_api/classnvinfer1_1_1_i_network_definition.html#a94f434942252e6d98ac17705c06ce060
       // TensorRT does not support 1d pooling
-      test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
+      test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+               {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
       y_count++;
     }
 }
@@ -1429,7 +1470,7 @@ TEST(PoolTest, LpPool2d) {
       test.AddAttribute("kernel_shape", kernel_sizes[kernel_size_count]);
 
       test.AddOutput<float>("Y", y_sizes[y_count], ys[y_count]);
-      test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider});
+      test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kCudaNHWCExecutionProvider});
       y_count++;
     }
 }
@@ -1447,7 +1488,8 @@ TEST(PoolTest, LpPoolCeilMode) {
 
   // https://docs.nvidia.com/deeplearning/tensorrt/api/c_api/classnvinfer1_1_1_i_network_definition.html#a94f434942252e6d98ac17705c06ce060
   // TensorRT does not support 1d pooling
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
 }
 
 TEST(PoolTest, GlobalLpPool) {
@@ -1702,7 +1744,7 @@ TEST(PoolTest, GlobalLpPool) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kCudaNHWCExecutionProvider});
 }
 
 TEST(PoolTest, MaxPoolDimWithZeroForN) {
@@ -1720,7 +1762,7 @@ TEST(PoolTest, MaxPoolDimWithZeroForN) {
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kQnnExecutionProvider});
+           {kTensorrtExecutionProvider, kQnnExecutionProvider});
 }
 
 }  // namespace test

From 19ff4a6d6c420d4d43c92d57ff45313f00b49336 Mon Sep 17 00:00:00 2001
From: Adam Pocock <adam.pocock@oracle.com>
Date: Wed, 20 Mar 2024 13:52:00 -0400
Subject: [PATCH 214/279] String Tensor SplitToSequence fix (#19942)

---
 .../core/providers/cpu/sequence/sequence_ops.cc     |  2 +-
 .../providers/cpu/sequence/sequence_ops_test.cc     | 13 +++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/cpu/sequence/sequence_ops.cc b/onnxruntime/core/providers/cpu/sequence/sequence_ops.cc
index 8064bc0a58cb1..2913f4ac32b6e 100644
--- a/onnxruntime/core/providers/cpu/sequence/sequence_ops.cc
+++ b/onnxruntime/core/providers/cpu/sequence/sequence_ops.cc
@@ -453,7 +453,7 @@ Status SplitToSequence::ComputeImpl(OpKernelContext& context, const Tensor& inpu
   int num_remaining_splits = 0;
   InlinedVector<int64_t> split_sizes;
   const bool is_string_type = input.IsDataTypeString();
-  const size_t element_size = (is_string_type) ? 0U : input.DataType()->Size();
+  const size_t element_size = input.DataType()->Size();
 
   // figure out split_scalar or split_sizes
   if (p_split_input) {
diff --git a/onnxruntime/test/providers/cpu/sequence/sequence_ops_test.cc b/onnxruntime/test/providers/cpu/sequence/sequence_ops_test.cc
index 60e75811e4333..c2d64b8e5ee4a 100644
--- a/onnxruntime/test/providers/cpu/sequence/sequence_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/sequence/sequence_ops_test.cc
@@ -442,6 +442,19 @@ TEST(SequenceOpsTest, SplitToSequence_PositiveAxisScalarSplit) {
   test.Run();
 }
 
+TEST(SequenceOpsTest, SplitToSequence_StringSplit) {
+  OpTester test("SplitToSequence", 11);
+  test.AddInput<std::string>("input", {3}, std::vector<std::string>({"Test string", "Another string", "A third and much longer string"}));
+  int64_t axis = 0;
+  test.AddAttribute("axis", axis);
+  SeqTensors<std::string> output;
+  output.AddTensor({1}, {"Test string"});
+  output.AddTensor({1}, {"Another string"});
+  output.AddTensor({1}, {"A third and much longer string"});
+  test.AddSeqOutput("S2", output);
+  test.Run();
+}
+
 TEST(SequenceOpsTest, SplitToSequence_DefaultAxis0UnevenSplitFloat) {
   OpTester test("SplitToSequence", 11);
   test.AddInput<float>("input", {5, 2}, GetConsecutiveVector<float>(1.f, 10));

From 6b305f95e0dbbb5e629d45d27a2beb53b6223c00 Mon Sep 17 00:00:00 2001
From: Rachel Guo <35738743+YUNQIUGUO@users.noreply.github.com>
Date: Wed, 20 Mar 2024 10:55:19 -0700
Subject: [PATCH 215/279] Support xcframework for mac catalyst builds. (#19534)

### Description
<!-- Describe your changes. -->


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

MAUI on macOS uses mac-catalyst which requires a different native
binary.

---------

Co-authored-by: rachguo <rachguo@rachguos-Mini.attlocal.net>
Co-authored-by: Scott McKay <skottmckay@gmail.com>
---
 cmake/adjust_global_compile_flags.cmake       |  9 +++
 ...maccatalyst_prepare_objects_for_prelink.py | 72 +++++++++++++++++++
 cmake/onnxruntime.cmake                       | 36 ++++++++--
 cmake/onnxruntime_mlas.cmake                  |  6 ++
 .../project.pbxproj                           | 24 ++++---
 .../ios_package_test.entitlements             | 10 +++
 tools/ci_build/build.py                       | 42 +++++++++--
 .../github/apple/build_apple_framework.py     |  4 +-
 ...t_full_apple_framework_build_settings.json |  1 +
 ...ult_full_ios_framework_build_settings.json | 14 +++-
 ...training_ios_framework_build_settings.json |  1 +
 .../github/apple/framework_info.json.template |  2 +-
 .../github/apple/test_apple_packages.py       | 25 +++++++
 .../azure-pipelines/templates/c-api-cpu.yml   |  3 +-
 14 files changed, 225 insertions(+), 24 deletions(-)
 create mode 100644 cmake/maccatalyst_prepare_objects_for_prelink.py
 create mode 100644 onnxruntime/test/platform/apple/apple_package_test/ios_package_test/ios_package_test.entitlements

diff --git a/cmake/adjust_global_compile_flags.cmake b/cmake/adjust_global_compile_flags.cmake
index d3f9256105127..9a3bc3302cc2b 100644
--- a/cmake/adjust_global_compile_flags.cmake
+++ b/cmake/adjust_global_compile_flags.cmake
@@ -8,6 +8,15 @@ if (CMAKE_SYSTEM_NAME STREQUAL "Android")
   string(APPEND CMAKE_ASM_FLAGS_RELEASE " -O3")
 endif()
 
+# Suggested by https://gitlab.kitware.com/cmake/cmake/-/issues/20132
+# MacCatalyst is not well supported in CMake
+# The error that can emerge without this flag can look like:
+# "clang : error : overriding '-mmacosx-version-min=11.0' option with '-target x86_64-apple-ios14.0-macabi' [-Werror,-Woverriding-t-option]"
+if (PLATFORM_NAME STREQUAL "macabi")
+  add_compile_options(-Wno-overriding-t-option)
+  add_link_options(-Wno-overriding-t-option)
+endif()
+
 # Enable space optimization for gcc/clang
 # Cannot use "-ffunction-sections -fdata-sections" if we enable bitcode (iOS)
 if (NOT MSVC AND NOT onnxruntime_ENABLE_BITCODE)
diff --git a/cmake/maccatalyst_prepare_objects_for_prelink.py b/cmake/maccatalyst_prepare_objects_for_prelink.py
new file mode 100644
index 0000000000000..34664b4e05237
--- /dev/null
+++ b/cmake/maccatalyst_prepare_objects_for_prelink.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python3
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import os
+import shutil
+import sys
+
+
+# Note: This script is mainly used for sanity checking/validating the files in the .a library equal to the .o files
+# in the source dir to handle the case of source files having duplicate names under different subdirectories for
+# each onnxruntime library. (Only applicable when doing a Mac Catalyst build.)
+def main():
+    source_dir = sys.argv[1]
+    dest_dir = sys.argv[2]
+    files_from_static_lib = sys.argv[3]
+    files_from_source_dir = []
+    for subdir, _, files in os.walk(source_dir):
+        for file_name in files:
+            if file_name.endswith(".o"):
+                files_from_source_dir.append(file_name.strip())
+                dest_name_without_extension, _ = os.path.splitext(file_name)
+                counter = 0
+
+                dest_file = f"{dest_name_without_extension}.o"
+                while os.path.exists(os.path.join(dest_dir, dest_file)):
+                    print("Duplicate file name from source: " + os.path.join(source_dir, subdir, file_name))
+                    counter += 1
+                    dest_file = f"{dest_name_without_extension}_{counter}.o"
+                    print("Renamed file name in destination: " + os.path.join(dest_dir, dest_file))
+
+                destination_path = os.path.join(dest_dir, dest_file)
+                source_file = os.path.join(source_dir, subdir, file_name)
+                shutil.copy(source_file, destination_path)
+
+    # Sanity check to ensure the number of .o object from the original cmake source directory matches with the number
+    # of .o files extracted from each .a onnxruntime library
+    file_lists_from_static_lib = []
+    with open(files_from_static_lib) as file:
+        filenames = file.readlines()
+    for filename in filenames:
+        file_lists_from_static_lib.append(filename.strip())
+
+    sorted_list1 = sorted(file_lists_from_static_lib)
+    sorted_list2 = sorted(files_from_source_dir)
+
+    if len(sorted_list1) != len(sorted_list2):
+        print(
+            "Caught a mismatch in the number of .o object files from the original cmake source directory: ",
+            len(sorted_list1),
+            "the number of .o files extracted from the static onnxruntime lib: ",
+            len(sorted_list2),
+            "for: ",
+            os.path.basename(source_dir),
+        )
+
+    if sorted_list1 == sorted_list2:
+        print(
+            "Sanity check passed: object files from original source directory matches with files extracted "
+            "from static library for: ",
+            os.path.basename(source_dir),
+        )
+    else:
+        print(
+            "Error: Mismatch between object files from original source directory "
+            "and the .o files extracted from static library for: ",
+            os.path.basename(source_dir),
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake
index 2ead13e554197..e15c8a046dc20 100644
--- a/cmake/onnxruntime.cmake
+++ b/cmake/onnxruntime.cmake
@@ -281,7 +281,13 @@ endif()
 
 # Assemble the Apple static framework (iOS and macOS)
 if(onnxruntime_BUILD_APPLE_FRAMEWORK)
-  set(STATIC_FRAMEWORK_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}-${CMAKE_OSX_SYSROOT})
+  # when building for mac catalyst, the CMAKE_OSX_SYSROOT is set to MacOSX as well, to avoid duplication,
+  # we specify as `-macabi` in the name of the output static apple framework directory.
+  if (PLATFORM_NAME STREQUAL "macabi")
+    set(STATIC_FRAMEWORK_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}-macabi)
+  else()
+    set(STATIC_FRAMEWORK_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}-${CMAKE_OSX_SYSROOT})
+  endif()
 
   # Setup the various directories required. Remove any existing ones so we start with a clean directory.
   set(STATIC_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/static_libraries)
@@ -299,18 +305,34 @@ if(onnxruntime_BUILD_APPLE_FRAMEWORK)
   # to enforce symbol visibility. doing it this way limits the symbols included from the .a files to symbols used
   # by the ORT .o files.
 
-  # If it's an onnxruntime library, extract .o files to a separate directory for each library to avoid any clashes
-  # with filenames (e.g. utils.o)
+  # If it's an onnxruntime library, extract .o files from the original cmake build path to a separate directory for
+  # each library to avoid any clashes with filenames (e.g. utils.o)
   foreach(_LIB ${onnxruntime_INTERNAL_LIBRARIES} )
     GET_TARGET_PROPERTY(_LIB_TYPE ${_LIB} TYPE)
     if(_LIB_TYPE STREQUAL "STATIC_LIBRARY")
       set(CUR_STATIC_LIB_OBJ_DIR ${STATIC_LIB_TEMP_DIR}/$<TARGET_LINKER_FILE_BASE_NAME:${_LIB}>)
       add_custom_command(TARGET onnxruntime POST_BUILD
                          COMMAND ${CMAKE_COMMAND} -E make_directory ${CUR_STATIC_LIB_OBJ_DIR})
-
-      add_custom_command(TARGET onnxruntime POST_BUILD
-                         COMMAND ar ARGS -x $<TARGET_FILE:${_LIB}>
-                         WORKING_DIRECTORY ${CUR_STATIC_LIB_OBJ_DIR})
+      if (PLATFORM_NAME STREQUAL "macabi")
+        # There exists several duplicate names for source files under different subdirectories within
+        # each onnxruntime library. (e.g. onnxruntime/contrib_ops/cpu/element_wise_ops.o
+        # vs. onnxruntime/providers/core/cpu/math/element_wise_ops.o)
+        # In that case, using 'ar ARGS -x' to extract the .o files from .a lib would possibly cause duplicate naming files being overwritten
+        # and lead to missing undefined symbol error in the generated binary.
+        # So we use the below python script as a sanity check to do a recursive find of all .o files in ${CUR_TARGET_CMAKE_SOURCE_LIB_DIR}
+        # and verifies that matches the content of the .a, and then copy from the source dir.
+        # TODO: The copying action here isn't really necessary. For future fix, consider using the script extracts from the ar with the rename to potentially
+        # make both maccatalyst and other builds do the same thing.
+        set(CUR_TARGET_CMAKE_SOURCE_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${_LIB}.dir)
+        add_custom_command(TARGET onnxruntime POST_BUILD
+                          COMMAND ar -t $<TARGET_FILE:${_LIB}> | grep "\.o$"  > ${_LIB}.object_file_list.txt
+                          COMMAND ${CMAKE_COMMAND} -E env python3 ${CMAKE_CURRENT_SOURCE_DIR}/maccatalyst_prepare_objects_for_prelink.py ${CUR_TARGET_CMAKE_SOURCE_LIB_DIR} ${CUR_STATIC_LIB_OBJ_DIR} ${CUR_STATIC_LIB_OBJ_DIR}/${_LIB}.object_file_list.txt
+                          WORKING_DIRECTORY ${CUR_STATIC_LIB_OBJ_DIR})
+      else()
+        add_custom_command(TARGET onnxruntime POST_BUILD
+        COMMAND ar ARGS -x $<TARGET_FILE:${_LIB}>
+        WORKING_DIRECTORY ${CUR_STATIC_LIB_OBJ_DIR})
+      endif()
     endif()
   endforeach()
 
diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
index 17de2aa4aaea6..6b7d4402be8eb 100644
--- a/cmake/onnxruntime_mlas.cmake
+++ b/cmake/onnxruntime_mlas.cmake
@@ -631,6 +631,12 @@ if (WIN32)
   endif()
 endif()
 
+if (PLATFORM_NAME STREQUAL "macabi")
+  # Needed for maccatalyst C compilation
+  # i.e. the flags below add "--target=x86_64-apple-ios14.0-macabi -ffunction-sections -fdata-sections"
+  target_compile_options(onnxruntime_mlas PRIVATE ${CMAKE_C_FLAGS})
+endif()
+
 if (NOT onnxruntime_BUILD_SHARED_LIB)
     install(TARGETS onnxruntime_mlas
             ARCHIVE   DESTINATION ${CMAKE_INSTALL_LIBDIR}
diff --git a/onnxruntime/test/platform/apple/apple_package_test/apple_package_test.xcodeproj/project.pbxproj b/onnxruntime/test/platform/apple/apple_package_test/apple_package_test.xcodeproj/project.pbxproj
index f0582d41734bd..eb7345be3770b 100644
--- a/onnxruntime/test/platform/apple/apple_package_test/apple_package_test.xcodeproj/project.pbxproj
+++ b/onnxruntime/test/platform/apple/apple_package_test/apple_package_test.xcodeproj/project.pbxproj
@@ -49,6 +49,7 @@
 		229E595826586B4A006E41AE /* sigmoid.ort */ = {isa = PBXFileReference; lastKnownFileType = file; path = sigmoid.ort; sourceTree = "<group>"; };
 		22C1D8DE271A79AF002CEE67 /* ios_package_testUITests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = ios_package_testUITests.xctest; sourceTree = BUILT_PRODUCTS_DIR; };
 		22C1D8E9271A79FD002CEE67 /* ios_package_uitest_cpp_api.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = ios_package_uitest_cpp_api.mm; sourceTree = "<group>"; };
+		513C65792B85789400E4EDFD /* ios_package_test.entitlements */ = {isa = PBXFileReference; lastKnownFileType = text.plist.entitlements; path = ios_package_test.entitlements; sourceTree = "<group>"; };
 		51C316B92B0881450033C70B /* macos_package_test.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = macos_package_test.app; sourceTree = BUILT_PRODUCTS_DIR; };
 		51C316BB2B0881450033C70B /* AppDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = "<group>"; };
 		51C316BC2B0881450033C70B /* AppDelegate.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = AppDelegate.m; sourceTree = "<group>"; };
@@ -117,6 +118,7 @@
 		229E591E265869BF006E41AE /* ios_package_test */ = {
 			isa = PBXGroup;
 			children = (
+				513C65792B85789400E4EDFD /* ios_package_test.entitlements */,
 				229E591F265869BF006E41AE /* AppDelegate.h */,
 				229E5920265869BF006E41AE /* AppDelegate.m */,
 				229E5928265869BF006E41AE /* Main.storyboard */,
@@ -521,8 +523,11 @@
 			buildSettings = {
 				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
 				ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
-				CODE_SIGN_STYLE = Automatic;
+				CODE_SIGNING_REQUIRED = NO;
+				CODE_SIGNING_STYLE = Automatic;
+				CODE_SIGN_ENTITLEMENTS = ios_package_test/ios_package_test.entitlements;
 				INFOPLIST_FILE = ios_package_test/Info.plist;
+				IPHONEOS_DEPLOYMENT_TARGET = 14.0;
 				LD_RUNPATH_SEARCH_PATHS = (
 					"$(inherited)",
 					"@executable_path/Frameworks",
@@ -530,9 +535,9 @@
 				PRODUCT_BUNDLE_IDENTIFIER = "ai.onnxruntime.tests.ios-package-test";
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator";
-				SUPPORTS_MACCATALYST = NO;
+				SUPPORTS_MACCATALYST = YES;
 				SUPPORTS_MAC_DESIGNED_FOR_IPHONE_IPAD = NO;
-				TARGETED_DEVICE_FAMILY = 1;
+				TARGETED_DEVICE_FAMILY = "1,2";
 			};
 			name = Debug;
 		};
@@ -541,8 +546,11 @@
 			buildSettings = {
 				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
 				ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
-				CODE_SIGN_STYLE = Automatic;
+				CODE_SIGNING_REQUIRED = NO;
+				CODE_SIGNING_STYLE = Automatic;
+				CODE_SIGN_ENTITLEMENTS = ios_package_test/ios_package_test.entitlements;
 				INFOPLIST_FILE = ios_package_test/Info.plist;
+				IPHONEOS_DEPLOYMENT_TARGET = 14.0;
 				LD_RUNPATH_SEARCH_PATHS = (
 					"$(inherited)",
 					"@executable_path/Frameworks",
@@ -550,9 +558,9 @@
 				PRODUCT_BUNDLE_IDENTIFIER = "ai.onnxruntime.tests.ios-package-test";
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator";
-				SUPPORTS_MACCATALYST = NO;
+				SUPPORTS_MACCATALYST = YES;
 				SUPPORTS_MAC_DESIGNED_FOR_IPHONE_IPAD = NO;
-				TARGETED_DEVICE_FAMILY = 1;
+				TARGETED_DEVICE_FAMILY = "1,2";
 			};
 			name = Release;
 		};
@@ -563,7 +571,7 @@
 				CODE_SIGN_STYLE = Automatic;
 				CURRENT_PROJECT_VERSION = 1;
 				GENERATE_INFOPLIST_FILE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 13.0;
+				IPHONEOS_DEPLOYMENT_TARGET = 14.0;
 				LD_RUNPATH_SEARCH_PATHS = (
 					"$(inherited)",
 					"@executable_path/Frameworks",
@@ -585,7 +593,7 @@
 				CODE_SIGN_STYLE = Automatic;
 				CURRENT_PROJECT_VERSION = 1;
 				GENERATE_INFOPLIST_FILE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 13.0;
+				IPHONEOS_DEPLOYMENT_TARGET = 14.0;
 				LD_RUNPATH_SEARCH_PATHS = (
 					"$(inherited)",
 					"@executable_path/Frameworks",
diff --git a/onnxruntime/test/platform/apple/apple_package_test/ios_package_test/ios_package_test.entitlements b/onnxruntime/test/platform/apple/apple_package_test/ios_package_test/ios_package_test.entitlements
new file mode 100644
index 0000000000000..ee95ab7e582d4
--- /dev/null
+++ b/onnxruntime/test/platform/apple/apple_package_test/ios_package_test/ios_package_test.entitlements
@@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>com.apple.security.app-sandbox</key>
+	<true/>
+	<key>com.apple.security.network.client</key>
+	<true/>
+</dict>
+</plist>
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 067f151844b1b..fd9f106f7ad9b 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -400,6 +400,12 @@ def convert_arg_line_to_args(self, arg_line):
 
     parser.add_argument("--ios", action="store_true", help="build for ios")
 
+    parser.add_argument(
+        "--macos",
+        choices=["MacOSX", "Catalyst"],
+        help="Specify the target platform for macOS build. Only specify this argument when --build_apple_framework is present.",
+    )
+
     parser.add_argument(
         "--apple_sysroot", default="", help="Specify the location name of the macOS platform SDK to be used"
     )
@@ -419,7 +425,7 @@ def convert_arg_line_to_args(self, arg_line):
         action="store_const",
         const="Xcode",
         dest="cmake_generator",
-        help="Use Xcode as cmake generator, this is only supported on MacOS. Equivalent to '--cmake_generator Xcode'.",
+        help="Use Xcode as cmake generator, this is only supported on MacOS. (non Catalyst build). Equivalent to '--cmake_generator Xcode'.",
     )
     parser.add_argument(
         "--osx_arch",
@@ -1323,8 +1329,12 @@ def generate_build_tree(
     if args.use_snpe:
         cmake_args += ["-Donnxruntime_USE_SNPE=ON"]
 
-    if args.build_apple_framework or args.ios:
-        if not args.cmake_generator == "Xcode":
+    if args.macos or args.ios:
+        # Note: Xcode CMake generator doesn't have a good support for Mac Catalyst yet.
+        if args.macos == "Catalyst" and args.cmake_generator == "Xcode":
+            raise BuildError("Xcode CMake generator ('--cmake_generator Xcode') doesn't support Mac Catalyst build.")
+
+        if (args.ios or args.macos == "MacOSX") and not args.cmake_generator == "Xcode":
             raise BuildError(
                 "iOS/MacOS framework build requires use of the Xcode CMake generator ('--cmake_generator Xcode')."
             )
@@ -1342,12 +1352,15 @@ def generate_build_tree(
                 "iOS/MacOS framework build on MacOS canceled due to missing arguments: "
                 + ", ".join(val for val, cond in zip(arg_names, needed_args) if not cond)
             )
+        # note: this value is mainly used in framework_info.json file to specify the build osx type
+        platform_name = "macabi" if args.macos == "Catalyst" else args.apple_sysroot
         cmake_args += [
             "-Donnxruntime_BUILD_SHARED_LIB=ON",
             "-DCMAKE_OSX_SYSROOT=" + args.apple_sysroot,
             "-DCMAKE_OSX_DEPLOYMENT_TARGET=" + args.apple_deploy_target,
             # we do not need protoc binary for ios cross build
             "-Dprotobuf_BUILD_PROTOC_BINARIES=OFF",
+            "-DPLATFORM_NAME=" + platform_name,
         ]
         if args.ios:
             cmake_args += [
@@ -1355,6 +1368,21 @@ def generate_build_tree(
                 "-DCMAKE_TOOLCHAIN_FILE="
                 + (args.ios_toolchain_file if args.ios_toolchain_file else "../cmake/onnxruntime_ios.toolchain.cmake"),
             ]
+        # for catalyst build, we need to manually specify cflags for target e.g. x86_64-apple-ios14.0-macabi, etc.
+        # https://forums.developer.apple.com/forums/thread/122571
+        if args.macos == "Catalyst":
+            macabi_target = f"{args.osx_arch}-apple-ios{args.apple_deploy_target}-macabi"
+            cmake_args += [
+                "-DCMAKE_CXX_COMPILER_TARGET=" + macabi_target,
+                "-DCMAKE_C_COMPILER_TARGET=" + macabi_target,
+                "-DCMAKE_CC_COMPILER_TARGET=" + macabi_target,
+                f"-DCMAKE_CXX_FLAGS=--target={macabi_target}",
+                f"-DCMAKE_CXX_FLAGS_RELEASE=-O3 -DNDEBUG --target={macabi_target}",
+                f"-DCMAKE_C_FLAGS=--target={macabi_target}",
+                f"-DCMAKE_C_FLAGS_RELEASE=-O3 -DNDEBUG --target={macabi_target}",
+                f"-DCMAKE_CC_FLAGS=--target={macabi_target}",
+                f"-DCMAKE_CC_FLAGS_RELEASE=-O3 -DNDEBUG --target={macabi_target}",
+            ]
 
     if args.build_wasm:
         emsdk_dir = os.path.join(cmake_dir, "external", "emsdk")
@@ -2740,7 +2768,13 @@ def main():
             cmake_extra_args += ["-G", args.cmake_generator]
 
         if is_macOS():
-            if not args.ios and not args.android and args.osx_arch == "arm64" and platform.machine() == "x86_64":
+            if (
+                not args.ios
+                and args.macos != "Catalyst"
+                and not args.android
+                and args.osx_arch == "arm64"
+                and platform.machine() == "x86_64"
+            ):
                 if args.test:
                     log.warning("Cannot test ARM64 build on X86_64. Will skip test running after build.")
                     args.test = False
diff --git a/tools/ci_build/github/apple/build_apple_framework.py b/tools/ci_build/github/apple/build_apple_framework.py
index 7b8a87632f5c7..e17bcd65d8814 100644
--- a/tools/ci_build/github/apple/build_apple_framework.py
+++ b/tools/ci_build/github/apple/build_apple_framework.py
@@ -50,9 +50,11 @@ def _build_for_apple_sysroot(
     # Build binary for each arch, one by one
     for current_arch in archs:
         build_dir_current_arch = os.path.join(intermediates_dir, sysroot + "_" + current_arch)
+        # Use MacOS SDK for Catalyst builds
+        apple_sysroot = "macosx" if sysroot == "macabi" else sysroot
         build_command = [
             *base_build_command,
-            "--apple_sysroot=" + sysroot,
+            "--apple_sysroot=" + apple_sysroot,
             "--osx_arch=" + current_arch,
             "--build_dir=" + build_dir_current_arch,
         ]
diff --git a/tools/ci_build/github/apple/default_full_apple_framework_build_settings.json b/tools/ci_build/github/apple/default_full_apple_framework_build_settings.json
index 86b4efdc63750..04a73ae450e5f 100644
--- a/tools/ci_build/github/apple/default_full_apple_framework_build_settings.json
+++ b/tools/ci_build/github/apple/default_full_apple_framework_build_settings.json
@@ -23,6 +23,7 @@
             "--cmake_extra_defines=onnxruntime_BUILD_UNIT_TESTS=OFF"
         ],
         "macosx": [
+            "--macos=MacOSX",
             "--apple_deploy_target=11.0"
         ],
         "iphoneos": [
diff --git a/tools/ci_build/github/apple/default_full_ios_framework_build_settings.json b/tools/ci_build/github/apple/default_full_ios_framework_build_settings.json
index 445bfca9889ff..4bc978956d7fc 100644
--- a/tools/ci_build/github/apple/default_full_ios_framework_build_settings.json
+++ b/tools/ci_build/github/apple/default_full_ios_framework_build_settings.json
@@ -6,25 +6,35 @@
         "iphonesimulator": [
             "arm64",
             "x86_64"
+        ],
+        "macabi": [
+            "arm64",
+            "x86_64"
         ]
     },
     "build_params": {
         "base": [
             "--parallel",
-            "--use_xcode",
             "--build_apple_framework",
             "--use_coreml",
-            "--use_xnnpack",
             "--skip_tests",
             "--cmake_extra_defines=onnxruntime_BUILD_UNIT_TESTS=OFF"
         ],
         "iphoneos": [
             "--ios",
+            "--use_xcode",
+            "--use_xnnpack",
             "--apple_deploy_target=12.0"
         ],
         "iphonesimulator": [
             "--ios",
+            "--use_xcode",
+            "--use_xnnpack",
             "--apple_deploy_target=12.0"
+        ],
+        "macabi":[
+            "--macos=Catalyst",
+            "--apple_deploy_target=14.0"
         ]
     }
 }
diff --git a/tools/ci_build/github/apple/default_training_ios_framework_build_settings.json b/tools/ci_build/github/apple/default_training_ios_framework_build_settings.json
index f88934cd44a66..2066af7843e0a 100644
--- a/tools/ci_build/github/apple/default_training_ios_framework_build_settings.json
+++ b/tools/ci_build/github/apple/default_training_ios_framework_build_settings.json
@@ -32,6 +32,7 @@
             "--apple_deploy_target=12.0"
         ],
         "macosx": [
+            "--macos=MacOSX",
             "--apple_deploy_target=11.0"
         ]
     }
diff --git a/tools/ci_build/github/apple/framework_info.json.template b/tools/ci_build/github/apple/framework_info.json.template
index b4c4fb8d16ebf..1f7eeb5948799 100644
--- a/tools/ci_build/github/apple/framework_info.json.template
+++ b/tools/ci_build/github/apple/framework_info.json.template
@@ -1,5 +1,5 @@
 {
-    "@CMAKE_OSX_SYSROOT@": {
+    "@PLATFORM_NAME@": {
         "APPLE_DEPLOYMENT_TARGET": "@CMAKE_OSX_DEPLOYMENT_TARGET@",
         "WEAK_FRAMEWORK": "@APPLE_WEAK_FRAMEWORK@"
     }
diff --git a/tools/ci_build/github/apple/test_apple_packages.py b/tools/ci_build/github/apple/test_apple_packages.py
index 3c0df994ffd3d..3987a37fcc76c 100644
--- a/tools/ci_build/github/apple/test_apple_packages.py
+++ b/tools/ci_build/github/apple/test_apple_packages.py
@@ -176,6 +176,25 @@ def _test_apple_packages(args):
 
                 break
 
+            if args.mac_catalyst_enabled:
+                subprocess.run(
+                    [
+                        "xcrun",
+                        "xcodebuild",
+                        "test",
+                        "-workspace",
+                        "./apple_package_test.xcworkspace",
+                        "-scheme",
+                        "ios_package_test",
+                        "-destination",
+                        "platform=macOS,variant=Mac Catalyst",
+                        "CODE_SIGNING_ALLOWED=NO",
+                    ],
+                    shell=False,
+                    check=True,
+                    cwd=target_proj_path,
+                )
+
             if PackageVariant[args.variant] != PackageVariant.Mobile and not args.skip_macos_test:
                 subprocess.run(
                     [
@@ -244,6 +263,12 @@ def parse_args():
         help="Skip macos platform tests. Specify this argument when build targets only contain ios archs. ",
     )
 
+    parser.add_argument(
+        "--mac_catalyst_enabled",
+        action="store_true",
+        help="Run tests for mac catalyst variants. Specify this argument when build targets contains catalyst archs. ",
+    )
+
     return parser.parse_args()
 
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
index 1ba0b02560aca..0bb9fad6716b7 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
@@ -138,7 +138,8 @@ stages:
           --framework_info_file "$(Build.BinariesDirectory)/ios_framework/xcframework_info.json" \
           --c_framework_dir "$(Build.BinariesDirectory)/ios_framework/framework_out" \
           --variant Full \
-          --skip_macos_test
+          --skip_macos_test \
+          --mac_catalyst_enabled
       displayName: "Test Apple framework"
 
     - task: PublishBuildArtifacts@1

From 15219e2e71b82f906f92317510396b6ccc858c49 Mon Sep 17 00:00:00 2001
From: Yufeng Li <liyufeng1987@gmail.com>
Date: Wed, 20 Mar 2024 12:49:58 -0700
Subject: [PATCH 216/279] turn on neural_speed by default (#19627)

### Description
<!-- Describe your changes. -->
the crash caused by the neural_speed turns out to be a very corn case.
Turn it on by default.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 cgmanifests/generated/cgmanifest.json         |  2 +-
 cmake/CMakeLists.txt                          |  4 +--
 cmake/deps.txt                                |  2 +-
 cmake/external/neural_speed.cmake             |  1 +
 ...7527d5286ddd3a995c228dedf8d76a7a86bc.patch | 30 +++++++++++++++++++
 .../cpu/quantization/neural_speed_wrapper.h   |  1 +
 .../templates/download-deps.yml               |  4 +--
 7 files changed, 38 insertions(+), 6 deletions(-)
 create mode 100644 cmake/patches/neural_speed/150e7527d5286ddd3a995c228dedf8d76a7a86bc.patch

diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json
index dc7e9c3fddb2f..3e13a567b1eaa 100644
--- a/cgmanifests/generated/cgmanifest.json
+++ b/cgmanifests/generated/cgmanifest.json
@@ -206,7 +206,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "c11386eb632eec7c1c2aa323142f73519f946e2a",
+          "commitHash": "150e7527d5286ddd3a995c228dedf8d76a7a86bc",
           "repositoryUrl": "https://github.com/intel/neural-speed.git"
         },
         "comments": "neural_speed"
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 655ca1c42ef93..49b6f06c76a64 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -88,7 +88,7 @@ option(onnxruntime_USE_QNN "Build with QNN support" OFF)
 option(onnxruntime_USE_SNPE "Build with SNPE support" OFF)
 option(onnxruntime_USE_RKNPU "Build with RKNPU support" OFF)
 option(onnxruntime_USE_DNNL "Build with DNNL support" OFF)
-option(onnxruntime_USE_NEURAL_SPEED "Build with Neural Speed support" OFF)
+option(onnxruntime_USE_NEURAL_SPEED "Build with Neural Speed support" ON)
 option(onnxruntime_USE_JSEP "Build with JavaScript implemented kernels support" OFF)
 option(onnxruntime_BUILD_UNIT_TESTS "Build ONNXRuntime unit tests" ON)
 option(onnxruntime_BUILD_CSHARP "Build C# library" OFF)
@@ -1206,7 +1206,7 @@ if (onnxruntime_USE_DNNL)
   add_compile_definitions(DNNL_OPENMP)
 endif()
 
-if (onnxruntime_USE_NEURAL_SPEED AND NOT onnxruntime_MINIMAL_BUILD)
+if (onnxruntime_USE_NEURAL_SPEED AND NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_USE_TVM)
   include(neural_speed)
   if (USE_NEURAL_SPEED)
     list(APPEND onnxruntime_EXTERNAL_LIBRARIES neural_speed::bestla)
diff --git a/cmake/deps.txt b/cmake/deps.txt
index 4111689c5def9..22ad9338ea59a 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -35,7 +35,7 @@ microsoft_gsl;https://github.com/microsoft/GSL/archive/refs/tags/v4.0.0.zip;cf36
 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5
 mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee7d34223d0567892db5179849939c8769dc41
 mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063
-neural_speed;https://github.com/intel/neural-speed/archive/refs/tags/bestlav0.1.1.zip;65b0f7a0d04f72f0d5a8d48af70f0366f2ab3939
+neural_speed;https://github.com/intel/neural-speed/archive/refs/tags/v0.3.zip;5ec64e3071edc7347ebd8a81679cf06e2bb9b851
 onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.15.0.zip;54c3f960a0541c5d8d3e60c2933e11f5d3688a11
 #use the commit of Final DDS removal. DDS output is now supported by ORT TRT.
 onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/bacfaaa951653cd4e72efe727a543567cb38f7de.zip;26434329612e804164ab7baa6ae629ada56c1b26
diff --git a/cmake/external/neural_speed.cmake b/cmake/external/neural_speed.cmake
index ed711351403a7..3fe9c660f89d6 100644
--- a/cmake/external/neural_speed.cmake
+++ b/cmake/external/neural_speed.cmake
@@ -9,6 +9,7 @@ if(USE_NEURAL_SPEED)
       neural_speed
       URL ${DEP_URL_neural_speed}
       URL_HASH SHA1=${DEP_SHA1_neural_speed}
+      PATCH_COMMAND ${Patch_EXECUTABLE} -p1 < ${PROJECT_SOURCE_DIR}/patches/neural_speed/150e7527d5286ddd3a995c228dedf8d76a7a86bc.patch
   )
   set(BTLA_USE_OPENMP OFF)
   onnxruntime_fetchcontent_makeavailable(neural_speed)
diff --git a/cmake/patches/neural_speed/150e7527d5286ddd3a995c228dedf8d76a7a86bc.patch b/cmake/patches/neural_speed/150e7527d5286ddd3a995c228dedf8d76a7a86bc.patch
new file mode 100644
index 0000000000000..e503a512a74ff
--- /dev/null
+++ b/cmake/patches/neural_speed/150e7527d5286ddd3a995c228dedf8d76a7a86bc.patch
@@ -0,0 +1,30 @@
+diff --git a/bestla/bestla/bestla_prologue_b.h b/bestla/bestla/bestla_prologue_b.h
+index 99f3ccc..a11de9d 100644
+--- a/bestla/bestla/bestla_prologue_b.h
++++ b/bestla/bestla/bestla_prologue_b.h
+@@ -456,9 +456,8 @@ class WeightKBlockNInteger {
+     auto tmpscales = tmp;
+     auto tmpzeropoints = reinterpret_cast<int8_t*>(tmpscales + N * blks);
+     if (scales) {
+-      for (size_t i = 0; i < N * blks; i += 2) {
++      for (size_t i = 0; i < N * blks; i ++) {
+         tmpscales[i] = scales[i] / 16;
+-        tmpscales[i + 1] = scales[i + 1] / 16;
+       }
+     }
+     if (zero_points) {
+diff --git a/bestla/bestla/kernel_avx512f.h b/bestla/bestla/kernel_avx512f.h
+index 6783ee8..59822e5 100644
+--- a/bestla/bestla/kernel_avx512f.h
++++ b/bestla/bestla/kernel_avx512f.h
+@@ -673,8 +673,8 @@ inline BTLA_CODE decompress_kblock_s3_s8fp(utils::bit2x4* bit2ptr, utils::bit1x8
+     zmm1 = _mm512_sllv_epi32(zmm1, zmm_shift);  // int3_clip => int8
+     zmm2 = _mm512_sllv_epi32(zmm2, zmm_shift);  // int3_clip => int8
+
+-    _mm512_storeu_epi8((__m512i*)dst, zmm1);
+-    _mm512_storeu_epi8((__m512i*)(dst + 64), zmm2);
++    _mm512_storeu_si512((__m512i*)dst, zmm1);
++    _mm512_storeu_si512((__m512i*)(dst + 64), zmm2);
+   };
+
+   assert(head_ignore_num % 8 == 0);
diff --git a/onnxruntime/contrib_ops/cpu/quantization/neural_speed_wrapper.h b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_wrapper.h
index d3902f9bd68c7..e7df50408ef09 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/neural_speed_wrapper.h
+++ b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_wrapper.h
@@ -27,6 +27,7 @@
 #pragma warning(disable : 4244)
 #pragma warning(disable : 4267)
 #pragma warning(disable : 4702)
+#pragma warning(disable : 4127)
 #endif
 
 #include "bestla/bestla_prologue_a.h"
diff --git a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
index c60b3e467d4f1..4fd33b4f0bc09 100644
--- a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
@@ -11,7 +11,7 @@ steps:
       packageType: upack
       feed: '/7424c8e4-5c62-490e-95c4-79446f31017c'
       definition: '517c4f6f-5437-4392-a70d-4f15ec5be2f0'
-      version: 1.0.143
+      version: 1.0.145
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # The private ADO project
@@ -22,7 +22,7 @@ steps:
       packageType: upack
       feed: '/4c7631f5-24c0-4307-8822-1aa8f180c325'
       definition: 'fd9dd5ad-b73e-4678-890e-edcf680dbc1a'
-      version: 1.0.143
+      version: 1.0.145
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # You can add more ADO accounts at here.

From 0335ea9f1e1f105bb5aec0f02b22da4a61afd8fb Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchuby@users.noreply.github.com>
Date: Wed, 20 Mar 2024 17:53:48 -0700
Subject: [PATCH 217/279] Use Java 11 to build project in the codeql pipeline
 (#19999)

Codeql uses Java 8 by default, which is too old for the project.

Related:

https://learn.microsoft.com/en-us/java/openjdk/reasons-to-move-to-java-11
https://github.com/actions/setup-java
---
 .github/workflows/codeql.yml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 4a5b87b3e69ed..e4d1b91bab736 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -47,6 +47,14 @@ jobs:
         # Details on CodeQL's query packs refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
         queries: security-extended,security-and-quality
 
+    # Setup Java to use a version that is not too old for the project
+    - if: ${{ matrix.language == 'java' }}
+      name: Setup Java 11
+      uses: actions/setup-java@v4
+      with:
+        java-version: '11'
+        distribution: 'microsoft'
+
     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
     # If this step fails, then you should remove it and run the build manually (see below)
     - if: ${{ matrix.language != 'cpp' }}

From 175f149b30bae8e02d146f1a34d22e1415b6f154 Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Thu, 21 Mar 2024 10:01:03 +0800
Subject: [PATCH 218/279] Remove downloading deps in CUDA package test stage
 (#19993)

### Description
<!-- Describe your changes. -->


### Motivation and Context
downloading deps is not needed in test stage
remove it to reduce random downloading errors
---
 .../templates/py-packaging-training-cuda-stage-steps.yml         | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml
index 91d7b9f219f76..024b9b45591ba 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml
@@ -172,6 +172,7 @@ stages:
           parameters:
             Dockerfile: tools/ci_build/github/linux/docker/${{ parameters.docker_file }}
             Context: tools/ci_build/github/linux/docker
+            UpdateDepsTxt: false
             DockerBuildArgs: >-
               --build-arg TORCH_VERSION=${{ parameters.torch_version }}
               --build-arg OPSET_VERSION=${{ parameters.opset_version }}

From 0b958bb421267a60e42213322dddad62986a93a1 Mon Sep 17 00:00:00 2001
From: Prathik Rao <prathik.rao@gmail.com>
Date: Wed, 20 Mar 2024 21:00:25 -0700
Subject: [PATCH 219/279] add random seed to layernorm tests (#19998)

Adds random seed to layernorm tests to prevent random failure.

### Motivation and Context
Fixes https://github.com/microsoft/onnxruntime/issues/19983
---
 .../orttraining/test/training_ops/cuda/layer_norm_test.cc     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/orttraining/orttraining/test/training_ops/cuda/layer_norm_test.cc b/orttraining/orttraining/test/training_ops/cuda/layer_norm_test.cc
index e86aa871b6c5f..13ad2f6150acf 100644
--- a/orttraining/orttraining/test/training_ops/cuda/layer_norm_test.cc
+++ b/orttraining/orttraining/test/training_ops/cuda/layer_norm_test.cc
@@ -49,7 +49,7 @@ static void TestLayerNormGrad(
 
   test.AddAttribute("axis", axis);
 
-  RandomValueGenerator random{};
+  RandomValueGenerator random{optional<RandomValueGenerator::RandomSeedType>{2345}};
   const auto Y_grad_data = random.Uniform<float>(n_x_m_dims, k_random_data_min, k_random_data_max);
   const auto X_data = random.Uniform<float>(n_x_m_dims, k_random_data_min, k_random_data_max);
   const auto scale_data = random.Uniform<float>(m_dims, k_random_data_min, k_random_data_max);
@@ -152,7 +152,7 @@ static void TestInvertibleLayerNormGrad(
 
   test.AddAttribute("axis", axis);
 
-  RandomValueGenerator random{};
+  RandomValueGenerator random{optional<RandomValueGenerator::RandomSeedType>{2345}};
   const auto Y_grad_data = random.Uniform<float>(n_x_m_dims, k_random_data_min, k_random_data_max);
   const auto X_data = random.Uniform<float>(n_x_m_dims, k_random_data_min, k_random_data_max);
   const auto scale_data = random.Uniform<float>(m_dims, k_random_data_min, k_random_data_max);

From 06fe4f31131a6873a295ba47ed60f4cb16584296 Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Wed, 20 Mar 2024 23:40:27 -0700
Subject: [PATCH 220/279] Increase MNIST test tolerance (#20000)

### Description

Found multiple occurrence of failures:

https://dev.azure.com/onnxruntime/onnxruntime/_build/results?buildId=1321061&view=logs&j=6df8fe70-7b8f-505a-8ef0-8bf93da2bac7&t=56a04c0b-9e7f-5c69-cb7b-c2a7b1a7392a&l=17537

https://dev.azure.com/onnxruntime/onnxruntime/_build/results?buildId=1329701&view=logs&j=6df8fe70-7b8f-505a-8ef0-8bf93da2bac7&t=4f6ef737-111d-50d1-a46b-5f86d9a970bc&s=3618b4c0-1011-591a-85b8-671e72e2cff1

1: [ RUN      ] ModelTests/ModelTest.Run/
cuda__models_zoo_opset7_MNIST_model
1: D:\a\_work\1\s\onnxruntime\test\providers\cpu\model_tests.cc(358):
error: Expected equality of these values:
1:   COMPARE_RESULT::SUCCESS
1:     Which is: 4-byte object <00-00 00-00>
1:   ret.first
1:     Which is: 4-byte object <01-00 00-00>
1: expected -2.33638 (c0158735), got -2.30239 (c0135a47), diff:
0.0339923, tol=0.0243638 idx=9
---
 onnxruntime/test/providers/cpu/model_tests.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/onnxruntime/test/providers/cpu/model_tests.cc b/onnxruntime/test/providers/cpu/model_tests.cc
index 00d96a0664fa0..aa752ed7308c6 100644
--- a/onnxruntime/test/providers/cpu/model_tests.cc
+++ b/onnxruntime/test/providers/cpu/model_tests.cc
@@ -92,6 +92,7 @@ TEST_P(ModelTest, Run) {
   // when cuda or openvino is enabled, set it to a larger value for resolving random MNIST test failure
   if (model_path.find(ORT_TSTR("_MNIST")) > 0) {
     if (provider_name == "cuda" || provider_name == "openvino") {
+      per_sample_tolerance = 2.5e-2;
       relative_per_sample_tolerance = 1e-2;
     }
   }

From 30a0d809255994af5685f2903c022446dceaaa10 Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Fri, 22 Mar 2024 06:53:59 +0800
Subject: [PATCH 221/279] Fix exception in Publish unit test results step
 (#20007)

### Description
Test results files are all in RelWithDebInfo\RelWithDebInfo directory.
It's not necessary to stat the directory of _deps

### Motivation and Context
Recently this exception in zip-nuget pipleine occurs many times.
`##[error]Error: Failed find: EPERM: operation not permitted, stat
'D:\a\_work\1\b\RelWithDebInfo\_deps\flatbuffers-src\java\src\test\java\DictionaryLookup'`

https://dev.azure.com/aiinfra/Lotus/_build/results?buildId=426981&view=logs&j=75fc0348-fe99-522b-3acb-90fd80ac5271&t=5d4ebcc1-bcde-574d-6f4e-8abd0f04ae4b
---
 .../github/azure-pipelines/nuget/templates/dml-vs-2022.yml      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml
index d6bb415a68ee6..3a3375a313ca5 100644
--- a/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml
@@ -188,7 +188,7 @@ stages:
           displayName: 'Publish unit test results'
           inputs:
             testResultsFiles: '**\*.results.xml'
-            searchFolder: '$(Build.BinariesDirectory)'
+            searchFolder: '$(Build.BinariesDirectory)\$(BuildConfig)\$(BuildConfig)'
             testRunTitle: 'Unit Test Run'
           condition: succeededOrFailed()
 

From 983fd8393aa30b0c788275ea6e513a614274e1ba Mon Sep 17 00:00:00 2001
From: TP Boudreau <tpboudreau@gmail.com>
Date: Thu, 21 Mar 2024 16:08:18 -0700
Subject: [PATCH 222/279] Recognize NaN operands in Min and Max ops (#19984)

### Description
Update the Min and Max CUDA math operations on float/double types to
propagate NaNs: if either operand is NaN, the result should be NaN.

TODO: float16/bfloat16 need similar change.

### Motivation
Currently, results differ between the CPU and CUDA implementations of
the floating point Min and Max operators: the CPU operators correctly
return NaN results if either operand is NaN. This PR updates the CUDA
implementations to conform with this correct behavior.

See the the issue and comments raised
[here](https://github.com/onnx/onnx/issues/6003).

### Context
Same behavior in numpy, torch and Java:
```
>>> numpy.min([numpy.NAN, 1])
nan
>>> numpy.max([numpy.NAN, 1])
nan

>>> torch.min(torch.tensor([1, float('nan')]))
tensor(nan)
>>> torch.max(torch.tensor([1, float('nan')]))
tensor(nan)
```

C languguage [fmin](https://en.cppreference.com/w/c/numeric/math/fmin)
and [fmax](https://en.cppreference.com/w/c/numeric/math/fmax) has
different behavior:
```
fmax(NaN,1) = 1
fmin(NaN,1) = 1
```

https://grouper.ieee.org/groups/msc/ANSI_IEEE-Std-754-2019/background/minNum_maxNum_Removal_Demotion_v3.pdf

![image](https://github.com/microsoft/onnxruntime/assets/30328909/62446cf1-f252-4ddc-8118-5ce605252331)

https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2273.pdf
---
 .../core/providers/cuda/cu_inc/common.cuh     |  22 ++++
 .../cpu/math/element_wise_ops_test.cc         | 114 ++++++++++++++++++
 2 files changed, 136 insertions(+)

diff --git a/onnxruntime/core/providers/cuda/cu_inc/common.cuh b/onnxruntime/core/providers/cuda/cu_inc/common.cuh
index 1cd3532846114..052dd05574ab1 100644
--- a/onnxruntime/core/providers/cuda/cu_inc/common.cuh
+++ b/onnxruntime/core/providers/cuda/cu_inc/common.cuh
@@ -5,7 +5,9 @@
 #include <stdint.h>
 #include <vector>
 #include <mutex>
+#include <limits>
 #include <assert.h>
+#include <math.h>
 #include <cuda_runtime.h>
 #include <cuda_fp16.h>
 #include "core/providers/cuda/cuda_common.h"
@@ -345,9 +347,29 @@ __device__ __inline__ half _Pow(half a, half b) { return half(powf((float)a, (fl
 template <typename T>
 __device__ __inline__ T _Min(T a, T b) { return a < b ? a : b; }
 
+template <>
+__device__ __inline__ float _Min(float a, float b) {
+  return (isnan(a) || isnan(b)) ? std::numeric_limits<float>::quiet_NaN() : ( a < b ? a : b );
+}
+
+template <>
+__device__ __inline__ double _Min(double a, double b) {
+  return (isnan(a) || isnan(b)) ? std::numeric_limits<double>::quiet_NaN() : ( a < b ? a : b );
+}
+
 template <typename T>
 __device__ __inline__ T _Max(T a, T b) { return a > b ? a : b; }
 
+template <>
+__device__ __inline__ float _Max(float a, float b) {
+  return (isnan(a) || isnan(b)) ? std::numeric_limits<float>::quiet_NaN() : ( a > b ? a : b );
+}
+
+template <>
+__device__ __inline__ double _Max(double a, double b) {
+  return (isnan(a) || isnan(b)) ? std::numeric_limits<double>::quiet_NaN() : ( a > b ? a : b );
+}
+
 template <typename T>
 __device__ __inline__ T _Abs(T a) { return a > (T)0 ? a : -a; }
 
diff --git a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
index c73dfcbce1b53..c02486a2ec26f 100644
--- a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
@@ -9,6 +9,7 @@
 #include "test/common/trt_op_test_utils.h"
 #include "core/util/math.h"
 #include <algorithm>
+#include <limits>
 #include <math.h>
 
 namespace onnxruntime {
@@ -1508,6 +1509,34 @@ TEST(MathOpTest, Min_12_Float_2_Input) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});  // TensorRT: Input batch size is inconsistent
 }
 
+TEST(MathOpTest, Min_12_Float_Nan) {
+  OpTester test("Min", 12);
+  test.AddInput<float>("data_2", {3, 3},
+                       {std::numeric_limits<float>::quiet_NaN(),
+                        std::numeric_limits<float>::quiet_NaN(),
+                        std::numeric_limits<float>::quiet_NaN(),
+                        -0.5f, 0.0f, -2.0f,
+                        0.5f, 0.0f, 2.0f});
+  test.AddInput<float>("data_1", {3, 1},
+                       {0.0f, -1.0f, 1.0f});
+  test.AddOutput<float>("min", {3, 3},
+                        {std::numeric_limits<float>::quiet_NaN(),
+                         std::numeric_limits<float>::quiet_NaN(),
+                         std::numeric_limits<float>::quiet_NaN(),
+                         -1.0f, -1.0f, -2.0f,
+                         0.5f, 0.0f, 1.0f});
+  if (nullptr != DefaultCpuExecutionProvider().get()) {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCpuExecutionProvider());
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
+  if (nullptr != DefaultCudaExecutionProvider().get()) {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCudaExecutionProvider());
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
+}
+
 TEST(MathOpTest, Min_12_Double) {
   OpTester test("Min", 12);
   test.AddInput<double>("data_0", {1, 3},
@@ -1525,6 +1554,34 @@ TEST(MathOpTest, Min_12_Double) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT: Input batch size is inconsistent
 }
 
+TEST(MathOpTest, Min_12_Double_Nan) {
+  OpTester test("Min", 12);
+  test.AddInput<double>("data_2", {3, 3},
+                        {std::numeric_limits<double>::quiet_NaN(),
+                         std::numeric_limits<double>::quiet_NaN(),
+                         std::numeric_limits<double>::quiet_NaN(),
+                         -0.5, 0.0, -2.0,
+                         0.5, 0.0, 2.0});
+  test.AddInput<double>("data_1", {3, 1},
+                        {0.0, -1.0, 1.0});
+  test.AddOutput<double>("min", {3, 3},
+                         {std::numeric_limits<double>::quiet_NaN(),
+                          std::numeric_limits<double>::quiet_NaN(),
+                          std::numeric_limits<double>::quiet_NaN(),
+                          -1.0, -1.0, -2.0,
+                          0.5, 0.0, 1.0});
+  if (nullptr != DefaultCpuExecutionProvider().get()) {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCpuExecutionProvider());
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
+  if (nullptr != DefaultCudaExecutionProvider().get()) {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCudaExecutionProvider());
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
+}
+
 TEST(MathOpTest, Min_12_Int32) {
   OpTester test("Min", 12);
   test.AddInput<int32_t>("data_0", {1, 3},
@@ -1631,6 +1688,7 @@ TEST(MathOpTest, Min_12_MLFLoat16_Scalar1) {
                             MakeMLFloat16({-10.f, -10.f, -10.f}));
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT: Input batch size is inconsistent
 }
+
 TEST(MathOpTest, Max_6) {
   OpTester test("Max", 6);
   std::vector<int64_t> dims{3, 3};
@@ -1719,6 +1777,34 @@ TEST(MathOpTest, Max_12_Float) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});  // TensorRT: Input batch size is inconsistent
 }
 
+TEST(MathOpTest, Max_12_Float_Nan) {
+  OpTester test("Max", 12);
+  test.AddInput<float>("data_2", {3, 3},
+                       {std::numeric_limits<float>::quiet_NaN(),
+                        std::numeric_limits<float>::quiet_NaN(),
+                        std::numeric_limits<float>::quiet_NaN(),
+                        -0.5f, 0.0f, -2.0f,
+                        0.5f, 0.0f, 2.0f});
+  test.AddInput<float>("data_1", {3, 1},
+                       {0.0f, -1.0f, 1.0f});
+  test.AddOutput<float>("max", {3, 3},
+                        {std::numeric_limits<float>::quiet_NaN(),
+                         std::numeric_limits<float>::quiet_NaN(),
+                         std::numeric_limits<float>::quiet_NaN(),
+                         -0.5f, 0.0f, -1.0f,
+                         1.0f, 1.0f, 2.0f});
+  if (nullptr != DefaultCpuExecutionProvider().get()) {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCpuExecutionProvider());
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
+  if (nullptr != DefaultCudaExecutionProvider().get()) {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCudaExecutionProvider());
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
+}
+
 TEST(MathOpTest, Max_12_Double) {
   OpTester test("Max", 12);
   test.AddInput<double>("data_0", {1, 3},
@@ -1736,6 +1822,34 @@ TEST(MathOpTest, Max_12_Double) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT: Input batch size is inconsistent
 }
 
+TEST(MathOpTest, Max_12_Double_Nan) {
+  OpTester test("Max", 12);
+  test.AddInput<double>("data_2", {3, 3},
+                        {std::numeric_limits<double>::quiet_NaN(),
+                         std::numeric_limits<double>::quiet_NaN(),
+                         std::numeric_limits<double>::quiet_NaN(),
+                         -0.5, 0.0, -2.0,
+                         0.5, 0.0, 2.0});
+  test.AddInput<double>("data_1", {3, 1},
+                        {0.0, -1.0, 1.0});
+  test.AddOutput<double>("max", {3, 3},
+                         {std::numeric_limits<double>::quiet_NaN(),
+                          std::numeric_limits<double>::quiet_NaN(),
+                          std::numeric_limits<double>::quiet_NaN(),
+                          -0.5, 0.0, -1.0,
+                          1.0, 1.0, 2.0});
+  if (nullptr != DefaultCpuExecutionProvider().get()) {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCpuExecutionProvider());
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
+  if (nullptr != DefaultCudaExecutionProvider().get()) {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCudaExecutionProvider());
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
+}
+
 TEST(MathOpTest, Max_12_Int32) {
   OpTester test("Max", 12);
   test.AddInput<int32_t>("data_0", {1, 3},

From dafbef3a21c63a01dbb3ef7af1edef245244ec11 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Thu, 21 Mar 2024 17:58:59 -0700
Subject: [PATCH 223/279] CMake: support reading dependency zip files from a
 local mirror (#20005)

### Description
To test this feature, run
```bat
python cmake\deps_update_and_upload.py --root-path mirror
```
Then run build.py as usual.

The zip files will be cached local. To avoid being downloaded again and
again.
---
 cmake/external/onnxruntime_external_deps.cmake | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index ac1e187f357aa..8839dbc8fda4f 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -14,6 +14,16 @@ foreach(ONNXRUNTIME_DEP IN LISTS ONNXRUNTIME_DEPS_LIST)
     set(DEP_URL_${ONNXRUNTIME_DEP_NAME} ${ONNXRUNTIME_DEP_URL})
     # The third column is SHA1 hash value
     set(DEP_SHA1_${ONNXRUNTIME_DEP_NAME} ${ONNXRUNTIME_DEP})
+
+    if(ONNXRUNTIME_DEP_URL MATCHES "^https://")
+      # Search a local mirror folder
+      string(REGEX REPLACE "^https://" "${REPO_ROOT}/mirror/" LOCAL_URL "${ONNXRUNTIME_DEP_URL}")
+
+      if(EXISTS "${LOCAL_URL}")
+        cmake_path(ABSOLUTE_PATH LOCAL_URL)
+        set(DEP_URL_${ONNXRUNTIME_DEP_NAME} "${LOCAL_URL}")
+      endif()
+    endif()
   endif()
 endforeach()
 

From cd6d3aea458d5ae0c3f6ab661b8067d60f390583 Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Fri, 22 Mar 2024 09:16:00 +0800
Subject: [PATCH 224/279] Refactor Python CUDA packaging pipeline to fix random
 hangs in building (#19989)

### Description
1. Move building on CPU machine.
2. Optimize the pipeline
3. Since there isn't official ONNX package for python 12, the python 12
test stage uses the packages built with ONNX source in build stage.


### Motivation and Context
1. Resolve the random hang in compilation
4. Save a lot of GPU resources.

---------
---
 .../py-cuda-packaging-pipeline.yml            |  13 +-
 .../stages/py-cuda-packaging-stage.yml        |  94 ++--
 .../templates/py-linux-gpu.yml                | 106 ++---
 .../azure-pipelines/templates/py-win-gpu.yml  | 418 +++++++++++-------
 4 files changed, 351 insertions(+), 280 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml
index aee42d3675087..20646d3ba4a26 100644
--- a/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml
@@ -21,6 +21,15 @@ parameters:
     values:
       - 11.8
       - 12.2
+  - name: SpecificArtifact
+    displayName: Use Specific Artifact
+    type: boolean
+    default: false
+
+  - name: BuildId
+    displayName: Specific Artifact's BuildId
+    type: string
+    default: '0'
 
 resources:
   repositories:
@@ -36,4 +45,6 @@ stages:
       enable_linux_gpu: ${{ parameters.enable_linux_gpu }}
       enable_windows_gpu: ${{ parameters.enable_windows_gpu }}
       cmake_build_type: ${{ parameters.cmake_build_type }}
-      cuda_version: ${{ parameters.cuda_version }}
\ No newline at end of file
+      cuda_version: ${{ parameters.cuda_version }}
+      SpecificArtifact: ${{ parameters.SpecificArtifact }}
+      BuildId: ${{ parameters.BuildId }}
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml
index f82c80d4d7e93..a2c1eeef632c1 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml
@@ -34,72 +34,40 @@ parameters:
    - 11.8
    - 12.2
 
-stages:
-- stage: Python_Packaging
-  dependsOn: []
-  variables:
-  - name: docker_base_image
-    ${{ if eq(parameters.cuda_version, '11.8') }}:
-      value: nvidia/cuda:11.8.0-cudnn8-devel-ubi8
-    ${{ if eq(parameters.cuda_version, '12.2') }}:
-      value: nvidia/cuda:12.2.2-cudnn8-devel-ubi8
-  - name: linux_trt_version
-    ${{ if eq(parameters.cuda_version, '11.8') }}:
-      value: 8.6.1.6-1.cuda11.8
-    ${{ if eq(parameters.cuda_version, '12.2') }}:
-      value: 8.6.1.6-1.cuda12.0
-  - name: win_trt_home
-    ${{ if eq(parameters.cuda_version, '11.8') }}:
-      value: $(Agent.TempDirectory)\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8
-    ${{ if eq(parameters.cuda_version, '12.2') }}:
-      value: $(Agent.TempDirectory)\TensorRT-8.6.1.6.Windows10.x86_64.cuda-12.0
-  - name: win_cuda_home
-    ${{ if eq(parameters.cuda_version, '11.8') }}:
-      value: $(Agent.TempDirectory)\v11.8
-    ${{ if eq(parameters.cuda_version, '12.2') }}:
-      value: $(Agent.TempDirectory)\v12.2
-  jobs:
-  - ${{ if eq(parameters.enable_windows_gpu, true) }}:
-      - template: ../templates/py-win-gpu.yml
-        parameters:
-          MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4'
-          PYTHON_VERSION: '3.8'
-          EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home=${{ variables.win_trt_home }} --cuda_home=${{ variables.win_cuda_home }}  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
-          EP_NAME: gpu
-          CudaVersion: ${{ parameters.cuda_version }}
-
-      - template: ../templates/py-win-gpu.yml
-        parameters:
-          MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4'
-          PYTHON_VERSION: '3.9'
-          EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home=${{ variables.win_trt_home }} --cuda_home=${{ variables.win_cuda_home }}  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
-          EP_NAME: gpu
-          CudaVersion: ${{ parameters.cuda_version }}
+- name: SpecificArtifact
+  displayName: Use Specific Artifact
+  type: boolean
+  default: false
 
-      - template: ../templates/py-win-gpu.yml
-        parameters:
-          MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4'
-          PYTHON_VERSION: '3.10'
-          EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home=${{ variables.win_trt_home }} --cuda_home=${{ variables.win_cuda_home }}  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
-          EP_NAME: gpu
-          CudaVersion: ${{ parameters.cuda_version }}
+- name: BuildId
+  displayName: Specific Artifact's BuildId
+  type: string
+  default: '0'
 
-      - template: ../templates/py-win-gpu.yml
-        parameters:
-          MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4'
-          PYTHON_VERSION: '3.11'
-          EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home=${{ variables.win_trt_home }} --cuda_home=${{ variables.win_cuda_home }}  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
-          EP_NAME: gpu
-          CudaVersion: ${{ parameters.cuda_version }}
+- name: PythonVersions
+  type: object
+  displayName: 'Python versions to build'
+  default:
+    - '3.8'
+    - '3.9'
+    - '3.10'
+    - '3.11'
+    - '3.12'
 
+stages:
+  - ${{ if eq(parameters.enable_windows_gpu, true) }}:
+    - ${{ each python_version in parameters.PythonVersions }}:
       - template: ../templates/py-win-gpu.yml
         parameters:
-          MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4'
-          PYTHON_VERSION: '3.12'
-          EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home=${{ variables.win_trt_home }} --cuda_home=${{ variables.win_cuda_home }}  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+          PYTHON_VERSION: ${{ python_version }}
           EP_NAME: gpu
           CudaVersion: ${{ parameters.cuda_version }}
-
+          SpecificArtifact: ${{ parameters.SpecificArtifact }}
+          BuildId: ${{ parameters.BuildId }}
+          ${{ if eq(parameters.cuda_version, '11.8') }}:
+            EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home=$(Agent.TempDirectory)\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8 --cuda_home=$(Agent.TempDirectory)\v11.8  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+          ${{ if eq(parameters.cuda_version, '12.2') }}:
+            EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home=$(Agent.TempDirectory)\TensorRT-8.6.1.6.Windows10.x86_64.cuda-12.0 --cuda_home=$(Agent.TempDirectory)\v12.2  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
 
   - ${{ if eq(parameters.enable_linux_gpu, true) }}:
       - template: ../templates/py-linux-gpu.yml
@@ -108,6 +76,10 @@ stages:
           machine_pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
           extra_build_arg: ${{ parameters.build_py_parameters }}
           cmake_build_type: ${{ parameters.cmake_build_type }}
-          docker_base_image: ${{ variables.docker_base_image }}
-          trt_version: ${{ variables.linux_trt_version }}
           cuda_version: ${{ parameters.cuda_version }}
+          ${{ if eq(parameters.cuda_version, '11.8') }}:
+            docker_base_image: nvidia/cuda:11.8.0-cudnn8-devel-ubi8
+            trt_version: 8.6.1.6-1.cuda11.8
+          ${{ if eq(parameters.cuda_version, '12.2') }}:
+            docker_base_image: nvidia/cuda:12.2.2-cudnn8-devel-ubi8
+            trt_version: 8.6.1.6-1.cuda12.0
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml b/tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml
index 8cc48aac7a3b9..318ffd21febf5 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml
@@ -35,62 +35,66 @@ parameters:
   values:
    - 11.8
    - 12.2
-jobs:
-- job: Linux_py_GPU_Wheels_${{ parameters.arch }}
-  timeoutInMinutes: 240
-  workspace:
-    clean: all
-  pool: ${{ parameters.machine_pool }}
-  variables:
-    # The build machine pool doesn't have dotnet, so it can't run CG.
-    - name: skipComponentGovernanceDetection
-      value: true
-    - name: extra_build_args
-      ${{ if ne(parameters.extra_build_arg, '') }}:
-        value: -x ${{ parameters.extra_build_arg }}
-      ${{ if eq(parameters.extra_build_arg, '') }}:
-        value: ''
-  steps:
-    - checkout: self
-      clean: true
-      submodules: recursive
 
-    - template: set-nightly-build-option-variable-step.yml
+stages:
+- stage: Linux_py_GPU_Wheels_${{ parameters.arch }}
+  dependsOn: []
+  jobs:
+  - job: Linux_py_GPU_Wheels_${{ parameters.arch }}
+    timeoutInMinutes: 240
+    workspace:
+      clean: all
+    pool: ${{ parameters.machine_pool }}
+    variables:
+      # The build machine pool doesn't have dotnet, so it can't run CG.
+      - name: skipComponentGovernanceDetection
+        value: true
+      - name: extra_build_args
+        ${{ if ne(parameters.extra_build_arg, '') }}:
+          value: -x ${{ parameters.extra_build_arg }}
+        ${{ if eq(parameters.extra_build_arg, '') }}:
+          value: ''
+    steps:
+      - checkout: self
+        clean: true
+        submodules: recursive
 
-    - template: get-docker-image-steps.yml
-      parameters:
-        Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
-        Context: tools/ci_build/github/linux/docker
-        DockerBuildArgs: "
-        --network=host 
-        --build-arg BASEIMAGE=${{ parameters.docker_base_image }}
-        --build-arg TRT_VERSION=${{ parameters.trt_version }}
-        --build-arg BUILD_UID=$( id -u )
-        --build-arg PLATFORM=${{ parameters.arch }}
-        "
-        Repository: onnxruntimecuda${{ replace(parameters.cuda_version, '.', '') }}xtrt86build${{ parameters.arch }}
+      - template: set-nightly-build-option-variable-step.yml
 
+      - template: get-docker-image-steps.yml
+        parameters:
+          Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
+          Context: tools/ci_build/github/linux/docker
+          DockerBuildArgs: "
+          --network=host
+          --build-arg BASEIMAGE=${{ parameters.docker_base_image }}
+          --build-arg TRT_VERSION=${{ parameters.trt_version }}
+          --build-arg BUILD_UID=$( id -u )
+          --build-arg PLATFORM=${{ parameters.arch }}
+          "
+          Repository: onnxruntimecuda${{ replace(parameters.cuda_version, '.', '') }}xtrt86build${{ parameters.arch }}
 
-    - task: Bash@3
-      displayName: 'Build Python Wheel'
-      inputs:
-        targetType: filePath
-        filePath: tools/ci_build/github/linux/run_python_dockerbuild.sh
-        arguments: -i onnxruntimecuda${{ replace(parameters.cuda_version, '.', '') }}xtrt86build${{ parameters.arch }} -d "GPU" -c ${{ parameters.cmake_build_type }} $(extra_build_args)
 
-    - task: PublishBuildArtifacts@1
-      displayName: 'Publish Artifact: ONNXRuntime python wheel'
-      inputs:
-        PathtoPublish: '$(Build.BinariesDirectory)/dist'
-        ArtifactName: onnxruntime_gpu
+      - task: Bash@3
+        displayName: 'Build Python Wheel'
+        inputs:
+          targetType: filePath
+          filePath: tools/ci_build/github/linux/run_python_dockerbuild.sh
+          arguments: -i onnxruntimecuda${{ replace(parameters.cuda_version, '.', '') }}xtrt86build${{ parameters.arch }} -d "GPU" -c ${{ parameters.cmake_build_type }} $(extra_build_args)
 
-    - task: PublishPipelineArtifact@0
-      displayName: 'Publish Test Binaries'
-      inputs:
-        artifactName: 'drop-linux-gpu-${{ parameters.arch }}'
-        targetPath: '$(Build.BinariesDirectory)/Release'
+      - task: PublishBuildArtifacts@1
+        displayName: 'Publish Artifact: ONNXRuntime python wheel'
+        inputs:
+          PathtoPublish: '$(Build.BinariesDirectory)/dist'
+          ArtifactName: onnxruntime_gpu
 
+      - task: PublishPipelineArtifact@0
+        displayName: 'Publish Test Binaries'
+        inputs:
+          artifactName: 'drop-linux-gpu-${{ parameters.arch }}'
+          targetPath: '$(Build.BinariesDirectory)/Release'
 
-    - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
-      displayName: 'Clean Agent Directories'
-      condition: always()
+
+      - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+        displayName: 'Clean Agent Directories'
+        condition: always()
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml
index 4315eae503ebd..17915d107dbe6 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml
@@ -1,8 +1,4 @@
 parameters:
-
-- name: MACHINE_POOL
-  type: string
-
 - name: EP_NAME
   type: string
 
@@ -27,169 +23,257 @@ parameters:
   values:
     - 11.8
     - 12.2
-jobs:
-- job: Win_py_${{ parameters.EP_NAME }}_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}
-  timeoutInMinutes: 240
-  workspace:
-    clean: all
-  pool:
-    name: ${{ parameters.MACHINE_POOL }}
-#    demands:
-#      - ImageVersionOverride -equals 1.0.367516
-  variables:
-    GRADLE_OPTS: '-Dorg.gradle.daemon=false'
-    VSGenerator: 'Visual Studio 17 2022'
-    CUDA_MODULE_LOADING: 'LAZY'
-  steps:
-      - checkout: self
-        clean: true
-        submodules: recursive
-
-      - template: telemetry-steps.yml
-
-      - task: UsePythonVersion@0
-        inputs:
-          versionSpec: ${{ parameters.PYTHON_VERSION }}
-          addToPath: true
-          architecture: 'x64'
-
-      - task: onebranch.pipeline.tsaoptions@1
-        displayName: 'OneBranch TSAOptions'
-        inputs:
-          tsaConfigFilePath: '$(Build.SourcesDirectory)\.config\tsaoptions.json'
-          appendSourceBranchName: false
-
-      - task: PythonScript@0
-        inputs:
-          scriptSource: inline
-          script: |
-            import sys
-            np_version = 'numpy==1.21.6' if sys.version_info < (3, 11) else 'numpy==1.24.2'
-            import subprocess
-            subprocess.call(['pip', 'install', '-q', 'setuptools', 'wheel', np_version])
-          workingDirectory: '$(Build.BinariesDirectory)'
-          displayName: 'Install python modules'
-
-      - template: download-deps.yml
-
-      - ${{ if ne(parameters.ENV_SETUP_SCRIPT, '') }}:
-        - template: jobs/set-winenv.yml
+
+- name: SpecificArtifact
+  displayName: Use Specific Artifact
+  type: boolean
+  default: false
+
+- name: BuildId
+  displayName: Specific Artifact's BuildId
+  type: string
+  default: '0'
+
+stages:
+  - stage: Win_py_${{ parameters.EP_NAME }}_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}_Build
+    dependsOn: []
+    jobs:
+    - job: Win_py_${{ parameters.EP_NAME }}_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}_Build
+      timeoutInMinutes: 120
+      workspace:
+        clean: all
+      pool:
+        name: onnxruntime-Win-CPU-2022
+    #    demands:
+    #      - ImageVersionOverride -equals 1.0.367516
+      variables:
+        GRADLE_OPTS: '-Dorg.gradle.daemon=false'
+        VSGenerator: 'Visual Studio 17 2022'
+        CUDA_MODULE_LOADING: 'LAZY'
+      steps:
+          - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+            displayName: 'Clean Agent Directories'
+            condition: always()
+
+          - checkout: self
+            clean: true
+            submodules: recursive
+
+          - template: telemetry-steps.yml
+
+          - task: UsePythonVersion@0
+            inputs:
+              versionSpec: ${{ parameters.PYTHON_VERSION }}
+              addToPath: true
+              architecture: 'x64'
+
+          - task: onebranch.pipeline.tsaoptions@1
+            displayName: 'OneBranch TSAOptions'
+            inputs:
+              tsaConfigFilePath: '$(Build.SourcesDirectory)\.config\tsaoptions.json'
+              appendSourceBranchName: false
+
+          - task: PythonScript@0
+            inputs:
+              scriptSource: inline
+              script: |
+                import sys
+                np_version = 'numpy==1.21.6' if sys.version_info < (3, 11) else 'numpy==1.26'
+                import subprocess
+                try:
+                  subprocess.check_call(['pip', 'install', '-q', 'setuptools', 'wheel', np_version])
+                except subprocess.CalledProcessError:
+                  sys.exit(1)
+              workingDirectory: '$(Build.BinariesDirectory)'
+              displayName: 'Install python modules'
+
+          - template: download-deps.yml
+
+          - ${{ if ne(parameters.ENV_SETUP_SCRIPT, '') }}:
+            - template: jobs/set-winenv.yml
+              parameters:
+                EnvSetupScript: ${{ parameters.ENV_SETUP_SCRIPT }}
+                ${{ if or(contains(parameters.EP_BUILD_FLAGS, 'use_cuda'), contains(parameters.EP_BUILD_FLAGS, 'use_tensorrt')) }}:
+                  DownloadCUDA: true
+
+          - ${{ if eq(parameters.ENV_SETUP_SCRIPT, '') }}:
+            - template: jobs/download_win_gpu_library.yml
+              parameters:
+                CudaVersion: ${{ parameters.CudaVersion }}
+                ${{ if or(contains(parameters.EP_BUILD_FLAGS, 'use_cuda'), contains(parameters.EP_BUILD_FLAGS, 'use_tensorrt')) }}:
+                  DownloadCUDA: true
+                ${{ if contains(parameters.EP_BUILD_FLAGS, 'use_tensorrt') }}:
+                  DownloadTRT: true
+
+          - task: PythonScript@0
+            displayName: 'Update deps.txt'
+            inputs:
+              scriptPath: $(Build.SourcesDirectory)/tools/ci_build/replace_urls_in_deps.py
+              arguments: --new_dir $(Build.BinariesDirectory)/deps
+              workingDirectory: $(Build.BinariesDirectory)
+
+          - task: PowerShell@2
+            displayName: 'Install ONNX'
+            inputs:
+              filePath: '$(Build.SourcesDirectory)/tools/ci_build/github/windows/install_third_party_deps.ps1'
+              workingDirectory: '$(Build.BinariesDirectory)'
+              arguments: -cpu_arch x64 -install_prefix $(Build.BinariesDirectory)\RelWithDebInfo\installed -build_config RelWithDebInfo
+
+          # it could be removed once there's onnx wheel for python 3.12
+          - ${{ if eq(parameters.PYTHON_VERSION, '3.12') }}:
+            - task: PublishPipelineArtifact@1
+              displayName: 'Publish Artifact: ONNX python 12 wheel'
+              inputs:
+                targetPath: '$(Agent.TempDirectory)\onnx\onnx-1.15.0\dist\'
+                publishLocation: 'pipeline'
+                artifactName: onnx_py12_wheel
+
+          - template: set-nightly-build-option-variable-step.yml
+
+          - task: PythonScript@0
+            displayName: 'Generate cmake config'
+            inputs:
+              scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
+              arguments: >
+                --config RelWithDebInfo
+                --build_dir $(Build.BinariesDirectory)
+                --skip_submodule_sync
+                --cmake_generator "$(VSGenerator)"
+                --enable_pybind
+                --enable_onnx_tests
+                --parallel --use_binskim_compliant_compile_flags --update
+                $(TelemetryOption) ${{ parameters.BUILD_PY_PARAMETERS }} ${{ parameters.EP_BUILD_FLAGS }}
+              workingDirectory: '$(Build.BinariesDirectory)'
+
+          # building with build.py so the parallelization parameters are added to the msbuild command
+          - task: PythonScript@0
+            displayName: 'Build'
+            inputs:
+              scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
+              arguments: >
+                --config RelWithDebInfo
+                --build_dir $(Build.BinariesDirectory)
+                --parallel --build
+                $(TelemetryOption) ${{ parameters.BUILD_PY_PARAMETERS }} ${{ parameters.EP_BUILD_FLAGS }}
+              workingDirectory: '$(Build.BinariesDirectory)'
+
+          # Esrp signing
+          - template: win-esrp-dll.yml
+            parameters:
+              FolderPath: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\onnxruntime\capi'
+              DisplayName: 'ESRP - Sign Native dlls'
+              DoEsrp: true
+              Pattern: '*.pyd,*.dll'
+
+          - task: PythonScript@0
+            displayName: 'Build wheel'
+            inputs:
+              scriptPath: '$(Build.SourcesDirectory)\setup.py'
+              arguments: 'bdist_wheel ${{ parameters.BUILD_PY_PARAMETERS }} $(NightlyBuildOption) --wheel_name_suffix=${{ parameters.EP_NAME }}'
+              workingDirectory: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo'
+
+          - task: CopyFiles@2
+            displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)'
+            inputs:
+              SourceFolder: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\dist'
+              Contents: '*.whl'
+              TargetFolder: '$(Build.ArtifactStagingDirectory)'
+
+          - task: PublishBuildArtifacts@1
+            displayName: 'Publish Artifact: ONNXRuntime python wheel'
+            inputs:
+              ArtifactName: onnxruntime_${{ parameters.EP_NAME }}
+
+          - script: |
+              7z x *.whl
+            workingDirectory: '$(Build.ArtifactStagingDirectory)'
+            displayName: 'unzip the package'
+
+          - task: CredScan@3
+            displayName: 'Run CredScan'
+            inputs:
+              debugMode: false
+            continueOnError: true
+
+          - task: BinSkim@4
+            displayName: 'Run BinSkim'
+            inputs:
+              AnalyzeTargetGlob: '+:file|$(Build.ArtifactStagingDirectory)\**\*.dll;-:file|$(Build.ArtifactStagingDirectory)\**\DirectML.dll'
+
+  - stage: Win_py_${{ parameters.EP_NAME }}_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}_Tests
+    dependsOn: Win_py_${{ parameters.EP_NAME }}_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}_Build
+    jobs:
+    - job: Win_py_${{ parameters.EP_NAME }}_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}_Tests
+      workspace:
+        clean: all
+      pool:
+        name: onnxruntime-Win2022-GPU-T4
+      steps:
+        - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+          displayName: 'Clean Agent Directories'
+          condition: always()
+
+        - checkout: self
+          clean: true
+          submodules: none
+
+        - task: UsePythonVersion@0
+          inputs:
+            versionSpec: ${{ parameters.PYTHON_VERSION }}
+            addToPath: true
+            architecture: 'x64'
+
+        - template: flex-downloadPipelineArtifact.yml
           parameters:
-            EnvSetupScript: ${{ parameters.ENV_SETUP_SCRIPT }}
-            ${{ if or(contains(parameters.EP_BUILD_FLAGS, 'use_cuda'), contains(parameters.EP_BUILD_FLAGS, 'use_tensorrt')) }}:
-              DownloadCUDA: true
+            ArtifactName: "onnxruntime_${{ parameters.EP_NAME }}"
+            StepName: 'Download Pipeline Artifact - Windows GPU Build'
+            TargetPath: '$(Build.ArtifactStagingDirectory)'
+            SpecificArtifact: ${{ parameters.SpecificArtifact }}
+            BuildId: ${{ parameters.BuildId }}
+
+        # It could be remove once there's onnx wheel for python 3.12
+        - ${{ if eq(parameters.PYTHON_VERSION, '3.12') }}:
+          - template: flex-downloadPipelineArtifact.yml
+            parameters:
+              ArtifactName: "onnx_py12_wheel"
+              StepName: 'Download Pipeline Artifact - Onnx Python12 wheel'
+              TargetPath: '$(Agent.TempDirectory)\onnx\'
+              SpecificArtifact: ${{ parameters.SpecificArtifact }}
+              BuildId: ${{ parameters.BuildId }}
+
+          - powershell: |
+              python -m pip install upgrade pip
+              Get-ChildItem -Path $(Agent.TempDirectory)\onnx\*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname tabulate}
+              python -m pip install pytest
+            workingDirectory: '$(Build.SourcesDirectory)'
+            displayName: 'Install ONNX and pytest'
+        - ${{ else }}:
+          - powershell: |
+              pushd onnxruntime/test/python
+              python -m pip install --upgrade pip
+              python -m pip install -r requirements.txt
+              popd
+            workingDirectory: '$(Build.SourcesDirectory)'
+            displayName: 'Install ONNX'
+
+        - powershell: |
+            python -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu -qq
+            Get-ChildItem -Path $(Build.ArtifactStagingDirectory)/*cp${{ replace(parameters.PYTHON_VERSION,'.','') }}*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname tabulate}
+            mkdir -p $(Agent.TempDirectory)\ort_test_data
+            Copy-Item -Path $(Build.sourcesDirectory)/onnxruntime/test/python/onnx_backend_test_series.py -Destination $(Agent.TempDirectory)\ort_test_data
+            Copy-Item -Recurse -Path $(Build.sourcesDirectory)/onnxruntime/test/testdata -Destination $(Agent.TempDirectory)\ort_test_data
+            cd $(Agent.TempDirectory)\ort_test_data
+            python onnx_backend_test_series.py
+          workingDirectory: '$(Build.sourcesDirectory)'
+          displayName: 'Run Python Tests'
+
+        - task: TSAUpload@2
+          displayName: 'TSA upload'
+          condition: and (succeeded(), eq(variables['Build.SourceBranch'], 'refs/heads/main'))
+          inputs:
+            GdnPublishTsaOnboard: false
+            GdnPublishTsaConfigFile: '$(Build.sourcesDirectory)\.gdn\.gdntsa'
 
-      - ${{ if eq(parameters.ENV_SETUP_SCRIPT, '') }}:
-        - template: jobs/download_win_gpu_library.yml
+        - template: component-governance-component-detection-steps.yml
           parameters:
-            CudaVersion: ${{ parameters.CudaVersion }}
-            ${{ if or(contains(parameters.EP_BUILD_FLAGS, 'use_cuda'), contains(parameters.EP_BUILD_FLAGS, 'use_tensorrt')) }}:
-              DownloadCUDA: true
-            ${{ if contains(parameters.EP_BUILD_FLAGS, 'use_tensorrt') }}:
-              DownloadTRT: true
-
-      - task: PythonScript@0
-        displayName: 'Update deps.txt'
-        inputs:
-          scriptPath: $(Build.SourcesDirectory)/tools/ci_build/replace_urls_in_deps.py
-          arguments: --new_dir $(Build.BinariesDirectory)/deps
-          workingDirectory: $(Build.BinariesDirectory)
-
-      - task: PowerShell@2
-        displayName: 'Install ONNX'
-        inputs:
-          filePath: '$(Build.SourcesDirectory)/tools/ci_build/github/windows/install_third_party_deps.ps1'
-          workingDirectory: '$(Build.BinariesDirectory)'
-          arguments: -cpu_arch x64 -install_prefix $(Build.BinariesDirectory)\RelWithDebInfo\installed -build_config RelWithDebInfo
-
-      - template: set-nightly-build-option-variable-step.yml
-
-
-      - task: PythonScript@0
-        displayName: 'Generate cmake config'
-        inputs:
-          scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-          arguments: >
-            --config RelWithDebInfo
-            --build_dir $(Build.BinariesDirectory)
-            --skip_submodule_sync
-            --cmake_generator "$(VSGenerator)"
-            --enable_pybind
-            --enable_onnx_tests
-            --parallel --use_binskim_compliant_compile_flags --update
-            $(TelemetryOption) ${{ parameters.BUILD_PY_PARAMETERS }} ${{ parameters.EP_BUILD_FLAGS }}
-          workingDirectory: '$(Build.BinariesDirectory)'
-
-      # building with build.py so the parallelization parameters are added to the msbuild command
-      - task: PythonScript@0
-        displayName: 'Build'
-        inputs:
-          scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-          arguments: >
-            --config RelWithDebInfo
-            --build_dir $(Build.BinariesDirectory)
-            --parallel --build
-            $(TelemetryOption) ${{ parameters.BUILD_PY_PARAMETERS }} ${{ parameters.EP_BUILD_FLAGS }}
-          workingDirectory: '$(Build.BinariesDirectory)'
-
-      # Esrp signing
-      - template: win-esrp-dll.yml
-        parameters:
-          FolderPath: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\onnxruntime\capi'
-          DisplayName: 'ESRP - Sign Native dlls'
-          DoEsrp: true
-          Pattern: '*.pyd,*.dll'
-
-      - task: PythonScript@0
-        displayName: 'Build wheel'
-        inputs:
-          scriptPath: '$(Build.SourcesDirectory)\setup.py'
-          arguments: 'bdist_wheel ${{ parameters.BUILD_PY_PARAMETERS }} $(NightlyBuildOption) --wheel_name_suffix=${{ parameters.EP_NAME }}'
-          workingDirectory: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo'
-
-      - task: CopyFiles@2
-        displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)'
-        inputs:
-          SourceFolder: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\dist'
-          Contents: '*.whl'
-          TargetFolder: '$(Build.ArtifactStagingDirectory)'
-
-      - task: PublishBuildArtifacts@1
-        displayName: 'Publish Artifact: ONNXRuntime python wheel'
-        inputs:
-          ArtifactName: onnxruntime_${{ parameters.EP_NAME }}
-
-      - script: |
-          7z x *.whl
-        workingDirectory: '$(Build.ArtifactStagingDirectory)'
-        displayName: 'unzip the package'
-
-      - task: CredScan@3
-        displayName: 'Run CredScan'
-        inputs:
-          debugMode: false
-        continueOnError: true
-
-      - task: BinSkim@4
-        displayName: 'Run BinSkim'
-        inputs:
-          AnalyzeTargetGlob: '+:file|$(Build.ArtifactStagingDirectory)\**\*.dll;-:file|$(Build.ArtifactStagingDirectory)\**\DirectML.dll'
-
-      - powershell: |
-         python -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu -qq
-         Get-ChildItem -Path $(Build.ArtifactStagingDirectory)/*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname tabulate}
-         Remove-Item -Recurse -Force onnxruntime
-         python onnx_backend_test_series.py
-        workingDirectory: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo'
-        displayName: 'Run Python Tests'
-
-      - task: TSAUpload@2
-        displayName: 'TSA upload'
-        condition: and (succeeded(), eq(variables['Build.SourceBranch'], 'refs/heads/main'))
-        inputs:
-          GdnPublishTsaOnboard: false
-          GdnPublishTsaConfigFile: '$(Build.sourcesDirectory)\.gdn\.gdntsa'
-
-      - template: component-governance-component-detection-steps.yml
-        parameters:
-          condition: 'succeeded'
+            condition: 'succeeded'

From eab35c20fc1afa301ac7c616d45a7f1f0f3d15b7 Mon Sep 17 00:00:00 2001
From: sfatimar <sahar.fatima@intel.com>
Date: Fri, 22 Mar 2024 07:14:00 +0530
Subject: [PATCH 225/279] Ort openvino npu 1.17 master (#19966)

### Description
Add NPU to list of device supported.
Added changes for Support to OV 2024.0
Nuget packages removes packaging of OpenVINO DLL
Bug Fixes with Python API
Reverted Dockerfiles not being maintained.


### Motivation and Context
NPU Device has been introduced by Intel in latest client systems
OpenVINO 2024.0 release is out.

---------

Co-authored-by: Suryaprakash Shanmugam <suryaprakash.shanmugam@intel.com>
Co-authored-by: Preetha Veeramalai <preetha.veeramalai@intel.com>
Co-authored-by: Ubuntu <ubuntu@ubuntu-118727.iind.intel.com>
Co-authored-by: hmamidix <hemax.sowjanya.mamidi@intel.com>
Co-authored-by: vthaniel <vishnudas.thaniel.s@intel.com>
Co-authored-by: saurabhkale17 <saurabh1.kale@intel.com>
---
 cmake/CMakeLists.txt                          |  37 +---
 cmake/onnxruntime_providers_openvino.cmake    |  27 +--
 dockerfiles/Dockerfile.openvino               |   6 +-
 dockerfiles/Dockerfile.openvino-centos7       | 105 ----------
 dockerfiles/Dockerfile.openvino-csharp        |  90 ---------
 dockerfiles/Dockerfile.openvino-rhel8         |  87 --------
 .../providers/openvino/backend_manager.cc     |  66 ++++--
 .../core/providers/openvino/backend_manager.h |   2 +-
 .../core/providers/openvino/backend_utils.cc  |  40 +---
 .../core/providers/openvino/backend_utils.h   |   3 +-
 .../openvino/backends/backend_factory.cc      |   6 +-
 .../openvino/backends/basic_backend.cc        |  83 ++++----
 .../openvino/backends/basic_backend.h         |   2 +-
 .../core/providers/openvino/contexts.h        |   4 +-
 .../core/providers/openvino/ibackend.h        |   2 +-
 .../openvino/openvino_execution_provider.cc   |  26 +--
 .../openvino/openvino_execution_provider.h    |  20 +-
 .../openvino/openvino_provider_factory.cc     |  23 ++-
 .../core/providers/openvino/ov_interface.cc   |  43 ++--
 .../core/providers/openvino/ov_interface.h    |   5 +-
 .../openvino/ov_versions/capability.cc        |  29 +--
 .../openvino/ov_versions/capability.h         |   5 +-
 .../openvino/ov_versions/data_ops.cc          | 191 +++---------------
 .../providers/openvino/ov_versions/data_ops.h |   9 +-
 .../providers/openvino/ov_versions/utils.cc   |  24 +--
 .../providers/openvino/ov_versions/utils.h    |   2 +-
 .../python/onnxruntime_pybind_state.cc        |  16 +-
 .../python/onnxruntime_pybind_state_common.h  |   7 +-
 .../test/contrib_ops/activation_op_test.cc    |   4 +
 onnxruntime/test/perftest/ort_test_session.cc |   4 +-
 .../cpu/activation/activation_op_test.cc      |   4 +-
 .../test/python/onnx_backend_test_series.py   |   4 +-
 .../onnx_backend_test_series_filters.jsonc    |   7 +-
 tools/ci_build/build.py                       |   9 +-
 .../linux-openvino-ci-pipeline.yml            |   2 +-
 .../linux/docker/Dockerfile.ubuntu_openvino   |  13 +-
 ...kerfile_manylinux2014_openvino_multipython |  83 --------
 .../nuget/generate_nuspec_for_native_nuget.py |  28 +--
 38 files changed, 275 insertions(+), 843 deletions(-)
 delete mode 100755 dockerfiles/Dockerfile.openvino-centos7
 delete mode 100644 dockerfiles/Dockerfile.openvino-csharp
 delete mode 100644 dockerfiles/Dockerfile.openvino-rhel8
 delete mode 100644 tools/ci_build/github/linux/docker/Dockerfile_manylinux2014_openvino_multipython

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 49b6f06c76a64..ee1959bb357fe 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -1290,34 +1290,6 @@ if (onnxruntime_USE_OPENVINO)
 
   add_definitions(-DUSE_OPENVINO=1)
 
-  if (EXISTS "$ENV{INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/version.txt")
-    file(READ $ENV{INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/version.txt VER)
-  endif()
-
-  if (NOT DEFINED ENV{INTEL_OPENVINO_DIR})
-    message(FATAL_ERROR "[Couldn't locate OpenVINO] OpenVINO may not have been initialized")
-  endif()
-
-  # Check OpenVINO version for support
-  if ($ENV{INTEL_OPENVINO_DIR} MATCHES "2023.0")
-    set(OPENVINO_VERSION "2023.0")
-    add_definitions(-DOPENVINO_2023_0=1)
-  elseif ($ENV{INTEL_OPENVINO_DIR} MATCHES "2023.1")
-    set(OPENVINO_VERSION "2023.1")
-    add_definitions(-DOPENVINO_2023_1=1)
-  elseif ($ENV{INTEL_OPENVINO_DIR} MATCHES "2023.2")
-    set(OPENVINO_VERSION "2023.2")
-    add_definitions(-DOPENVINO_2023_2=1)
-  elseif ($ENV{INTEL_OPENVINO_DIR} MATCHES "2023.3")
-    set(OPENVINO_VERSION "2023.3")
-    add_definitions(-DOPENVINO_2023_3=1)
-  elseif ($ENV{INTEL_OPENVINO_DIR} MATCHES "openvino")
-    set(OPENVINO_VERSION "2023.3")
-    add_definitions(-DOPENVINO_2023_3=1)
-  else()
-    message(FATAL_ERROR "Unsupported OpenVINO version: ${INTEL_OPENVINO_DIR}")
-  endif()
-
   if (onnxruntime_USE_OPENVINO_GPU_FP32)
     add_definitions(-DOPENVINO_CONFIG_GPU_FP32=1)
   endif()
@@ -1334,6 +1306,10 @@ if (onnxruntime_USE_OPENVINO)
     add_definitions(-DOPENVINO_CONFIG_CPU_FP16=1)
   endif()
 
+  if (onnxruntime_USE_OPENVINO_NPU)
+    add_definitions(-DOPENVINO_CONFIG_NPU=1)
+  endif()
+
   if (onnxruntime_USE_OPENVINO_GPU_FP32_NP)
     add_definitions(-DOPENVINO_CONFIG_GPU_FP32=1)
     add_definitions(-DOPENVINO_DISABLE_GRAPH_PARTITION=1)
@@ -1354,6 +1330,11 @@ if (onnxruntime_USE_OPENVINO)
     add_definitions(-DOPENVINO_DISABLE_GRAPH_PARTITION=1)
   endif()
 
+  if (onnxruntime_USE_OPENVINO_NPU_NP)
+    add_definitions(-DOPENVINO_CONFIG_NPU=1)
+    add_definitions(-DOPENVINO_DISABLE_GRAPH_PARTITION=1)
+  endif()
+
   if (onnxruntime_USE_OPENVINO_HETERO)
     add_definitions(-DOPENVINO_CONFIG_HETERO=1)
     add_definitions(-DDEVICE_NAME="${onnxruntime_USE_OPENVINO_DEVICE}")
diff --git a/cmake/onnxruntime_providers_openvino.cmake b/cmake/onnxruntime_providers_openvino.cmake
index e26f0bfc0b751..5876b2b5c448b 100644
--- a/cmake/onnxruntime_providers_openvino.cmake
+++ b/cmake/onnxruntime_providers_openvino.cmake
@@ -16,23 +16,19 @@
   endif()
 
   # Header paths
-  find_package(InferenceEngine REQUIRED)
-  find_package(ngraph REQUIRED)
-
-  if (OPENVINO_2022_1 OR OPENVINO_2022_2)
   find_package(OpenVINO REQUIRED COMPONENTS Runtime ONNX)
-  list (OV_20_LIBS openvino::frontend::onnx openvino::runtime)
+  if(OpenVINO_VERSION VERSION_LESS 2023.0)
+    message(FATAL_ERROR "OpenVINO 2023.0 and newer are supported. Please, latest OpenVINO release")
   endif()
 
   if (WIN32)
     unset(CMAKE_MAP_IMPORTED_CONFIG_RELWITHDEBINFO)
   endif()
 
+  list(APPEND OPENVINO_LIB_LIST openvino::frontend::onnx openvino::runtime ${PYTHON_LIBRARIES})
   if ((DEFINED ENV{OPENCL_LIBS}) AND (DEFINED ENV{OPENCL_INCS}))
     add_definitions(-DIO_BUFFER_ENABLED=1)
-    list(APPEND OPENVINO_LIB_LIST $ENV{OPENCL_LIBS} ${OV_20_LIBS} ${InferenceEngine_LIBRARIES} ${NGRAPH_LIBRARIES} ngraph::onnx_importer ${PYTHON_LIBRARIES})
-  else()
-    list(APPEND OPENVINO_LIB_LIST ${OV_20_LIBS} ${InferenceEngine_LIBRARIES} ${NGRAPH_LIBRARIES} ngraph::onnx_importer ${PYTHON_LIBRARIES})
+    list(APPEND OPENVINO_LIB_LIST $ENV{OPENCL_LIBS})
   endif()
 
   source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_openvino_cc_srcs})
@@ -75,7 +71,14 @@
     message(FATAL_ERROR "onnxruntime_providers_openvino unknown platform, need to specify shared library exports for it")
   endif()
 
-  install(TARGETS onnxruntime_providers_openvino
-          ARCHIVE  DESTINATION ${CMAKE_INSTALL_LIBDIR}
-          LIBRARY  DESTINATION ${CMAKE_INSTALL_LIBDIR}
-          RUNTIME  DESTINATION ${CMAKE_INSTALL_BINDIR})
\ No newline at end of file
+  if (CMAKE_OPENVINO_LIBRARY_INSTALL_DIR)
+    install(TARGETS onnxruntime_providers_openvino
+            ARCHIVE  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+            LIBRARY  DESTINATION ${CMAKE_OPENVINO_LIBRARY_INSTALL_DIR}
+            RUNTIME  DESTINATION ${CMAKE_INSTALL_BINDIR})
+  else()
+    install(TARGETS onnxruntime_providers_openvino
+            ARCHIVE  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+            LIBRARY  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+            RUNTIME  DESTINATION ${CMAKE_INSTALL_BINDIR})
+  endif()
diff --git a/dockerfiles/Dockerfile.openvino b/dockerfiles/Dockerfile.openvino
index 78d04a51ba162..049916fac92f1 100644
--- a/dockerfiles/Dockerfile.openvino
+++ b/dockerfiles/Dockerfile.openvino
@@ -1,9 +1,9 @@
 #-------------------------------------------------------------------------
-# Copyright(C) 2021-2023 Intel Corporation.
+# Copyright(C) 2021-2024 Intel Corporation.
 # SPDX-License-Identifier: MIT
 #--------------------------------------------------------------------------
 
-ARG OPENVINO_VERSION=2023.0.0
+ARG OPENVINO_VERSION=2024.0.0
 
 
 # Build stage
@@ -17,7 +17,7 @@ ARG DEVICE=CPU_FP32
 ARG ONNXRUNTIME_REPO=https://github.com/microsoft/onnxruntime.git
 ARG ONNXRUNTIME_BRANCH=main
 
-ENV InferenceEngine_DIR=${INTEL_OPENVINO_DIR}/runtime/cmake
+ENV OpenVINO_DIR=${INTEL_OPENVINO_DIR}/runtime/cmake
 
 USER root
 RUN apt update; apt install -y git protobuf-compiler libprotobuf-dev
diff --git a/dockerfiles/Dockerfile.openvino-centos7 b/dockerfiles/Dockerfile.openvino-centos7
deleted file mode 100755
index 697db44801e3b..0000000000000
--- a/dockerfiles/Dockerfile.openvino-centos7
+++ /dev/null
@@ -1,105 +0,0 @@
-#-------------------------------------------------------------------------
-# Copyright(C) 2021 Intel Corporation.
-# SPDX-License-Identifier: MIT
-#--------------------------------------------------------------------------
-
-FROM centos:7.8.2003
-
-WORKDIR /code
-
-ARG MY_ROOT=/code
-ARG YUM_OV_PACKAGE=intel-openvino-runtime-centos7-2021.4.752.x86_64
-ARG DEVICE=CPU_FP32
-ARG ONNXRUNTIME_REPO=https://github.com/microsoft/onnxruntime
-ARG ONNXRUNTIME_BRANCH=main
-
-ENV INTEL_OPENVINO_DIR=/opt/intel/openvino_2021.4.752
-ENV InferenceEngine_DIR=${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/share
-ENV IE_PLUGINS_PATH=${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/lib/intel64
-ENV ngraph_DIR=${INTEL_OPENVINO_DIR}/deployment_tools/ngraph/cmake
-ENV LD_LIBRARY_PATH=/opt/intel/opencl:${INTEL_OPENVINO_DIR}/inference_engine/external/gna/lib:${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/external/mkltiny_lnx/lib:$INTEL_OPENVINO_DIR/deployment_tools/ngraph/lib:${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/external/omp/lib:${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/external/tbb/lib:${IE_PLUGINS_PATH}:${LD_LIBRARY_PATH}
-ENV OpenCV_DIR=${INTEL_OPENVINO_DIR}/opencv/share/OpenCV
-ENV LD_LIBRARY_PATH=${INTEL_OPENVINO_DIR}/opencv/lib:${INTEL_OPENVINO_DIR}/opencv/share/OpenCV/3rdparty/lib:${LD_LIBRARY_PATH}
-ENV HDDL_INSTALL_DIR=${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/external/hddl
-ENV LD_LIBRARY_PATH=${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/external/hddl/lib:$LD_LIBRARY_PATH
-ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/lib:/usr/local/lib64:/usr/lib64:/lib64:$LD_LIBRARY_PATH
-
-# Install packages
-RUN yum update -y && \
-    yum groupinstall "Development Tools" -y && \
-    yum install -y yum-utils autoconf automake libtool unzip udev wget zlib-devel libffi-devel openssl-devel boost-devel-1.53.0 && \
-    yum clean packages &&  yum clean all && rm -rf /var/cache/yum && \
-# Install cmake
-    cd $MY_ROOT && \
-    wget https://github.com/Kitware/CMake/releases/download/v3.27.3/cmake-3.27.3.tar.gz && \
-    tar -zxvf cmake-3.27.3.tar.gz && rm -rf cmake-3.27.3.tar.gz && \
-    cd cmake-3.27.3 && \
-    ./bootstrap && \
-    make && \
-    make install && \
-    cd $MY_ROOT && \
-# libusb1.0.22
-    cd /opt/ && wget https://github.com/libusb/libusb/archive/v1.0.22.zip && \
-    unzip v1.0.22.zip && rm -rf v1.0.22.zip && cd  /opt/libusb-1.0.22 && \
-# bootstrap steps
-    ./bootstrap.sh && \
-    ./configure --disable-udev --enable-shared && \
-    make -j4 && \
-# configure libusb1.0.22
-    cd /opt/libusb-1.0.22/libusb && \
-    /bin/mkdir -p '/usr/local/lib' && \
-    /bin/bash ../libtool   --mode=install /usr/bin/install -c   libusb-1.0.la '/usr/local/lib' && \
-    /bin/mkdir -p '/usr/local/include/libusb-1.0' && \
-    /usr/bin/install -c -m 644 libusb.h '/usr/local/include/libusb-1.0' && \
-    /bin/mkdir -p '/usr/local/lib/pkgconfig' && \
-# Install openvino
-    yum-config-manager --add-repo https://yum.repos.intel.com/openvino/2021/setup/intel-openvino-2021.repo && \
-    rpm --import https://yum.repos.intel.com/openvino/2021/setup/RPM-GPG-KEY-INTEL-OPENVINO-2021 && \
-    yum update -y && yum list intel-openvino* && \
-    yum install -y $YUM_OV_PACKAGE && \
-    cd ${INTEL_OPENVINO_DIR}/install_dependencies/ && ./install_openvino_dependencies.sh -y && \
-    printf "\nexport LD_LIBRARY_PATH=\${LD_LIBRARY_PATH}:/usr/local/lib\n" >> /opt/intel/openvino_2021.4.752/bin/setupvars.sh && \
-    cd /opt/libusb-1.0.22 && \
-    /usr/bin/install -c -m 644 libusb-1.0.pc '/usr/local/lib/pkgconfig' && \
-    cp /opt/intel/openvino_2021/deployment_tools/inference_engine/external/97-myriad-usbboot.rules /etc/udev/rules.d/ && \
-    ldconfig && \
-# Install GPU runtime and drivers
-    cd ${MY_ROOT} && \
-    mkdir /tmp/opencl && \
-    cd /tmp/opencl && \
-    yum install -y epel-release && \
-    yum install -y ocl-icd ocl-icd-devel && \
-    wget -O intel-igc-core-1.0.2597-1.el7.x86_64.rpm https://sourceforge.net/projects/intel-compute-runtime/files/19.41.14441/centos-7/intel-igc-core-1.0.2597-1.el7.x86_64.rpm/download && \
-    wget -O intel-opencl-19.41.14441-1.el7.x86_64.rpm https://sourceforge.net/projects/intel-compute-runtime/files/19.41.14441/centos-7/intel-opencl-19.41.14441-1.el7.x86_64.rpm/download && \
-    wget -O intel-igc-opencl-devel-1.0.2597-1.el7.x86_64.rpm https://sourceforge.net/projects/intel-compute-runtime/files/19.41.14441/centos-7/intel-igc-opencl-devel-1.0.2597-1.el7.x86_64.rpm/download && \
-    wget -O intel-igc-opencl-1.0.2597-1.el7.x86_64.rpm https://sourceforge.net/projects/intel-compute-runtime/files/19.41.14441/centos-7/intel-igc-opencl-1.0.2597-1.el7.x86_64.rpm/download && \
-    wget -O intel-gmmlib-19.3.2-1.el7.x86_64.rpm https://sourceforge.net/projects/intel-compute-runtime/files/19.41.14441/centos-7/intel-gmmlib-19.3.2-1.el7.x86_64.rpm/download && \
-    wget -O intel-gmmlib-devel-19.3.2-1.el7.x86_64.rpm https://sourceforge.net/projects/intel-compute-runtime/files/19.41.14441/centos-7/intel-gmmlib-devel-19.3.2-1.el7.x86_64.rpm/download && \
-    rpm -i /tmp/opencl/*.rpm  && \
-    ldconfig  && \
-    rm -rf /tmp/opencl && \
-# Installing gcc-10
-    yum install -y centos-release-scl && \
-    yum install -y devtoolset-10-gcc* && \
-    echo 'source scl_source enable devtoolset-10' >> ~/.bashrc && \
-# python installation
-    source scl_source enable devtoolset-10 && \
-    cd /code/ && \
-    wget https://www.python.org/ftp/python/3.8.3/Python-3.8.3.tgz && tar xvf Python-3.8.3.tgz && \
-    cd Python-3.8*/ && ./configure && make && make install && \
-    cd ../ &&  mkdir -p /usr/bin/Python38 && ln -s Python-3.8.3/ /usr/bin/Python38 && \
-# installing dependancies
-    yum install -y python3-lxml python3-six libusb.x86_64 && \
-    yum clean packages &&  yum clean all && rm -rf /var/cache/yum && \
-# Build onnxruntime
-    cd $MY_ROOT && \
-    pip3 install numpy wheel setuptools cython && \
-    git clone --recursive -b ${ONNXRUNTIME_BRANCH} ${ONNXRUNTIME_REPO} && \
-    pip3 install onnx && \
-    cd /code/onnxruntime && ./build.sh --allow_running_as_root --config Release --update --build --parallel --use_openvino ${DEVICE} --build_shared_lib --build_wheel && \
-    pip3 install /code/onnxruntime/build/Linux/Release/dist/*-linux_x86_64.whl && \
-# Clean up
-    cd  $MY_ROOT && rm -rf onnxruntime Python-3* && \
-    cd ${MY_ROOT}/ && rm -rf cmake* && \
-    cd /usr/share/ && rm -rf gcc* && cd /usr/lib/ && rm -rf gcc cd && rm -rf .cache && \
-    cd ${INTEL_OPENVINO_DIR}/ && rm -rf documentation data_processing && cd deployment_tools/ && rm -rf tools
diff --git a/dockerfiles/Dockerfile.openvino-csharp b/dockerfiles/Dockerfile.openvino-csharp
deleted file mode 100644
index 2529ef4b73209..0000000000000
--- a/dockerfiles/Dockerfile.openvino-csharp
+++ /dev/null
@@ -1,90 +0,0 @@
-#-------------------------------------------------------------------------
-# Copyright(C) 2021-2023 Intel Corporation.
-# SPDX-License-Identifier: MIT
-#--------------------------------------------------------------------------
-
-ARG OPENVINO_VERSION=2023.0.0
-
-# Build stage
-FROM openvino/ubuntu20_runtime:${OPENVINO_VERSION} AS base
-
-ENV WORKDIR_PATH=/home/openvino
-WORKDIR $WORKDIR_PATH
-ENV DEBIAN_FRONTEND noninteractive
-
-USER root
-RUN apt update; apt install -y --no-install-recommends wget gnupg && \
-    rm -rf /var/lib/apt/lists/*
-
-# Install Mono
-RUN wget http://download.mono-project.com/repo/xamarin.gpg && apt-key add xamarin.gpg && rm xamarin.gpg && \
-    echo "deb https://download.mono-project.com/repo/ubuntu stable-bionic main" | tee /etc/apt/sources.list.d/mono-official-stable.list && \
-    apt update -y && \
-    apt install -y mono-devel
-
-# Install nuget.exe
-RUN wget https://dist.nuget.org/win-x86-commandline/latest/nuget.exe && \
-    mv nuget.exe /usr/local/bin/nuget.exe && \
-    echo 'mono /usr/local/bin/nuget.exe $@' > /usr/local/bin/nuget && \
-    chmod a+x /usr/local/bin/nuget
-
-# Install .NET core
-RUN wget https://packages.microsoft.com/config/ubuntu/20.04/packages-microsoft-prod.deb -O packages-microsoft-prod.deb && \
-    dpkg -i packages-microsoft-prod.deb && \
-    apt-get update -y &&\
-    apt-get install -y apt-transport-https && \
-    apt-get update -y && \
-    apt-get install -y dotnet-sdk-5.0
-
-# Build stage
-FROM base AS builder
-
-ENV WORKDIR_PATH=/home/openvino
-WORKDIR $WORKDIR_PATH
-ENV DEBIAN_FRONTEND noninteractive
-
-ARG DEVICE=CPU_FP32
-ARG ONNXRUNTIME_REPO=https://github.com/microsoft/onnxruntime.git
-ARG ONNXRUNTIME_BRANCH=main
-
-ENV InferenceEngine_DIR=${INTEL_OPENVINO_DIR}/runtime/cmake
-ENV LANG en_US.UTF-8
-
-USER root
-RUN apt update; apt install -y --no-install-recommends git protobuf-compiler libprotobuf-dev ca-certificates unattended-upgrades && \
-    unattended-upgrade && \
-    rm -rf /var/lib/apt/lists/*
-
-RUN git clone --recursive -b ${ONNXRUNTIME_BRANCH} ${ONNXRUNTIME_REPO}
-RUN /bin/sh onnxruntime/dockerfiles/scripts/install_common_deps.sh
-RUN ln -s cmake-* cmake-dir
-RUN python3 -m pip install wheel
-ENV PATH=${WORKDIR_PATH}/cmake-dir/bin:$PATH
-RUN pip3 install onnx
-RUN ln -s /usr/bin/python3 /usr/bin/python
-RUN apt install locales && \
-    locale-gen en_US en_US.UTF-8 && \
-    dpkg-reconfigure locales
-RUN cd onnxruntime && ./build.sh --allow_running_as_root --config Release --update --build --parallel --use_openvino ${DEVICE} --build_nuget --build_shared_lib
-RUN cp /home/openvino/onnxruntime/build/Linux/Release/Microsoft.ML.OnnxRuntime.Managed* /home/openvino/onnxruntime/build/Linux/Release/nuget-artifacts
-
-# Deploy stage
-FROM base
-
-ENV DEBIAN_FRONTEND noninteractive
-USER root
-
-RUN apt update; apt install -y unattended-upgrades fonts-freefont-ttf && \
-    unattended-upgrade
-ARG BUILD_UID=1001
-ARG BUILD_USER=onnxruntimedev
-RUN adduser --uid $BUILD_UID $BUILD_USER
-RUN usermod -a -G video,users ${BUILD_USER}
-ENV WORKDIR_PATH /home/${BUILD_USER}
-WORKDIR ${WORKDIR_PATH}
-COPY --from=builder /home/openvino/onnxruntime/build/Linux/Release/nuget-artifacts ${WORKDIR_PATH}/nuget-artifacts
-
-USER ${BUILD_USER}
-ENV PATH=${WORKDIR_PATH}/miniconda/bin:${WORKDIR_PATH}/cmake-dir/bin:$PATH
-ENV IE_PLUGINS_PATH=${INTEL_OPENVINO_DIR}/runtime/lib/intel64
-ENV LD_LIBRARY_PATH=/opt/intel/opencl:${INTEL_OPENVINO_DIR}/runtime/3rdparty/tbb/lib:${IE_PLUGINS_PATH}:${LD_LIBRARY_PATH}
diff --git a/dockerfiles/Dockerfile.openvino-rhel8 b/dockerfiles/Dockerfile.openvino-rhel8
deleted file mode 100644
index 5c504cfa553a1..0000000000000
--- a/dockerfiles/Dockerfile.openvino-rhel8
+++ /dev/null
@@ -1,87 +0,0 @@
-# Build stage
-FROM registry.access.redhat.com/ubi8/ubi:8.4
-
-WORKDIR /code
-
-ARG MY_ROOT=/code
-ARG DEVICE=CPU_FP32
-ARG ONNXRUNTIME_REPO=https://github.com/microsoft/onnxruntime
-ARG ONNXRUNTIME_BRANCH=main
-
-ENV INTEL_OPENVINO_DIR=/opt/intel/openvino_2022.3.0
-
-ENV InferenceEngine_DIR=${INTEL_OPENVINO_DIR}/runtime/cmake
-ENV IE_PLUGINS_PATH=${INTEL_OPENVINO_DIR}/runtime/lib/intel64/
-ENV ngraph_DIR=${INTEL_OPENVINO_DIR}/runtime/cmake
-ENV LD_LIBRARY_PATH=${INTEL_OPENVINO_DIR}/runtime/3rdparty/tbb/lib/:${IE_PLUGINS_PATH}:${LD_LIBRARY_PATH}
-ENV OpenCV_DIR=${INTEL_OPENVINO_DIR}/extras/opencv/cmake
-ENV LD_LIBRARY_PATH=${INTEL_OPENVINO_DIR}/extras/opencv/lib:${LD_LIBRARY_PATH}
-ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/lib:/usr/local/lib64:/usr/lib64:/lib64:${LD_LIBRARY_PATH}
-ENV PATH=${MY_ROOT}/cmake-dir/bin:$PATH
-
-# Install packages
-RUN yum install -y yum-utils autoconf automake libtool unzip udev wget zlib-devel libffi-devel openssl-devel git make gcc && \
-    yum clean packages &&  yum clean all && rm -rf /var/cache/yum && \
-# Install python 3.8
-    cd $MY_ROOT && \
-    wget https://www.python.org/ftp/python/3.8.9/Python-3.8.9.tgz && tar xvf Python-3.8.9.tgz && rm -rf Python-3.8.9.tgz && \
-    cd Python-3.8*/ && ./configure && make && make install && \
-    cd ../ &&  mkdir -p /usr/bin/Python38 && ln -s Python-3.8.9/ /usr/bin/Python38 && ln -s /usr/bin/pip3 /usr/bin/pip && \
-# libusb1.0.22
-    cd /opt/ && wget https://github.com/libusb/libusb/archive/v1.0.22.zip && \
-    unzip v1.0.22.zip && rm -rf v1.0.22.zip && cd  /opt/libusb-1.0.22 && \
-# bootstrap steps
-    ./bootstrap.sh && \
-    ./configure --disable-udev --enable-shared && \
-    make -j4 && \
-# configure libusb1.0.22
-    cd /opt/libusb-1.0.22/libusb && \
-    /bin/mkdir -p '/usr/local/lib' && \
-    /bin/bash ../libtool   --mode=install /usr/bin/install -c   libusb-1.0.la '/usr/local/lib' && \
-    /bin/mkdir -p '/usr/local/include/libusb-1.0' && \
-    /usr/bin/install -c -m 644 libusb.h '/usr/local/include/libusb-1.0' && \
-    /bin/mkdir -p '/usr/local/lib/pkgconfig' && \
-# Install openvino
-    cd /opt/ && mkdir intel/ && cd intel && \
-    wget https://storage.openvinotoolkit.org/repositories/openvino/packages/2022.3/linux/l_openvino_toolkit_rhel8_2022.3.0.9052.9752fafe8eb_x86_64.tgz  && \
-    tar xvf l_openvino_toolkit_rhel8_2022.3.0.9052.9752fafe8eb_x86_64.tgz && \
-    rm -rf l_openvino_toolkit_rhel8_2022.3.0.9052.9752fafe8eb_x86_64.tgz && \
-    mv l_openvino_toolkit_rhel8_2022.3.0.9052.9752fafe8eb_x86_64 openvino_2022.3.0 && \
-    cd ${INTEL_OPENVINO_DIR}/install_dependencies/ && ./install_openvino_dependencies.sh -y && ./install_NEO_OCL_driver.sh -y && \
-    printf "\nexport LD_LIBRARY_PATH=\${LD_LIBRARY_PATH}:/usr/local/lib\n" >> /opt/intel/openvino_2022.3.0/setupvars.sh && \
-    cd /opt/libusb-1.0.22 && \
-    /usr/bin/install -c -m 644 libusb-1.0.pc '/usr/local/lib/pkgconfig' && \
-    # MYRIAD plugins are not available for openvino 2022.3.0 release
-    #cp /opt/intel/openvino_2022.3.0/install_dependencies/97-myriad-usbboot.rules /etc/udev/rules.d/ && \
-    ldconfig && \
-#Install protobuf
-    cd $MY_ROOT && \
-    git clone https://github.com/protocolbuffers/protobuf.git && \
-    cd protobuf && \
-    git checkout v3.16.0 && \
-    git submodule update --init --recursive && \
-    mkdir build_source && cd build_source && \
-    cmake ../cmake  -DCMAKE_INSTALL_LIBDIR=lib64 -Dprotobuf_BUILD_SHARED_LIBS=OFF -DCMAKE_INSTALL_PREFIX=/usr -DCMAKE_INSTALL_SYSCONFDIR=/etc -DCMAKE_POSITION_INDEPENDENT_CODE=ON -Dprotobuf_BUILD_TESTS=OFF -DCMAKE_BUILD_TYPE=Release && \
-    make -j$(nproc) && \
-    make install && \
-# Build onnxruntime
-    cd $MY_ROOT && \
-    pip3 install numpy wheel setuptools cython onnx && \
-    git clone --recursive -b ${ONNXRUNTIME_BRANCH} ${ONNXRUNTIME_REPO} && \
-    bash onnxruntime/dockerfiles/scripts/install_common_deps.sh && \
-    ln -s cmake-* cmake-dir && \
-    source /opt/intel/openvino_2022.3.0/setupvars.sh && \
-    cd /code/onnxruntime && ./build.sh --allow_running_as_root --config Release --update --build --parallel --use_openvino ${DEVICE} --build_shared_lib --build_wheel && \
-    pip3 install /code/onnxruntime/build/Linux/Release/dist/*-linux_x86_64.whl && \
-# Clean up
-    cd ${MY_ROOT} && rm -rf onnxruntime && rm -rf Python-3.8.9 && rm -rf protobuf
-
-# Deploy stage
-ARG BUILD_UID=1001
-ARG BUILD_USER=onnxruntimedev
-RUN adduser --uid $BUILD_UID $BUILD_USER
-RUN usermod -a -G video,users,render ${BUILD_USER}
-ENV WORKDIR_PATH /home/${BUILD_USER}
-
-WORKDIR ${WORKDIR_PATH}
-USER ${BUILD_USER}
diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
index 330b464ffd1bb..3252603e33389 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.cc
+++ b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -1,8 +1,9 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #include <fstream>
 #include <utility>
+#include <exception>
 
 #include "core/providers/shared_library/provider_api.h"
 #include "contexts.h"
@@ -24,15 +25,6 @@ BackendManager::BackendManager(const GlobalContext& global_context,
   global_context_ = global_context;
 
   auto prec_str = GetGlobalContext().precision_str;
-  if (prec_str == "FP32") {
-    subgraph_context_.precision = "FP32";
-  } else if (prec_str == "FP16") {
-    subgraph_context_.precision = "FP16";
-  } else if (prec_str == "U8") {
-    subgraph_context_.precision = "U8";
-  } else {
-    throw std::string("Invalid OpenVINO Precision type: " + prec_str);
-  }
 
   // Save the indexes of graph inputs among fused_node's inputDefs
   // (which also contains initializers).
@@ -47,7 +39,7 @@ BackendManager::BackendManager(const GlobalContext& global_context,
   for (auto input : graph_inputs) {
     auto it = subgraph_context_.input_names.find(input->Name());
     if (it == subgraph_context_.input_names.end()) {
-      throw std::string("Input not found in the input defs list");
+      ORT_THROW("Input not found in the input defs list");
     }
     int index = it->second;
     subgraph_context_.input_indexes.push_back(index);
@@ -61,6 +53,7 @@ BackendManager::BackendManager(const GlobalContext& global_context,
   }
   subgraph_context_.subgraph_name = fused_node.Name();
   model_proto_ = GetModelProtoFromFusedNode(fused_node, subgraph, logger);
+  std::string device_type = openvino_ep::BackendManager::GetGlobalContext().device_type;
 
   if (ModelHasSymbolicInputDims(subgraph)) {
     subgraph_context_.has_dynamic_input_shape = true;
@@ -75,7 +68,7 @@ BackendManager::BackendManager(const GlobalContext& global_context,
                                                           GetGlobalContext(),
                                                           subgraph_context_);
         } catch (std::string const& msg) {
-          throw msg;
+          ORT_THROW(msg);
         }
         LOGS_DEFAULT(INFO) << "[OpenVINO-EP] "
                            << "Backend created for graph " << subgraph_context_.subgraph_name;
@@ -87,12 +80,29 @@ BackendManager::BackendManager(const GlobalContext& global_context,
                        << subgraph_context_.subgraph_name;
 
     subgraph_context_.has_dynamic_input_shape = false;
+
+    // OV NPU plugin is supported with fallback to OV CPU upon compilation failures.
     try {
       concrete_backend_ = BackendFactory::MakeBackend(*model_proto_,
                                                       GetGlobalContext(),
                                                       subgraph_context_);
-    } catch (std::string const& msg) {
-      throw msg;
+    } catch (const OnnxRuntimeException& ex) {
+      if (device_type.find("NPU") != std::string::npos) {
+        LOGS_DEFAULT(WARNING) << ex.what();
+        LOGS_DEFAULT(WARNING) << "Model compilation failed at OV NPU."
+                              << "Falling back to OV CPU for execution";
+        GetGlobalContext().device_type = "CPU";
+        GetGlobalContext().precision_str = "FP32";
+        try {
+          concrete_backend_ = BackendFactory::MakeBackend(*model_proto_,
+                                                          GetGlobalContext(),
+                                                          subgraph_context_);
+        } catch (std::string const& msg) {
+          ORT_THROW(msg);
+        }
+      } else {
+        ORT_THROW(ex.what());
+      }
     }
   }
 }
@@ -254,8 +264,13 @@ void BackendManager::Compute(OrtKernelContext* context) {
     LOGS_DEFAULT(INFO) << "Start Compute";
   }
 #endif
+  // OV NPU doesn't support dynamic shaped model inference.
+  // if disable_dynamic_shapes is set to true then execution of dynamic model is done
+  // by rewriting the model to static shaped model at runtime based on input shape.
+  // disable_dynamic_shapes is always set to true for OV NPU plugin.
   bool use_dynamic_backend = true;
-  if (!GetGlobalContext().disable_dynamic_shapes && subgraph_context_.has_dynamic_input_shape &&
+  if (subgraph_context_.has_dynamic_input_shape &&
+      !GetGlobalContext().disable_dynamic_shapes &&
       (GetGlobalContext().device_type.find("CPU") != std::string::npos ||
        GetGlobalContext().device_type.find("GPU") != std::string::npos)) {
     concrete_backend_->Infer(context);
@@ -263,12 +278,11 @@ void BackendManager::Compute(OrtKernelContext* context) {
   } else if (use_dynamic_backend && subgraph_context_.has_dynamic_input_shape) {
     std::vector<std::vector<int64_t>> tensor_shapes = GetInputTensorShapes(ctx);
     auto key = MakeMapKeyString(tensor_shapes, GetGlobalContext().device_type);
-
     std::shared_ptr<IBackend> dynamic_backend;
     auto search = backend_map_.find(key);
     if (search == backend_map_.end()) {
       LOGS_DEFAULT(INFO) << "[OpenVINO-EP] "
-                         << "Creating concrete backend for key: " << key;
+                         << "Creating dynamic backend for key: " << key;
       LOGS_DEFAULT(INFO) << "[OpenVINO-EP] "
                          << "Backend created for graph " << subgraph_context_.subgraph_name;
       auto modelproto_with_concrete_shapes = ReWriteInputShapeInfo(*model_proto_, tensor_shapes);
@@ -276,8 +290,22 @@ void BackendManager::Compute(OrtKernelContext* context) {
         dynamic_backend = BackendFactory::MakeBackend(*modelproto_with_concrete_shapes,
                                                       GetGlobalContext(),
                                                       subgraph_context_);
-      } catch (std::string const& msg) {
-        throw msg;
+      } catch (const OnnxRuntimeException& ex) {
+        if (GetGlobalContext().device_type.find("NPU") != std::string::npos) {
+          LOGS_DEFAULT(WARNING) << ex.what();
+          LOGS_DEFAULT(WARNING) << "Model compilation failed at OV NPU."
+                                << "Falling back to OV CPU for execution";
+          GetGlobalContext().device_type = "CPU";
+          GetGlobalContext().precision_str = "FP32";
+          key = MakeMapKeyString(tensor_shapes, GetGlobalContext().device_type);
+          try {
+            dynamic_backend = BackendFactory::MakeBackend(*modelproto_with_concrete_shapes,
+                                                          GetGlobalContext(),
+                                                          subgraph_context_);
+          } catch (std::string const& msg) {
+            ORT_THROW(msg);
+          }
+        }
       }
       backend_map_.insert({key, dynamic_backend});
     } else {
diff --git a/onnxruntime/core/providers/openvino/backend_manager.h b/onnxruntime/core/providers/openvino/backend_manager.h
index 59bda7ca640ee..376ebea225a2b 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.h
+++ b/onnxruntime/core/providers/openvino/backend_manager.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #pragma once
diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc
index 50c839017df2a..32b5ad7d5b66d 100644
--- a/onnxruntime/core/providers/openvino/backend_utils.cc
+++ b/onnxruntime/core/providers/openvino/backend_utils.cc
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #include <algorithm>
@@ -11,12 +11,7 @@
 #include "core/providers/shared_library/provider_api.h"
 #include "backend_utils.h"
 
-#if defined(OV_API_20)
 using Exception = ov::Exception;
-#else
-using Exception = InferenceEngine::details::InferenceEngineException;
-using WaitMode = InferenceEngine::IInferRequest::WaitMode;
-#endif
 
 namespace onnxruntime {
 namespace openvino_ep {
@@ -47,7 +42,6 @@ struct static_cast_int64 {
 
 std::shared_ptr<OVNetwork>
 CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, const GlobalContext& global_context,
-              const SubGraphContext& subgraph_context,
               std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map) {
   if (IsCILogEnabled()) {
     std::cout << "CreateNgraphFunc" << std::endl;
@@ -55,28 +49,6 @@ CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, const GlobalContext
   const std::string model = model_proto.SerializeAsString();
   try {
     auto cnn_network = global_context.ie_core.ReadModel(model, global_context.onnx_model_path_name);
-    if ((subgraph_context.precision == "FP16") &&
-        (global_context.device_type.find("NPU") == std::string::npos)) {
-      // FP16 transformations
-      ov::pass::ConvertFP32ToFP16 pass_obj;
-      pass_obj.run_on_model(cnn_network);
-      cnn_network->validate_nodes_and_infer_types();
-
-      auto proc = ov::preprocess::PrePostProcessor(cnn_network);
-      for (size_t i = 0; i < cnn_network->inputs().size(); i++) {
-        if (cnn_network->inputs()[i].get_element_type() == ov::element::f16) {
-          proc.input(i).tensor().set_element_type(ov::element::f32);
-          proc.input(i).preprocess().convert_element_type(ov::element::f16);
-        }
-      }
-
-      for (size_t i = 0; i < cnn_network->outputs().size(); i++) {
-        if (cnn_network->outputs()[i].get_element_type() == ov::element::f16) {
-          proc.output(i).postprocess().convert_element_type(ov::element::f32);
-        }
-      }
-      cnn_network = proc.build();
-    }
 
     // Check for Constant Folding
     if (!global_context.is_wholly_supported_graph) {
@@ -103,7 +75,7 @@ CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, const GlobalContext
 #endif
     return cnn_network;
   } catch (std::string const& msg) {
-    throw msg;
+    ORT_THROW(msg);
   }
 }
 
@@ -127,7 +99,7 @@ GetOutputTensor(Ort::KernelContext& context, size_t batch_size,
   }
   auto it = output_names.find(output_name);
   if (it == output_names.end()) {
-    throw std::string(log_tag + "Output names mismatch between OpenVINO and ONNX");
+    ORT_THROW(log_tag + "Output names mismatch between OpenVINO and ONNX");
   }
   int index = it->second;
   return context.GetOutput(index, output_shape.get(), num_dims);
@@ -145,7 +117,7 @@ GetOutputTensor(Ort::KernelContext& context,
 
   auto it = output_names.find(output_name);
   if (it == output_names.end()) {
-    throw std::string(log_tag + "Output names mismatch between OpenVINO and ONNX");
+    ORT_THROW(log_tag + "Output names mismatch between OpenVINO and ONNX");
   }
   int index = it->second;
   auto shape = node->get_shape();
@@ -204,7 +176,7 @@ void FillOutputsWithConstantData(std::shared_ptr<ov::Node> node, Ort::UnownedVal
       break;
     }
     default:
-      throw std::string(log_tag + "Unsupported output data type");
+      ORT_THROW(log_tag + "Unsupported output data type");
   }
 }
 
@@ -232,7 +204,7 @@ void FillInputBlob(OVTensorPtr inputBlob, size_t batch_slice_idx,
   auto tensor = context.GetInput(subgraph_context.input_names.at(input_name));
   auto mem_info = tensor.GetTensorMemoryInfo();
   if (mem_info.GetAllocatorName() == OpenVINO_GPU) {
-    throw std::string(log_tag + "IO Buffering is not enabled, Please enable Input on CPU");
+    ORT_THROW(log_tag + "IO Buffering is not enabled, Please enable Input on CPU");
   }
   // Copy input data into OpenVINO's input buffer
   const char* tensor_data = tensor.GetTensorData<char>();
diff --git a/onnxruntime/core/providers/openvino/backend_utils.h b/onnxruntime/core/providers/openvino/backend_utils.h
index 82b0351e87da5..93fa874774469 100644
--- a/onnxruntime/core/providers/openvino/backend_utils.h
+++ b/onnxruntime/core/providers/openvino/backend_utils.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #pragma once
@@ -65,7 +65,6 @@ void FillOutputBlob(OVTensorPtr outputBlob, Ort::UnownedValue& output_tensor,
 std::shared_ptr<OVNetwork>
 CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto,
               const GlobalContext& global_context,
-              const SubGraphContext& subgraph_context,
               std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map);
 
 void printPerformanceCounts(const std::vector<OVProfilingInfo>& performanceMap,
diff --git a/onnxruntime/core/providers/openvino/backends/backend_factory.cc b/onnxruntime/core/providers/openvino/backends/backend_factory.cc
index c586dd8b38af9..a0f4ce8f843b0 100644
--- a/onnxruntime/core/providers/openvino/backends/backend_factory.cc
+++ b/onnxruntime/core/providers/openvino/backends/backend_factory.cc
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #include <memory>
@@ -24,11 +24,11 @@ BackendFactory::MakeBackend(const ONNX_NAMESPACE::ModelProto& model_proto,
     try {
       concrete_backend_ = std::make_shared<BasicBackend>(model_proto, global_context, subgraph_context);
     } catch (std::string const& msg) {
-      throw msg;
+      ORT_THROW(msg);
     }
     return concrete_backend_;
   } else {
-    throw std::string("[OpenVINO-EP] Backend factory error: Unknown backend type: " + type);
+    ORT_THROW("[OpenVINO-EP] Backend factory error: Unknown backend type: " + type);
   }
 }
 }  // namespace openvino_ep
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
index 0779940983aea..69d234a7c55ef 100644
--- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc
+++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #include <map>
@@ -79,20 +79,20 @@ BasicBackend::BasicBackend(const ONNX_NAMESPACE::ModelProto& model_proto,
                                                            subgraph_context_.subgraph_name);
         LOGS_DEFAULT(INFO) << log_tag << "Loaded model to the plugin";
       } else {
-        ie_cnn_network_ = CreateOVModel(model_proto, global_context_, subgraph_context_, const_outputs_map_);
+        ie_cnn_network_ = CreateOVModel(model_proto, global_context_, const_outputs_map_);
         exe_network_ = global_context_.ie_core.LoadNetwork(
             ie_cnn_network_, hw_target, device_config, subgraph_context_.subgraph_name);
         LOGS_DEFAULT(INFO) << log_tag << "Loaded model to the plugin";
       }
 #endif
     } else {
-      ie_cnn_network_ = CreateOVModel(model_proto, global_context_, subgraph_context_, const_outputs_map_);
+      ie_cnn_network_ = CreateOVModel(model_proto, global_context_, const_outputs_map_);
       exe_network_ = global_context_.ie_core.LoadNetwork(
           ie_cnn_network_, hw_target, device_config, subgraph_context_.subgraph_name);
       LOGS_DEFAULT(INFO) << log_tag << "Loaded model to the plugin";
     }
   } catch (const char* msg) {
-    throw(msg);
+    ORT_THROW(msg);
   }
 
   inferRequestsQueue_ = std::unique_ptr<InferRequestsQueue>(new InferRequestsQueue(exe_network_, 1));
@@ -125,21 +125,17 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
   if (global_context_.device_type.find("NPU") != std::string::npos) {
     std::pair<std::string, ov::Any> device_property;
     device_property = std::make_pair("NPU_COMPILER_TYPE", "DRIVER");
+
+    const std::string env_npu_compiler_type = onnxruntime::GetEnvironmentVar("ORT_OPENVINO_NPU_COMPILER_TYPE");
+    if (!env_npu_compiler_type.empty()) {
+      device_property = std::make_pair("NPU_COMPILER_TYPE", env_npu_compiler_type);
+    }
     device_config.emplace(ov::device::properties("NPU", device_property));
   }
 }
 
 void BasicBackend::EnableCaching() {
   if (!global_context_.cache_dir.empty()) {
-    if (global_context_.is_wholly_supported_graph) {
-#if defined(OPENVINO_2022_3)
-#if defined(_WIN32) || defined(WIN32) || defined(__CYGWIN__) || defined(__MINGW32__) || defined(__BORLANDC__)
-      _putenv_s("OV_GPU_CACHE_MODEL", "1");
-#else
-      setenv("OV_GPU_CACHE_MODEL", "1", 1);
-#endif
-#endif
-    }
     LOGS_DEFAULT(INFO) << log_tag << "Enables Caching";
     global_context_.ie_core.SetCache(global_context_.cache_dir);
   }
@@ -162,7 +158,7 @@ void BasicBackend::EnableStreams() {
       (global_context_.device_type.find("HETERO") != std::string::npos) ||
       (global_context_.device_type.find("AUTO") != std::string::npos)) {
     if (global_context_.num_streams != 1) {
-      throw(log_tag + "Cannot set NUM_STREAMS to " + std::to_string(global_context_.num_streams) + " for device " + global_context_.device_type);
+      ORT_THROW(log_tag + "Cannot set NUM_STREAMS to " + std::to_string(global_context_.num_streams) + " for device " + global_context_.device_type);
     }
     // Do nothing
   } else {
@@ -198,9 +194,9 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
       if (input_names.find(onnx_input_name) != input_names.end()) {
         input_name = onnx_input_name;
       } else {
-        throw(log_tag +
-              "Input names mismatch between OpenVINO and ONNX. " + onnx_input_name +
-              " doesn't exist in the list of OpenVINO input tensor names");
+        ORT_THROW(log_tag +
+                  "Input names mismatch between OpenVINO and ONNX. " + onnx_input_name +
+                  " doesn't exist in the list of OpenVINO input tensor names");
       }
       size_t batch_slice_idx = 0;
       if (subgraph_context_.has_dynamic_input_shape &&
@@ -232,14 +228,14 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
         try {
           infer_request->SetTensor(input_name, tensor_ptr);
         } catch (const char* msg) {
-          throw(msg);
+          ORT_THROW(msg);
         }
       } else {
         OVTensorPtr graph_input_blob;
         try {
           graph_input_blob = infer_request->GetTensor(input_name);
         } catch (const char* msg) {
-          throw(msg);
+          ORT_THROW(msg);
         }
         FillInputBlob(graph_input_blob, batch_slice_idx, input_name, context, subgraph_context_);
       }
@@ -248,7 +244,7 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
     // Start Async inference
     infer_request->StartAsync();
   } catch (const char* msg) {
-    throw(msg);
+    ORT_THROW(msg);
   }
 }
 
@@ -274,10 +270,10 @@ void BasicBackend::StartRemoteAsyncInference(Ort::KernelContext& context, OVInfe
       if (input_names.find(onnx_input_name) != input_names.end()) {
         input_name = onnx_input_name;
       } else {
-        throw(log_tag +
-              "Input names mismatch between OpenVINO and ONNX. " +
-              onnx_input_name +
-              " doesn't exist in the list of OpenVINO input tensor names");
+        ORT_THROW(log_tag +
+                  "Input names mismatch between OpenVINO and ONNX. " +
+                  onnx_input_name +
+                  " doesn't exist in the list of OpenVINO input tensor names");
       }
       input_idx++;
       // Kernel Context Input Buffer
@@ -322,7 +318,7 @@ void BasicBackend::StartRemoteAsyncInference(Ort::KernelContext& context, OVInfe
         }
       }
       if (!output_name_found) {
-        throw std::string(
+        ORT_THROW(
             log_tag +
             "Output names mismatch between OpenVINO and ONNX. [ONNX Output: ] " +
             onnx_output_name + " doesn't exist in the list of OpenVINO output tensor names");
@@ -344,7 +340,7 @@ void BasicBackend::StartRemoteAsyncInference(Ort::KernelContext& context, OVInfe
         try {
           infer_request->SetTensor(output_name, tensor_ptr);
         } catch (const char* msg) {
-          throw(msg);
+          ORT_THROW(msg);
         }
       }
     }
@@ -352,7 +348,7 @@ void BasicBackend::StartRemoteAsyncInference(Ort::KernelContext& context, OVInfe
     // Start Async inference
     infer_request->StartAsync();
   } catch (const char* msg) {
-    throw(msg);
+    ORT_THROW(msg);
   }
 }
 #endif
@@ -382,17 +378,18 @@ void BasicBackend::CompleteAsyncInference(Ort::KernelContext& context, OVInferRe
         }
       }
       if (!output_name_found) {
-        throw(log_tag +
-              "Output names mismatch between OpenVINO and ONNX. "
-              "[ONNX Output: ] " +
-              onnx_output_name +
-              " doesn't exist in the "
-              "list of OpenVINO output tensor names");
+        ORT_THROW(
+            log_tag +
+            "Output names mismatch between OpenVINO and ONNX. "
+            "[ONNX Output: ] " +
+            onnx_output_name +
+            " doesn't exist in the "
+            "list of OpenVINO output tensor names");
       }
       try {
         graph_output_blob = infer_request->GetTensor(output_name);
       } catch (const char* msg) {
-        throw(msg);
+        ORT_THROW(msg);
       }
       size_t batch_size = 1;
       auto output_tensor =
@@ -413,14 +410,14 @@ void BasicBackend::CompleteAsyncInference(Ort::KernelContext& context, OVInferRe
         auto output_tensor = GetOutputTensor(context, out_name, subgraph_context_.output_names, node);
         auto mem_info = output_tensor.GetTensorMemoryInfo();
         if (mem_info.GetAllocatorName() == OpenVINO_GPU) {
-          throw(log_tag + "IO Buffering is not supported for constant subgraphs");
+          ORT_THROW(log_tag + "IO Buffering is not supported for constant subgraphs");
         } else {
           FillOutputsWithConstantData(node, output_tensor);
         }
       }
     }
   } catch (const char* msg) {
-    throw(msg);
+    ORT_THROW(msg);
   }
 }
 
@@ -440,7 +437,7 @@ void BasicBackend::Infer(OrtKernelContext* ctx) {
         auto output_tensor = GetOutputTensor(context, out_name, subgraph_context_.output_names, node);
         FillOutputsWithConstantData(node, output_tensor);
       } catch (std::string const& msg) {
-        throw msg;
+        ORT_THROW(msg);
       }
     }
     // Get Output tensors
@@ -461,26 +458,26 @@ void BasicBackend::Infer(OrtKernelContext* ctx) {
       try {
         StartRemoteAsyncInference(context, infer_request);
       } catch (std::string const& msg) {
-        throw msg;
+        ORT_THROW(msg);
       }
     } else {
       try {
         StartAsyncInference(context, infer_request);
       } catch (std::string const& msg) {
-        throw msg;
+        ORT_THROW(msg);
       }
     }
 #else
     try {
       StartAsyncInference(context, infer_request);
-    } catch (std::string const& msg) {
-      throw msg;
+    } catch (const std::runtime_error& e) {
+      ORT_THROW(log_tag + " Exception at StartAsyncInference: " + e.what());
     }
 #endif
     try {
       CompleteAsyncInference(context, infer_request);
-    } catch (std::string const& msg) {
-      throw msg;
+    } catch (const std::runtime_error& e) {
+      ORT_THROW(log_tag + " Exception at CompleteAsyncInference: " + e.what());
     }
 
     // Get Output tensors
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.h b/onnxruntime/core/providers/openvino/backends/basic_backend.h
index aa96dadbf0e2d..3502f660bbb20 100644
--- a/onnxruntime/core/providers/openvino/backends/basic_backend.h
+++ b/onnxruntime/core/providers/openvino/backends/basic_backend.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #pragma once
diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h
index 5f19c71683f24..8701d9f676ffd 100644
--- a/onnxruntime/core/providers/openvino/contexts.h
+++ b/onnxruntime/core/providers/openvino/contexts.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #pragma once
@@ -31,6 +31,7 @@ struct GlobalContext {
   int onnx_opset_version;
   void* context = 0;
   bool use_api_2;
+  std::vector<int> OpenVINO_Version = {};  // Ov Major and OV minor version from OV headers
 };
 
 // Holds context specific to subgraph.
@@ -44,7 +45,6 @@ struct SubGraphContext {
   std::vector<int> input_indexes;
   std::unordered_map<std::string, int> input_names;
   std::unordered_map<std::string, int> output_names;
-  std::string precision;
 };
 
 }  // namespace openvino_ep
diff --git a/onnxruntime/core/providers/openvino/ibackend.h b/onnxruntime/core/providers/openvino/ibackend.h
index 8aacce19c14d5..ece855c6167c6 100644
--- a/onnxruntime/core/providers/openvino/ibackend.h
+++ b/onnxruntime/core/providers/openvino/ibackend.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #pragma once
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
index e3948cc94b348..913440d2fb6ea 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #include "core/providers/shared_library/provider_api.h"
@@ -6,6 +6,7 @@
 #include "contexts.h"
 #include "backend_manager.h"
 #include "ov_versions/capability.h"
+#include "openvino/core/version.hpp"
 
 #define MEMCPY_S(dest, src, destsz, srcsz) memcpy(dest, src, std::min(destsz, srcsz))
 
@@ -25,6 +26,7 @@ OpenVINOExecutionProvider::OpenVINOExecutionProvider(const OpenVINOExecutionProv
   global_context_->enable_opencl_throttling = info.enable_opencl_throttling_;
   global_context_->disable_dynamic_shapes = info.disable_dynamic_shapes_;
   global_context_->num_of_threads = info.num_of_threads_;
+  global_context_->OpenVINO_Version = {OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR};
 
   // to check if target device is available
   // using ie_core capability GetAvailableDevices to fetch list of devices plugged in
@@ -50,8 +52,7 @@ OpenVINOExecutionProvider::OpenVINOExecutionProvider(const OpenVINOExecutionProv
               device_found = true;
               break;
             }
-            if ((info.device_type_.find("NPU") != std::string::npos) &&
-                (info.precision_ == "FP16" || info.precision_ == "U8")) {
+            if (info.device_type_.find("NPU") != std::string::npos) {
               device_found = true;
               break;
             }
@@ -113,27 +114,10 @@ OpenVINOExecutionProvider::GetCapability(const GraphViewer& graph_viewer,
   global_context_->onnx_opset_version =
       graph_viewer.DomainToVersionMap().at(kOnnxDomain);
 
-#if defined(OPENVINO_2023_0)
   openvino_ep::GetCapability obj(graph_viewer,
                                  global_context_->device_type,
-                                 global_context_->precision_str, "V_2023_0");
+                                 global_context_->precision_str);
   result = obj.Execute();
-#elif defined(OPENVINO_2023_1)
-  openvino_ep::GetCapability obj(graph_viewer,
-                                 global_context_->device_type,
-                                 global_context_->precision_str, "V_2023_1");
-  result = obj.Execute();
-#elif defined(OPENVINO_2023_2)
-  openvino_ep::GetCapability obj(graph_viewer,
-                                 global_context_->device_type,
-                                 global_context_->precision_str, "V_2023_2");
-  result = obj.Execute();
-#elif defined(OPENVINO_2023_3)
-  openvino_ep::GetCapability obj(graph_viewer,
-                                 global_context_->device_type,
-                                 global_context_->precision_str, "V_2023_3");
-  result = obj.Execute();
-#endif
 
   global_context_->is_wholly_supported_graph = obj.IsWhollySupportedGraph();
 
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
index b0c92828d8a38..b0dc881c36f33 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.h
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #pragma once
@@ -20,7 +20,7 @@ static void print_build_options() {
             << "you want to build"
             << std::endl;
   std::cout << "The different hardware devices that can be added with HETERO/MULTI/AUTO build "
-            << "are ['CPU','GPU']"
+            << "are ['CPU','GPU','NPU']"
             << std::endl;
   std::cout << "An example of how to specify the HETERO or MULTI or AUTO build type. "
             << "Ex: HETERO:GPU,CPU  Ex: MULTI:GPU,CPU Ex: AUTO:GPU,CPU"
@@ -48,7 +48,7 @@ static std::vector<std::string> parseDevices(const std::string& device_string) {
     print_build_options();
     ORT_THROW("Invalid device string: " + device_string);
   }
-  std::vector<std::string> dev_options = {"CPU", "GPU"};
+  std::vector<std::string> dev_options = {"CPU", "GPU", "NPU"};
   for (std::string dev : devices) {
     if (!std::count(dev_options.begin(), dev_options.end(), dev)) {
       print_build_options();
@@ -98,12 +98,9 @@ struct OpenVINOExecutionProviderInfo {
 #elif defined OPENVINO_CONFIG_GPU_FP16
       device_type_ = "GPU";
       precision_ = "FP16";
-#elif defined OPENVINO_CONFIG_NPU_FP16
+#elif defined OPENVINO_CONFIG_NPU
       device_type_ = "NPU";
-      precision_ = "FP16";
-#elif defined OPENVINO_CONFIG_NPU_U8
-      device_type_ = "NPU";
-      precision_ = "U8";
+      precision_ = "";
 #elif defined OPENVINO_CONFIG_HETERO || defined OPENVINO_CONFIG_MULTI || defined OPENVINO_CONFIG_AUTO
 #ifdef DEVICE_NAME
 #define DEVICE DEVICE_NAME
@@ -142,12 +139,9 @@ struct OpenVINOExecutionProviderInfo {
     } else if (dev_type == "GPU.1_FP16") {
       device_type_ = "GPU.1";
       precision_ = "FP16";
-    } else if (dev_type == "NPU_FP16") {
-      device_type_ = "NPU";
-      precision_ = "FP16";
-    } else if (dev_type == "NPU_U8") {
+    } else if (dev_type == "NPU") {
       device_type_ = "NPU";
-      precision_ = "U8";
+      precision_ = "";
     } else if (dev_type.find("HETERO") == 0 || dev_type.find("MULTI") == 0) {
       std::vector<std::string> devices = parseDevices(dev_type);
       precision_ = "FP16";
diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
index 068456777bece..17511c54aab86 100644
--- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
+++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #include "core/providers/shared_library/provider_api.h"
@@ -78,7 +78,6 @@ struct OpenVINO_Provider : Provider {
                                             // with this value at runtime.
     bool enable_opencl_throttling = false;  // [enable_opencl_throttling]: Enables OpenCL queue throttling for GPU
                                             // device (Reduces CPU Utilization when using GPU)
-    bool disable_dynamic_shapes = false;    // [disable_dynamic_shapes]:  Execute model with default static shape for optimal performance.
     void* context = nullptr;
 
     if (provider_options_map.find("device_type") != provider_options_map.end()) {
@@ -86,7 +85,7 @@ struct OpenVINO_Provider : Provider {
 
       std::set<std::string> ov_supported_device_types = {"CPU_FP32", "CPU_FP16", "GPU_FP32",
                                                          "GPU.0_FP32", "GPU.1_FP32", "GPU_FP16",
-                                                         "GPU.0_FP16", "GPU.1_FP16"};
+                                                         "GPU.0_FP16", "GPU.1_FP16", "NPU"};
       if (!((ov_supported_device_types.find(device_type) != ov_supported_device_types.end()) ||
             (device_type.find("HETERO:") == 0) ||
             (device_type.find("MULTI:") == 0) ||
@@ -94,7 +93,7 @@ struct OpenVINO_Provider : Provider {
         ORT_THROW(
             "[ERROR] [OpenVINO] You have selcted wrong configuration value for the key 'device_type'. "
             "Select from 'CPU_FP32', 'CPU_FP16', 'GPU_FP32', 'GPU.0_FP32', 'GPU.1_FP32', 'GPU_FP16', "
-            "'GPU.0_FP16', 'GPU.1_FP16' or from"
+            "'GPU.0_FP16', 'GPU.1_FP16', 'NPU' or from"
             " HETERO/MULTI/AUTO options available. \n");
       }
     }
@@ -147,12 +146,24 @@ struct OpenVINO_Provider : Provider {
       bool_flag = "";
     }
 
+    // [disable_dynamic_shapes]:  Rewrite dynamic shaped models to static shape at runtime and execute.
+    // Always true for NPU plugin.
+    bool disable_dynamic_shapes = false;
+    if (device_type.find("NPU") != std::string::npos) {
+      disable_dynamic_shapes = true;
+    }
     if (provider_options_map.find("disable_dynamic_shapes") != provider_options_map.end()) {
       bool_flag = provider_options_map.at("disable_dynamic_shapes");
       if (bool_flag == "true" || bool_flag == "True")
         disable_dynamic_shapes = true;
-      else if (bool_flag == "false" || bool_flag == "False")
-        disable_dynamic_shapes = false;
+      else if (bool_flag == "false" || bool_flag == "False") {
+        if (device_type.find("NPU") != std::string::npos) {
+          disable_dynamic_shapes = true;
+          LOGS_DEFAULT(INFO) << "[OpenVINO-EP] The value for the key 'disable_dynamic_shapes' will be set to TRUE for NPU backend.\n ";
+        } else {
+          disable_dynamic_shapes = false;
+        }
+      }
     }
     return std::make_shared<OpenVINOProviderFactory>(const_cast<char*>(device_type.c_str()),
                                                      enable_npu_fast_compile,
diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc
index ea481791111fc..d7c6654c90f81 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.cc
+++ b/onnxruntime/core/providers/openvino/ov_interface.cc
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #include "ov_interface.h"
@@ -8,12 +8,7 @@
 #include "core/providers/shared_library/provider_api.h"
 #include "backend_utils.h"
 
-#if defined(OV_API_20)
 using Exception = ov::Exception;
-#else
-using Exception = InferenceEngine::details::InferenceEngineException;
-using WaitMode = InferenceEngine::IInferRequest::WaitMode;
-#endif
 
 namespace onnxruntime {
 namespace openvino_ep {
@@ -36,9 +31,9 @@ std::shared_ptr<OVNetwork> OVCore::ReadModel(const std::string& model, const std
     }
     return FE->convert(inputModel);
   } catch (const Exception& e) {
-    throw std::string(log_tag + "[OpenVINO-EP] Exception while Reading network: " + std::string(e.what()));
+    ORT_THROW(log_tag + "[OpenVINO-EP] Exception while Reading network: " + std::string(e.what()));
   } catch (...) {
-    throw std::string(log_tag + "[OpenVINO-EP] Unknown exception while Reading network");
+    ORT_THROW(log_tag + "[OpenVINO-EP] Unknown exception while Reading network");
   }
 }
 
@@ -81,9 +76,9 @@ OVExeNetwork OVCore::LoadNetwork(std::shared_ptr<OVNetwork>& ie_cnn_network,
     OVExeNetwork exe(obj);
     return exe;
   } catch (const Exception& e) {
-    throw std::string(log_tag + " Exception while Loading Network for graph: " + name + e.what());
+    ORT_THROW(log_tag + " Exception while Loading Network for graph: " + name + e.what());
   } catch (...) {
-    throw std::string(log_tag + " Exception while Loading Network for graph " + name);
+    ORT_THROW(log_tag + " Exception while Loading Network for graph " + name);
   }
 }
 
@@ -113,9 +108,9 @@ OVExeNetwork OVCore::LoadNetwork(std::shared_ptr<OVNetwork>& model, OVRemoteCont
     auto obj = oe.compile_model(model, *context);
     return OVExeNetwork(obj);
   } catch (const Exception& e) {
-    throw std::string(log_tag + " Exception while Loading Network for graph: " + name + e.what());
+    ORT_THROW(log_tag + " Exception while Loading Network for graph: " + name + e.what());
   } catch (...) {
-    throw std::string(log_tag + " Exception while Loading Network for graph " + name);
+    ORT_THROW(log_tag + " Exception while Loading Network for graph " + name);
   }
 }
 #endif
@@ -135,9 +130,9 @@ OVInferRequest OVExeNetwork::CreateInferRequest() {
     OVInferRequest inf_obj(infReq);
     return inf_obj;
   } catch (const Exception& e) {
-    throw std::string(log_tag + "Exception while creating InferRequest object: " + e.what());
+    ORT_THROW(log_tag + "Exception while creating InferRequest object: " + e.what());
   } catch (...) {
-    throw std::string(log_tag + "Exception while creating InferRequest object.");
+    ORT_THROW(log_tag + "Exception while creating InferRequest object.");
   }
 }
 
@@ -147,9 +142,9 @@ OVTensorPtr OVInferRequest::GetTensor(const std::string& input_name) {
     OVTensorPtr blob = std::make_shared<OVTensor>(tobj);
     return blob;
   } catch (const Exception& e) {
-    throw std::string(log_tag + " Cannot access IE Blob for input: " + input_name + e.what());
+    ORT_THROW(log_tag + " Cannot access IE Blob for input: " + input_name + e.what());
   } catch (...) {
-    throw std::string(log_tag + " Cannot access IE Blob for input: " + input_name);
+    ORT_THROW(log_tag + " Cannot access IE Blob for input: " + input_name);
   }
 }
 
@@ -157,9 +152,9 @@ void OVInferRequest::SetTensor(const std::string& name, OVTensorPtr& blob) {
   try {
     ovInfReq.set_tensor(name, *(blob.get()));
   } catch (const Exception& e) {
-    throw std::string(log_tag + " Cannot set Remote Blob for output: " + name + e.what());
+    ORT_THROW(log_tag + " Cannot set Remote Blob for output: " + name + e.what());
   } catch (...) {
-    throw std::string(log_tag + " Cannot set Remote Blob for output: " + name);
+    ORT_THROW(log_tag + " Cannot set Remote Blob for output: " + name);
   }
 }
 
@@ -167,9 +162,9 @@ void OVInferRequest::StartAsync() {
   try {
     ovInfReq.start_async();
   } catch (const Exception& e) {
-    throw std::string(log_tag + " Couldn't start Inference: " + e.what());
+    ORT_THROW(log_tag + " Couldn't start Inference: " + e.what());
   } catch (...) {
-    throw std::string(log_tag + " In Error Couldn't start Inference");
+    ORT_THROW(log_tag + " In Error Couldn't start Inference");
   }
 }
 
@@ -177,9 +172,9 @@ void OVInferRequest::Infer() {
   try {
     ovInfReq.infer();
   } catch (const Exception& e) {
-    throw std::string(log_tag + " Couldn't start Inference: " + e.what());
+    ORT_THROW(log_tag + " Couldn't start Inference: " + e.what());
   } catch (...) {
-    throw std::string(log_tag + " In Error Couldn't start Inference");
+    ORT_THROW(log_tag + " In Error Couldn't start Inference");
   }
 }
 
@@ -187,9 +182,9 @@ void OVInferRequest::WaitRequest() {
   try {
     ovInfReq.wait();
   } catch (const Exception& e) {
-    throw std::string(log_tag + " Wait Model Failed: " + e.what());
+    ORT_THROW(log_tag + " Wait Model Failed: " + e.what());
   } catch (...) {
-    throw std::string(log_tag + " Wait Mode Failed");
+    ORT_THROW(log_tag + " Wait Mode Failed");
   }
 }
 
diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h
index cf4d867d4df55..2a13fafb99fd3 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.h
+++ b/onnxruntime/core/providers/openvino/ov_interface.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #pragma once
@@ -6,14 +6,11 @@
 #include <vector>
 #include <memory>
 
-#define OV_API_20
 #include "openvino/openvino.hpp"
 #include "openvino/pass/convert_fp32_to_fp16.hpp"
 #include "openvino/frontend/manager.hpp"
 
 #ifdef IO_BUFFER_ENABLED
-#include <gpu/gpu_context_api_ocl.hpp>
-#include <gpu/gpu_config.hpp>
 #include <openvino/runtime/intel_gpu/ocl/ocl.hpp>
 #endif
 
diff --git a/onnxruntime/core/providers/openvino/ov_versions/capability.cc b/onnxruntime/core/providers/openvino/ov_versions/capability.cc
index 11c8a1629b073..3970bf6ff68a7 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/capability.cc
+++ b/onnxruntime/core/providers/openvino/ov_versions/capability.cc
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) 2019- Intel Corporation
 // Licensed under the MIT License
 
 #include "core/providers/shared_library/provider_api.h"
@@ -6,6 +6,7 @@
 #include "../backend_manager.h"
 #include "capability.h"
 #include "utils.h"
+#include "openvino/core/version.hpp"
 
 #if defined(_MSC_VER)
 #pragma warning(disable : 4244 4245 5208)
@@ -25,20 +26,22 @@ namespace openvino_ep {
 // Constructor
 GetCapability::GetCapability(const GraphViewer& graph_viewer_param,
                              const std::string device_type_param,
-                             const std::string device_precision,
-                             const std::string version_param)
+                             const std::string device_precision)
     : graph_viewer_(graph_viewer_param), device_type_(device_type_param), device_precision_(device_precision) {
-  if (version_param == "V_2023_0") {
-    data_ops_ = new DataOps(graph_viewer_, V_2023_0, device_type_, device_precision_);
-  } else if (version_param == "V_2023_1") {
-    data_ops_ = new DataOps(graph_viewer_, V_2023_1, device_type_, device_precision_);
-  } else if (version_param == "V_2023_2") {
-    data_ops_ = new DataOps(graph_viewer_, V_2023_2, device_type_, device_precision_);
-  } else if (version_param == "V_2023_3") {
-    data_ops_ = new DataOps(graph_viewer_, V_2023_3, device_type_, device_precision_);
-  } else {
-    data_ops_ = new DataOps(graph_viewer_, V_2023_3, device_type_, device_precision_);
+  if (device_type_.find("NPU") != std::string::npos) {
+    device_type_ = "CPU_FP32";
   }
+#if OPENVINO_VERSION_MAJOR == 2023 && OPENVINO_VERSION_MINOR == 1
+  data_ops_ = new DataOps(graph_viewer_, V_2023_1, device_type_, device_precision_);
+#elif OPENVINO_VERSION_MAJOR == 2023 && OPENVINO_VERSION_MINOR == 2
+  data_ops_ = new DataOps(graph_viewer_, V_2023_2, device_type_, device_precision_);
+#elif OPENVINO_VERSION_MAJOR == 2023 && OPENVINO_VERSION_MINOR == 3
+  data_ops_ = new DataOps(graph_viewer_, V_2023_3, device_type_, device_precision_);
+#elif OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 0
+  data_ops_ = new DataOps(graph_viewer_, V_2024_0, device_type_, device_precision_);
+#else
+  data_ops_ = new DataOps(graph_viewer_, V_2024_0, device_type_, device_precision_);
+#endif
 }
 
 std::vector<std::unique_ptr<ComputeCapability>> GetCapability::Execute() {
diff --git a/onnxruntime/core/providers/openvino/ov_versions/capability.h b/onnxruntime/core/providers/openvino/ov_versions/capability.h
index 2040634cc45d9..d9fe5a95ef833 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/capability.h
+++ b/onnxruntime/core/providers/openvino/ov_versions/capability.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #pragma once
@@ -21,8 +21,7 @@ class GetCapability {
  public:
   GetCapability(const GraphViewer& graph_viewer_param,
                 const std::string device_type_param,
-                const std::string precision,
-                const std::string version_param);
+                const std::string precision);
   virtual std::vector<std::unique_ptr<ComputeCapability>> Execute();
   bool IsWhollySupportedGraph() {
     return is_wholly_supported_graph_;
diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc
index e829bf377b195..c7c3e93595719 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc
+++ b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #include <unordered_set>
@@ -14,6 +14,7 @@
 #include "data_ops.h"
 #include "capability.h"
 #include "utils.h"
+#include "../ov_interface.h"
 
 #if defined(_MSC_VER)
 #pragma warning(disable : 4244 4245 5208)
@@ -36,6 +37,7 @@ namespace openvino_ep {
 std::set<std::string> ops_supported_only_in_model = {
     "Add",
     "Cast",
+    "Celu",
     "Concat",
     "ConstantOfShape",
     "DequantizeLinear",
@@ -46,6 +48,7 @@ std::set<std::string> ops_supported_only_in_model = {
     "EyeLike",
     "GatherElements",
     "GatherND",
+    "GridSample",
     "Identity",
     "LayerNormalization",
     "Loop",
@@ -72,293 +75,171 @@ std::set<std::string> ops_supported_only_in_model = {
 std::set<std::string> ops_supported_as_function = {
     "LessOrEqual",
     "GreaterOrEqual",
-    "LayerNormalization"};
+    "LayerNormalization",
+    "Celu"};
 
 std::vector<SupportedOp> supported_op_mode = {
     {"Abs", V_2020_4, {"CPU", "GPU"}},
-    {"Abs", V_2023_0, {"NPU"}},
     {"Acos", V_2020_4, {"CPU"}},
     {"Acos", V_2022_1, {"GPU"}},
-    {"Acos", V_2023_1, {"NPU"}},
     {"Acosh", V_2020_4, {"CPU"}},
     {"Acosh", V_2022_1, {"GPU"}},
-    {"Acosh", V_2023_1, {"NPU"}},
     {"Add", V_2020_4, {"CPU", "GPU"}},
-    {"Add", V_2023_0, {"NPU"}},
     {"And", V_2020_4, {"CPU", "GPU"}},
-    {"And", V_2023_1, {"NPU"}},
     {"ArgMax", V_2020_4, {"CPU"}},
     {"ArgMax", V_2021_1, {"GPU"}},
     {"ArgMin", V_2020_4, {"CPU"}},
     {"ArgMin", V_2022_1, {"GPU"}},
     {"Asin", V_2020_4, {"CPU", "GPU"}},
-    {"Asin", V_2023_1, {"NPU"}},
     {"Asinh", V_2020_4, {"CPU", "GPU"}},
-    {"Asinh", V_2023_1, {"NPU"}},
     {"Atan", V_2020_4, {"CPU", "GPU"}},
-    {"Atan", V_2023_1, {"NPU"}},
     {"Atanh", V_2020_4, {"CPU"}},
     {"Atanh", V_2022_1, {"GPU"}},
-    {"Atanh", V_2023_1, {"NPU"}},
     {"AveragePool", V_2020_4, {"CPU", "GPU"}},
-    {"AveragePool", V_2023_0, {"NPU"}},
     {"BatchNormalization", V_2020_4, {"CPU", "GPU"}},
-    {"BatchNormalization", V_2023_0, {"NPU"}},
     {"BitShift", V_2022_1, {"CPU"}},
-    {"BitShift", V_2023_1, {"NPU"}},
     {"Cast", V_2020_4, {"CPU", "GPU"}},
-    {"Cast", V_2023_0, {"NPU"}},
-    {"CastLike", V_2023_1, {"CPU", "GPU", "NPU"}},
+    {"CastLike", V_2023_1, {"CPU", "GPU"}},
     {"Ceil", V_2020_4, {"GPU"}},
     {"Ceil", V_2021_4, {"CPU"}},
-    {"Ceil", V_2023_1, {"NPU"}},
     {"Celu", V_2022_1, {"CPU", "GPU"}},
     {"Clip", V_2020_4, {"CPU", "GPU"}},
-    {"Clip", V_2023_0, {"NPU"}},
     {"Compress", V_2023_1, {"CPU", "GPU"}},
     {"Concat", V_2020_4, {"CPU", "GPU"}},
-    {"Concat", V_2023_0, {"NPU"}},
     {"Constant", V_2020_4, {"CPU", "GPU"}},
-    {"Constant", V_2023_0, {"NPU"}},
     {"ConstantOfShape", V_2020_4, {"CPU", "GPU"}},
-    {"ConstantOfShape", V_2023_0, {"NPU"}},  // Gets mapped to broadcast op in the plugin.
     {"Conv", V_2020_4, {"CPU", "GPU"}},
-    {"Conv", V_2023_0, {"NPU"}},
     {"ConvInteger", V_2022_1, {"CPU", "GPU"}},
-    {"ConvInteger", V_2023_1, {"NPU"}},
     {"ConvTranspose", V_2020_4, {"CPU", "GPU"}},
-    {"ConvTranspose", V_2023_1, {"NPU"}},
     {"Cos", V_2020_4, {"CPU"}},
     {"Cos", V_2022_1, {"GPU"}},
-    {"Cos", V_2023_0, {"NPU"}},
     {"Cosh", V_2020_4, {"CPU"}},
     {"Cosh", V_2022_1, {"GPU"}},
-    {"Cosh", V_2023_1, {"NPU"}},
     {"CumSum", V_2022_1, {"CPU", "GPU"}},
-    {"CumSum", V_2023_0, {"NPU"}},
     {"DepthToSpace", V_2020_4, {"CPU", "GPU"}},
-    {"DepthToSpace", V_2023_0, {"NPU"}},
     {"DequantizeLinear", V_2021_4, {"CPU", "GPU"}},
-    {"DequantizeLinear", V_2023_0, {"NPU"}},
     {"Div", V_2020_4, {"CPU", "GPU"}},
-    {"Div", V_2023_0, {"NPU"}},
     {"Dropout", V_2020_4, {"CPU", "GPU"}},
-    {"Dropout", V_2023_0, {"NPU"}},
     {"Elu", V_2020_4, {"CPU", "GPU"}},
-    {"Elu", V_2023_0, {"NPU"}},
     {"Einsum", V_2023_1, {"CPU", "GPU"}},
     {"Equal", V_2020_4, {"CPU", "GPU"}},
-    {"Equal", V_2023_0, {"NPU"}},  // Added for whisper decoder model.
     {"Erf", V_2020_4, {"CPU", "GPU"}},
-    {"Erf", V_2023_0, {"NPU"}},
     {"Exp", V_2020_4, {"CPU", "GPU"}},
-    {"Exp", V_2023_0, {"NPU"}},
     {"Expand", V_2022_1, {"CPU", "GPU"}},
-    {"Expand", V_2023_0, {"NPU"}},  // Gets mapped to broadcast op and multiply op in the plugin.
     {"EyeLike", V_2022_1, {"CPU"}},
-    {"EyeLike", V_2023_0, {"NPU"}},  // NoOP
     {"Flatten", V_2020_4, {"CPU", "GPU"}},
-    {"Flatten", V_2023_0, {"NPU"}},
     {"Floor", V_2020_4, {"CPU", "GPU"}},
-    {"Floor", V_2023_1, {"NPU"}},
     {"Gather", V_2020_4, {"CPU", "GPU"}},
-    {"Gather", V_2023_0, {"NPU"}},
     {"GatherElements", V_2022_2, {"CPU", "GPU"}},
-    {"GatherElements", V_2023_1, {"NPU"}},
     {"GatherND", V_2021_4, {"CPU", "GPU"}},
-    {"GatherND", V_2023_1, {"NPU"}},
+    {"Gelu", V_2023_1, {"CPU", "GPU"}},
     {"Gemm", V_2020_4, {"CPU", "GPU"}},
-    {"Gemm", V_2023_0, {"NPU"}},
     {"GlobalAveragePool", V_2020_4, {"CPU", "GPU"}},
-    {"GlobalAveragePool", V_2023_0, {"NPU"}},
     {"GlobalLpPool", V_2020_4, {"CPU", "GPU"}},
-    {"GlobalLpPool", V_2023_1, {"NPU"}},
     {"GlobalMaxPool", V_2022_1, {"CPU", "GPU"}},
-    {"GlobalMaxPool", V_2023_1, {"NPU"}},
     {"Greater", V_2020_4, {"CPU", "GPU"}},
-    {"Greater", V_2023_0, {"NPU"}},
     {"GreaterOrEqual", V_2022_1, {"CPU", "GPU"}},
-    {"GreaterOrEqual", V_2023_0, {"NPU"}},
     {"GridSample", V_2022_3, {"CPU"}},
     {"GridSample", V_2023_0, {"GPU"}},
-    {"GridSample", V_2023_1, {"NPU"}},
-    {"HardMax", V_2023_1, {"CPU", "GPU", "NPU"}},
+    {"HardMax", V_2023_1, {"CPU", "GPU"}},
     {"Identity", V_2020_4, {"CPU", "GPU"}},
-    {"Identity", V_2023_0, {"NPU"}},  // NoOP
     {"If", V_2022_3, {"CPU", "GPU"}},
-    {"If", V_2023_1, {"NPU"}},
     {"ImageScaler", V_2022_1, {"CPU", "GPU"}},
-    {"ImageScaler", V_2023_0, {"NPU"}},
     {"InstanceNormalization", V_2020_4, {"CPU", "GPU"}},
-    {"InstanceNormalization", V_2023_0, {"NPU"}},
     {"HardSigmoid", V_2020_4, {"CPU", "GPU"}},
-    {"HardSigmoid", V_2023_1, {"NPU"}},
     {"HardMax", V_2022_1, {"CPU", "GPU"}},
+    {"LayerNormalization", V_2023_0, {"CPU", "GPU"}},
     {"LeakyRelu", V_2020_4, {"CPU", "GPU"}},
-    {"LeakyRelu", V_2023_0, {"NPU"}},
     {"Less", V_2020_4, {"CPU", "GPU"}},
-    {"Less", V_2023_0, {"NPU"}},  // Added for whisper decoder model.
     {"LessOrEqual", V_2022_1, {"CPU", "GPU"}},
-    {"LessOrEqual", V_2023_0, {"NPU"}},
     {"Log", V_2020_4, {"CPU", "GPU"}},
-    {"Log", V_2023_0, {"NPU"}},
     {"LogSoftMax", V_2022_1, {"CPU", "GPU"}},
     {"Loop", V_2021_4, {"CPU", "GPU"}},
-    {"LpNormalization", V_2023_1, {"CPU", "GPU", "NPU"}},
-    {"LpPool", V_2023_1, {"CPU", "GPU", "NPU"}},
+    {"LpNormalization", V_2023_1, {"CPU", "GPU"}},
     {"LRN", V_2020_4, {"CPU", "GPU"}},
-    {"LRN", V_2023_0, {"NPU"}},
     {"LSTM", V_2020_4, {"CPU", "GPU"}},
-    {"LSTM", V_2023_1, {"NPU"}},
     {"MatMul", V_2020_4, {"CPU", "GPU"}},
-    {"MatMul", V_2023_0, {"NPU"}},
     {"MatMulInteger", V_2022_1, {"CPU"}},
-    {"MatMulInteger", V_2023_1, {"NPU"}},
     {"Max", V_2020_4, {"CPU", "GPU"}},
-    {"Max", V_2023_0, {"NPU"}},
     {"MaxPool", V_2020_4, {"CPU", "GPU"}},
-    {"MaxPool", V_2023_0, {"NPU"}},
     {"Mean", V_2020_4, {"CPU", "GPU"}},
-    {"Mean", V_2023_0, {"NPU"}},
     {"MeanVarianceNormalization", V_2022_1, {"CPU", "GPU"}},
-    {"MeanVarianceNormalization", V_2023_1, {"NPU"}},
     {"Min", V_2020_4, {"CPU", "GPU"}},
-    {"Min", V_2023_0, {"NPU"}},
     {"Mod", V_2022_1, {"CPU", "GPU"}},
     {"Mul", V_2020_4, {"CPU", "GPU"}},
-    {"Mul", V_2023_0, {"NPU"}},
     {"Neg", V_2020_4, {"CPU", "GPU"}},
-    {"Neg", V_2023_0, {"NPU"}},
     {"NonMaxSuppression", V_2021_1, {"CPU", "GPU"}},
-    {"NonMaxSuppression", V_2023_1, {"NPU"}},
     {"NonZero", V_2021_1, {"CPU"}},
     {"NonZero", V_2023_0, {"GPU"}},
     {"Not", V_2021_1, {"CPU", "GPU"}},
     {"Not", V_2020_4, {"CPU", "GPU"}},
-    {"Not", V_2023_1, {"NPU"}},
     {"OneHot", V_2020_4, {"CPU", "GPU"}},
-    {"OneHot", V_2023_1, {"NPU"}},
     {"Or", V_2022_1, {"CPU", "GPU"}},
-    {"Or", V_2023_1, {"NPU"}},
     {"Pad", V_2020_4, {"CPU", "GPU"}},
-    {"Pad", V_2023_0, {"NPU"}},
     {"Pow", V_2020_4, {"CPU", "GPU"}},
-    {"Pow", V_2023_0, {"NPU"}},
     {"PRelu", V_2020_4, {"CPU", "GPU"}},
-    {"PRelu", V_2023_0, {"NPU"}},
     {"QLinearMatMul", V_2022_3, {"CPU"}},
-    // {"QLinearMatMul", V_2023_1, {"NPU"}},
     {"QuantizeLinear", V_2021_4, {"CPU", "GPU"}},
-    {"QuantizeLinear", V_2023_0, {"NPU"}},
     {"RNN", V_2023_1, {"CPU", "GPU"}},
     {"RandomNormalLike", V_2023_0, {"CPU", "GPU"}},
     {"RandomNormalLike", V_2023_0, {"CPU", "GPU"}},
-    {"RandomNormalLike", V_2023_1, {"NPU"}},
     {"RandomNormal", V_2023_0, {"CPU", "GPU"}},
-    {"RandomNormal", V_2023_1, {"NPU"}},
     {"Range", V_2022_1, {"CPU", "GPU"}},
-    {"Range", V_2023_0, {"NPU"}},
     {"Reciprocal", V_2020_4, {"CPU", "GPU"}},
-    {"Reciprocal", V_2023_0, {"NPU"}},
     {"ReduceL1", V_2022_1, {"CPU", "GPU"}},
-    {"ReduceL1", V_2023_1, {"NPU"}},
     {"ReduceL2", V_2022_1, {"CPU", "GPU"}},
-    {"ReduceL2", V_2023_1, {"NPU"}},
     {"ReduceLogSum", V_2020_4, {"CPU"}},
     {"ReduceLogSum", V_2022_1, {"CPU", "GPU"}},
-    {"ReduceLogSum", V_2023_1, {"NPU"}},
     {"ReduceLogSumExp", V_2022_1, {"CPU", "GPU"}},
-    {"ReduceLogSumExp", V_2023_1, {"NPU"}},
     {"ReduceMax", V_2020_4, {"CPU", "GPU"}},
-    {"ReduceMax", V_2023_1, {"NPU"}},
     {"ReduceMean", V_2020_4, {"CPU", "GPU"}},
-    {"ReduceMean", V_2023_0, {"NPU"}},
     {"ReduceMin", V_2020_4, {"CPU", "GPU"}},
-    {"ReduceMin", V_2023_1, {"NPU"}},
     {"ReduceProd", V_2020_4, {"CPU"}},
     {"ReduceProd", V_2022_1, {"GPU"}},
-    {"ReduceProd", V_2023_1, {"NPU"}},
     {"ReduceSum", V_2020_4, {"CPU", "GPU"}},
-    // {"ReduceSum", V_2023_1, {"NPU"}},
     {"ReduceSumSquare", V_2020_4, {"CPU"}},
     {"ReduceSumSquare", V_2022_1, {"CPU", "GPU"}},
-    {"ReduceSumSquare", V_2023_1, {"NPU"}},
     {"Relu", V_2020_4, {"CPU", "GPU"}},
-    {"Relu", V_2023_0, {"NPU"}},
     {"Resize", V_2020_4, {"CPU"}},
     {"Resize", V_2022_1, {"GPU"}},
-    {"Resize", V_2023_1, {"NPU"}},
     {"Reshape", V_2020_4, {"CPU", "GPU"}},
-    {"Reshape", V_2023_0, {"NPU"}},
     {"ReverseSequence", V_2022_1, {"CPU", "GPU"}},
     {"RoiAlign", V_2021_1, {"CPU", "GPU"}},
-    {"RoiAlign", V_2023_1, {"NPU"}},
     {"Round", V_2021_4, {"CPU", "GPU"}},
-    {"Round", V_2023_1, {"NPU"}},
     {"Scatter", V_2022_1, {"CPU", "GPU"}},
-    {"Scatter", V_2023_1, {"NPU"}},
     {"ScatterElements", V_2022_1, {"CPU", "GPU"}},
-    {"ScatterElements", V_2023_1, {"NPU"}},
     {"ScatterND", V_2022_1, {"CPU", "GPU"}},
-    {"ScatterND", V_2023_1, {"NPU"}},
     {"Selu", V_2020_4, {"CPU", "GPU"}},
-    {"Selu", V_2023_1, {"NPU"}},
     {"Shape", V_2020_4, {"CPU", "GPU"}},
-    {"Shape", V_2023_0, {"NPU"}},
     {"Shrink", V_2022_1, {"CPU", "GPU"}},
-    {"Shrink", V_2023_0, {"NPU"}},
     {"Sigmoid", V_2020_4, {"CPU", "GPU"}},
-    {"Sigmoid", V_2023_0, {"NPU"}},
     {"Sign", V_2020_4, {"CPU"}},
     {"Sign", V_2022_1, {"GPU"}},
-    {"Sign", V_2023_0, {"NPU"}},
     {"Sin", V_2022_1, {"CPU", "GPU"}},
-    {"Sin", V_2023_0, {"NPU"}},
     {"Sinh", V_2020_4, {"CPU"}},
-    {"Sinh", V_2023_1, {"NPU"}},
     {"Size", V_2022_1, {"CPU", "GPU"}},
-    {"Size", V_2023_1, {"NPU"}},
     {"Slice", V_2020_4, {"CPU", "GPU"}},
-    {"Slice", V_2023_0, {"NPU"}},
     {"Softmax", V_2020_4, {"CPU", "GPU"}},
-    {"Softmax", V_2023_0, {"NPU"}},
     {"Softplus", V_2022_1, {"CPU", "GPU"}},
-    {"Softplus", V_2023_0, {"NPU"}},
     {"Softsign", V_2022_1, {"CPU", "GPU"}},
     {"SpaceToDepth", V_2020_4, {"CPU", "GPU"}},
-    {"SpaceToDepth", V_2023_0, {"NPU"}},
     {"Split", V_2020_4, {"CPU", "GPU"}},
-    {"Split", V_2023_0, {"NPU"}},
     {"Sqrt", V_2020_4, {"CPU", "GPU"}},
-    {"Sqrt", V_2023_0, {"NPU"}},
     {"Squeeze", V_2020_4, {"CPU", "GPU"}},
-    {"Squeeze", V_2023_0, {"NPU"}},
     {"Softsign", V_2020_4, {"CPU"}},
     {"Sub", V_2020_4, {"CPU", "GPU"}},
-    {"Sub", V_2023_0, {"NPU"}},
     {"Sum", V_2020_4, {"CPU", "GPU"}},
-    {"Sum", V_2023_0, {"NPU"}},
     {"Tan", V_2020_4, {"CPU", "GPU"}},
-    {"Tan", V_2023_1, {"NPU"}},
     {"Tanh", V_2020_4, {"CPU", "GPU"}},
-    {"Tanh", V_2023_0, {"NPU"}},
     {"ThresholdedRelu", V_2022_1, {"CPU", "GPU"}},
-    {"ThresholdedRelu", V_2023_0, {"NPU"}},
     {"Tile", V_2021_3, {"CPU", "GPU"}},
-    {"Tile", V_2023_0, {"NPU"}},
     {"Transpose", V_2020_4, {"CPU", "GPU"}},
-    {"Transpose", V_2023_0, {"NPU"}},
     {"Trilu", V_2023_0, {"CPU", "GPU"}},
-    {"Trilu", V_2023_1, {"NPU"}},
     {"TopK", V_2020_4, {"CPU", "GPU"}},
-    {"TopK", V_2023_0, {"NPU"}},
     {"Upsample", V_2020_4, {"CPU", "GPU"}},
     {"Unsqueeze", V_2020_4, {"CPU", "GPU"}},
-    {"Unsqueeze", V_2023_0, {"NPU"}},
     {"Where", V_2022_1, {"CPU", "GPU"}},
-    {"Where", V_2023_0, {"NPU"}},  // Added for whisper decoder model.
     {"Xor", V_2022_1, {"CPU", "GPU"}},
-    {"Xor", V_2023_1, {"NPU"}},
 };
 
 void DataOps::populate_types_supported() {
@@ -370,6 +251,8 @@ void DataOps::populate_types_supported() {
       std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT32));
   supported_types_initializer_.insert(
       std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT64));
+  supported_types_initializer_.insert(
+      std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT16));
   supported_types_initializer_.insert(
       std::make_pair(V_2021_1, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT16));
   supported_types_initializer_.insert(
@@ -387,6 +270,8 @@ void DataOps::populate_types_supported() {
       std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT8));
   supported_types_npu_.insert(
       std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT16));
+  supported_types_npu_.insert(
+      std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT16));
   supported_types_npu_.insert(
       std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT32));
   supported_types_npu_.insert(
@@ -402,6 +287,8 @@ void DataOps::populate_types_supported() {
       std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT32));
   supported_types_cpu_.insert(
       std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT16));
+  supported_types_cpu_.insert(
+      std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT16));
   supported_types_cpu_.insert(
       std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT8));
   supported_types_cpu_.insert(
@@ -437,13 +324,12 @@ void DataOps::populate_op_mode_supported() {
   no_dimension_supported_.push_back({"DequantizeLinear", V_2021_4, {"All"}});
   no_dimension_supported_.push_back({"Equal", V_2022_1, {"CPU"}});
   no_dimension_supported_.push_back({"Equal", V_2023_0, {"GPU"}});
+  no_dimension_supported_.push_back({"Expand", V_2023_3, {"CPU"}});
   no_dimension_supported_.push_back({"Floor", V_2020_4, {"All"}});
   no_dimension_supported_.push_back({"Gather", V_2020_4, {"All"}});
-  no_dimension_supported_.push_back({"Greater", V_2023_0, {"NPU"}});
   no_dimension_supported_.push_back({"Identity", V_2023_0, {"All"}});
   no_dimension_supported_.push_back({"Less", V_2022_1, {"CPU"}});
   no_dimension_supported_.push_back({"Loop", V_2021_4, {"All"}});
-  no_dimension_supported_.push_back({"Max", V_2023_0, {"NPU"}});
   no_dimension_supported_.push_back({"Min", V_2020_4, {"All"}});
   no_dimension_supported_.push_back({"Mul", V_2020_4, {"All"}});
   no_dimension_supported_.push_back({"Neg", V_2023_0, {"CPU", "GPU"}});
@@ -476,9 +362,8 @@ void DataOps::populate_op_mode_supported() {
   {
     UnsupportedOpMode obj = {{V_2022_1, V_2022_2, V_2022_3},
                              [this](const Node* node, const InitializedTensorSet&) {
-                               // Abs is not supproted with INT8 or INT32 as input data type on GPU and NPU
-                               if ((device_id_.find("GPU") != std::string::npos) ||
-                                   (device_id_.find("NPU") != std::string::npos)) {
+                               // Abs is not supproted with INT8 or INT32 as input data type on GPU
+                               if ((device_id_.find("GPU") != std::string::npos)) {
                                  for (size_t i = 0; i < node->InputDefs().size(); i++) {
                                    if (node->InputDefs()[i]->TypeAsProto()->tensor_type().elem_type() ==
                                            ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT8 ||
@@ -706,7 +591,7 @@ void DataOps::populate_op_mode_supported() {
     op_list_.insert({"PRelu", obj});
   }
   {
-    UnsupportedOpMode obj = {{V_2023_0, V_2023_1, V_2023_2, V_2023_3},
+    UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0},
                              [this](const Node* node, const InitializedTensorSet&) {
                                const auto& input_arg = node->InputDefs()[1];
                                auto shape = input_arg->Shape();
@@ -821,7 +706,7 @@ void DataOps::populate_op_mode_supported() {
     op_list_.insert({"Squeeze", obj});
   }
   {
-    UnsupportedOpMode obj = {{V_2023_0, V_2023_1, V_2023_2, V_2023_3},
+    UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0},
                              [this](const Node* node, const InitializedTensorSet&) {
                                // If the operator is unsqueeze
                                // If axes is an input, then we cannot produce a static graph.
@@ -836,7 +721,7 @@ void DataOps::populate_op_mode_supported() {
     op_list_.insert({"Unsqueeze", obj});
   }
   {
-    UnsupportedOpMode obj = {{V_2023_0, V_2023_1, V_2023_2, V_2023_3},
+    UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0},
                              [this](const Node* node, const InitializedTensorSet&) {
                                // check for attributes
                                auto& upsample_attr = node->GetAttributes();
@@ -961,7 +846,7 @@ bool DataOps::type_is_supported(const NodeArg* node_arg, bool is_initializer) {
   } else {
     auto dtype = type_proto->tensor_type().elem_type();
 
-    if (device_id_.find("NPU") != std::string::npos || device_id_.find("HETERO") != std::string::npos ||
+    if (device_id_.find("HETERO") != std::string::npos ||
         device_id_.find("MULTI") != std::string::npos || device_id_.find("AUTO") != std::string::npos) {
       for (auto const& var : supported_types_npu_) {
         if ((var.first <= version_id_) &&
@@ -1063,8 +948,7 @@ bool DataOps::dimension_unsupported(const Node* node) {
   return true;
 }
 
-bool DataOps::node_is_supported(const std::map<std::string, std::set<std::string>>& op_map,
-                                const NodeIndex node_idx) {
+bool DataOps::node_is_supported(const NodeIndex node_idx) {
   const auto& node = graph_viewer_.GetNode(node_idx);
   const auto& optype = node->OpType();
 
@@ -1174,37 +1058,14 @@ bool DataOps::node_is_supported(const std::map<std::string, std::set<std::string
     return false;
   }
 
-  // Check 3b
-  const auto opset = op_map.find(domain);
-  const auto op_fun = ops_supported_as_function.find(node->OpType());
-  if (opset == op_map.end()) {
-#ifndef NDEBUG
-    if (openvino_ep::backend_utils::IsDebugEnabled()) {
-      std::cout << "Failed in Unsupported onnx model domain" << std::endl;
-    }
-#endif
-    return false;
-  }
-  if (opset->second.find(optype) == opset->second.end() && op_fun == ops_supported_as_function.end()) {
-#ifndef NDEBUG
-    if (openvino_ep::backend_utils::IsDebugEnabled()) {
-      std::cout << "The operator is not available in OpenVINO ngraph operators list"
-                << "nor the operator is a special ONNX function"
-                << std::endl;
-    }
-#endif
-    return false;
-  }
   return true;
 }
 
 std::vector<NodeIndex> DataOps::GetUnsupportedNodeIndices(std::unordered_set<std::string>& ng_required_initializers) {
-  const auto ng_supported_ops = GetNgSupportedOps(GetOnnxOpSet(graph_viewer_));
-
   std::vector<NodeIndex> unsupported_nodes_idx;
 
   for (const auto& node_idx : graph_viewer_.GetNodesInTopologicalOrder()) {
-    if (node_is_supported(ng_supported_ops, node_idx)) {
+    if (node_is_supported(node_idx)) {
       // Collect inputs that are initializers
       graph_viewer_.GetNode(node_idx)->ForEachDef([&ng_required_initializers, this](const NodeArg& node_arg,
                                                                                     bool is_input) {
diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.h b/onnxruntime/core/providers/openvino/ov_versions/data_ops.h
index 87688601ad692..0990904908111 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/data_ops.h
+++ b/onnxruntime/core/providers/openvino/ov_versions/data_ops.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #pragma once
@@ -26,7 +26,8 @@ enum versionNum {
   V_2023_0,
   V_2023_1,
   V_2023_2,
-  V_2023_3
+  V_2023_3,
+  V_2024_0
 };
 
 using VersionNum = enum versionNum;
@@ -67,9 +68,7 @@ class DataOps {
   bool dimension_unsupported(const Node* node);
   bool unsupported_op_mode(const Node* node);
   bool type_is_supported(const NodeArg* node_arg, bool is_initializer);
-  bool node_is_supported(const std::map<std::string,
-                                        std::set<std::string>>& op_map,
-                         const NodeIndex node_idx);
+  bool node_is_supported(const NodeIndex node_idx);
 
  public:
   DataOps(const GraphViewer& graph_viewer_param, VersionNum ver, const std::string dev_id, const std::string device_precision)
diff --git a/onnxruntime/core/providers/openvino/ov_versions/utils.cc b/onnxruntime/core/providers/openvino/ov_versions/utils.cc
index ee0bfddb7dc83..c5ed29df487b4 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/utils.cc
+++ b/onnxruntime/core/providers/openvino/ov_versions/utils.cc
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #include "core/providers/shared_library/provider_api.h"
@@ -11,14 +11,6 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #endif
 
-#include "openvino/core/deprecated.hpp"
-#define IN_OV_COMPONENT
-#define NGRAPH_LEGACY_HEADER_INCLUDED
-#include <ngraph/frontend/onnx_import/onnx.hpp>
-
-#undef NGRAPH_LEGACY_HEADER_INCLUDED
-#undef IN_OV_COMPONENT
-
 #if defined(_MSC_VER)
 #pragma warning(default : 4244 4245)
 #elif __GNUC__
@@ -95,20 +87,6 @@ int GetOnnxOpSet(const GraphViewer& graph_viewer) {
   return dm_to_ver.at(kOnnxDomain);
 }
 
-std::map<std::string, std::set<std::string>> GetNgSupportedOps(const int onnx_opset) {
-  std::map<std::string, std::set<std::string>> ng_supported_ops;
-  OPENVINO_SUPPRESS_DEPRECATED_START
-  ng_supported_ops.emplace(kOnnxDomain, ngraph::onnx_import::get_supported_operators(onnx_opset, kOnnxDomain));
-
-  const std::set<std::string> ng_disabled_ops = {"LSTM"};  // Place-holder for ops not supported.
-
-  for (const auto& disabled_op : ng_disabled_ops) {
-    ng_supported_ops.at(kOnnxDomain).erase(disabled_op);
-  }
-  OPENVINO_SUPPRESS_DEPRECATED_END
-  return ng_supported_ops;
-}
-
 /**
  * Returns a vector clusters(or node_idx). For each unsupported node, the graph is split into 3 parts.
  * supported_cluster + (UNsupported_node + rest_of_the_graph). This functions returns vector of all supported_clusters by nGraph
diff --git a/onnxruntime/core/providers/openvino/ov_versions/utils.h b/onnxruntime/core/providers/openvino/ov_versions/utils.h
index b3edeef88dfec..34aa762ba9b67 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/utils.h
+++ b/onnxruntime/core/providers/openvino/ov_versions/utils.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 #pragma once
 
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index e5e0e81cb7da8..7b56f0c68427a 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -937,6 +937,20 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
             ORT_THROW("Invalid value passed for disable_dynamic_shapes: ", option.second);
           }
           OV_provider_options_map[option.first] = option.second;
+        } else if (option.first == "enable_dynamic_shapes") {
+          LOGS_DEFAULT(WARNING) << " Deprecation notice - 'enable_dynamic_shapes' is Deprected. Upgrade the API to disable_dynamic_shapes parameter."
+                                   "Please refer https://onnxruntime.ai/docs/execution-providers/OpenVINO-ExecutionProvider.html#requirements to ensure all dependencies are met.";
+          std::string value;
+          if (!(option.second == "True" || option.second == "true" ||
+                option.second == "False" || option.second == "false")) {
+            ORT_THROW("Invalid value passed for enable_dynamic_shapes: ", option.second);
+          }
+          if (option.second == "True" || option.second == "true") {
+            value = "false";
+          } else {
+            value = "true";
+          }
+          OV_provider_options_map["disable_dynamic_shapes"] = value;
         } else if (option.first == "device_id") {
           OV_provider_options_map[option.first] = option.second;
           continue;
@@ -967,7 +981,7 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
       if (!Env::Default().GetEnvironmentVar("INTEL_OPENVINO_DIR").empty()) {
         ORT_THROW("INTEL_OPENVINO_DIR is set but OpenVINO library wasn't able to be loaded. Please install a supported version of OpenVINO as mentioned in the requirements page (https://onnxruntime.ai/docs/execution-providers/OpenVINO-ExecutionProvider.html#requirements), ensure dependency libraries are in the PATH and your hardware is supported.");
       } else {
-        LOGS_DEFAULT(WARNING) << "Failed to create " << type << ". Please reference https://onnxruntime.ai/docs/execution-providers/OpenVINO-ExecutionProvider.html#requirements to ensure all dependencies are met.";
+        LOGS_DEFAULT(WARNING) << "Failed to create " << type << ". Please refer https://onnxruntime.ai/docs/execution-providers/OpenVINO-ExecutionProvider.html#requirements to ensure all dependencies are met.";
       }
     }
 #endif
diff --git a/onnxruntime/python/onnxruntime_pybind_state_common.h b/onnxruntime/python/onnxruntime_pybind_state_common.h
index 6827f2c9dfd91..22314610dbee9 100644
--- a/onnxruntime/python/onnxruntime_pybind_state_common.h
+++ b/onnxruntime/python/onnxruntime_pybind_state_common.h
@@ -60,11 +60,8 @@ struct OrtStatus {
 #elif OPENVINO_CONFIG_GPU_FP16
 #define BACKEND_OPENVINO "-OPENVINO_GPU_FP16"
 
-#elif OPENVINO_CONFIG_NPU_FP16
-#define BACKEND_OPENVINO "-OPENVINO_NPU_FP16"
-
-#elif OPENVINO_CONFIG_NPU_U8
-#define BACKEND_OPENVINO "-OPENVINO_NPU_U8"
+#elif OPENVINO_CONFIG_NPU
+#define BACKEND_OPENVINO "-OPENVINO_NPU"
 
 #elif OPENVINO_CONFIG_MULTI
 #define BACKEND_OPENVINO "-OPENVINO_MULTI"
diff --git a/onnxruntime/test/contrib_ops/activation_op_test.cc b/onnxruntime/test/contrib_ops/activation_op_test.cc
index 2a56991ec5af4..061fffa572be2 100644
--- a/onnxruntime/test/contrib_ops/activation_op_test.cc
+++ b/onnxruntime/test/contrib_ops/activation_op_test.cc
@@ -50,11 +50,15 @@ TEST_F(ActivationOpTest, ParametricSoftplus) {
       {{"alpha", alpha}, {"beta", beta}}, {}, false);  // Disable TensorRT due to result mismatch
 }
 
+// [TODO] Temporarily ignore this test for OpenVINO
+// Fails due to accuracy mismatch
+#if !defined(USE_OPENVINO)
 TEST_F(ActivationOpTest, Gelu) {
   TestActivationOp<float>(
       "Gelu", input_values, [](float x) { return x * 0.5f * (1.0f + std::erf(x * static_cast<float>(M_SQRT1_2))); }, {},
       {}, false, 1, kMSDomain);
 }
+#endif
 
 #if defined(USE_DNNL)
 std::vector<BFloat16> expected_output_bfloat16(const std::vector<float>& input_data) {
diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc
index 6e10763d8f293..9743ed18a6cc0 100644
--- a/onnxruntime/test/perftest/ort_test_session.cc
+++ b/onnxruntime/test/perftest/ort_test_session.cc
@@ -247,7 +247,7 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
       if (key == "device_type") {
         std::set<std::string> ov_supported_device_types = {"CPU_FP32", "CPU_FP16", "GPU_FP32",
                                                            "GPU.0_FP32", "GPU.1_FP32", "GPU_FP16",
-                                                           "GPU.0_FP16", "GPU.1_FP16"};
+                                                           "GPU.0_FP16", "GPU.1_FP16", "NPU"};
         if (ov_supported_device_types.find(value) != ov_supported_device_types.end()) {
           ov_options[key] = value;
         } else if (value.find("HETERO:") == 0) {
@@ -260,7 +260,7 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
           ORT_THROW(
               "[ERROR] [OpenVINO] You have selected a wrong configuration value for the key 'device_type'. "
               "Select from 'CPU_FP32', 'CPU_FP16', 'GPU_FP32', 'GPU.0_FP32', 'GPU.1_FP32', 'GPU_FP16', "
-              "'GPU.0_FP16', 'GPU.1_FP16' or from"
+              "'GPU.0_FP16', 'GPU.1_FP16', 'NPU' or from"
               " HETERO/MULTI/AUTO options available. \n");
         }
       } else if (key == "device_id") {
diff --git a/onnxruntime/test/providers/cpu/activation/activation_op_test.cc b/onnxruntime/test/providers/cpu/activation/activation_op_test.cc
index acd513172f95d..d2e883331acd4 100644
--- a/onnxruntime/test/providers/cpu/activation/activation_op_test.cc
+++ b/onnxruntime/test/providers/cpu/activation/activation_op_test.cc
@@ -697,7 +697,9 @@ TEST(LeakyReluGradInferenceTest, Basic) {
 
 // Remove DNNL from running this test because DNNL Gelu op seems not check domain for kernel implementation.
 // It will run the DNNL Gelu op which only be part of standard of Gelu-20 op.
-#if !defined(USE_DNNL) && !defined(USE_QNN)
+// [TODO] Temporarily ignore this test for OpenVINO to avoid an exception due to mishandling of the
+// approximate parameter. Re-enable it later when the issue is fixed
+#if !defined(USE_DNNL) && !defined(USE_QNN) && !defined(USE_OPENVINO)
 TEST_F(ActivationOpTest, ONNX_Gelu) {
   TestActivationOp<float>(
       "Gelu",
diff --git a/onnxruntime/test/python/onnx_backend_test_series.py b/onnxruntime/test/python/onnx_backend_test_series.py
index c48b07422d452..e441230537410 100644
--- a/onnxruntime/test/python/onnx_backend_test_series.py
+++ b/onnxruntime/test/python/onnx_backend_test_series.py
@@ -140,8 +140,8 @@ def create_backend_test(test_name=None):
         if backend.supports_device("OPENVINO_CPU_FP16"):
             current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_CPU_FP16")
 
-        if backend.supports_device("OPENVINO_NPU_FP16"):
-            current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_NPU_FP16")
+        if backend.supports_device("OPENVINO_NPU"):
+            current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_NPU")
 
         if backend.supports_device("OPENVINO"):
             current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_opset18")
diff --git a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
index ca089c42032b1..f120bf9968558 100644
--- a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
+++ b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
@@ -493,9 +493,12 @@
         "test_range_float_type_positive_delta_expanded_cpu", // Error but not a failure.
         "test_scan_sum_cpu", // Disabled due to output mismatch with tolerance.
         "test_scan9_sum_cpu", // Disabled due to output mismatch with tolerance.
-        "test_reduce_max_bool_inputs_cpu"
+        "test_reduce_max_bool_inputs_cpu",
+        "test_gelu_default_1_cpu", // Disabled due to accuracy mismatch
+        "test_gelu_default_2_cpu"
+        
     ],
-    "current_failing_tests_OPENVINO_NPU_FP16": [
+    "current_failing_tests_OPENVINO_NPU": [
         "^test_prelu_broadcast",
         "test_loop11_cpu"
     ],
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index fd9f106f7ad9b..3c1bdfc54c12e 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -73,13 +73,14 @@ def _str_to_bool(s):
 
 
 def _openvino_verify_device_type(device_read):
-    choices = ["CPU_FP32", "CPU_FP16", "GPU_FP32", "GPU_FP16"]
+    choices = ["CPU_FP32", "CPU_FP16", "GPU_FP32", "GPU_FP16", "NPU"]
 
     choices1 = [
         "CPU_FP32_NO_PARTITION",
         "CPU_FP16_NO_PARTITION",
         "GPU_FP32_NO_PARTITION",
         "GPU_FP16_NO_PARTITION",
+        "NPU_NO_PARTITION",
     ]
     status_hetero = True
     res = False
@@ -94,7 +95,7 @@ def _openvino_verify_device_type(device_read):
         if len(comma_separated_devices) < 2:
             print("At least two devices required in Hetero/Multi/Auto Mode")
             status_hetero = False
-        dev_options = ["CPU", "GPU"]
+        dev_options = ["CPU", "GPU", "NPU"]
         for dev in comma_separated_devices:
             if dev not in dev_options:
                 status_hetero = False
@@ -105,7 +106,7 @@ def invalid_hetero_build():
         print("specify the keyword HETERO or MULTI or AUTO followed by the devices ")
         print("in the order of priority you want to build\n")
         print("The different hardware devices that can be added in HETERO or MULTI or AUTO")
-        print("are ['CPU','GPU'] \n")
+        print("are ['CPU','GPU','NPU'] \n")
         print("An example of how to specify the hetero build type. Ex: HETERO:GPU,CPU \n")
         print("An example of how to specify the MULTI build type. Ex: MULTI:GPU,CPU \n")
         print("An example of how to specify the AUTO build type. Ex: AUTO:GPU,CPU \n")
@@ -1226,6 +1227,7 @@ def generate_build_tree(
             "-Donnxruntime_USE_OPENVINO_GPU_FP16=" + ("ON" if args.use_openvino == "GPU_FP16" else "OFF"),
             "-Donnxruntime_USE_OPENVINO_CPU_FP32=" + ("ON" if args.use_openvino == "CPU_FP32" else "OFF"),
             "-Donnxruntime_USE_OPENVINO_CPU_FP16=" + ("ON" if args.use_openvino == "CPU_FP16" else "OFF"),
+            "-Donnxruntime_USE_OPENVINO_NPU=" + ("ON" if args.use_openvino == "NPU" else "OFF"),
             "-Donnxruntime_USE_OPENVINO_GPU_FP32_NP="
             + ("ON" if args.use_openvino == "GPU_FP32_NO_PARTITION" else "OFF"),
             "-Donnxruntime_USE_OPENVINO_GPU_FP16_NP="
@@ -1234,6 +1236,7 @@ def generate_build_tree(
             + ("ON" if args.use_openvino == "CPU_FP32_NO_PARTITION" else "OFF"),
             "-Donnxruntime_USE_OPENVINO_CPU_FP16_NP="
             + ("ON" if args.use_openvino == "CPU_FP16_NO_PARTITION" else "OFF"),
+            "-Donnxruntime_USE_OPENVINO_NPU_NP=" + ("ON" if args.use_openvino == "NPU_NO_PARTITION" else "OFF"),
             "-Donnxruntime_USE_OPENVINO_HETERO=" + ("ON" if args.use_openvino.startswith("HETERO") else "OFF"),
             "-Donnxruntime_USE_OPENVINO_DEVICE=" + (args.use_openvino),
             "-Donnxruntime_USE_OPENVINO_MULTI=" + ("ON" if args.use_openvino.startswith("MULTI") else "OFF"),
diff --git a/tools/ci_build/github/azure-pipelines/linux-openvino-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-openvino-ci-pipeline.yml
index c92fc93abba37..03e0274fc198a 100644
--- a/tools/ci_build/github/azure-pipelines/linux-openvino-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-openvino-ci-pipeline.yml
@@ -32,5 +32,5 @@ jobs:
   parameters:
     AgentPool : 'Linux-CPU-2019'
     JobName: 'Linux_CI_Dev'
-    RunDockerBuildArgs: '-o ubuntu20.04 -d openvino -v 2023.0.0 -x "--use_openvino CPU_FP32 --build_wheel"'
+    RunDockerBuildArgs: '-o ubuntu20.04 -d openvino -v 2024.0.0 -x "--use_openvino CPU_FP32 --build_wheel"'
     TimeoutInMinutes: 120
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino
index a0ba5ea232ca3..45682c797bbb8 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino
@@ -1,8 +1,8 @@
 ARG UBUNTU_VERSION=20.04
 FROM ubuntu:${UBUNTU_VERSION}
 
-ARG OPENVINO_VERSION=2023.0.0
-ARG PYTHON_VERSION=3.8
+ARG OPENVINO_VERSION=2024.0.0
+ARG PYTHON_VERSION=3.9
 
 ADD scripts /tmp/scripts
 RUN /tmp/scripts/install_ubuntu.sh -p ${PYTHON_VERSION} -d EdgeDevice && \
@@ -14,15 +14,14 @@ RUN apt update && apt install -y libnuma1 ocl-icd-libopencl1 && \
 
 ENV INTEL_OPENVINO_DIR /opt/intel/openvino_${OPENVINO_VERSION}
 ENV LD_LIBRARY_PATH $INTEL_OPENVINO_DIR/runtime/lib/intel64:$INTEL_OPENVINO_DIR/runtime/3rdparty/tbb/lib:/usr/local/openblas/lib:$LD_LIBRARY_PATH
-ENV InferenceEngine_DIR $INTEL_OPENVINO_DIR/runtime/cmake
-ENV ngraph_DIR $INTEL_OPENVINO_DIR/runtime/cmake
+ENV OpenVINO_DIR $INTEL_OPENVINO_DIR/runtime/cmake
 ENV IE_PLUGINS_PATH $INTEL_OPENVINO_DIR/runtime/lib/intel64
 ENV DEBIAN_FRONTEND=noninteractive
 
 RUN cd /opt && mkdir -p intel && cd intel && \
-    wget https://storage.openvinotoolkit.org/repositories/openvino/packages/2023.0/linux/l_openvino_toolkit_ubuntu20_2023.0.0.10926.b4452d56304_x86_64.tgz && \
-    tar xzf l_openvino_toolkit_ubuntu20_2023.0.0.10926.b4452d56304_x86_64.tgz && rm -rf l_openvino_toolkit_ubuntu20_2023.0.0.10926.b4452d56304_x86_64.tgz && \
-    mv l_openvino_toolkit_ubuntu20_2023.0.0.10926.b4452d56304_x86_64 openvino_2023.0.0 && \
+    wget https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.0/linux/l_openvino_toolkit_ubuntu20_2024.0.0.14509.34caeefd078_x86_64.tgz && \
+    tar xzf l_openvino_toolkit_ubuntu20_2024.0.0.14509.34caeefd078_x86_64.tgz && rm -rf l_openvino_toolkit_ubuntu20_2024.0.0.14509.34caeefd078_x86_64.tgz && \
+    mv l_openvino_toolkit_ubuntu20_2024.0.0.14509.34caeefd078_x86_64 openvino_2024.0.0 && \
     cd $INTEL_OPENVINO_DIR/install_dependencies && ./install_openvino_dependencies.sh -y
 
 WORKDIR /root
diff --git a/tools/ci_build/github/linux/docker/Dockerfile_manylinux2014_openvino_multipython b/tools/ci_build/github/linux/docker/Dockerfile_manylinux2014_openvino_multipython
deleted file mode 100644
index bc0b412773286..0000000000000
--- a/tools/ci_build/github/linux/docker/Dockerfile_manylinux2014_openvino_multipython
+++ /dev/null
@@ -1,83 +0,0 @@
-FROM quay.io/pypa/manylinux2014_x86_64:latest
-
-ENV PATH /opt/rh/devtoolset-10/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
-ADD scripts /tmp/scripts
-RUN cd /tmp/scripts && /tmp/scripts/manylinux/install_centos.sh && /tmp/scripts/manylinux/install_deps.sh && rm -rf /tmp/scripts
-
-ARG PYTHON_VER_PATH="cp38-cp38"
-ARG PYTHON_VERSION="3.8"
-ARG BUILD_UID=1001
-ARG BUILD_USER=onnxruntimedev
-ARG OV_DEVICE_PRECISION="CPU_FP32"
-ARG ENABLE_TRAINING=true
-ARG ORT_BRANCH="rel-1.13.1"
-ARG OV_VERSION="2022.2.0"
-RUN adduser --uid $BUILD_UID $BUILD_USER
-WORKDIR /home/$BUILD_USER
-ENV PYTHON_EXE="/opt/python/$PYTHON_VER_PATH/bin/python$PYTHON_VERSION"
-
-RUN yum -y install wget git
-
-# libusb1.0.22
-RUN cd /home/ && wget https://github.com/libusb/libusb/archive/v1.0.22.zip && \
-    unzip v1.0.22.zip && rm -rf v1.0.22.zip && cd  /home/libusb-1.0.22 && \
-# bootstrap steps
-    ./bootstrap.sh && \
-    ./configure --disable-udev --enable-shared && \
-    make -j4 && \
-# configure libusb1.0.22
-    cd /home/libusb-1.0.22/libusb && \
-    /bin/mkdir -p '/usr/local/lib' && \
-    /bin/bash ../libtool   --mode=install /usr/bin/install -c   libusb-1.0.la '/usr/local/lib' && \
-    /bin/mkdir -p '/usr/local/include/libusb-1.0' && \
-    /usr/bin/install -c -m 644 libusb.h '/usr/local/include/libusb-1.0' && \
-    /bin/mkdir -p '/usr/local/lib/pkgconfig'
-
-RUN ${PYTHON_EXE} -m pip install onnx numpy wheel
-USER $BUILD_USER
-RUN cd $WORKDIR && git clone https://github.com/openvinotoolkit/openvino.git && \
-    cd openvino && \
-    git checkout $OV_VERSION && \
-    git submodule init && \
-    git submodule update --recursive
-
-RUN cd $WORKDIR && cd openvino && mkdir build && cd build && \
-    cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_FLAGS=-D_GLIBCXX_USE_CXX11_ABI=0 -DENABLE_PYTHON=ON -DPYTHON_EXECUTABLE=$PYTHON_EXE -DCMAKE_INSTALL_PREFIX=/home/onnxruntimedev/openvino_$OV_VERSION && \
-    make -j8 && make install
-
-ENV INTEL_OPENVINO_DIR /home/onnxruntimedev/openvino_$OV_VERSION
-ENV LD_LIBRARY_PATH $INTEL_OPENVINO_DIR/runtime/lib/intel64:$INTEL_OPENVINO_DIR/runtime/3rdparty/tbb/lib:/usr/local/openblas/lib:$LD_LIBRARY_PATH
-ENV TBB_LIBS $INTEL_OPENVINO_DIR/runtime/3rdparty/tbb/lib
-ENV InferenceEngine_DIR $INTEL_OPENVINO_DIR/runtime/cmake
-ENV ngraph_DIR $INTEL_OPENVINO_DIR/runtime/cmake
-ENV IE_PLUGINS_PATH $INTEL_OPENVINO_DIR/runtime/lib/intel64
-ENV OPENVINO_MANYLINUX 1
-
-RUN cd $WORKDIR && \
-    git clone --recursive -b $ORT_BRANCH https://github.com/intel/onnxruntime.git
-RUN cd onnxruntime/onnxruntime/core/providers/openvino && mkdir scripts
-
-RUN cp ${IE_PLUGINS_PATH}/libopenvino.so /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/
-RUN cp ${IE_PLUGINS_PATH}/libopenvino_c.so /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/
-RUN cp ${IE_PLUGINS_PATH}/libopenvino_onnx_frontend.so /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/
-RUN cp ${IE_PLUGINS_PATH}/libopenvino_intel_cpu_plugin.so /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/
-RUN cp ${IE_PLUGINS_PATH}/libopenvino_intel_gpu_plugin.so /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/
-RUN cp ${IE_PLUGINS_PATH}/libopenvino_intel_myriad_plugin.so /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/
-RUN cp ${IE_PLUGINS_PATH}/libopenvino_hetero_plugin.so /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/
-RUN cp ${IE_PLUGINS_PATH}/libopenvino_auto_plugin.so /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/
-RUN cp ${IE_PLUGINS_PATH}/plugins.xml /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/
-RUN cp ${IE_PLUGINS_PATH}/usb-ma2x8x.mvcmd /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/
-RUN cp ${TBB_LIBS}/libtbb.so /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/
-RUN cp ${TBB_LIBS}/libtbb.so.2 /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/
-RUN cp ${TBB_LIBS}/libtbbmalloc.so /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/
-RUN cp ${TBB_LIBS}/libtbbmalloc.so.2 /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/
-RUN cd /home/onnxruntimedev/onnxruntime && git pull
-RUN if $ENABLE_TRAINING; then \
-        ${PYTHON_EXE} ./onnxruntime/tools/ci_build/build.py \
-        --build_dir ./onnxruntime/build --use_openvino $(OV_DEVICE_PRECISION) --build_shared_lib \
-        --config Release --build_wheel --skip_tests --enable_training ; \
-    else \
-        ${PYTHON_EXE} ./onnxruntime/tools/ci_build/build.py \
-        --build_dir ./onnxruntime/build --use_openvino $(OV_DEVICE_PRECISION) --build_shared_lib \
-        --config Release --build_wheel --skip_tests ;\
-    fi
diff --git a/tools/nuget/generate_nuspec_for_native_nuget.py b/tools/nuget/generate_nuspec_for_native_nuget.py
index d5139f00e2f04..31c920c6e4438 100644
--- a/tools/nuget/generate_nuspec_for_native_nuget.py
+++ b/tools/nuget/generate_nuspec_for_native_nuget.py
@@ -734,7 +734,7 @@ def generate_files(line_list, args):
         )
 
     if args.execution_provider == "openvino":
-        openvino_path = get_env_var("INTEL_OPENVINO_DIR")
+        get_env_var("INTEL_OPENVINO_DIR")
         files_list.append(
             "<file src="
             + '"'
@@ -752,32 +752,6 @@ def generate_files(line_list, args):
             + '\\native" />'
         )
 
-        if is_windows():
-            dll_list_path = os.path.join(openvino_path, "runtime\\bin\\intel64\\Release\\")
-            tbb_list_path = os.path.join(openvino_path, "runtime\\3rdparty\\tbb\\bin\\")
-
-            for dll_element in os.listdir(dll_list_path):
-                if dll_element.endswith("dll"):
-                    files_list.append(
-                        "<file src="
-                        + '"'
-                        + os.path.join(dll_list_path, dll_element)
-                        + runtimes_target
-                        + args.target_architecture
-                        + '\\native" />'
-                    )
-
-            for tbb_element in os.listdir(tbb_list_path):
-                if tbb_element.endswith("dll"):
-                    files_list.append(
-                        "<file src="
-                        + '"'
-                        + os.path.join(tbb_list_path, tbb_element)
-                        + runtimes_target
-                        + args.target_architecture
-                        + '\\native" />'
-                    )
-
     if args.execution_provider == "cuda" or is_cuda_gpu_win_sub_package and not is_ado_packaging_build:
         files_list.append(
             "<file src="

From 6238e9c0af0978fa8359d20623541c854fa9045f Mon Sep 17 00:00:00 2001
From: kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
Date: Thu, 21 Mar 2024 18:59:05 -0700
Subject: [PATCH 226/279] Add LLaMA end-to-end benchmarking (#19985)

### Description

This PR adds a benchmarking script to measure end-to-end performance and
saves the results in a CSV file.

### Motivation and Context

With this PR, end-to-end performance can be easily measured for many
large-language models such as LLaMA-2. The performance numbers for
LLaMA-2 are located
[here](https://github.com/microsoft/onnxruntime-inference-examples/tree/main/python/models/llama).
---
 .../tools/transformers/models/llama/README.md | 132 +++++
 .../transformers/models/llama/benchmark.py    |  38 +-
 .../models/llama/benchmark_all.py             |  22 +
 .../models/llama/benchmark_e2e.py             | 554 ++++++++++++++++++
 .../models/llama/convert_to_onnx.py           |   5 +
 .../models/llama/dist_settings.py             |   5 +
 .../transformers/models/llama/llama_inputs.py | 194 +++++-
 .../transformers/models/llama/llama_parity.py |   9 +-
 .../transformers/models/llama/llama_torch.py  |   5 +
 .../transformers/models/llama/prompts.json    |  11 +
 .../models/llama/quant_kv_dataloader.py       |   5 +
 11 files changed, 957 insertions(+), 23 deletions(-)
 create mode 100644 onnxruntime/python/tools/transformers/models/llama/benchmark_e2e.py
 create mode 100644 onnxruntime/python/tools/transformers/models/llama/prompts.json

diff --git a/onnxruntime/python/tools/transformers/models/llama/README.md b/onnxruntime/python/tools/transformers/models/llama/README.md
index f9552e02d74b9..2e8cd3e1ac7f9 100644
--- a/onnxruntime/python/tools/transformers/models/llama/README.md
+++ b/onnxruntime/python/tools/transformers/models/llama/README.md
@@ -1,7 +1,14 @@
 # Contents
  - [LLaMA-2](#llama-2)
+   - [Prerequisites](#prerequisites)
    - [Exporting LLaMA-2](#exporting-llama-2)
+   - [Examples of Exporting LLaMA-2](#examples-of-exporting-llama-2)
+   - [Parity Checking LLaMA-2](#parity-checking-llama-2)
    - [Benchmarking LLaMA-2](#benchmark-llama-2)
+     - [Variants](#variants)
+     - [Benchmark All](#benchmark-all)
+     - [Benchmark E2E](#benchmark-e2e)
+   - [E2E Inference with LLaMA-2](#e2e-inference-with-llama-2)
  - [Mistral](#mistral)
    - [Exporting Mistral](#exporting-mistral)
    - [Optimizing and Quantizing Mistral](#optimizing-and-quantizing-mistral)
@@ -229,6 +236,55 @@ $ ./build.sh --config Release --use_cuda --cuda_home /usr/local/cuda-12.2 --cudn
 $ CUDA_VISIBLE_DEVICES=0,1,2,3 bash convert_70b_model.sh 4 -m meta-llama/Llama-2-70b-hf --output llama2-70b-distributed --precision fp16 --execution_provider cuda --use_gqa
 ```
 
+## Parity Checking LLaMA-2
+
+Here are some examples of how you can use the parity checker to verify your LLaMA-2 ONNX model.
+
+1. Merged ONNX model, FP32 CPU
+```
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.llama_parity \
+    --model_name meta-llama/Llama-2-7b-hf \
+    --onnx_model_path ./llama2-7b/rank_0_Llama-2-7b-hf_decoder_merged_model_fp32.onnx \
+    --merged \
+    --execution_provider cpu \
+    --precision fp32 \
+    --cache_dir ./model_cache \
+```
+
+2. Merged ONNX model, FP32 CUDA
+```
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.llama_parity \
+    --model_name meta-llama/Llama-2-7b-hf \
+    --onnx_model_path ./llama2-7b/rank_0_Llama-2-7b-hf_decoder_merged_model_fp32.onnx \
+    --merged \
+    --execution_provider cuda \
+    --precision fp32 \
+    --cache_dir ./model_cache \
+```
+
+3. Merged ONNX model, FP16 CUDA
+```
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.llama_parity \
+    --model_name meta-llama/Llama-2-7b-hf \
+    --onnx_model_path ./llama2-7b/rank_0_Llama-2-7b-hf_decoder_merged_model_fp32.onnx \
+    --merged \
+    --execution_provider cuda \
+    --precision fp16 \
+    --cache_dir ./model_cache \
+```
+
+4. Merged ONNX model, FP16 CUDA with GroupQueryAttention
+```
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.llama_parity \
+    --model_name meta-llama/Llama-2-7b-hf \
+    --onnx_model_path ./llama2-7b/rank_0_Llama-2-7b-hf_decoder_merged_model_fp32.onnx \
+    --merged \
+    --use_gqa \
+    --execution_provider cuda \
+    --precision fp16 \
+    --cache_dir ./model_cache \
+```
+
 ## Benchmark LLaMA-2
 
 Here are some examples of how you can benchmark LLaMA-2.
@@ -240,6 +296,7 @@ Here are some examples of how you can benchmark LLaMA-2.
 CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark \
     --benchmark-type hf-pt-eager \
     --model-name meta-llama/Llama-2-7b-hf \
+    --cache-dir ./model_cache \
     --precision fp32 \
     --batch-sizes "1 2" \
     --sequence-lengths "8 16" \
@@ -252,6 +309,7 @@ CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark \
 CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark \
     --benchmark-type hf-pt-compile \
     --model-name meta-llama/Llama-2-7b-hf \
+    --cache-dir ./model_cache \
     --precision fp16 \
     --batch-sizes "1 2" \
     --sequence-lengths "8 16" \
@@ -265,6 +323,7 @@ CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark \
     --benchmark-type hf-ort \
     --hf-ort-dir-path ./Llama-2-7b-hf-onnx/ \
     --model-name meta-llama/Llama-2-7b-hf \
+    --cache-dir ./model_cache \
     --precision fp32 \
     --batch-sizes "1 2" \
     --sequence-lengths "8 16" \
@@ -278,6 +337,7 @@ CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark \
     --benchmark-type hf-ort \
     --hf-ort-dir-path ./Llama-2-7b-hf-onnx/ \
     --model-name meta-llama/Llama-2-7b-hf \
+    --cache-dir ./model_cache \
     --precision fp16 \
     --batch-sizes "1 2" \
     --sequence-lengths "8 16" \
@@ -291,6 +351,7 @@ CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark \
     --benchmark-type ort-msft \
     --ort-model-path ./llama-2-onnx/7B_float32/ONNX/LlamaV2_7B_float32.onnx \
     --model-name meta-llama/Llama-2-7b-hf \
+    --cache-dir ./model_cache \
     --precision fp32 \
     --batch-sizes "1 2" \
     --sequence-lengths "8 16" \
@@ -303,6 +364,7 @@ CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark \
     --benchmark-type ort-msft \
     --ort-model-path ./llama-2-onnx/7B_float16/ONNX/LlamaV2_7B_float16.onnx \
     --model-name meta-llama/Llama-2-7b-hf \
+    --cache-dir ./model_cache \
     --precision fp16 \
     --batch-sizes "1 2" \
     --sequence-lengths "8 16" \
@@ -315,6 +377,7 @@ CUDA_VISIBLE_DEVICES=1 python3 -m models.llama.benchmark \
     --benchmark-type ort-convert-to-onnx \
     --ort-model-path ./llama2-7b/rank_0_Llama-2-7b-hf_decoder_merged_model_fp32.onnx \
     --model-name meta-llama/Llama-2-7b-hf \
+    --cache-dir ./model_cache \
     --precision fp32 \
     --batch-sizes "1 2" \
     --sequence-lengths "8 16" \
@@ -327,6 +390,7 @@ CUDA_VISIBLE_DEVICES=4 python3 -m models.llama.benchmark \
     --benchmark-type ort-convert-to-onnx \
     --ort-model-path ./llama2-7b/rank_0_Llama-2-7b-hf_decoder_merged_model_fp16.onnx \
     --model-name meta-llama/Llama-2-7b-hf \
+    --cache-dir ./model_cache \
     --precision fp16 \
     --batch-sizes "1 2" \
     --sequence-lengths "8 16" \
@@ -339,6 +403,7 @@ CUDA_VISIBLE_DEVICES=4,5,6,7 bash benchmark_70b_model.sh 4 \
     --benchmark-type ort-convert-to-onnx \
     --ort-model-path ./llama2-70b-dis/rank_{}_Llama-2-70b-hf_decoder_merged_model_fp16.onnx \
     --model-name meta-llama/Llama-2-70b-hf \
+    --cache-dir ./model_cache \
     --precision fp16 \
     --device cuda \
     --warmup-runs 5 \
@@ -357,6 +422,7 @@ CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark_all \
     --ort-convert-to-onnx-model-path ./llama2-7b-fp16/Llama-2-7b-hf_decoder_merged_model_fp16.onnx \
     --ort-msft-model-path ./llama-2-onnx/7B_float16/ONNX/LlamaV2_7B_float16.onnx \
     --model-name meta-llama/Llama-2-7b-hf \
+    --cache-dir ./model_cache \
     --precision fp16 \
     --batch-sizes "1 2" \
     --sequence-lengths "8 16" \
@@ -366,6 +432,72 @@ CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark_all \
     --timeout 60  # number of minutes before moving to the next benchmark
 ```
 
+### Benchmark E2E
+You can use `benchmark_e2e.py` to benchmark the full end-to-end scenario and automatically store the results in a CSV file. This tool uses `argmax` for sampling to standardize the benchmarking process.
+
+1. PyTorch without `torch.compile`, FP32
+```
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark_e2e \
+    --benchmark-type pt-eager \
+    --model-name meta-llama/Llama-2-7b-hf \
+    --cache-dir ./model_cache \
+    --prompts-file ./models/llama/prompts.json \
+    --precision fp32 \
+    --batch-sizes "1 2" \
+    --prompt-lengths "16 64" \
+    --device cpu \
+    --auth
+```
+
+2. PyTorch with `torch.compile`, FP16
+```
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark_e2e \
+    --benchmark-type pt-compile \
+    --model-name meta-llama/Llama-2-7b-hf \
+    --cache-dir ./model_cache \
+    --prompts-file ./models/llama/prompts.json \
+    --precision fp16 \
+    --batch-sizes "1 2" \
+    --prompt-lengths "16 64" \
+    --device cuda \
+    --auth
+```
+
+3. ONNX Runtime with `convert_to_onnx`, FP32
+```
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark_e2e \
+    --benchmark-type ort \
+    --model-name meta-llama/Llama-2-7b-hf \
+    --cache-dir ./model_cache \
+    --onnx-model-path ./llama2-7b/rank_0_Llama-2-7b-hf_decoder_merged_model_fp32.onnx \
+    --prompts-file ./models/llama/prompts.json \
+    --precision fp32 \
+    --batch-sizes "1 2" \
+    --prompt-lengths "16 64" \
+    --device cpu \
+    --auth
+```
+
+4. ONNX Runtime with `convert_to_onnx`, FP16
+```
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark_e2e \
+    --benchmark-type ort \
+    --model-name meta-llama/Llama-2-7b-hf \
+    --cache-dir ./model_cache \
+    --onnx-model-path ./llama2-7b/rank_0_Llama-2-7b-hf_decoder_merged_model_fp32.onnx \
+    --prompts-file ./models/llama/prompts.json \
+    --precision fp16 \
+    --batch-sizes "1 2" \
+    --prompt-lengths "16 64" \
+    --device cuda \
+    --use_buffer_share \
+    --auth
+```
+
+## E2E Inference with LLaMA-2
+
+For end-to-end inference, please visit the [ONNX Runtime Inference Examples folder](https://github.com/microsoft/onnxruntime-inference-examples/tree/main/python/models/llama) for a step-by-step walkthrough, code examples, and performance metrics.
+
 # Mistral
 
 ## Introduction
diff --git a/onnxruntime/python/tools/transformers/models/llama/benchmark.py b/onnxruntime/python/tools/transformers/models/llama/benchmark.py
index bfe108d21a595..6184298c471ac 100644
--- a/onnxruntime/python/tools/transformers/models/llama/benchmark.py
+++ b/onnxruntime/python/tools/transformers/models/llama/benchmark.py
@@ -1,3 +1,8 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
 import argparse
 import datetime
 import gc
@@ -14,11 +19,12 @@
 from benchmark_helper import measure_memory, setup_logger
 from dist_settings import get_rank, get_size
 from llama_inputs import (
-    add_io_bindings,
+    add_io_bindings_as_ortvalues,
     get_merged_sample_with_past_kv_inputs,
     get_msft_sample_inputs,
     get_sample_inputs,
     get_sample_with_past_kv_inputs,
+    verify_ort_inputs,
 )
 from optimum.onnxruntime import ORTModelForCausalLM
 from torch.profiler import ProfilerActivity, profile, record_function
@@ -199,6 +205,7 @@ def get_model(args: argparse.Namespace):
             torch_dtype=torch.float16 if args.use_fp16 else torch.float32,
             use_auth_token=args.auth,
             use_cache=True,
+            cache_dir=args.cache_dir,
         ).to(args.target_device)
         end_time = time.time()
 
@@ -444,24 +451,12 @@ def get_logits(inputs):
 
 def run_ort_inference(args, init_inputs, iter_inputs, model):
     def prepare_ort_inputs(inputs, kv_cache_ortvalues):
-        # Check that all model inputs will be provided
-        model_inputs = set(map(lambda model_input: model_input.name, model.get_inputs()))
-        user_inputs = set(inputs.keys())
-        missing_inputs = model_inputs - user_inputs
-        if len(missing_inputs):
-            logger.error(f"The following model inputs are missing: {missing_inputs}")
-            raise Exception("There are missing inputs to the model. Please add them and try again.")
-
-        # Remove unnecessary inputs from model inputs
-        unnecessary_inputs = user_inputs - model_inputs
-        if len(unnecessary_inputs):
-            for unnecessary_input in unnecessary_inputs:
-                logger.info(f"Removing unnecessary input '{unnecessary_input}' from user provided inputs")
-                del inputs[unnecessary_input]
+        # Verify model inputs
+        inputs = verify_ort_inputs(model, inputs)
 
         # Add IO bindings for non-CPU execution providers
         if args.device != "cpu":
-            io_binding, kv_cache_ortvalues = add_io_bindings(
+            io_binding, kv_cache_ortvalues = add_io_bindings_as_ortvalues(
                 model, inputs, args.device, int(args.rank), args.use_gqa, kv_cache_ortvalues
             )
             setattr(args, "io_binding", io_binding)  # noqa: B010
@@ -612,6 +607,13 @@ def get_args(rank=0):
     parser.add_argument("--pt-num-rows", type=int, default=1000, help="Number of rows for PyTorch profiler to display")
     parser.add_argument("--verbose", default=False, action="store_true")
     parser.add_argument("--log-folder", type=str, default=os.path.join("."), help="Folder to cache log files")
+    parser.add_argument(
+        "--cache-dir",
+        type=str,
+        required=True,
+        default="./model_cache",
+        help="Cache dir where Hugging Face files are stored",
+    )
 
     args = parser.parse_args()
 
@@ -662,8 +664,8 @@ def main():
 
     args.rank = rank
     args.world_size = world_size
-    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
-    config = AutoConfig.from_pretrained(args.model_name)
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name, cache_dir=args.cache_dir)
+    config = AutoConfig.from_pretrained(args.model_name, cache_dir=args.cache_dir)
     target_device = f"cuda:{args.rank}" if args.device != "cpu" else args.device
     use_fp16 = args.precision == "fp16"
 
diff --git a/onnxruntime/python/tools/transformers/models/llama/benchmark_all.py b/onnxruntime/python/tools/transformers/models/llama/benchmark_all.py
index c6d550d47cf4c..2433ae3d9b5ee 100644
--- a/onnxruntime/python/tools/transformers/models/llama/benchmark_all.py
+++ b/onnxruntime/python/tools/transformers/models/llama/benchmark_all.py
@@ -1,3 +1,8 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
 import argparse
 import datetime
 import json
@@ -78,6 +83,13 @@ def get_args():
         help="Path to ONNX model from convert_to_onnx",
     )
 
+    parser.add_argument(
+        "--cache-dir",
+        type=str,
+        default="./model_cache",
+        help="Cache dir where Hugging Face files are stored",
+    )
+
     parser.add_argument(
         "--model-name",
         type=str,
@@ -332,6 +344,8 @@ def main():
             str(args.num_runs),
             "--log-folder",
             args.log_folder,
+            "--cache-dir",
+            args.cache_dir,
             "--auth",
         ]
         logger.info("Benchmark PyTorch without torch.compile")
@@ -362,6 +376,8 @@ def main():
             str(args.num_runs),
             "--log-folder",
             args.log_folder,
+            "--cache-dir",
+            args.cache_dir,
             "--auth",
         ]
         logger.info("Benchmark PyTorch with torch.compile")
@@ -394,6 +410,8 @@ def main():
             str(args.num_runs),
             "--log-folder",
             args.log_folder,
+            "--cache-dir",
+            args.cache_dir,
             "--auth",
         ]
         logger.info("Benchmark Optimum + ONNX Runtime")
@@ -426,6 +444,8 @@ def main():
             str(args.num_runs),
             "--log-folder",
             args.log_folder,
+            "--cache-dir",
+            args.cache_dir,
         ]
         logger.info("Benchmark Microsoft model in ONNX Runtime")
         results = benchmark(args, benchmark_cmd, "ort-msft")
@@ -457,6 +477,8 @@ def main():
             str(args.num_runs),
             "--log-folder",
             args.log_folder,
+            "--cache-dir",
+            args.cache_dir,
         ]
         logger.info("Benchmark convert_to_onnx model in ONNX Runtime")
         results = benchmark(args, benchmark_cmd, "onnxruntime")
diff --git a/onnxruntime/python/tools/transformers/models/llama/benchmark_e2e.py b/onnxruntime/python/tools/transformers/models/llama/benchmark_e2e.py
new file mode 100644
index 0000000000000..4d0d2e68e8983
--- /dev/null
+++ b/onnxruntime/python/tools/transformers/models/llama/benchmark_e2e.py
@@ -0,0 +1,554 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+# This is an end-to-end benchmarking script for the Hugging Face LLaMA-2 model.
+#
+# Prerequisites:
+# 1) Install `huggingface-cli`:
+#
+# $ pip install huggingface_hub
+#
+# 2) Authenticate with Hugging Face's CLI:
+#
+# $ huggingface-cli login
+#
+# 3) Accept Meta's license in Hugging Face to access the models at https://huggingface.co/meta-llama/
+#
+# 4) Install the latest ONNX Runtime version
+#
+# $ pip install onnxruntime-gpu
+
+from __future__ import annotations
+
+import argparse
+import datetime
+import gc
+import itertools
+import json
+import logging
+import os
+import textwrap
+import time
+
+import numpy as np
+import pandas as pd
+import torch
+from benchmark_helper import setup_logger
+from llama_inputs import add_io_bindings_as_tensors, get_initial_inputs_and_outputs
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+
+import onnxruntime as ort
+
+logger = logging.getLogger(__name__)
+
+
+def get_model(args):
+    if args.benchmark_type in {"pt-eager", "pt-compile"}:
+        model = AutoModelForCausalLM.from_pretrained(
+            args.hf_dir_path if args.hf_dir_path != "" else args.model_name,
+            cache_dir=args.cache_dir,
+            torch_dtype=args.torch_dtype,
+            use_auth_token=args.auth,
+            use_cache=True,
+        ).to(args.target_device)
+        model.eval()
+
+        if args.benchmark_type == "pt-compile":
+            model = torch.compile(model)
+
+    else:
+        sess_options = ort.SessionOptions()
+        ep = (
+            ("CUDAExecutionProvider", {"device_id": args.device_id})
+            if args.device == "cuda"
+            else "CPUExecutionProvider"
+        )
+        model = ort.InferenceSession(args.onnx_model_path, sess_options=sess_options, providers=[ep])
+
+    return model
+
+
+def run_inference(args, model, runs, inputs, outputs):
+    if args.benchmark_type == "pt-compile":
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+    # Synchronize inputs
+    io_binding = None
+    if args.benchmark_type in {"pt-eager", "pt-compile"}:
+        if args.device != "cpu":
+            torch.cuda.synchronize(args.target_device)
+    else:
+        io_binding = add_io_bindings_as_tensors(model, inputs, outputs, args.use_fp16, args.use_buffer_share)
+        io_binding.synchronize_inputs()
+
+    # Run inference
+    start = time.perf_counter()
+    for _ in range(runs):
+        if args.benchmark_type in {"pt-eager", "pt-compile"}:
+            with torch.no_grad():
+                outputs = model(**inputs)
+                if args.device != "cpu":
+                    torch.cuda.synchronize(args.target_device)
+        else:
+            model.run_with_iobinding(io_binding)
+            io_binding.synchronize_outputs()
+
+    end = time.perf_counter()
+    avg = (end - start) / runs
+    return avg, outputs
+
+
+def prepare_model_for_inference(args, model, config, tokenizer, prompt_length, prompt):
+    clear_cache()
+    inputs, outputs = get_initial_inputs_and_outputs(
+        config, tokenizer, prompt_length, prompt, args.target_device, args.use_fp16, args.use_buffer_share, args.engine
+    )
+    _, outputs = run_inference(args, model, args.warmup_runs, inputs, outputs)
+    return inputs, outputs
+
+
+def clear_cache():
+    gc.collect()
+    torch.cuda.empty_cache()
+
+
+def save_results(results, filename, gen_length):
+    df = pd.DataFrame(
+        results,
+        columns=[
+            "Batch Size",
+            "Prompt Length",
+            "Prompt Processing Latency (ms)",
+            "Prompt Processing Throughput (tps)",
+            "Sampling Latency (ms)",
+            "Sampling Throughput (tps)",
+            "First Token Generated Latency (ms)",
+            "First Token Generated Throughput (tps)",
+            f"Average Latency of First {gen_length // 2} Tokens Generated (ms)",
+            f"Average Throughput of First {gen_length // 2} Tokens Generated (tps)",
+            f"Average Latency of First {gen_length} Tokens Generated (ms)",
+            f"Average Throughput of First {gen_length} Tokens Generated (tps)",
+            "Wall-Clock Latency (s)",
+            "Wall-Clock Throughput (tps)",
+        ],
+    )
+
+    df.to_csv(filename, index=False)
+    logger.info(f"Results saved in {filename}!")
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-bt",
+        "--benchmark-type",
+        type=str,
+        required=True,
+        choices=["pt-eager", "pt-compile", "ort"],
+    )
+
+    parser.add_argument(
+        "-m",
+        "--model-name",
+        type=str,
+        required=False,
+        help="Hugging Face name of model (e.g. 'meta-llama/Llama-2-7b-hf')",
+    )
+
+    parser.add_argument(
+        "-a",
+        "--auth",
+        default=False,
+        action="store_true",
+        help="Use Hugging Face authentication token to access model",
+    )
+
+    parser.add_argument(
+        "-c",
+        "--cache-dir",
+        type=str,
+        default=os.path.join(".", "model_cache"),
+        help="Path to directory containing all Hugging Face files (e.g. config, tokenizer, PyTorch model). Use when loading model as `AutoModel.from_pretrained(model_name, cache_dir=cache_dir)`.",
+    )
+
+    parser.add_argument(
+        "--hf-dir-path",
+        type=str,
+        default="",
+        help="Path to directory containing all Hugging Face files (e.g. config, tokenizer, PyTorch model). Use when loading model as `AutoModel.from_pretrained(folder_path)`.",
+    )
+
+    parser.add_argument(
+        "-o",
+        "--onnx-model-path",
+        required=False,
+        help="Path to ONNX model",
+    )
+
+    parser.add_argument(
+        "-f",
+        "--prompts-file",
+        required=True,
+        default=os.path.join(".", "models", "llama", "prompts.json"),
+        help="JSON file containing entries in the format 'prompt length: prompt' where prompt length = tokenized length of prompt",
+    )
+
+    parser.add_argument(
+        "--use_buffer_share",
+        default=False,
+        action="store_true",
+        help="Use when GroupQueryAttention (GQA) is in ONNX model",
+    )
+
+    parser.add_argument(
+        "--anomaly-filtering",
+        default=False,
+        action="store_true",
+        help="Use this flag to filter anomaly accelerator times for tokens generated. \
+              This may give more accurate latency and throughput metrics for tokens generated. \
+              Wall-clock metrics are still reported with anomaly times though.",
+    ),
+
+    parser.add_argument(
+        "-b",
+        "--batch-sizes",
+        default="1 2",
+    )
+
+    parser.add_argument(
+        "-s",
+        "--prompt-lengths",
+        default="32 64 128 256 512",
+    )
+
+    parser.add_argument(
+        "-p",
+        "--precision",
+        required=True,
+        type=str,
+        default="fp32",
+        choices=["int4", "int8", "fp16", "fp32"],
+        help="Precision for model. For ONNX models, the model's precision should be set before running this script.",
+    )
+
+    parser.add_argument(
+        "-g",
+        "--generation-length",
+        type=int,
+        default=256,
+        help="Number of new tokens to generate",
+    )
+
+    parser.add_argument(
+        "-d",
+        "--device",
+        type=str,
+        default="cuda" if torch.cuda.is_available() else "cpu",
+        choices=["cpu", "cuda"],
+    )
+
+    parser.add_argument("-id", "--device-id", type=int, default=0)
+    parser.add_argument("-w", "--warmup-runs", type=int, default=5)
+    parser.add_argument("-n", "--num-runs", type=int, default=100)
+    parser.add_argument("--seed", type=int, default=2)
+
+    args = parser.parse_args()
+
+    # Set seed properties
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+
+    # Set runtime properties
+    if "ort" in args.benchmark_type:
+        setattr(args, "execution_provider", f"{args.device.upper()}ExecutionProvider")  # noqa: B010
+        if args.execution_provider == "CUDAExecutionProvider":
+            args.execution_provider = (args.execution_provider, {"device_id": args.device_id})
+
+    # Check that paths have been specified for any benchmarking with ORT
+    if args.benchmark_type == "ort":
+        assert args.onnx_model_path, "Please specify a path to `--onnx-model-path`"
+
+    args.batch_sizes = args.batch_sizes.split(" ")
+    args.prompt_lengths = args.prompt_lengths.split(" ")
+
+    # Use FP32 precision for FP32, INT8, INT4 CPU models, use FP16 precision for FP16 and INT4 GPU models
+    args.precision = (
+        "fp32" if args.precision in {"int8", "fp32"} or (args.precision == "int4" and args.device == "cpu") else "fp16"
+    )
+
+    target_device = f"cuda:{args.device_id}" if args.device != "cpu" else args.device
+    torch_dtype = torch.float16 if args.precision == "fp16" else torch.float32
+    engine = "ort" if args.benchmark_type == "ort" else "pt"
+    setattr(args, "target_device", target_device)  # noqa: B010
+    setattr(args, "torch_dtype", torch_dtype)  # noqa: B010
+    setattr(args, "engine", engine)  # noqa: B010
+    setattr(args, "use_fp16", args.precision == "fp16")  # noqa: B010
+
+    return args
+
+
+def main():
+    args = get_args()
+    setup_logger(False)
+    logger.info(args.__dict__)
+
+    # Get prompts and prompt sizes
+    size_to_prompt = None
+    with open(args.prompts_file) as f:
+        size_to_prompt = json.load(f, object_hook=lambda d: {int(k): v for k, v in d.items()})
+
+    # Get config, tokenizer, and model
+    config = AutoConfig.from_pretrained(
+        args.hf_dir_path if args.hf_dir_path != "" else args.model_name,
+        cache_dir=args.cache_dir,
+        use_auth_token=args.auth,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.hf_dir_path if args.hf_dir_path != "" else args.model_name,
+        cache_dir=args.cache_dir,
+        use_auth_token=args.auth,
+    )
+    model = get_model(args)
+
+    all_csv_metrics = []
+    for batch_size, prompt_length in itertools.product(args.batch_sizes, args.prompt_lengths):
+        batch_size, prompt_length = int(batch_size), int(prompt_length)  # noqa: PLW2901
+        logger.info(f"Running batch size = {batch_size}, prompt length = {prompt_length}")
+        clear_cache()
+        max_length = prompt_length + args.generation_length
+
+        if prompt_length not in size_to_prompt:
+            raise NotImplementedError(
+                textwrap.dedent(
+                    f"""
+                                A prompt of size {prompt_length} was not found in '{args.prompts_file}'. There are a couple of solutions to fix this.
+                                1) You can change one of the keys in '{args.prompts_file}' to be {prompt_length}.
+                                    If {prompt_length} < actual prompt's length, the benchmark E2E tool will repeat the first word in the prompt until {prompt_length} = actual prompt's length.
+                                    If {prompt_length} > actual prompt's length, the benchmark E2E tool will automatically trim the actual prompt's length so that {prompt_length} = actual prompt's length.
+                                2) You can add a new key-value entry in '{args.prompts_file}' of the form '{prompt_length}': 'your prompt goes here'.
+                """
+                )
+            )
+        prompt = [size_to_prompt[prompt_length]] * batch_size
+        csv_metrics = [batch_size, prompt_length]
+
+        try:
+            # Measure prompt processing
+            logger.info("Measuring prompt processing...")
+            inputs, outputs = prepare_model_for_inference(args, model, config, tokenizer, prompt_length, prompt)
+            accelerator_prompt_latency_s, outputs = run_inference(args, model, args.num_runs, inputs, outputs)
+
+            # Calculate prompt metrics
+            accelerator_prompt_latency_ms = accelerator_prompt_latency_s * 1000
+            accelerator_prompt_thrpt = batch_size * (prompt_length / accelerator_prompt_latency_s)
+            logger.info(f"Average Latency of Prompt Processing: {accelerator_prompt_latency_ms} ms")
+            logger.info(
+                f"Average Throughput of Prompt Processing: {batch_size * (prompt_length / accelerator_prompt_latency_s)} tps"
+            )
+            csv_metrics.extend([accelerator_prompt_latency_ms, accelerator_prompt_thrpt])
+
+            # Measure token generation
+            logger.info("Measuring token generation...")
+            clear_cache()
+            inputs, outputs = prepare_model_for_inference(args, model, config, tokenizer, prompt_length, prompt)
+
+            all_token_ids = inputs["input_ids"].clone()
+            current_length = all_token_ids.shape[-1]
+            num_heads = config.num_key_value_heads
+            head_size = (
+                config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads
+            )
+
+            has_eos = torch.zeros(batch_size, device=args.target_device, dtype=torch.bool)
+
+            # 0th entry will have prompt accelerator time, 1st entry onwards will have token generation accelerator time
+            accelerator_times = []
+            sampling_times = []  # cost to sample after each model run
+
+            wall_clock_start_time = time.perf_counter()
+            while current_length <= max_length:
+                # Run inference
+                accelerator_time_latency_s, outputs = run_inference(args, model, 1, inputs, outputs)
+                accelerator_times.append(accelerator_time_latency_s)
+
+                # Sample with argmax (greedy search)
+                sampling_start_time = time.perf_counter()
+                if outputs["logits"].shape[1] > 1:
+                    prompt_end_indices = inputs["attention_mask"].sum(1) - 1
+                    idxs = (
+                        prompt_end_indices.unsqueeze(dim=1)
+                        .repeat(1, config.vocab_size)
+                        .view(batch_size, 1, config.vocab_size)
+                    )
+                    next_token_logits = torch.gather(outputs["logits"], 1, idxs).squeeze()
+                else:
+                    next_token_logits = outputs["logits"][:, -1, :]
+                next_tokens = torch.argmax(next_token_logits, dim=-1)
+
+                # Check if we previously reached EOS token id or if generated token id is EOS token id
+                has_eos = has_eos | next_tokens == tokenizer.eos_token_id
+
+                # Determine which new tokens to add to list of all token ids
+                # Add EOS token ids for batch entries that ended early (ragged batching scenario where some batch entries ended early and some haven't)
+                tokens_to_add = next_tokens.masked_fill(has_eos, tokenizer.eos_token_id).reshape([batch_size, 1])
+                sampling_end_time = time.perf_counter()
+                sampling_times.append(sampling_end_time - sampling_start_time)
+
+                all_token_ids = torch.cat([all_token_ids, tokens_to_add], dim=-1)
+
+                # Return early if all batch entries have reached EOS token id
+                current_length += 1
+                if torch.all(has_eos) or current_length > max_length:
+                    break
+
+                # Update inputs for next inference run
+                inputs["input_ids"] = tokens_to_add
+                inputs["attention_mask"] = torch.cat(
+                    [inputs["attention_mask"], (~has_eos).to(torch.int64).reshape(batch_size, 1)], 1
+                )
+                inputs["position_ids"] = (
+                    None
+                    if "position_ids" not in inputs
+                    else torch.max(inputs["position_ids"], dim=1)[0].reshape(batch_size, 1) + 1
+                )
+
+                # Set logits to zeros for next inference run and re-use memory buffer
+                if outputs["logits"].shape[1] != 1:
+                    outputs["logits"] = outputs["logits"][:, :1, :].contiguous()
+                outputs["logits"].zero_()
+
+                # Update KV caches for next inference run
+                if args.engine == "pt":
+                    # Update KV caches for PyTorch
+                    inputs["past_key_values"] = outputs["past_key_values"]
+                elif not args.use_buffer_share:
+                    # Update KV caches for ONNX Runtime if buffer sharing is not used
+                    for i in range(config.num_hidden_layers):
+                        inputs[f"past_key_values.{i}.key"] = outputs[f"present.{i}.key"]
+                        inputs[f"past_key_values.{i}.value"] = outputs[f"present.{i}.value"]
+
+                    new_sequence_length = inputs["attention_mask"].shape[1]
+                    for i in range(config.num_hidden_layers):
+                        present_key = torch.zeros(
+                            batch_size,
+                            num_heads,
+                            new_sequence_length,
+                            head_size,
+                            device=args.target_device,
+                            dtype=args.torch_dtype,
+                        )
+                        present_value = torch.zeros(
+                            batch_size,
+                            num_heads,
+                            new_sequence_length,
+                            head_size,
+                            device=args.target_device,
+                            dtype=args.torch_dtype,
+                        )
+                        outputs.update(
+                            {
+                                f"present.{i}.key": present_key.contiguous(),
+                                f"present.{i}.value": present_value.contiguous(),
+                            }
+                        )
+
+            wall_clock_end_time = time.perf_counter()
+
+            # Filter out any anomaly accelerator times (e.g. for `torch.compile`)
+            accelerator_times.pop(0)  # Remove prompt processing time
+            if args.anomaly_filtering:
+                anomaly_threshold_factor = 10
+                min_time_s = min(accelerator_times)
+                orig_size = len(accelerator_times)
+                accelerator_times = list(
+                    filter(lambda acc_time: acc_time < anomaly_threshold_factor * min_time_s, accelerator_times)
+                )
+                new_size = len(accelerator_times)
+                logger.info(
+                    f"Filtered out {orig_size - new_size} anomaly accelerator times that are {anomaly_threshold_factor}x greater than {min_time_s * 1000} ms..."
+                )
+
+            #######################################################
+            # Calculate sampling and first token generated metrics
+            #######################################################
+
+            # Calculate sampling metrics
+            avg_sampling_latency_s = sum(sampling_times) / len(sampling_times)
+            avg_sampling_latency_ms = avg_sampling_latency_s * 1000
+            avg_sampling_thrpt = batch_size * (1 / avg_sampling_latency_s)
+            logger.info(f"Average Latency of Sampling: {avg_sampling_latency_ms} ms")
+            logger.info(f"Average Throughput of Sampling: {avg_sampling_thrpt} tps")
+
+            # Calculate first token generated metrics
+            first_token_latency_s = accelerator_times[0]
+            first_token_latency_ms = first_token_latency_s * 1000
+            first_token_thrpt = batch_size * (1 / first_token_latency_s)
+            logger.info(f"Latency of First Token Generated: {first_token_latency_ms} ms")
+            logger.info(f"Throughput of First Token Generated: {first_token_thrpt} tps")
+
+            ####################################################
+            # Calculate first `halfway` token generated metrics
+            ####################################################
+
+            halfway = args.generation_length // 2
+            halfway_token_latency_s = sum(accelerator_times[:halfway]) / len(accelerator_times[:halfway])
+            halfway_token_latency_ms = halfway_token_latency_s * 1000
+            halfway_token_thrpt = batch_size * (1 / halfway_token_latency_s)
+            logger.info(f"Average Latency of First {halfway} Tokens Generated: {halfway_token_latency_ms} ms")
+            logger.info(f"Average Throughput of First {halfway} Tokens Generated: {halfway_token_thrpt} tps")
+
+            #########################################
+            # Calculate all tokens generated metrics
+            #########################################
+
+            all_token_latency_s = sum(accelerator_times) / len(accelerator_times)
+            all_token_latency_ms = all_token_latency_s * 1000
+            all_token_thrpt = batch_size * (1 / all_token_latency_s)
+            logger.info(
+                f"Average Latency of First {args.generation_length} Tokens Generated: {all_token_latency_ms} ms"
+            )
+            logger.info(f"Average Throughput of First {args.generation_length} Tokens Generated: {all_token_thrpt} tps")
+
+            ###############################
+            # Calculate wall clock metrics
+            ###############################
+
+            wall_clock_latency_s = wall_clock_end_time - wall_clock_start_time
+            wall_clock_thrpt = batch_size * ((prompt_length + args.generation_length) / wall_clock_latency_s)
+            logger.info(f"Wall-Clock Latency: {wall_clock_latency_s} s")
+            logger.info(
+                f"Wall-Clock Throughput: {batch_size * ((prompt_length + args.generation_length) / wall_clock_latency_s)} tps"
+            )
+
+            # Add metrics to CSV
+            logger.info("Adding results to CSV")
+            csv_metrics.extend(
+                [
+                    avg_sampling_latency_ms,
+                    avg_sampling_thrpt,
+                    first_token_latency_ms,
+                    first_token_thrpt,
+                    halfway_token_latency_ms,
+                    halfway_token_thrpt,
+                    all_token_latency_ms,
+                    all_token_thrpt,
+                    wall_clock_latency_s,
+                    wall_clock_thrpt,
+                ]
+            )
+            all_csv_metrics.append(csv_metrics)
+
+        except:  # noqa: E722
+            logger.info(f"Could not benchmark at batch size = {batch_size}, prompt length = {prompt_length}")
+
+    filename = f"benchmark_{args.engine}_e2e_{datetime.datetime.now():%Y-%m-%d_%H:%M:%S}.csv"
+    save_results(all_csv_metrics, filename, args.generation_length)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
index 1ad58327b7fc2..b649f7ab65049 100644
--- a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
+++ b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
@@ -1,3 +1,8 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
 from __future__ import annotations
 
 import argparse
diff --git a/onnxruntime/python/tools/transformers/models/llama/dist_settings.py b/onnxruntime/python/tools/transformers/models/llama/dist_settings.py
index 72192ce8d8c63..3b53f60758b27 100644
--- a/onnxruntime/python/tools/transformers/models/llama/dist_settings.py
+++ b/onnxruntime/python/tools/transformers/models/llama/dist_settings.py
@@ -1,3 +1,8 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
 import os
 
 import torch.distributed as dist
diff --git a/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py b/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py
index 18202f4b81c0f..5aed55c12f38f 100644
--- a/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py
+++ b/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py
@@ -1,8 +1,13 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
 from __future__ import annotations
 
 import numpy as np
 import torch
-from transformers import AutoConfig
+from transformers import AutoConfig, AutoTokenizer
 
 from onnxruntime import InferenceSession, OrtValue
 
@@ -269,6 +274,8 @@ def convert_inputs_for_ort(
     return ort_inputs
 
 
+# Re-allocate KV caches from (batch_size, num_heads, past_sequence_length, head_size) to
+# (batch_size, num_heads, max_sequence_length, head_size) for past-present buffer sharing
 def enable_past_present_share_buffer(ort_inputs: dict, past_seq_len: int, max_seq_len: int):
     for k, v in ort_inputs.items():
         # Allocate new buffers with max_sequence_length for GQA
@@ -281,8 +288,29 @@ def enable_past_present_share_buffer(ort_inputs: dict, past_seq_len: int, max_se
     return ort_inputs
 
 
-# Add IO bindings for execution providers
-def add_io_bindings(
+# Verify ONNX Runtime inputs with model
+def verify_ort_inputs(model: InferenceSession, ort_inputs: dict):
+    # Check that all model inputs will be provided
+    model_inputs = set(map(lambda model_input: model_input.name, model.get_inputs()))
+    user_inputs = set(ort_inputs.keys())
+    missing_inputs = model_inputs - user_inputs
+    if len(missing_inputs):
+        print(f"The following model inputs are missing: {missing_inputs}")
+        raise Exception("There are missing inputs to the model. Please add them and try again.")
+
+    # Remove unnecessary inputs from model inputs
+    unnecessary_inputs = user_inputs - model_inputs
+    if len(unnecessary_inputs):
+        for unnecessary_input in unnecessary_inputs:
+            print(f"Removing unnecessary input '{unnecessary_input}' from user provided inputs")
+            del ort_inputs[unnecessary_input]
+
+    return ort_inputs
+
+
+# Add IO bindings for execution providers using OrtValue
+# Use when you need to run inference once or twice to save memory
+def add_io_bindings_as_ortvalues(
     model: InferenceSession, ort_inputs: dict, device: str, device_id: int, use_gqa: bool, kv_cache_ortvalues: dict
 ):
     io_binding = model.io_binding()
@@ -318,3 +346,163 @@ def add_io_bindings(
             io_binding.bind_output(name, device_type=device, device_id=device_id)
 
     return io_binding, kv_cache_ortvalues
+
+
+# Add IO bindings for execution providers using PyTorch tensors
+# Use when you need to run inference many times
+def add_io_bindings_as_tensors(
+    model: InferenceSession, inputs: dict, outputs: dict, use_fp16: bool, use_buffer_share: bool
+):
+    # Verify model inputs
+    inputs = verify_ort_inputs(model, inputs)
+
+    device = None
+    pt_to_np = {
+        "torch.int32": np.int32,
+        "torch.int64": np.int64,
+        "torch.float16": np.float16,
+        "torch.float32": np.float32,
+    }
+
+    # Bind inputs/outputs to IO binding
+    io_binding = model.io_binding()
+    for k, v in inputs.items():
+        io_binding.bind_input(
+            name=k,
+            device_type=v.device.type,
+            device_id=0 if v.device.type == "cpu" else v.device.index,
+            element_type=pt_to_np[repr(v.dtype)],
+            shape=tuple(v.shape),
+            buffer_ptr=v.data_ptr(),
+        )
+        device = v.device
+
+    for output in model.get_outputs():
+        name = output.name
+        if use_buffer_share and "present" in name:
+            # Bind KV cache outputs to KV cache inputs
+            v = inputs[name.replace("present", "past_key_values")]
+            io_binding.bind_output(
+                name=name,
+                device_type=v.device.type,
+                device_id=v.device.index,
+                element_type=np.float16,
+                shape=tuple(v.shape),
+                buffer_ptr=v.data_ptr(),
+            )
+        else:
+            v = outputs[name]
+            io_binding.bind_output(
+                name=name,
+                device_type=device.type,
+                device_id=0 if device.type == "cpu" else device.index,
+                element_type=(np.float16 if use_fp16 else np.float32),
+                shape=tuple(v.shape),
+                buffer_ptr=v.data_ptr(),
+            )
+
+    return io_binding
+
+
+# Get actual inputs when using real data (instead of sample data) and initialize outputs
+def get_initial_inputs_and_outputs(
+    config: AutoConfig,
+    tokenizer: AutoTokenizer,
+    requested_length: int,
+    prompt: list[str],
+    device: torch.device,
+    use_fp16: bool,
+    use_buffer_share: bool,
+    engine: str,
+):
+    tokenizer.pad_token = "[PAD]"
+    encodings_dict = tokenizer.batch_encode_plus(prompt, padding=True)
+    torch_dtype = torch.float16 if use_fp16 else torch.float32
+
+    # input_ids:      pad token id is 0
+    # attention_mask: pad token id is 0
+    # position_ids:   pad token id is 1
+    input_ids = torch.tensor(encodings_dict["input_ids"], device=device, dtype=torch.int64)
+    attention_mask = torch.tensor(encodings_dict["attention_mask"], device=device, dtype=torch.int64)
+    position_ids = get_position_ids(attention_mask, use_past_kv=False)
+
+    # Check if tokenized prompt length matches the requested prompt length
+    tokenized_length = input_ids.shape[-1]
+    if tokenized_length > requested_length:
+        # Shorten the inputs from (batch_size, tokenized_length) to (batch_size, requested_length)
+        input_ids = input_ids[:, :requested_length]
+        attention_mask = attention_mask[:, :requested_length]
+        position_ids = get_position_ids(attention_mask, use_past_kv=False)
+    elif tokenized_length < requested_length:
+        # Lengthen the inputs from (batch_size, tokenized_length) to (batch_size, requested_length)
+        input_ids_first_col = input_ids[:, 0].unsqueeze(0).T
+        attention_mask_first_col = attention_mask[:, 0].unsqueeze(0).T
+        for _ in range(requested_length - tokenized_length):
+            input_ids = torch.hstack((input_ids_first_col, input_ids))
+            attention_mask = torch.hstack((attention_mask_first_col, attention_mask))
+        position_ids = get_position_ids(attention_mask, use_past_kv=False)
+
+    tokenized_length = input_ids.shape[-1]
+    assert tokenized_length == requested_length
+
+    # Create inputs
+    inputs = {
+        "input_ids": input_ids.contiguous() if engine == "ort" else input_ids,
+        "attention_mask": attention_mask.contiguous() if engine == "ort" else attention_mask,
+        "position_ids": position_ids.contiguous() if engine == "ort" else position_ids,
+    }
+    if engine != "ort":
+        inputs["past_key_values"] = []
+
+    # Get shape of KV cache inputs
+    batch_size, sequence_length = input_ids.shape
+    max_sequence_length = config.max_position_embeddings
+    num_heads = config.num_key_value_heads
+    head_size = config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads
+
+    # Create KV cache inputs
+    for i in range(config.num_hidden_layers):
+        past_key = torch.zeros(
+            batch_size,
+            num_heads,
+            max_sequence_length if use_buffer_share else 0,
+            head_size,
+            device=device,
+            dtype=torch_dtype,
+        )
+        past_value = torch.zeros(
+            batch_size,
+            num_heads,
+            max_sequence_length if use_buffer_share else 0,
+            head_size,
+            device=device,
+            dtype=torch_dtype,
+        )
+        if engine == "ort":
+            inputs.update(
+                {
+                    f"past_key_values.{i}.key": past_key.contiguous(),
+                    f"past_key_values.{i}.value": past_value.contiguous(),
+                }
+            )
+        else:
+            inputs["past_key_values"].append((past_key, past_value))
+
+    outputs = None
+    if engine == "ort":
+        # Create outputs
+        logits = torch.zeros(batch_size, sequence_length, config.vocab_size, device=device, dtype=torch_dtype)
+        outputs = {"logits": logits.contiguous()}
+        if not use_buffer_share:
+            for i in range(config.num_hidden_layers):
+                present_key = torch.zeros(
+                    batch_size, num_heads, sequence_length, head_size, device=device, dtype=torch_dtype
+                )
+                present_value = torch.zeros(
+                    batch_size, num_heads, sequence_length, head_size, device=device, dtype=torch_dtype
+                )
+                outputs.update(
+                    {f"present.{i}.key": present_key.contiguous(), f"present.{i}.value": present_value.contiguous()}
+                )
+
+    return inputs, outputs
diff --git a/onnxruntime/python/tools/transformers/models/llama/llama_parity.py b/onnxruntime/python/tools/transformers/models/llama/llama_parity.py
index f41a90208c51b..9cbc9af7fe9b5 100644
--- a/onnxruntime/python/tools/transformers/models/llama/llama_parity.py
+++ b/onnxruntime/python/tools/transformers/models/llama/llama_parity.py
@@ -1,3 +1,8 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
 from __future__ import annotations
 
 import argparse
@@ -10,7 +15,7 @@
 from benchmark_helper import setup_logger
 from dist_settings import get_rank, get_size
 from llama_inputs import (
-    add_io_bindings,
+    add_io_bindings_as_ortvalues,
     convert_inputs_for_ort,
     get_merged_sample_with_past_kv_inputs,
     get_sample_inputs,
@@ -123,7 +128,7 @@ def verify_parity(
 
     # Add IO bindings for non-CPU execution providers
     if args.execution_provider != "cpu":
-        io_binding, kv_cache_ortvalues = add_io_bindings(
+        io_binding, kv_cache_ortvalues = add_io_bindings_as_ortvalues(
             ort_model,
             inputs,
             args.execution_provider,
diff --git a/onnxruntime/python/tools/transformers/models/llama/llama_torch.py b/onnxruntime/python/tools/transformers/models/llama/llama_torch.py
index 89b459c80beec..d570e2d7ee086 100644
--- a/onnxruntime/python/tools/transformers/models/llama/llama_torch.py
+++ b/onnxruntime/python/tools/transformers/models/llama/llama_torch.py
@@ -1,3 +1,8 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
 import logging
 import os
 
diff --git a/onnxruntime/python/tools/transformers/models/llama/prompts.json b/onnxruntime/python/tools/transformers/models/llama/prompts.json
new file mode 100644
index 0000000000000..5d8fae99dbc7e
--- /dev/null
+++ b/onnxruntime/python/tools/transformers/models/llama/prompts.json
@@ -0,0 +1,11 @@
+{
+    "16": "How are astronauts launched into space quickly on those rockets? ",
+    "64": "Today, we will learn how to bake a chocolate cake. First, you need to have all of the ingredients to bake. Otherwise, the chocolate cake won't be tasty. You will also need a large baking pan to hold the batter. ",
+    "256": "Risk Management and Insurance (RMI) is a field that focuses on the identification, assessment and financial mitigation of risk. It's about insurance but also more than that. For example, insurance companies look at risk factors such as age, gender and medical history to determine how much they will charge for life insurance coverage. However, RMI is not just about buying insurance (although it is a big part of this). It is also about taking steps to reduce the likelihood that something bad happens in the first place. For example, you may think twice before crossing a busy road if there's a high risk of being hit by a car or getting injured. In addition to insurance companies and financial services firms, RMI professionals work with individuals (customers), businesses and other entities (clients). Their job is to identify potential risks and help mitigate them before they become problems for their clients. This can include helping people prepare financially for unexpected events like losing a job or being injured in an accident, as well as assisting businesses with managing risk exposure from things like natural disasters or cyber attacks. Insurance companies use RMI to ",
+    "1024": "Risk Management and Insurance (RMI) is a field that focuses on the identification, assessment and financial mitigation of risk. It's about insurance but also more than that. For example, insurance companies look at risk factors such as age, gender and medical history to determine how much they will charge for life insurance coverage. However, RMI is not just about buying insurance (although it is a big part of this). It is also about taking steps to reduce the likelihood that something bad happens in the first place. For example, you may think twice before crossing a busy road if there's a high risk of being hit by a car or getting injured. In addition to insurance companies and financial services firms, RMI professionals work with individuals (customers), businesses and other entities (clients). Their job is to identify potential risks and help mitigate them before they become problems for their clients. This can include helping people prepare financially for unexpected events like losing a job or being injured in an accident, as well as assisting businesses with managing risk exposure from things like natural disasters or cyber attacks. Insurance companies use RMI to assess the level of risk associated with potential customers and determine how much they should charge them for coverage. For example, if you are a healthy 25-year old male who doesn't smoke and has never been in an accident, your insurance premiums will likely be lower than those of someone else who fits into one or more of these categories (or all three). Risk Management & Insurance is the process by which you can protect yourself from financial loss. It's about taking control of your money and making sure that it's safe, secure and accessible to you when you need it most. The first step in risk management is understanding what risks are important to you as an individual or a family member who may depend on the income generated by these investments for their livelihood. Once you have identified these key risk factors, then we can help identify how best to manage them through various strategies such as setting up automatic payments into savings accounts so that money is always available when needed most; setting aside emergency funds in case something unexpected happens (e.g., illness); investing wisely so that returns outpace inflation over time; diversifying portfolios by adding stocks and bonds which will help reduce volatility while still providing growth potential through dividends/interest payments over longer periods of time than if invested solely into one type of asset class alone etc. The field of risk management and insurance is growing rapidly, as more people become aware of the potential dangers that can arise from an unforeseen event or accident. As a result, there are many different careers within this field that you may want to consider if you're interested in working with risks and helping others protect themselves from them.One common career path in risk management is as an insurance agent/broker. This person would work for an insurance company or brokerage firm, selling policies to clients who need coverage against things like car accidents or home damage caused by natural disasters such as fires or floods. Insurance agents typically work on commission (i.e., they receive a percentage of every sale). This is important because it means that the more successful an agent is at selling policies, the higher his/her income will be. Another career option within risk management is working for an insurance company itself rather than as an external broker or salesperson. In this case, you'd help manage claims made by policyholders who have been injured through no fault of their own (for example after being hit by another driver). You can also work in risk analysis, a field that involves analyzing the potential risks associated with various investments and projects. This is done to determine whether or not an opportunity has enough upside to justify taking on any related risks. In addition, you might also be responsible for developing strategies to minimize those risks so they don't result in big losses if something goes wrong down the road. If your goal is to work as a broker or agent, then there are some prerequisites that will need to be met before beginning this career path: You must have an associate's degree from an accredited college; pass an exam administered by state regulators (the Series 6) and/or complete additional training offered by professional organizations such as NAFA, which stands for National Association of Financial Advisors. After meeting these requirements, you'll then need to find employment at one or more insurance companies where they offer positions that allow new hires some flexibility when starting out their careers.Risk management and insurance is a broad field that includes many different types of jobs. ",
+    "2048": "Artificial Intelligence (AI) is a transformative technology that has the potential to revolutionize society in many ways. AI can be used to enhance the accuracy and efficiency of decision-making, improve lives through new apps and services, and solve some of the thorny policy problems of climate change, infrastructure, and healthcare. In this essay, I will discuss some of the ways AI can benefit society. One of the most significant benefits of AI is its ability to improve healthcare. AI can assist doctors, nurses, and other healthcare professionals in making better diagnoses and faster decisions on a course of treatment, based on the large amount of data that currently exists. AI allows doctors to pinpoint effective drugs that may have otherwise been overlooked and can identify higher-risk individuals before any human can. AI can also help relieve the burden on healthcare professionals by taking care of routine data collection and filing, freeing up time for other higher-value activities. Another area where AI can benefit society is in the fight against climate change. AI can be used to analyze vast amounts of data, identify patterns, and provide accurate predictions. It can help us forecast what further spread of pandemics is going to look like, and track their development around the world. AI can also help us predict the impact of climate change on our planet and develop strategies to mitigate its effects. For example, AI can be used to optimize energy consumption, reduce waste, and improve the efficiency of transportation systems. AI can also benefit society by improving education. AI-powered educational tools can help students learn more effectively by providing personalized learning experiences tailored to their individual needs. AI can also help teachers by automating routine tasks such as grading and providing feedback on student work. This can free up time for teachers to focus on more important tasks such as lesson planning and student engagement. AI can also benefit society by improving public safety. AI-powered surveillance systems can help law enforcement agencies detect and prevent crime more effectively. AI can also be used to analyze social media data to identify potential threats and prevent them before they occur. For example, AI can be used to detect hate speech and other forms of online harassment, which can help prevent cyberbullying and other forms of online abuse. Finally, AI can benefit society by improving the economy. AI can help businesses become more efficient by automating routine tasks and providing insights into customer behavior. This can help businesses make better decisions and improve their bottom line. AI can also help create new jobs by enabling the development of new products and services that were previously impossible. In conclusion, AI has the potential to benefit society in many ways. From improving healthcare and education to fighting climate change and improving public safety, AI can help us solve some of the most pressing problems facing our world today. As we continue to develop and refine this transformative technology, it is important that we do so in an ethical and responsible manner, ensuring that the benefits of AI are shared by all members of society. AI has been a topic of discussion for many years, and while it has brought many benefits to society, there are also concerns about its impact. In this essay, I will discuss some of the reasons why AI may not help society. Firstly, AI can be biased. AI systems are designed by humans, and they can be infused with the biases of their creators. This can lead to discrimination against certain groups of people and can perpetuate existing inequalities in society. Additionally, AI can lack transparency, making it difficult to understand how decisions are being made. This can lead to mistrust of AI systems and can hinder their adoption. Secondly, AI can be used to automate jobs, which can lead to unemployment. While AI can increase productivity and efficiency, it can also lead to job displacement, particularly in industries that rely heavily on manual labor. This can have a negative impact on individuals and communities, particularly those that are already marginalized. Thirdly, AI can be used to create fake content, such as deepfakes, which can be used to spread misinformation and propaganda. This can have serious consequences for democracy and can undermine trust in institutions. Fourthly, AI can be used to create autonomous weapons, which can have devastating consequences. These weapons can make decisions without human intervention, which can lead to unintended consequences and can be difficult to control. Fifthly, AI can be used to create surveillance systems that infringe on privacy rights. These systems can be used to monitor individuals without their knowledge or consent, which can have serious consequences for civil liberties. In conclusion, while AI has many potential benefits, there are also concerns about its impact on society. It is important to consider these concerns and to ensure that AI is developed and used in a responsible and ethical manner. Within AI, there are also many subfields. Reinforcement learning is a type of machine learning algorithm that focuses on training models to make decisions in an environment in order to maximize a reward. This is typically done through trial and error, as the algorithm receives feedback in the form of rewards or punishments for its actions. Reinforcement learning has many potential benefits for society, some of which are discussed below. Firstly, reinforcement learning can be used to improve industrial automation and robotics. By training robots to learn from their own experiences, they can gain the skills necessary to perform complex tasks without human intervention. This can lead to increased efficiency and productivity in industries such as manufacturing and logistics. Secondly, reinforcement learning can be used to optimize traffic control systems. By training models to make real-time decisions based on traffic patterns and other data, traffic flow can be improved, reducing congestion and travel times. Thirdly, reinforcement learning can be used to improve healthcare. By training models to make decisions based on patient data, doctors can make more accurate diagnoses and develop more effective treatment plans. This can lead to better health outcomes for patients and can reduce healthcare costs. Fourthly, reinforcement learning can be used to improve education. By training models to adapt to individual student needs, personalized learning experiences can be created that are tailored to each student\u2019s strengths and weaknesses. This can lead to improved academic performance and can help to close the achievement gap. Finally, reinforcement learning can be used to improve environmental sustainability. By training models to make decisions based on environmental data, such as weather patterns and pollution levels, more effective policies can be developed to reduce carbon emissions and protect natural resources. In conclusion, reinforcement learning has many potential benefits for society. By training models to make decisions based on feedback from their environment, we can create more efficient and effective systems in a wide range of fields. However, it is important to consider the ethical implications of these technologies and to ensure that they are developed and used in a responsible and ethical manner. Multi-modal models are another type of machine learning that can process and find relationships between different types of data, such as images, video, audio, and text. They have the potential to revolutionize many aspects of our lives, from healthcare to transportation to education. In this essay, I will discuss how multi-modal models can help society in various ways. One of the most significant benefits of multi-modal models is their ability to transform unstructured data into structured data that can be analyzed. For example, a company could use a multi-modal model to extract data from images or PDFs of invoices or receipts. This would enable them to analyze the data more efficiently and make better-informed decisions. Another benefit of multi-modal models is their ability to cater to various learning styles. Blended and multi-modal learning can reach people who benefit from different learning styles. By understanding their individual learning styles, employees can leverage resources that are compatible with how they process information most effectively. Multi-modal models can also help improve healthcare. For example, they can be used to analyze medical images and identify patterns that might be difficult for human doctors to detect. This can lead to earlier diagnoses and more effective treatments. In addition, multi-modal models can help improve transportation. For example, they can be used to analyze traffic patterns and optimize traffic flow. This can help reduce congestion and improve safety on the roads. Finally, multi-modal models can help improve education. For example, they can be used to create personalized learning experiences for students based on their individual learning styles. This can help students learn more effectively and efficiently. In conclusion, multi-modal models have the potential to help society in many ways. They can transform unstructured data into structured data, cater to various learning styles, improve healthcare, transportation, and education. However, like any new technology, it is important to approach it with caution and consider the potential risks and benefits. I hope this essay has provided some insight into the potential benefits of multi-modal models. Throughout this essay, I have demonstrated the numerous benefits that artificial intelligence will bring to our society. I have also shown some examples of various categories within artificial intelligence that have varying purposes. It is important to consider that each category has its own purpose and has its own pros and cons to it. In conclusion, we must use AI responsibly. ",
+    "3840": "Artificial Intelligence (AI) is a transformative technology that has the potential to revolutionize society in many ways. AI can be used to enhance the accuracy and efficiency of decision-making, improve lives through new apps and services, and solve some of the thorny policy problems of climate change, infrastructure, and healthcare. In this essay, I will discuss some of the ways AI can benefit society. One of the most significant benefits of AI is its ability to improve healthcare. AI can assist doctors, nurses, and other healthcare professionals in making better diagnoses and faster decisions on a course of treatment, based on the large amount of data that currently exists. AI allows doctors to pinpoint effective drugs that may have otherwise been overlooked and can identify higher-risk individuals before any human can. AI can also help relieve the burden on healthcare professionals by taking care of routine data collection and filing, freeing up time for other higher-value activities. Another area where AI can benefit society is in the fight against climate change. AI can be used to analyze vast amounts of data, identify patterns, and provide accurate predictions. It can help us forecast what further spread of pandemics is going to look like, and track their development around the world. AI can also help us predict the impact of climate change on our planet and develop strategies to mitigate its effects. For example, AI can be used to optimize energy consumption, reduce waste, and improve the efficiency of transportation systems. AI can also benefit society by improving education. AI-powered educational tools can help students learn more effectively by providing personalized learning experiences tailored to their individual needs. AI can also help teachers by automating routine tasks such as grading and providing feedback on student work. This can free up time for teachers to focus on more important tasks such as lesson planning and student engagement. AI can also benefit society by improving public safety. AI-powered surveillance systems can help law enforcement agencies detect and prevent crime more effectively. AI can also be used to analyze social media data to identify potential threats and prevent them before they occur. For example, AI can be used to detect hate speech and other forms of online harassment, which can help prevent cyberbullying and other forms of online abuse. Finally, AI can benefit society by improving the economy. AI can help businesses become more efficient by automating routine tasks and providing insights into customer behavior. This can help businesses make better decisions and improve their bottom line. AI can also help create new jobs by enabling the development of new products and services that were previously impossible. In conclusion, AI has the potential to benefit society in many ways. From improving healthcare and education to fighting climate change and improving public safety, AI can help us solve some of the most pressing problems facing our world today. As we continue to develop and refine this transformative technology, it is important that we do so in an ethical and responsible manner, ensuring that the benefits of AI are shared by all members of society. AI has been a topic of discussion for many years, and while it has brought many benefits to society, there are also concerns about its impact. In this essay, I will discuss some of the reasons why AI may not help society. Firstly, AI can be biased. AI systems are designed by humans, and they can be infused with the biases of their creators. This can lead to discrimination against certain groups of people and can perpetuate existing inequalities in society. Additionally, AI can lack transparency, making it difficult to understand how decisions are being made. This can lead to mistrust of AI systems and can hinder their adoption. Secondly, AI can be used to automate jobs, which can lead to unemployment. While AI can increase productivity and efficiency, it can also lead to job displacement, particularly in industries that rely heavily on manual labor. This can have a negative impact on individuals and communities, particularly those that are already marginalized. Thirdly, AI can be used to create fake content, such as deepfakes, which can be used to spread misinformation and propaganda. This can have serious consequences for democracy and can undermine trust in institutions. Fourthly, AI can be used to create autonomous weapons, which can have devastating consequences. These weapons can make decisions without human intervention, which can lead to unintended consequences and can be difficult to control. Fifthly, AI can be used to create surveillance systems that infringe on privacy rights. These systems can be used to monitor individuals without their knowledge or consent, which can have serious consequences for civil liberties. In conclusion, while AI has many potential benefits, there are also concerns about its impact on society. It is important to consider these concerns and to ensure that AI is developed and used in a responsible and ethical manner. Within AI, there are also many subfields. Reinforcement learning is a type of machine learning algorithm that focuses on training models to make decisions in an environment in order to maximize a reward. This is typically done through trial and error, as the algorithm receives feedback in the form of rewards or punishments for its actions. Reinforcement learning has many potential benefits for society, some of which are discussed below. Firstly, reinforcement learning can be used to improve industrial automation and robotics. By training robots to learn from their own experiences, they can gain the skills necessary to perform complex tasks without human intervention. This can lead to increased efficiency and productivity in industries such as manufacturing and logistics. Secondly, reinforcement learning can be used to optimize traffic control systems. By training models to make real-time decisions based on traffic patterns and other data, traffic flow can be improved, reducing congestion and travel times. Thirdly, reinforcement learning can be used to improve healthcare. By training models to make decisions based on patient data, doctors can make more accurate diagnoses and develop more effective treatment plans. This can lead to better health outcomes for patients and can reduce healthcare costs. Fourthly, reinforcement learning can be used to improve education. By training models to adapt to individual student needs, personalized learning experiences can be created that are tailored to each student\u2019s strengths and weaknesses. This can lead to improved academic performance and can help to close the achievement gap. Finally, reinforcement learning can be used to improve environmental sustainability. By training models to make decisions based on environmental data, such as weather patterns and pollution levels, more effective policies can be developed to reduce carbon emissions and protect natural resources. In conclusion, reinforcement learning has many potential benefits for society. By training models to make decisions based on feedback from their environment, we can create more efficient and effective systems in a wide range of fields. However, it is important to consider the ethical implications of these technologies and to ensure that they are developed and used in a responsible and ethical manner. Multi-modal models are another type of machine learning that can process and find relationships between different types of data, such as images, video, audio, and text. They have the potential to revolutionize many aspects of our lives, from healthcare to transportation to education. In this essay, I will discuss how multi-modal models can help society in various ways. One of the most significant benefits of multi-modal models is their ability to transform unstructured data into structured data that can be analyzed. For example, a company could use a multi-modal model to extract data from images or PDFs of invoices or receipts. This would enable them to analyze the data more efficiently and make better-informed decisions. Another benefit of multi-modal models is their ability to cater to various learning styles. Blended and multi-modal learning can reach people who benefit from different learning styles. By understanding their individual learning styles, employees can leverage resources that are compatible with how they process information most effectively. Multi-modal models can also help improve healthcare. For example, they can be used to analyze medical images and identify patterns that might be difficult for human doctors to detect. This can lead to earlier diagnoses and more effective treatments. In addition, multi-modal models can help improve transportation. For example, they can be used to analyze traffic patterns and optimize traffic flow. This can help reduce congestion and improve safety on the roads. Finally, multi-modal models can help improve education. For example, they can be used to create personalized learning experiences for students based on their individual learning styles. This can help students learn more effectively and efficiently. In conclusion, multi-modal models have the potential to help society in many ways. They can transform unstructured data into structured data, cater to various learning styles, improve healthcare, transportation, and education. However, like any new technology, it is important to approach it with caution and consider the potential risks and benefits. I hope this essay has provided some insight into the potential benefits of multi-modal models. Semi-supervised learning is a type of machine learning that falls in between supervised and unsupervised learning. It is a method that uses a small amount of labeled data and a large amount of unlabeled data to train a model. The goal of semi-supervised learning is to learn a function that can accurately predict the output variable based on the input variables, similar to supervised learning. However, unlike supervised learning, the algorithm is trained on a dataset that contains both labeled and unlabeled data. Semi-supervised learning is particularly useful when there is a large amount of unlabeled data available, but it\u2019s too expensive or difficult to label all of it. The primary advantage of semi-supervised learning is that it can reduce the amount of annotated data used. This is particularly useful when labeled data is scarce or expensive to obtain. By using a small amount of labeled data and a large amount of unlabeled data, semi-supervised learning algorithms can learn from both types of data and improve their accuracy. Semi-supervised learning algorithms are also capable of consolidating overfitting tendencies, which is a common problem in supervised learning. Another advantage of semi-supervised learning is that it is versatile. It can be applied in various situations, from image recognition to crawlers. For example, in text classification, the goal is to classify a given text into one or more predefined categories. Semi-supervised learning can be used to train a text classification model using a small amount of labeled data and a large amount of unlabeled text data. In image classification, the goal is to classify a given image into one or more predefined categories. Semi-supervised learning can be used to train an image classification model using a small amount of labeled data and a large amount of unlabeled image data. In anomaly detection, the goal is to detect patterns or observations that are unusual or different from the norm. Semi-supervised learning can be used to detect anomalies using a small amount of labeled data and a large amount of unlabeled data. Semi-supervised learning algorithms are also stable and simple. They have high efficiency and can be used to improve the performance and generalization of models. However, semi-supervised learning algorithms also have some disadvantages. One of the main disadvantages is that they require a large amount of unlabeled data to be effective. If there is not enough unlabeled data available, the algorithm may not be able to learn effectively. Additionally, semi-supervised learning algorithms can be sensitive to the quality of the labeled data. If the labeled data is noisy or incorrect, the algorithm may not be able to learn effectively. In conclusion, semi-supervised learning is a powerful tool that can be used to improve the accuracy and generalization of machine learning models. It is particularly useful when labeled data is scarce or expensive to obtain. Semi-supervised learning algorithms can learn from both labeled and unlabeled data, which makes them versatile and capable of consolidating overfitting tendencies. However, semi-supervised learning algorithms also have some disadvantages, such as requiring a large amount of unlabeled data to be effective and being sensitive to the quality of the labeled data. Despite these disadvantages, semi-supervised learning is a valuable technique that can be used to improve the performance of machine learning models. Supervised learning is a type of machine learning that involves training a model on labeled data. The goal of supervised learning is to learn a function that can accurately predict the output variable based on the input variables. Supervised learning is widely used in various fields, including image recognition, speech recognition, natural language processing, and more. One of the primary advantages of supervised learning is that it allows for accurate predictions. Supervised learning models can provide highly accurate predictions or classifications when trained on a diverse and representative dataset. This makes supervised learning particularly useful in situations where accuracy is critical, such as in medical diagnosis or fraud detection. Another advantage of supervised learning is that it is easy to understand and implement. Supervised learning algorithms are relatively simple and can be implemented using a variety of programming languages and libraries. This makes it accessible to a wide range of developers and data scientists. Supervised learning is also versatile. It can be applied to a wide range of problem domains, making it a flexible approach for various industries and applications. For example, in image classification, the goal is to classify a given image into one or more predefined categories. Supervised learning can be used to train an image classification model using a labeled dataset of images and their corresponding categories. In speech recognition, the goal is to transcribe spoken words into text. Supervised learning can be used to train a speech recognition model using a labeled dataset of audio recordings and their corresponding transcriptions. Supervised learning algorithms are also capable of handling missing data. If there is missing data in the labeled dataset, supervised learning algorithms can still learn from the available data and make accurate predictions. This is particularly useful in situations where data is incomplete or noisy. However, supervised learning algorithms also have some disadvantages. One of the main disadvantages is that they require a large amount of labeled data to be effective. If there is not enough labeled data available, the algorithm may not be able to learn effectively. Additionally, supervised learning algorithms can be sensitive to the quality of the labeled data. If the labeled data is noisy or incorrect, the algorithm may not be able to learn effectively. In conclusion, supervised learning is a powerful tool that can be used to make accurate predictions and classifications. It is easy to understand and implement, and it is versatile enough to be applied to a wide range of problem domains. However, supervised learning algorithms also have some disadvantages, such as requiring a large amount of labeled data to be effective and being sensitive to the quality of the labeled data. Despite these disadvantages, supervised learning is a valuable technique that can be used to improve the performance of machine learning models. Unsupervised learning is a type of machine learning that involves training a model on unlabeled data. The goal of unsupervised learning is to learn the underlying structure of the data, without any prior knowledge of the output variable. Unsupervised learning is widely used in various fields, including image recognition, natural language processing, and more. One of the primary advantages of unsupervised learning is that it can handle large amounts of unlabeled and unstructured data. This makes unsupervised learning particularly useful in situations where labeled data is scarce or expensive to obtain. By using unsupervised learning algorithms, we can learn from the available data and make accurate predictions. Another advantage of unsupervised learning is that it can identify previously undetected patterns in data. Unsupervised learning algorithms can be used to cluster data points into groups based on their similarities. This can be useful in various applications, such as customer segmentation, anomaly detection, and more. Unsupervised learning algorithms are also capable of dimensionality reduction. This is particularly useful when dealing with high-dimensional data, such as images or text. By reducing the dimensionality of the data, unsupervised learning algorithms can improve the efficiency and accuracy of the model. Unsupervised learning algorithms are also capable of feature learning. Feature learning is the process of automatically learning features from the input data. This can be useful in various applications, such as image recognition, where the algorithm can learn features such as edges, corners, and more. However, unsupervised learning algorithms also have some disadvantages. One of the main disadvantages is that they require a large amount of unlabeled data to be effective. If there is not enough unlabeled data available, the algorithm may not be able to learn effectively. Additionally, unsupervised learning algorithms can be sensitive to the quality of the data. If the data is noisy or incorrect, the algorithm may not be able to learn effectively. As you can see, artificial intelligence (AI) is a wide-ranging field that encompasses various sub-fields. Some of the sub-fields that we have previously discussed include reinforcement learning, multi-modal learning, semi-supervised learning, supervised learning, unsupervised learning, and much more. There are also many application domains for artificial intelligence (AI) that can utilize it. Throughout this essay, I have demonstrated the numerous benefits that artificial intelligence (AI) will bring to our society. I have also shown some examples of various categories within artificial intelligence that have varying purposes. It is important to consider that each category has its own purpose and has its own pros and cons to it. What do you think artificial intelligence will bring to our society? Will it be used in a responsible manner? ",
+    "4096": "In the heart of Eldoria, where ancient forests whispered secrets and rivers sang forgotten melodies, lay the Enchanted Labyrinth. Its walls, adorned with shimmering runes, concealed a portal to realms unknown. Few dared to venture inside, for the labyrinth was said to twist time and reality. Evelyn, a curious young mage, stood before the labyrinth's entrance. Her emerald eyes sparkled with determination. She clutched a cracked map, its ink fading like memories lost to the wind. Legends spoke of a treasure hidden deep within - a relic capable of granting any wish. As Evelyn stepped across the threshold, the air thickened. The walls shifted, rearranging themselves. She followed the faint glow of her lantern, each step echoing through eternity. Shadows danced, whispering forgotten names. Was this a dream or a nightmare? Deeper into the labyrinth, Evelyn encountered Aelar, the Guardian of Time. His silver hair flowed like moonlight, and his eyes held the weight of centuries. Aelar barred her path, his staff crackling with energy. 'Seeker,' he intoned, 'answer my riddle, and the way shall open.' Evelyn's heart raced. 'Ask, Guardian.' 'What has roots as old as time, yet dances with the wind?' She pondered, memories of her grandmother's tales flooding her mind. 'A tree,' she replied. Aelar smiled, and the walls shifted once more. 'Proceed, Seeker.' The labyrinth twisted, revealing a moonlit grove. Trees hummed ancient lullabies, and fireflies wove constellations in the air. At the center stood a weeping willow, its branches brushing the ground like a grieving widow's veil. Evelyn approached, her fingers tracing the bark. 'Why do you weep?' The willow's voice, soft as falling petals, answered, 'I guard the Tear of Eternity.' Evelyn's breath caught. The Tear - a gem said to hold memories of lost civilizations. She plucked it from a low branch, its facets reflecting forgotten faces. As Evelyn pressed onward, the labyrinth tightened its grip. She faced illusions - lovers lost, friends betrayed. Doubt gnawed at her resolve. Was the treasure worth the cost? At the labyrinth's heart, she found a mirror. Her reflection wavered, revealing her deepest desire: her sister, Lysandra, who vanished years ago. Tears blurred the glass. 'Speak your wish,' the mirror whispered. Evelyn's voice trembled. 'Bring Lysandra back.' The mirror shattered, and reality fractured. Lysandra stepped through, eyes wide with wonder. 'Evelyn?' Lysandra's return came at a cost - the labyrinth demanded balance. For every wish granted, a memory faded. Evelyn watched as her childhood laughter dissolved like mist. Together, they exited the labyrinth, the Tear pulsing in Evelyn's palm. She gazed at her sister, both joy and sorrow in her eyes. 'Was it worth it?' Lysandra asked. Evelyn smiled. 'In Eldoria, every choice we make becomes a story. And ours, dear sister, is woven in stardust and sacrifice.' And so, the Enchanted Labyrinth whispered its final secret: Wishes are threads, and memories their loom. In the land of Aetherfall, where mist-clad mountains touched the heavens and rivers whispered forgotten spells, a prophecy echoed through time. It spoke of the Starstone, a gem said to hold the universe's secrets - the key to creation and destruction. Eldric, a humble blacksmith with eyes like storm clouds, stumbled upon an ancient map. Its ink had faded, but the constellations remained. Guided by fate, he set forth, leaving his forge behind. Eldric's journey led him to the Whispering Forest, where trees conversed in hushed tones. Their leaves whispered of hidden paths and treacherous guardians. Eldric's heart pounded as he stepped into the shadows. There, he met Lyria, a forest nymph with silver hair and eyes like moonlit pools. She guarded the first clue - a riddle etched into a petal: 'In the heart of the forest, where time bends, seek the Wellspring of Echoes. There, the Starstone awaits.' Eldric followed Lyria's guidance. The Wellspring lay within a moon-kissed glade. Its waters shimmered, reflecting memories of lost lovers, ancient battles, and forgotten oaths. Eldric dipped his hand, and the riddle unfolded: 'To find the Starstone, seek the Three Keys: the tear of a fallen star, the breath of a dragon, and the song of a forgotten bard.' Eldric climbed the Stardust Peaks, where fallen stars lay embedded in the rock. Each tear held a fragment of cosmic sorrow. He found one - a sapphire gem pulsing with celestial fire. But it was guarded by Drakor, the last of the star dragons. Drakor's scales shimmered like galaxies. His eyes held eons of wisdom. 'Why seek the Tear, mortal?' 'To save Aetherfall,' Eldric replied. 'To restore balance.' Drakor nodded, and with a breath, he shattered the gem. Eldric caught the falling tear - a shard of eternity. Next, Eldric sailed to the Isle of Shadows, where the void whispered secrets. There, he faced Nyxia, the ancient shadow dragon. Her wings spanned continents, and her breath could devour stars. 'Why seek my breath?' Nyxia hissed. 'To awaken the Starstone,' Eldric said. 'To mend the rifts.' Nyxia's eyes glowed. She exhaled - a stream of darkness. Eldric captured it in a crystal vial - the Breath of the Void. The final key lay in the Bard's Hollow, where echoes of lost melodies lingered. Eldric met Silvan, a ghostly minstrel who strummed a lute of moonwood. 'Sing,' Silvan urged. 'The Song of the Forgotten.' Eldric sang of battles, love, and sacrifice. The hollow trembled, and from the mist, a spectral harp appeared. Its strings hummed - the Song of Ages. Eldric plucked the notes, and they merged into a silver key - the Song of the Forgotten. At the Nexus of Worlds, Eldric assembled the keys - the Tear, the Breath, and the Song. The ground quaked, and the Starstone emerged - a gem of cosmic hues. Its light wove reality, mending fractures in Aetherfall. But the prophecy held a twist: the Starstone demanded a choice. Eldric could use it to reshape the world or sacrifice it to heal the void. He gazed at Lyria, Drakor, Nyxia, and Silvan - their fates intertwined. With a heavy heart, he whispered, 'Balance.' And so, the Starstone shattered, its fragments seeding new constellations. Eldric returned to his forge, but his hammer now shaped more than iron - it forged destiny. Lyria, the Forest Nymph Lyria, with her silver hair and eyes like moonlit pools, remained in the Whispering Forest. She became its guardian, weaving spells to protect the ancient trees. Her laughter echoed through the glades, and travelers whispered of a nymph who danced with moonbeams. Lyria's heart held a secret - the memory of Eldric's touch, the warmth of their shared quest. She tended to the Wellspring of Echoes, ensuring its waters flowed through time, carrying whispers of forgotten tales. Drakor, the Last Star Dragon Drakor, the last of the star dragons, retreated to the highest peak of the Stardust Peaks. There, he curled his immense form around the shattered Tear of the Fallen. His scales absorbed its cosmic fire, and he became a living constellation - a beacon for lost souls. Drakor's breath no longer consumed stars; instead, it birthed new constellations. Travelers gazed at the night sky, seeking guidance in his patterns. Drakor's eyes held both sorrow and hope, for he knew that balance required sacrifice. Nyxia, the Ancient Shadow Dragon Nyxia, with wings spanning continents, chose a different path. She descended to the Isle of Shadows, where the void whispered secrets. There, she guarded the Abyss of Remembrance - a rift between worlds. Nyxia's breath no longer devoured stars; it sealed the rifts. She became a bridge, allowing souls to traverse realms. Those who sought lost loved ones or glimpses of forgotten memories found solace in her shadowed embrace. Nyxia's eyes held the weight of choices made and unmade, and she vowed to keep the balance intact. Silvan, the Ghostly Minstrel Silvan, the spectral minstrel, wandered the Bard's Hollow. His lute of moonwood sang melodies of love, loss, and courage. Silvan's song echoed through time, touching hearts across Aetherfall. He became the keeper of memories - the forgotten bard who whispered forgotten names. When travelers stumbled upon the hollow, Silvan strummed his lute, and their own stories surfaced. He wove their experiences into the Song of Ages, ensuring that no tale would fade into oblivion. Silvan's translucent form danced in moonlight, a bridge between the living and the departed. Eldric, the Blacksmith As for Eldric, the humble blacksmith, he returned to his forge in the village of Hearthstone. His hammer now shaped more than iron - it forged destiny. Eldric crafted talismans from the Tear of the Fallen, the Breath of the Void, and the Song of the Forgotten. These talismans healed rifts, mended broken hearts, and ignited hope. Eldric's eyes held the wisdom of realms explored, and he knew that Aetherfall's balance rested on the choices of ordinary souls. He continued to tell the tale of the Starstone, passing it down through generations, ensuring that the magic endured. And so, dear reader, the threads of fate intertwined - a forest nymph, a star dragon, a shadow, and a minstrel - all bound by the echoes of a forgotten song. The Chronicles of the Celestial Weaver In the forgotten village of Astralis, where the night sky wept silver tears, lived a young girl named Elara. Her eyes held the secrets of constellations, and her fingers danced like stardust. But Astralis suffered - a curse had befallen the heavens. The stars dimmed, their brilliance fading. Elara's grandmother, Lyris, whispered of an ancient prophecy: 'When the stars falter, seek the Celestial Weaver.' Elara vowed to unravel the mystery and save her village. Guided by Lyris's map, Elara ventured into the Veiled Forest, where moonlight wove through ancient oaks. There, she met Silas, the enigmatic weaver. His loom hummed with cosmic threads - the Loom of Eternity. 'Seek the lost constellations,' Silas said. 'Weave them anew.' Elara's heart raced. She plucked a silver thread - the remnants of Orion - and began to weave. The loom responded, stars rekindling. But the cost was memory - Elara forgot her childhood laughter. Elara's journey spanned realms: The Nebula Caves: She retrieved the Pleiades, their sisterhood echoing through time. The Comet's Trail: She chased Halley's Comet, capturing its fiery tail. The Abyss of Lyra: There, Vega's song echoed - a melody of love and longing. Each constellation restored, Elara's memories faded. She forgot her first kiss, her mother's lullabies. Yet Astralis glimmered - the stars brightened. In the Celestial Citadel, Elara faced Draco, the fallen dragon. His scales bore scars - the price of rebellion. He guarded the final constellation - the Serpent. 'Why weave the stars?' Draco hissed. 'They betrayed me.' Elara's fingers trembled. 'To save my village.' Draco's eyes softened. 'We were once kin. We'll share this memory.' As Elara wove the Serpent, she glimpsed Draco's love for Lyris - their forbidden bond. The constellation blazed, and Elara remembered both love and sacrifice. Back in Astralis, the stars blazed anew. Villagers rejoiced, but Elara's memories were fragile threads. Lyris embraced her. 'You've woven fate,' Lyris said. 'But the Loom demands balance.' Elara faced Silas. 'What price?' He smiled - a constellation of wrinkles. 'Your memories or the stars.' Elara hesitated. She remembered her grandmother's stories, her stolen kisses. She chose the stars. Elara became the new Celestial Weaver. Her memories - her life - wove into the cosmos. Astralis thrived, but Elara forgot her name, her laughter, her love. Lyris whispered, 'Weavers are forgotten, but their constellations endure.' And so, Elara wove - the forgotten girl who stitched eternity. Elara, now the Celestial Weaver, wove constellations with threads of memory. Astralis thrived - the villagers danced under starlit skies, unaware of their forgotten histories. Lyris watched her granddaughter, her eyes both proud and sorrowful. 'Elara,' Lyris whispered, 'the Loom demands more than memories.' Elara's fingers trembled. She glimpsed her own reflection in the cosmic threads - the girl who once dreamed of love and laughter. But now, her past was a constellation of faded stars. Silas, the former weaver, lingered in the shadows. His form blurred - a specter between realms. He spoke of the Whispering Veil, a boundary separating memory from oblivion. Beyond it lay forgotten worlds, lost loves, and forbidden truths. 'Cross the Veil,' Silas urged. 'Retrieve what was sacrificed.' Elara hesitated. She yearned for her stolen memories - the taste of strawberries, the warmth of a lover's touch. But the Veil was treacherous - a labyrinth of half-remembered echoes. Elara stepped into the Veil. Its mist clung to her skin, whispering secrets. She glimpsed fragments of her past - a stolen kiss, a tear shed for a fallen friend. The path forked: The Garden of Remembrance: Blooming with forgotten faces, this garden promised reunion. Elara could reclaim her lost memories, but at a cost - the stars would dim once more. The Abyss of Oblivion: A chasm of emptiness. Here, Elara could sever her ties to Astralis, becoming a true Celestial Weaver. The stars would blaze forever, but her existence would be a threadless void. Elara hesitated. She remembered Lyris's lullabies, Silas's enigmatic smile, and Draco's love for her grandmother. She yearned for her stolen laughter - the taste of strawberries, the warmth of a lover's touch. But the stars - Astralis - called to her. The village thrived, its people dancing under constellations she had rekindled. Elara's choice would echo across eternity. She faced the Veil's center - a mirror reflecting her fragmented self. Her fingers trembled. 'Balance,' she whispered. And so, Elara wove anew. She plucked threads from the Garden of Remembrance, reclaiming stolen moments. The stars dimmed, but Astralis glowed with forgotten love. Silas nodded. 'You've chosen well, Weaver.' Elara's memories returned - the taste of strawberries, the warmth of a lover's touch. She kissed Lyris's forehead, whispered Draco's name, and stepped back into Astralis. The stars blazed - the legacy of a girl who stitched eternity. Short stories like these are great to listen and read because they allow us to explore our creative minds and broaden our imaginations. They also inspire us to learn from others and can become culturally impactful. The themes of these stories can also dive deep into philosophical questions and raise awareness for important issues. The plots for these stories are sometimes based on real life events as well and can have deep emotional impact.",
+    "7936": "The Effects of Airplanes: A Closer Look Airplanes have revolutionized the way we travel, connect, and explore the world. From short domestic flights to transcontinental journeys, these metal birds have become an integral part of our lives. However, their impact extends beyond convenience and adventure. Let's delve into the effects of airplanes from various angles. Environmental Impact Fuel Consumption and Emissions Airplanes consume vast amounts of fuel during flight. For instance, a Boeing 747, with a gas tank capacity of 63,500 gallons, burns approximately five gallons of jet fuel per mile traveled. On a 4,000-mile flight, this translates to 20,000 gallons of fuel. However, when we consider the number of passengers (around 400), the fuel efficiency per traveler is surprisingly better than that of cars. A Honda Civic, which gets 30 miles per gallon, would need 133 gallons of fuel for the same distance. Even an RV, which moves just seven miles on a gallon of gasoline, would require about 285 gallons per traveler. Greenhouse Gas Emissions Airplanes emit greenhouse gases directly into the upper atmosphere, where they can linger longer and cause more damage than the same gases at lower altitudes. While air travel contributes to climate change, it's essential to recognize that other forms of transportation, such as cars and ships, also emit greenhouse gases. The challenge lies in finding ways to reduce aviation emissions without compromising connectivity and mobility. Ozone Depletion and Contrails Planes affect the concentration of other gases and pollutants in the atmosphere. They lead to a short-term increase in ozone (O3) but a long-term decrease. Contrails - those white streaks left behind by planes - can contribute to cloud formation and impact local weather patterns. Balancing the benefits of air travel with environmental concerns remains a critical challenge. Human Health Implications Jet Lag and Sleep Disruption Frequent flyers are no strangers to jet lag. Crossing time zones disrupts our circadian rhythms, affecting sleep patterns, mood, and overall well-being. Pilots, flight attendants, and passengers alike experience the effects of rapid travel across time zones. Dehydration and Blood Pressure Changes The low humidity in airplane cabins can lead to dehydration. Additionally, changes in cabin pressure affect blood pressure, especially during takeoff and landing. Staying hydrated and moving around during long flights can mitigate these effects. Risk of Contagious Diseases Airplanes put passengers in close proximity to one another. Recirculated air, shared surfaces, and confined spaces create an environment conducive to the spread of infections. While airlines take precautions, travelers should remain vigilant, especially during flu seasons. The Perspective Shift: Seeing Earth from Above Beyond the environmental and health impacts, airplanes have transformed our worldview. Before the Wright brothers' epochal breakthrough, humans were grounded, limited to terrestrial views. The advent of flight not only boosted our power of movement but also enhanced our vision. From above, we witness the curvature of the Earth, the vastness of oceans, and the intricate patterns of landscapes. Airplanes have made us global citizens, connecting us to distant lands and cultures. In conclusion, airplanes are a double-edged sword. They offer unparalleled mobility and exploration but come with environmental consequences and health considerations. As we continue to innovate and improve aviation technology, let's strive for a balance - a world where we soar through the skies while safeguarding our planet and well-being. Economic Impact Air Travel Industry The aviation industry is a significant contributor to the global economy. Airlines, airports, manufacturers, and associated services generate substantial revenue and employment. Air travel facilitates international trade, tourism, and business interactions. However, it also faces challenges such as fuel price fluctuations, competition, and regulatory complexities. Supply Chain and Cargo Transport Airplanes play a crucial role in transporting goods across continents. High-value and time-sensitive cargo, including perishable items, pharmaceuticals, and electronics, rely on air freight. The efficiency of supply chains owes much to the speed and reach of airplanes. Tourism and Local Economies Tourism heavily depends on air travel. Popular destinations thrive due to the influx of visitors arriving by plane. Local economies benefit from tourism-related activities, including hospitality, restaurants, and souvenir shops. Conversely, overreliance on tourism can strain natural resources and cultural heritage. Technological Advancements Aerospace Engineering The development of airplanes has driven advancements in aerospace engineering. Innovations in materials, aerodynamics, and propulsion systems have led to more efficient and safer aircraft. Research in areas like supersonic flight, electric planes, and autonomous drones continues to shape the industry. Navigation and Communication Airplanes rely on sophisticated navigation systems, including GPS, radar, and inertial guidance. These technologies enhance safety, accuracy, and efficiency. Communication networks allow pilots to stay connected with air traffic control, other planes, and ground stations. Social and Cultural Effects Global Connectivity Airplanes have transformed our perception of distance. What once took weeks by ship or months by land can now be accomplished in hours. Families separated by oceans reunite, students study abroad, and cultural exchange flourishes. The world feels smaller, and our interconnectedness grows. Iconic Symbols Airplanes evoke a sense of wonder and adventure. The iconic silhouettes of jumbo jets, fighter planes, and vintage biplanes symbolize human achievement and exploration. Airshows, aviation museums, and historical flights celebrate this legacy. Challenges and Future Prospects Sustainability The aviation industry faces the challenge of reducing its environmental impact. Researchers explore alternative fuels, electric propulsion, and lightweight materials. Balancing growth with sustainability remains critical. Airspace Congestion As air travel becomes more accessible, airspace congestion intensifies. Efficient air traffic management, improved routes, and next-generation air traffic control systems are essential to prevent gridlock. Security and Safety Ensuring the safety of passengers, crew, and cargo remains paramount. Rigorous security protocols, maintenance standards, and emergency preparedness are vital. In conclusion, airplanes are more than mere vessels of transportation. They shape economies, connect cultures, and inspire innovation. As we soar into the future, let's navigate the skies responsibly, appreciating both the marvels and challenges of flight. The Effects of Space Travel on the Human Body Space travel, with its awe-inspiring vistas and boundless possibilities, has captivated humanity for decades. However, venturing beyond our home planet comes with a price - a price paid not only in technological challenges but also in the toll it takes on the human body. Let us explore the effects of space travel, from radiation exposure to altered gravity, and how astronauts adapt to these extreme conditions. Space Radiation: A Silent Threat Radiation Exposure On Earth, our protective magnetic field and atmosphere shield us from the majority of space radiation. However, in space, astronauts face direct exposure to cosmic rays and solar particles. These high-energy particles can penetrate the body, damaging cells and DNA. Increased risk of cancer and degenerative diseases, such as heart disease and cataracts, have been observed in human populations exposed to radiation on Earth. In space, health risks from radiation are mainly driven by long-term impacts. Altered Gravity: A Weighty Matter Microgravity and Muscle Atrophy Astronauts aboard the International Space Station (ISS) experience microgravity, where their bodies float freely. While this weightlessness allows for breathtaking experiments and observations, it wreaks havoc on muscles and bones. Without the constant pull of gravity, muscles weaken, and bones lose density. Astronauts must engage in rigorous exercise routines to counteract muscle atrophy and maintain bone health. Fluid Redistribution and Swollen Faces In microgravity, bodily fluids shift upward, causing facial puffiness and fluid retention. Astronauts often joke about their 'moon faces.' This fluid redistribution can also affect vision, leading to a condition known as spaceflight-associated neuro-ocular syndrome (SANS). Isolation and Confinement: The Mental Strain Psychological Challenges Space missions involve prolonged isolation and confinement. Astronauts live in tight quarters, cut off from the natural world. The absence of familiar sights, sounds, and smells can lead to feelings of loneliness and anxiety. Coping mechanisms, communication with loved ones, and psychological support are crucial to maintaining mental well-being. Distance from Earth: A Cosmic Solitude Emotional Impact The vastness of space can evoke existential thoughts. Astronauts gaze back at Earth - a tiny blue dot suspended in the cosmic void - and grapple with their insignificance. The emotional weight of being far from home, family, and friends can be profound. Hostile and Closed Environments: Surviving in the Void Spacecraft Living Conditions Spacecraft are marvels of engineering, but they are also confined capsules. Astronauts adapt to tight spaces, recycled air, and limited privacy. The constant hum of machinery and the absence of natural light can wear on their senses. Risk of Infection In closed environments, microbes thrive. Astronauts must maintain strict hygiene to prevent infections. The immune system faces unique challenges, especially during extended missions. The Resilience of Astronauts Adaptation and Innovation Astronauts are remarkable in their ability to adapt. They learn to navigate microgravity, perform complex tasks, and troubleshoot technical glitches. Their resilience drives innovation, leading to better spacecraft design and life support systems. The Twin Study: Scott and Mark Kelly Scott Kelly and his identical twin brother, Mark Kelly, participated in the unique Twins Study. Scott spent nearly a year aboard the ISS, while Mark remained on Earth. By comparing their physiological and psychological changes, researchers gained valuable insights into the effects of space travel. Looking Ahead: Mars and Beyond Challenges for Deep Space Missions As we plan for Mars missions and beyond, we face the RIDGE of space travel: Space Radiation: Shielding astronauts from cosmic rays. Isolation and Confinement: Maintaining mental health during long journeys. Distance from Earth: Coping with cosmic solitude. Gravity Fields: Addressing muscle and bone health. Hostile/Closed Environments: Ensuring safety and hygiene. In conclusion, space travel is a delicate balance between exploration and preservation. As we venture farther into the cosmos, we must safeguard both our scientific curiosity and the well-being of those who dare to explore the final frontier. The Environmental Impact of Airplanes and Spaceships Airplanes and spaceships have transformed the way we explore our planet and beyond. However, their operations come with significant environmental consequences. Let's delve into the effects of these flying machines on our delicate ecosystem. Climate Change Air travel is a major contributor to climate change due to greenhouse gas emissions. Jet engines burn fossil fuels (mostly aviation gasoline or jet fuel), releasing carbon dioxide (CO2), nitrogen oxides (NOx), and water vapor into the atmosphere. These emissions trap heat, leading to global warming. Although aviation accounts for about 3.5 percent of human-induced climate change, its impact is disproportionately high due to emissions at high altitudes. Air Quality Airplanes emit pollutants such as sulfur dioxide (SO2), particulate matter (PM), and volatile organic compounds (VOCs). These pollutants degrade air quality near airports and along flight paths. Ground-level ozone formation, which harms human health and ecosystems, is also influenced by aviation emissions. Noise Pollution The roar of jet engines disrupts communities around airports. Noise pollution affects sleep patterns, stress levels, and overall well-being. Efforts to reduce noise include quieter engine designs and flight path adjustments. Spaceships: Earth's Atmospheric Guardians Rocket Launches and Pollution Rocket launches, essential for space exploration, release pollutants into the atmosphere. The fuel used - such as unsymmetrical dimethylhydrazine (UDMH) - can be highly carcinogenic and ecologically damaging. For instance, the Baikonur Cosmodrome in Kazakhstan, the world's oldest spaceport, has left a large zone of pollution due to toxic rocket fuel seeping into the soil. Carbon Particles and Geo-Engineering Recent research highlights the impact of rocket emissions on the atmosphere. Black carbon (soot) particles from rockets can absorb heat, acting as a form of geo-engineering. As commercial space launches increase, so does the concern about their environmental effects. Balancing Exploration and Preservation Space Tourism The rise of space tourism introduces new challenges. As more people venture beyond Earth, we must consider the cumulative impact of rocket emissions. Balancing our curiosity with environmental stewardship is crucial. Sustainable Practices Efforts are underway to develop cleaner propulsion technologies, use alternative fuels, and minimize space debris. Innovations like reusable rockets and electric propulsion aim to reduce the environmental footprint of space travel. Looking Ahead: A Cosmic Responsibility Mars and Beyond As we dream of Mars colonies and interstellar travel, we must tread carefully. The RIDGE of space exploration - Radiation, Isolation, Distance, Gravity, and Environment - requires sustainable solutions. Let's explore the cosmos while safeguarding our home planet. In conclusion, airplanes and spaceships propel us toward the stars, but their effects ripple through our atmosphere and ecosystems. As stewards of both Earth and space, we must navigate the skies responsibly, seeking harmony between exploration and preservation. From the ground to the sky, dining experiences have transcended traditional restaurant settings. Imagine savoring gourmet meals while suspended high above the earth, with breathtaking views stretching as far as the eye can see. Welcome to the world of aerial dining, where culinary delights meet gravity-defying elegance. Dinner in the Sky: Elevating Gastronomy The Original Concept Dinner in the Sky, born in 2006, is the epitome of dining with a twist. Picture a massive table - more like a platform - hoisted almost 200 feet into the air by a sturdy crane. Guests, chefs, and waitstaff don their white hats as they ascend to the skies. The setting? A floating dinner table, surrounded by nothing but open air and panoramic vistas. The Experience As you settle into your seat, the anticipation builds. The restaurant staff orchestrates a three-course fine dining experience, all while suspended in midair. The menu features carefully crafted dishes, often prepared beforehand and finished in a convection oven right there in the sky. Each bite is accompanied by awe-inspiring views - city skylines, rolling landscapes, or even the vastness of the ocean. Safety First Before you ascend, a safety briefing ensures that you're securely strapped in. The thrill of being airborne mingles with the elegance of haute cuisine. Whether it's a romantic date night or a corporate event, Dinner in the Sky promises an unforgettable meal. Sky-High Restaurants Around the World Dubai Marina: A Feast Above the Waters Situated in Dubai Marina, this dining concept boasts some of the best views of the city skyline, surrounding waters, and the iconic Palm Jumeirah. Imagine floating above the ground while you dine - a one-of-a-kind experience you simply cannot miss. After the safety briefing near Skydive Dubai, you're hoisted 50 meters into the air, suspended over the bustling marina. The fusion of flavors meets the fusion of horizons. Las Vegas: Unparalleled Views of the Strip In the entertainment capital of the world, Dinner in the Sky Las Vegas takes fine dining to new heights - literally. As the sun sets, you ascend, and the glittering lights of the Las Vegas Strip come alive. The most unforgettable dinner you'll ever have awaits, with the cityscape stretching out beneath you. It's a feast for the senses, where culinary artistry meets architectural marvels. The Future of Aerial Gastronomy Sustainability and Innovation As we look ahead, the challenge lies in balancing indulgence with environmental responsibility. How can we minimize the carbon footprint of these lofty dining experiences? Innovations like electric-powered cranes, locally sourced ingredients, and waste reduction strategies are steps toward a more sustainable future. Beyond Earth: Space Tourism and Cosmic Cuisine With the rise of space tourism, could we soon dine among the stars? Imagine a celestial restaurant aboard a spacecraft, overlooking Earth from orbit. Cosmic cuisine - crafted by zero-gravity chefs - might become the ultimate bucket-list experience. As we explore the cosmos, let's ensure that our gastronomic adventures leave no trace behind. In conclusion, dining in the air transcends mere sustenance. It's a celebration of human ingenuity, a fusion of flavors and vistas, and a reminder that our appetite for exploration knows no bounds. So, raise your glass (carefully!) to the skies and savor the magic of dining aloft. Dining in the Sky is a unique and exhilarating culinary experience that elevates traditional dining to new heights - literally. Here are the key aspects of this extraordinary concept: The Setting: Up, Up, and Away! Imagine being seated at a massive table suspended high above the ground, often hundreds of feet in the air. The dining platform is typically hoisted by a sturdy crane or other mechanical means. Guests, chefs, and waitstaff ascend together, creating an unforgettable communal experience. The Experience: A Feast with a View As you settle into your seat, anticipation builds. The thrill of being airborne mingles with the elegance of haute cuisine. The menu features carefully crafted dishes, often prepared beforehand and finished on-site. Whether it's breakfast, lunch, or dinner, each course is served against a backdrop of breathtaking views - city skylines, rolling landscapes, or even the vastness of the ocean. The floating table becomes a stage for culinary artistry, where flavors dance amidst the clouds. Safety First: Buckle Up! Before ascending, guests receive a safety briefing. Straps secure them to their seats, ensuring a worry-free dining experience. The focus shifts from gravity to gastronomy as the platform rises, leaving the ground far below. Locations Around the World: Where the Sky Meets the Plate Dubai Marina: Suspended above the bustling marina, diners enjoy views of the city skyline and the iconic Palm Jumeirah. Las Vegas: As the sun sets, guests ascend over the glittering lights of the Las Vegas Strip, creating an unparalleled dining spectacle. The Future: Sustainability and Cosmic Cuisine Balancing indulgence with environmental responsibility is crucial. Innovations like electric-powered cranes and locally sourced ingredients aim to reduce the carbon footprint. Could cosmic cuisine be next? With the rise of space tourism, imagine dining aboard a spacecraft, overlooking Earth from orbit. Zero-gravity chefs crafting celestial dishes - it's a tantalizing prospect. Introduction The sky, our celestial canvas, is a dynamic theater where cosmic phenomena unfold. From twinkling stars to majestic planets, the sky offers a mesmerizing display that captivates astronomers and dreamers alike. In this essay, we'll explore the various elements of celestial weather, from meteor showers to planetary alignments. Stars and Constellations Stellar Climates Stars, like earthly weather patterns, exhibit their own 'climates.' Some stars burn fiercely, radiating intense heat, while others are cooler and more temperate. The constellations, those celestial neighborhoods, form intricate patterns across the night sky. Imagine them as cosmic weather maps, guiding our eyes to distant realms. Meteor Showers: Celestial Rainfall Meteor showers are cosmic storms, where Earth passes through debris left behind by comets. As these tiny particles collide with our atmosphere, they ignite, creating streaks of light - the meteors. The Perseids in August and the Geminids in December are celestial fireworks, painting the sky with ephemeral beauty. Planets and Their Dance Planetary Weather Systems Our solar system hosts a diverse range of planets, each with its own atmospheric conditions. Venus, shrouded in thick clouds of sulfuric acid, experiences hurricane-force winds. Mars, with its rusty surface, battles dust storms that engulf the entire planet. Jupiter's Great Red Spot - a colossal storm - has raged for centuries. Conjunctions and Oppositions Planets engage in a cosmic ballet. Conjunctions occur when two planets appear close together in the sky, as if sharing a celestial embrace. Oppositions, on the other hand, position a planet directly opposite the Sun, making it visible all night. Witnessing Mars during opposition feels like meeting an old friend. Lunar Weather Phases of the Moon The Moon, Earth's faithful companion, cycles through its phases. New Moon, First Quarter, Full Moon - the lunar weather changes predictably. During a lunar eclipse, our planet casts a shadow on the Moon, turning it coppery red. It's a cosmic reminder of our place in the grand celestial drama. Tides: The Ocean's Cosmic Response The Moon's gravitational pull orchestrates tides on Earth. High tides and low tides ebb and flow, responding to lunar cues. The celestial dance between Earth, Moon, and Sun shapes our oceans, affecting coastlines and marine life. Celestial Events Comets: Cosmic Visitors Comets, celestial vagabonds, journey through our solar system. Their icy cores release gas and dust, forming magnificent tails. Halley's Comet, a recurring visitor, graces our skies once every 76 years. Its return is a cosmic homecoming. Supernovae: Stellar Explosions When massive stars reach the end of their lives, they explode in brilliant supernovae. These cosmic fireworks outshine entire galaxies. Witnessing a supernova - a rare event - is like glimpsing the universe's raw power. Conclusion As we gaze upward, let's remember that the sky is not merely a backdrop but a living, breathing entity. Its weather - both familiar and otherworldly - shapes our cosmic experience. So, next time you look up, consider the celestial forecast: a blend of stardust, wonder, and infinite possibilities. In the words of Carl Sagan, 'The cosmos is within us. We are made of star-stuff.' Cosmic Mysteries Dark Matter and Dark Energy The sky harbors secrets beyond our comprehension. Among them are dark matter and dark energy. Dark matter, invisible and elusive, exerts gravitational influence on galaxies, holding them together. Imagine it as the cosmic glue binding the universe. Dark energy, on the other hand, accelerates the universe's expansion, pushing galaxies apart. These cosmic enigmas remain shrouded in mystery, awaiting discovery. Auroras: Celestial Light Shows When charged particles from the Sun collide with Earth's magnetic field, they create auroras - the ethereal dance of light near the poles. The Northern Lights (Aurora Borealis) and Southern Lights (Aurora Australis) paint the night sky with hues of green, pink, and purple. These celestial ballets remind us of our interconnectedness with the solar system. Celestial Timekeeping Stellar Clocks The sky serves as humanity's oldest timekeeper. Ancient civilizations relied on celestial events for calendars. The sidereal day, based on Earth's rotation relative to distant stars, is approximately 23 hours, 56 minutes, and 4 seconds. Constellations rise and set, marking the passage of time - a cosmic heartbeat. Eclipses: Celestial Alignments Solar and lunar eclipses are cosmic alignments. During a solar eclipse, the Moon obscures the Sun, casting a shadow on Earth. The eerie twilight and the diamond ring effect evoke awe. Lunar eclipses, when Earth's shadow engulfs the Moon, transform it into a reddish orb - an astronomical spectacle witnessed by civilizations across millennia. Cosmic Harmony Music of the Spheres Ancient philosophers believed in the 'music of the spheres.' They imagined celestial bodies - planets, stars, and moons - emitting harmonious vibrations. Each celestial note contributed to a cosmic symphony. While we no longer hear this celestial music, its metaphorical resonance persists - a reminder that the universe hums with hidden melodies. Galactic Weather Patterns Galaxies, like weather systems, evolve. Spiral galaxies, with their graceful arms, resemble cosmic hurricanes. Elliptical galaxies, shaped like celestial footballs, harbor dormant black holes at their cores. Colliding galaxies create celestial tempests, birthing new stars. The cosmic weather forecast predicts galactic collisions, stellar births, and cosmic winds. Conclusion: Our Cosmic Home As we conclude our cosmic odyssey, remember that the sky is not an abstract canvas - it's our celestial home. Whether you're stargazing from a mountaintop or contemplating the Moon's craters, you participate in the grand cosmic narrative. The sky whispers tales of creation, destruction, and eternity. So, dear reader, look up. Embrace the celestial weather - the storms and serenades. For in the vastness of space, we find wonder, humility, and a shared cosmic kinship. As Carl Sagan eloquently put it, 'We are a way for the cosmos to know itself.' Introduction The universe is a symphony, and planets are its celestial notes. These enigmatic orbs dance around stars, weaving tales of creation, destruction, and cosmic balance. In this essay, we embark on a cosmic journey to explore the eight planets of our solar system and their profound significance. Mercury: The Swift Messenger Mercury, the swiftest planet, orbits closest to the Sun. Its surface is a rugged landscape of craters and cliffs, baked by scorching temperatures during the day and chilled at night. Named after the Roman messenger god, Mercury shuttles between extremes, delivering cosmic messages across the solar system. Venus: Earth's Fiery Twin Venus, Earth's twin sister, hides behind thick clouds of sulfuric acid. Its surface resembles a volcanic inferno, with temperatures hot enough to melt lead. Yet, its beauty lies in its radiant glow - the Morning and Evening Star - illuminating our dawn and dusk. Earth: Our Blue Gem Earth, our precious home, teems with life. Its oceans, forests, and deserts form a delicate biosphere. From the icy poles to the equatorial rainforests, Earth's diverse climates sustain a symphony of ecosystems. We are its guardians, entrusted with its care. Mars: The Red Planet's Mysteries Mars, the Red Planet, beckons explorers. Its rusty surface bears ancient river valleys and polar ice caps. Could Mars harbor hidden reservoirs of life? Robotic rovers traverse its deserts, seeking answers beneath its crimson skies. Jupiter: King of the Gas Giants Jupiter, the colossal gas giant, boasts a mesmerizing tapestry of bands and storms. Its Great Red Spot - a tempest larger than Earth - has raged for centuries. Jupiter's gravitational pull shapes the solar system, protecting inner planets from cosmic debris. Saturn: Jewel of the Rings Saturn, adorned with majestic rings, is a cosmic jewel. These icy hoops, composed of countless particles, create a celestial ballet. Saturn's moons - Titan, Enceladus, and others - beckon us to explore their icy landscapes. Uranus: The Original Ice Giant Uranus, tipped on its side, spins like a cosmic top. Its icy blue hue conceals turbulent storms. Uranus remains a mystery, awaiting further study by future missions. Neptune: The Farthest Wanderer Neptune, shrouded in azure clouds, is the outermost planet. Its winds whip at supersonic speeds, and its icy heart harbors storms that rival Jupiter's. Voyager 2, our interstellar traveler, captured Neptune's beauty as it sailed past. Conclusion: Cosmic Harmony Planets are cosmic harmonizers. Their gravitational dances sculpt orbits, stir tides, and guide comets. They remind us of our place in the grand cosmic orchestra. As we gaze at the night sky, let us cherish these celestial companions - the guardians of harmony. In the words of Carl Sagan, 'We are made of star-stuff.' Our existence echoes the cosmic rhythm, and planets are our celestial partners in this cosmic waltz. Pluto, once considered our ninth planet, now holds the title of a dwarf planet. The International Astronomical Union (IAU) made this reclassification in 2006. Pluto didn't meet one of the three criteria the IAU uses to define a full-sized planet: it has not cleared its neighboring region of other objects. Despite its demotion, Pluto remains a fascinating member of the Kuiper belt, a ring of bodies beyond Neptune's orbit. It is the ninth-largest and tenth-most-massive known object to directly orbit the Sun. Although smaller than Earth's moon, Pluto's icy and rocky composition continues to intrigue astronomers and stargazers alike. NASA's New Horizons mission is a remarkable endeavor that has expanded our understanding of the outer reaches of our solar system. Let's delve into the details of this pioneering spacecraft: Objective: New Horizons was designed to study the dwarf planet Pluto, its moons, and other objects in the Kuiper Belt. Launch Date: On January 19, 2006, New Horizons embarked on its epic journey. Spacecraft Mass: Weighing 1,054 pounds (478 kilograms), it carried a suite of scientific instruments. Mission Design and Management: The mission was led by NASA in collaboration with the Johns Hopkins University Applied Physics Laboratory (APL). Historic Flyby: On July 14, 2015, New Horizons made history by becoming the first spacecraft to explore Pluto up close. It captured stunning images of Pluto's diverse geological features, including its icy plains, rugged mountains, and frozen canyons. Moons of Pluto: During the flyby, New Horizons also studied Pluto's five moons, including the intriguing Charon. Arrokoth Flyby: In early 2019, New Horizons achieved another milestone by flying past Arrokoth (2014 MU69). Arrokoth is a Kuiper Belt Object, making it the most distant object ever explored up close. Kuiper Belt: This region extends from about 30 AU (near Neptune's orbit) to about 50 AU from the Sun. New Horizons ventured into this uncharted territory. New Horizons carried an impressive array of instruments, including: Ralph: A visible and infrared imager/spectrometer. Alice: An ultraviolet imaging spectrometer. Radio-Science Experiment (REX): Studied radio signals. Long-Range Reconnaissance Imager (LORRI): Captured high-resolution images. Solar Wind and Plasma Spectrometer (SWAP): Analyzed solar wind. Pluto Energetic Particle Spectrometer Science Investigation (PEPSSI): Studied particles around Pluto. Student Dust Counter (SDC): Measured dust impacts. New Horizons provided insights into Pluto's atmosphere, surface, and geology. It revealed icy mountains, glaciers, and mysterious dark regions. The spacecraft also observed Jupiter's moons (Io, Europa, and Ganymede) during its long journey. As of 2023, New Horizons continues to explore the outer solar system, contributing to our understanding of distant bodies. In summary, New Horizons has been a trailblazer, revealing the secrets of Pluto and venturing into the cosmic frontier. Its legacy inspires future missions and fuels our curiosity about the cosmos. ",
+    "8192": "Once upon a time, in a quaint little village nestled amidst rolling hills, there existed an old teapot. But this was no ordinary teapot; it was a magical one. Its handle curved just so, and its spout seemed to whisper secrets to the wind. The villagers called it 'Elara,' and they believed it held the power to grant wishes. Elara sat on the windowsill of Mrs. Abernathy's cozy cottage. Mrs. Abernathy was a kind-hearted woman with twinkling eyes and a penchant for herbal teas. She'd inherited the teapot from her grandmother, who, in turn, had received it from a mysterious traveler. One chilly evening, as the sun dipped below the horizon, Mrs. Abernathy brewed her favorite chamomile tea. She poured the fragrant liquid into Elara, and to her astonishment, the teapot began to glow. The room filled with a soft, golden light, and Mrs. Abernathy felt a tingle in her fingertips. 'Make a wish,' whispered Elara, its spout quivering. Mrs. Abernathy hesitated. She'd heard tales of wishes gone awry - of greedy desires leading to unintended consequences. But her heart yearned for something simple: a garden filled with blooming roses. So, she closed her eyes and wished for just that. The next morning, Mrs. Abernathy stepped outside, and her breath caught. The air smelled of roses - sweet and heady. But when she looked around, she gasped. Her modest garden had transformed into a riot of colors. Roses of every hue - crimson, ivory, apricot - bloomed in profusion. They climbed the walls, twined around the picket fence, and even spilled onto the cobblestone path. Word spread throughout the village, and soon everyone wanted a turn with Elara. The baker wished for the perfect sourdough loaf, and it appeared in his oven. The blacksmith wished for strength, and his arms bulged with newfound muscle. The schoolteacher wished for wisdom, and her lectures became captivating tales. But as wishes multiplied, so did the consequences. The baker's sourdough grew sentient and demanded to be called 'Doughbert.' The blacksmith's strength made him accidentally crush his anvil. And the schoolteacher's wisdom led her to question the very fabric of reality. Mrs. Abernathy watched with a mix of amusement and concern. Elara seemed to thrive on granting wishes, but its porcelain surface bore faint cracks. Was it growing weaker? One day, a young girl named Lily approached Elara. Her eyes sparkled with innocence, and she clutched a dandelion in her hand. 'Teapot,' she said, 'I wish for a friend.' Elara hesitated. It sensed the purity of Lily's heart, but it also knew the weight of loneliness. With a shudder, it granted the wish. And so, Lily's dandelion transformed into a giggling sprite named Petal. They danced through meadows, shared secrets, and became inseparable. Elara's cracks deepened, but it didn't mind. As seasons passed, Mrs. Abernathy sat by the window, watching Elara fade. Yet, she felt no regret. For in granting wishes, the teapot had found purpose. And perhaps, just perhaps, it had one final wish left - to be remembered. And so, when Mrs. Abernathy's time came, she whispered to Elara, 'Thank you.' The teapot glowed one last time, and Mrs. Abernathy drifted away, leaving behind a garden of roses and a village full of stories. And that, my dear reader, is how the enchanted teapot became a legend - a vessel of magic, love, and wishes granted with a fragile heart. As the seasons changed, so did the village. The once-sleepy hamlet now buzzed with visitors from distant lands. They came seeking Elara, the legendary teapot that granted wishes. Some sought riches, others fame, but most yearned for something deeper - a connection to the mystical. Among the newcomers was a weary traveler named Ezra. His cloak was tattered, and his boots bore the marks of countless miles. He'd heard whispers of Elara's magic and hoped it could mend his broken heart. For Ezra had lost his beloved, and grief weighed upon him like an anchor. Mrs. Abernathy, now an old woman with silver hair, welcomed Ezra into her cottage. Elara sat on the windowsill, its porcelain surface etched with memories. Mrs. Abernathy poured chamomile tea into the teapot, and it glowed faintly, as if recognizing an old friend. 'Make a wish,' Mrs. Abernathy said, her voice soft. Ezra hesitated. His wish was simple yet profound: to see his love once more, if only in a dream. He closed his eyes and whispered, 'I wish for a single night with her.' Elara trembled, its spout quivering. It understood the ache of lost love - the longing that transcended time. And so, it granted Ezra's wish. That night, as the moon hung low in the sky, Ezra lay on Mrs. Abernathy's creaky bed. Elara sat beside him, its glow illuminating the room. He drifted into slumber, and there, in the realm between wakefulness and dreams, he found himself in a moonlit garden. His love, Isolde, stood before him. Her eyes were the color of forget-me-nots, and her laughter echoed like wind chimes. They danced beneath a silver canopy, twirling through memories - their first kiss, stolen moments by the river, promises whispered under ancient oaks. But dreams are fragile, and dawn approached. Isolde's form wavered, and Ezra clung to her. 'Stay,' he pleaded. 'Just a little longer.' Isolde smiled, her touch like a butterfly's kiss. 'Time bends here,' she said. 'But you must wake, my love.' As the sun peeked over the horizon, Ezra opened his eyes. Elara sat on the windowsill, its glow fading. Mrs. Abernathy watched him, her gaze knowing. 'Did you see her?' she asked. Ezra nodded, tears glistening. 'She was real, Mrs. Abernathy. I held her again.' The village marveled at Ezra's tale - the man who danced with a ghost. They flocked to Elara, each with their wishes. The blacksmith wished for forgiveness, the baker for inspiration, and the schoolteacher for courage. Elara obliged, its cracks deepening, but it never complained. One day, as winter painted the landscape white, Mrs. Abernathy grew frail. She called Ezra to her bedside. 'Elara's magic wanes,' she whispered. 'But it has one final wish.' Ezra knelt beside her. 'What is it?' 'Take Elara beyond the hills,' Mrs. Abernathy said. 'To the ancient oak where Isolde and I carved our initials. There, bury the teapot. It will become part of the earth, and its magic will seep into the roots.' And so, on a frost-kissed morning, Ezra carried Elara to the oak. He dug a small hole, placed the teapot inside, and covered it with soil. As he patted the ground, he felt a tremor - a farewell. The next spring, the oak bloomed with roses - crimson, ivory, apricot. And in its shade, a dandelion sprouted. Its petals glowed like moonlight, and when the wind whispered, it carried echoes of laughter. Ezra knew then that Elara's wish had come true. It had become part of the land, woven into the fabric of stories. And perhaps, just perhaps, it still listened, granting silent wishes to those who believed. And so, the legend of Elara lived on - a teapot turned earth, a vessel of love, and a bridge between worlds. In the heart of the Whispering Forest, where ancient trees leaned close and their leaves murmured secrets, lived a young girl named Evelyn. She had eyes the color of moss and hair that tangled like wild vines. Evelyn was no ordinary child; she could hear the forest's whispers - the soft rustle of leaves, the creaking of branches, and the laughter of unseen creatures. The villagers feared the Whispering Forest. They said it was cursed - a place where time flowed differently, where shadows danced with mischief, and where lost souls wandered forever. But Evelyn felt drawn to its heart. She believed the forest held answers - about her missing parents, about the world beyond the village. One moonlit night, when the forest beckoned with silver fingers, Evelyn slipped away from her tiny cottage. She wore a cloak spun from spider silk and carried a lantern that glowed like a captured star. The trees leaned in, their bark etched with ancient runes. They whispered her name - Evelyn, Evelyn - as if they knew her purpose. Deeper she ventured, past gnarled roots and dew-kissed ferns. The air smelled of moss and memories. The lantern's light flickered, casting eerie shadows on the forest floor. And then, she heard it - the melody of the Whispering Forest. It was a haunting tune, sung by unseen lips, and it tugged at her heart. 'Who are you?' Evelyn whispered. The forest answered - a chorus of voices, overlapping and harmonizing. 'We are the echoes of forgotten dreams, the guardians of lost paths. Seek what you desire, but beware the price.' Evelyn pressed on. She reached a clearing where moonflowers bloomed - a sea of pale petals that glowed like fallen stars. In their midst stood a stone pedestal, and atop it rested a silver key. It was unlike any key she'd seen - twisted and delicate, with a single emerald set in its bow. The whispers intensified. 'Take the key,' they urged. 'Unlock the door to your destiny.' Evelyn hesitated. What door? What destiny? She thought of her parents - their laughter, their scent of pine and adventure. They'd vanished when she was a baby, leaving only a crumpled map with cryptic symbols. With trembling fingers, she picked up the key. It felt warm, alive. And then, she saw it - a door, half-hidden behind an ancient oak. Its wood was etched with constellations, and its handle bore the same emerald as the key. Evelyn inserted the key into the lock. The door groaned open, revealing a tunnel - a ribbon of darkness that wound deeper into the forest. The whispers grew urgent. 'Step through, Evelyn. Find your truth.' She stepped into the tunnel, and the world shifted. Time blurred, and she glimpsed her parents - laughing, dancing, fading like smoke. The tunnel led to a chamber - a celestial cavern where stars swirled in liquid patterns. And there, on a stone pedestal, lay a crystal vial. The whispers crescendoed. 'Drink,' they urged. 'Remember.' Evelyn uncorked the vial. Memories flooded her - the scent of pine, her parents' laughter, the taste of adventure. Tears blurred her vision. She drank, and the forest embraced her - a cocoon of whispers, of love, of belonging. When Evelyn emerged, the Whispering Forest had changed. It no longer whispered of curses but sang of hope. She carried her parents' memories - their legacy - and vowed to protect the forest's secrets. And so, Evelyn became the new guardian. She tended the moonflowers, listened to the trees, and sang the haunting melody. The villagers no longer feared the forest; they sought its solace, its magic. And every night, as the moon rose, Evelyn stood by the ancient oak. She whispered her parents' names, and the forest whispered back - a lullaby woven from stardust and love. Beyond the Whispering Forest, where the moonflowers bloomed and the stars whispered secrets, lay a forgotten path. It was a narrow trail, overgrown with moss and guarded by ancient stones. Few dared to tread there, for it led to the Compass Grove. Lysander, a young cartographer with ink-stained fingers and a heart full of wanderlust, stumbled upon this path one misty morning. His boots sank into damp earth, and the air smelled of pine and possibility. He carried a tattered map - a relic passed down through generations. Its edges bore cryptic symbols, and its center held a blank space - an uncharted territory. The Compass Grove was said to house a mystical compass - the Wayfinder's Compass - forged by the first explorers. It was no ordinary instrument; it pointed not to north, but to one's true desire. Legends whispered that whoever held the compass could navigate not only the physical world but also the labyrinth of their own heart. Lysander's pulse quickened. He yearned for adventure - to map uncharted lands, to unravel mysteries. His parents had vanished during an expedition, leaving behind a single clue: the blank space on the map. Perhaps the Compass Grove held answers. As he pushed through brambles and ferns, the forest seemed to guide him. Sunlight filtered through leaves, casting dappled patterns on the ground. And then, he saw it - a circle of ancient stones, their surfaces etched with symbols. At the center stood a pedestal, and atop it rested the Wayfinder's Compass. Lysander's breath caught. The compass was unlike any he'd seen. Its needle shimmered like a captured star, and its dial bore not cardinal directions but enigmatic words: Dreams, Regret, Destiny, and Hope. He touched the compass, and it hummed - a vibration that resonated in his bones. The whispers began - the voices of long-lost explorers, of forgotten dreams. 'Choose,' they urged. 'Choose your path.' Lysander hesitated. Dreams? Regret? Destiny? Hope? Each word held a promise, a peril. He thought of his parents - their laughter, their courage. He thought of the blank space on the map - the uncharted territory that beckoned. And so, he turned the dial to Dreams. The needle quivered, then settled - a path leading deeper into the forest. Lysander followed, lantern in hand, heart pounding. The compass guided him past silver streams and ancient oaks. It led him to a hidden waterfall - a curtain of moonlight that shimmered like stardust. There, he glimpsed a figure - a woman with eyes like forgotten constellations. She wore a cloak spun from spider silk, and her hair flowed like a river. 'Lysander,' she said, her voice a melody. 'You seek dreams.' He nodded. 'I seek answers. About my parents.' The woman touched his forehead, and memories flooded him - the scent of pine, his parents' laughter, the taste of adventure. 'Dreams are maps,' she said. 'They guide us beyond what we see.' Lysander understood. Dreams were compasses of the soul. His parents had followed theirs, and now he would follow his. He stepped through the waterfall, and the world shifted. He found himself on a cliff overlooking a vast sea - a sea of blank parchment. Islands floated in the distance, waiting to be charted. Lysander unrolled his map - the one with the blank space - and dipped his quill. He drew coastlines, marked mountains, and named each land. And as he mapped, the compass glowed - a beacon of dreams fulfilled. Lysander knew then that he was not merely a cartographer; he was a dreamweaver. His parents' legacy flowed through him - their courage, their laughter, their love. And so, Lysander sailed the uncharted seas, guided by the Wayfinder's Compass. He discovered islands of forgotten myths, forests of whispered tales, and cities where stars danced in the streets. He wrote his own story - a cartography of dreams. And in the Compass Grove, the ancient stones whispered his name - Lysander, Lysander - as if they knew he'd found his true north. In the heart of the city, where cobblestone streets wound like forgotten memories, stood an abandoned mansion. Its windows were boarded up, and ivy clung to its crumbling walls. But within those decaying walls lay a secret - a clockwork garden. Evelyn, a curious girl with eyes like rain-kissed petals, discovered the mansion one rainy afternoon. She wore mismatched socks and carried a notebook filled with sketches - a testament to her love for hidden wonders. The mansion's gate creaked open, and Evelyn stepped into a world frozen in time. The clockwork garden was unlike any other. Its flowers were made of gears and springs, their petals unfolding with precise clicks. The roses ticked, the daffodils whirred, and the tulips chimed. And at the center stood a colossal mechanical tree - its branches reaching toward the sky, its leaves spinning like miniature windmills. Evelyn gasped. She'd read about clockwork wonders - the automatons that danced at royal balls, the pocket watches that whispered secrets. But this garden was alive - a symphony of metal and magic. As she explored, she noticed a silver key embedded in the tree's trunk. It gleamed, beckoning her. Evelyn hesitated. What did the key unlock? And why had the clockwork garden been abandoned? The flowers seemed to whisper. 'Unlock the tree,' they urged. 'Discover its heart.' Evelyn turned the key. The tree shuddered, and its branches parted, revealing a hidden chamber. Inside, a mechanical heart pulsed - a delicate contraption of brass and crystal. It hummed, resonating with the rhythm of forgotten time. And then, she heard it - the voice of the tree. 'I am Chronos,' it said. 'Guardian of moments.' Evelyn's heart raced. 'Moments?' 'Every petal, every leaf,' Chronos explained. 'They hold memories - the laughter of lovers, the tears of parting, the whispers of dreams. But time has fractured. The clockwork garden is frozen, and I am fading.' Evelyn understood. The mansion's former owner - a clockmaker named Lysander - had built this garden to capture fleeting moments. But Lysander had vanished, leaving Chronos incomplete. 'I can mend you,' Evelyn said. 'But why was the garden abandoned?' Chronos sighed - a sound like winding gears. 'Lysander sought eternity. He believed that by freezing time, he could preserve love, prevent loss. But he forgot that life thrives in impermanence.' Evelyn touched the mechanical heart. 'Can we fix it?' Chronos nodded. 'You must find Lysander's final creation - the Celestial Gear. It lies beyond the city, where the river meets the stars.' And so, Evelyn embarked on her quest. She followed the river, past moonlit bridges and forgotten docks. The Celestial Gear awaited - a constellation of interlocking wheels, its center a pulsing light. As she placed the gear in Chronos's heart, the clockwork garden stirred. Flowers bloomed, petals unfurling with joy. The mechanical tree's leaves spun faster, and time flowed once more. But Chronos grew weaker. 'I am bound to this place,' it said. 'My purpose fulfilled.' Evelyn wept. 'Can't you come with me?' Chronos smiled - a clockwork smile. 'I am part of the garden now. But you, dear Evelyn, carry its memory.' And so, she returned to the mansion, where the clockwork garden thrived. She sketched its wonders, capturing gears and petals on paper. And when she closed her eyes, she heard the whispers - the laughter of lovers, the tears of parting, the echoes of dreams. Evelyn became the new guardian. She tended the flowers, wound the tree, and listened to Chronos's fading heartbeat. And every night, as the stars wheeled overhead, she whispered her thanks. For in the heart of the clockwork garden, time danced - a fragile waltz of moments, preserved and cherished. In the heart of the Astronomer's Quarter, where cobblestone streets wound like celestial paths, stood an ancient observatory. Its domed roof bore the scars of countless meteor showers, and its telescopes whispered secrets to the night sky. But within those hallowed walls lay a mystery - a forgotten constellation. Aria, a young stargazer with eyes like distant galaxies, discovered the observatory one moonless night. She wore a cloak spun from stardust and carried a pocket-sized atlas - a testament to her love for the heavens. The observatory's door creaked open, and Aria stepped into a world woven with cosmic threads. The forgotten constellation was unlike any other. Its stars were elusive, their positions shifting with each passing century. Astronomers had once mapped it - a celestial tapestry of myth and memory - but over time, its name faded, its stories lost. As Aria explored, she noticed a silver quill resting on an ancient star chart. Its nib gleamed, beckoning her. Aria hesitated. What secrets did the quill hold? And why had the forgotten constellation slipped from memory? The stars seemed to whisper. 'Write,' they urged. 'Illuminate the night.' Aria dipped the quill in ink. The constellations above shifted - a celestial dance awaiting completion. She traced the forgotten lines - the Hunter's Bow, the Weaver's Loom, the Lost Lyre. And then, she saw it - a gap in the sky, a void where a constellation once blazed. The quill hummed - a vibration that resonated in her bones. The whispers intensified. 'Remember,' they urged. 'Remember the story.' And so, Aria wrote - a tale woven from stardust and longing. She penned the forgotten constellation's name: Lyra's Veil. Its stars had once guided lovers across oceans, inspired poets to verses, and cradled dreams in their luminous arms. But Lyra's Veil had vanished - a casualty of time's relentless march. Its stories faded, its purpose lost. Aria vowed to restore it - to stitch the celestial fabric, thread by thread. She climbed to the observatory's rooftop, where telescopes pointed toward infinity. Aria gazed at the sky, her breath mingling with the Milky Way. And there, in the gap, she saw it - the faint glimmer of Lyra's Veil. The quill guided her. She drew the missing lines - the Weaver's Loom reconnected, the Lost Lyre's melody restored. And as she wrote, the stars responded. Lyra's Veil emerged - a constellation reborn. But Aria felt a pull - a cosmic yearning. She touched the quill to her heart, and memories flooded her - the scent of stardust, her grandmother's bedtime stories, the taste of wonder. 'Guard it,' whispered the stars. 'Guard Lyra's Veil.' And so, Aria became the new guardian. She tended the observatory, charted the skies, and whispered the forgotten stories. The astronomers marveled - the gap was gone, and Lyra's Veil blazed once more. But Aria knew her duty. She would write new tales - of love, of courage, of dreams stitched together. And every night, as the constellations wheeled overhead, she whispered her thanks. For in the heart of the forgotten constellation, time danced - a fragile waltz of memory, preserved and cherished. In the heart of the bustling city, where skyscrapers touched the clouds and neon signs flickered like distant stars, lived a forgotten runner named Evelyn. She wasn't famous like the sprinters on billboards or the marathon champions with their gleaming medals. No, Evelyn was an ordinary woman who ran for the sheer joy of it. Every morning, before the sun peeked over the horizon, Evelyn laced up her worn-out sneakers. She followed the same route - a loop around the park, past the fountain where pigeons bathed, and along the riverbank where willow trees whispered secrets. Her pace was steady, her breaths rhythmic. She ran not to win races but to escape the noise of life - to find solace in the rhythm of her footsteps. But the city had forgotten Evelyn. The sports channels didn't broadcast her runs, and the local newspapers didn't write about her achievements. She was a lone figure - a silhouette against the dawn, chasing dreams that no one else cared about. One chilly morning, as Evelyn jogged along the river, she noticed a poster taped to a lamppost. It announced the city's annual marathon - the grand event that drew elite athletes from around the world. Evelyn's heart skipped a beat. She'd never run a marathon, but the idea tugged at her like a distant constellation. She tore off the poster and studied it. The race would wind through the city's streets, past cheering crowds and historic landmarks. The finish line was the grand stadium - the same stadium where she'd watched her heroes cross the tape, their names echoing through the loudspeakers. Evelyn hesitated. She wasn't a professional runner. She didn't have a coach or a team. But something stirred within her - a longing to be part of the marathon, to leave her mark on the city she loved. And so, she trained. She woke earlier, ran farther, and pushed her limits. She practiced pacing, fueled by oatmeal and determination. The other runners didn't notice her - a middle-aged woman with graying hair - but Evelyn didn't mind. She was a comet streaking through the pre-dawn darkness, fueled by her own quiet fire. On marathon day, the city buzzed with excitement. The streets were lined with spectators - families with homemade signs, old couples in folding chairs, children waving tiny flags. The elite runners surged ahead, their strides effortless. But Evelyn was in the middle of the pack - a forgotten runner among thousands. As she crossed each mile marker, Evelyn felt a surge of pride. She wasn't breaking records, but she was breaking barriers - the ones she'd built around herself. The cheers of the crowd fueled her - their encouragement like solar winds pushing her forward. And then, at mile 20, exhaustion hit. Evelyn's legs wobbled, her breaths came in ragged gasps. She glanced at the grand stadium - the finish line shimmering like a distant galaxy. But her body rebelled. She wanted to collapse, to fade into anonymity. And that's when she saw him - a young boy with a crumpled sign. It read, 'Go, Evelyn! You're not forgotten.' Tears blurred her vision. She pushed through the pain, her heartbeat a metronome of determination. As Evelyn crossed the finish line, the crowd erupted. The loudspeakers blared her name - Evelyn, Evelyn - and the forgotten runner became a star. She collapsed into the arms of a volunteer, her legs trembling. But she'd done it. She'd run the marathon - the one that mattered to her. The newspapers wrote about her - the woman who defied odds, who ran not for glory but for love. And the city remembered Evelyn - the forgotten runner who'd become a constellation, lighting the way for others. Lysander stood at the finish line of the marathon, his chest heaving, sweat-soaked shirt clinging to his skin. The stadium roared - a symphony of applause and encouragement. But amidst the cheers, he felt a void - an ache that no medal could fill. He'd run the race - the one that mattered to him. Yet, as he caught his breath, Lysander wondered about the blank space on his map. The uncharted territory - the reason his parents had vanished - still haunted him. A shadow fell across the track. It was Evelyn, the forgotten runner. Her eyes sparkled with determination, and her worn-out sneakers bore the marks of countless miles. She'd finished the marathon too, her name echoing through the loudspeakers. 'Evelyn,' Lysander said, his voice hoarse. 'Why do we run?' She leaned against the railing, gazing at the city beyond. 'For the same reason we map,' she replied. 'To find what's lost.' Lysander nodded. 'The Compass Grove,' he said. 'The Wayfinder's Compass.' Evelyn's eyes widened. 'You know of it?' He traced the blank space on his map - the gap where the forgotten constellation should be. 'My parents sought it,' Lysander confessed. 'They believed it held answers - about time, about destiny.' Evelyn's fingers brushed the silver quill in her pocket. 'And did they find it?' He shook his head. 'They vanished. But I won't stop searching.' Together, they left the stadium - the forgotten runner and the cartographer. They followed the same path - the one that led beyond the city, into the Whispering Forest. The compass guided them - the needle pointing not to north, but to dreams. As they reached the ancient stones of the Compass Grove, Evelyn gasped. 'Look,' she said, her voice hushed. There, etched into the stones, were symbols - the Weaver's Loom, the Lost Lyre, and the Hunter's Bow. And at the center stood the pedestal - the Wayfinder's Compass. Lysander touched it - the needle quivering. 'What do we seek?' he asked. Evelyn's eyes held galaxies. 'Not just answers,' she said. 'But connection - to the forgotten, to each other.' And so, they turned the dial - to Hope. The compass hummed, and the forest whispered. A path opened - a ribbon of moonlight leading deeper. They stepped through, and the world shifted. Stars swirled - a celestial dance. And there, in the gap, they saw it - the forgotten constellation. Lyra's Veil blazed - a tapestry of memories, stitched by stardust. Its stars guided lovers, inspired poets, and cradled dreams. Lysander and Evelyn held hands - the cartographer and the runner. They traced the lines - the Weaver's Loom reconnected, the Lost Lyre's melody restored. And as they gazed at Lyra's Veil, they felt it - a cosmic yearning. Not for fame or medals, but for eternity - the kind woven into forgotten constellations. Together, they whispered their thanks - to the stars, to the forest, to each other. In the small town of Maplewood, basketball was more than a game - it was a way of life. The local high school gym, with its creaky wooden floors and flickering lights, held memories etched into the hearts of generations. Tommy Reynolds, a lanky teenager with dreams as big as the full moon, had grown up shooting hoops in that gym. His father, a former basketball star, had taught him the art of the game - the perfect arc of a jump shot, the rhythm of dribbling, and the magic of teamwork. But Tommy wasn't like his father. He lacked the height and the natural talent. Still, he practiced tirelessly, his sneakers squeaking on the polished floor. He'd stare at the faded championship banners hanging from the rafters - the ones his father had helped win - and imagine his own name there someday. Senior year arrived, and Tommy made the varsity team. He wasn't a star player, but he hustled, diving for loose balls and setting screens. The crowd cheered louder for the flashy slam dunks, but Tommy's heart beat for the fundamentals - the bounce pass, the defensive stance, the pick-and-roll. The state championship game loomed - a David-and-Goliath matchup against the undefeated Oakwood Tigers. They had a towering center, a lightning-fast point guard, and a reputation for crushing opponents. Maplewood was the underdog, the team with heart but not much else. As the final seconds ticked away, the score was tied. Tommy stood at center court, sweat dripping down his face. The gym seemed to hold its breath. He glanced at the banners - the ghosts of champions past urging him on. The ball found its way to Tommy. He dribbled, eyes scanning the court. His father's voice echoed in his mind: 'Trust your instincts, son.' He drove toward the basket, the Tigers' defense closing in. But instead of taking the shot, Tommy passed - the perfect bounce pass to his teammate, Danny. Danny leaped, releasing the ball just as the buzzer sounded. The gym erupted. The ball swirled through the net - a miracle shot that defied physics. Maplewood had won - the underdogs had toppled the giants. Tommy's teammates lifted him on their shoulders. The crowd chanted his name. But as he glanced at the banners, he knew the truth. It wasn't just his shot - it was the culmination of every bounce pass, every defensive stance, every pick-and-roll. His father hugged him - a rare display of emotion. 'You did it, Tommy,' he whispered. 'You made your mark.' And there, in the glow of victory, Tommy realized that sometimes the greatest miracles happen at center court - not in the spotlight, but in the quiet moments of practice, persistence, and heart."
+}
diff --git a/onnxruntime/python/tools/transformers/models/llama/quant_kv_dataloader.py b/onnxruntime/python/tools/transformers/models/llama/quant_kv_dataloader.py
index e8b563261001b..33084aec214c2 100644
--- a/onnxruntime/python/tools/transformers/models/llama/quant_kv_dataloader.py
+++ b/onnxruntime/python/tools/transformers/models/llama/quant_kv_dataloader.py
@@ -1,3 +1,8 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
 import argparse
 
 import numpy as np

From 2bc29244b4b6992667d06446c839426917945a29 Mon Sep 17 00:00:00 2001
From: Baiju Meswani <bmeswani@microsoft.com>
Date: Fri, 22 Mar 2024 10:28:44 -0700
Subject: [PATCH 227/279] Support model with multiple SCE loss nodes (#20016)

---
 .../orttraining/core/framework/gradient_graph_builder.cc     | 5 +++++
 .../orttraining/core/optimizer/insert_output_rewriter.cc     | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/orttraining/orttraining/core/framework/gradient_graph_builder.cc b/orttraining/orttraining/core/framework/gradient_graph_builder.cc
index d66591318d5c7..2ee4b5e1a173d 100644
--- a/orttraining/orttraining/core/framework/gradient_graph_builder.cc
+++ b/orttraining/orttraining/core/framework/gradient_graph_builder.cc
@@ -210,6 +210,11 @@ NodeSet GradientGraphBuilder::ReverseBFSWithStopGradient(const NodeSet& nodes) c
         continue;
       }
       const NodeArg* node_arg = n->InputDefs()[edge_it->GetDstArgIndex()];
+      if (!node_arg) {
+        LOGS(logger_, VERBOSE) << "Skip building gradient for input_" << edge_it->GetDstArgIndex()
+                               << " of node: " << n->Name() << " because it is not found in the graph.";
+        continue;
+      }
       const auto [is_tensor_type, is_allowed_type_for_grad, type] = IsAllowedForGradient(graph_, node_arg);
       if (is_tensor_type) {
         if (!is_allowed_type_for_grad) {
diff --git a/orttraining/orttraining/core/optimizer/insert_output_rewriter.cc b/orttraining/orttraining/core/optimizer/insert_output_rewriter.cc
index 2aade8c9bc1f9..61fc8d5492c2b 100644
--- a/orttraining/orttraining/core/optimizer/insert_output_rewriter.cc
+++ b/orttraining/orttraining/core/optimizer/insert_output_rewriter.cc
@@ -44,7 +44,7 @@ Status InsertSoftmaxCrossEntropyLossOutput::Apply(Graph& graph, Node& node, Rewr
     t.mutable_tensor_type()->mutable_shape()->CopyFrom(*X->Shape());  // log probability should have the same shape as logits.
   }
 
-  NodeArg& node_arg = graph.GetOrCreateNodeArg(X->Name() + "_log_prob", &t);
+  NodeArg& node_arg = graph.GetOrCreateNodeArg(graph.GenerateNodeArgName(X->Name() + "_log_prob"), &t);
 
   outputs.push_back(&node_arg);
 

From 7e84ba0ea30f3642c75d8d3fce5626766ce5a20e Mon Sep 17 00:00:00 2001
From: Abhishek Jindal <abjindal@microsoft.com>
Date: Fri, 22 Mar 2024 10:39:19 -0700
Subject: [PATCH 228/279] remove const cast for DLManagedTensor (#20015)

### Description
<!-- Describe your changes. -->
Removing const_cast as it might lead to unknown behavior. Specifying
DLMangedTensor as a const doesn't seem to be necessary and I have tested
this by running torch_ort.configure. Not sure what other tests which
needs to be done. Background can be found in this
[PR](https://github.com/microsoft/onnxruntime/pull/19982)


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../torch_cpp_extensions/aten_op_executor/aten_op_executor.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/python/torch_cpp_extensions/aten_op_executor/aten_op_executor.cc b/onnxruntime/python/torch_cpp_extensions/aten_op_executor/aten_op_executor.cc
index 4148e63d58619..f4d2f68d4d8b5 100644
--- a/onnxruntime/python/torch_cpp_extensions/aten_op_executor/aten_op_executor.cc
+++ b/onnxruntime/python/torch_cpp_extensions/aten_op_executor/aten_op_executor.cc
@@ -36,7 +36,7 @@ struct ATenOperator {
   size_t return_size;
   std::vector<c10::TypeKind> ret_kinds;
 
-  c10::IValue ToIValueArgument(const DLManagedTensor* dlpack, size_t index) const {
+  c10::IValue ToIValueArgument(DLManagedTensor* dlpack, size_t index) const {
     TORCH_INTERNAL_ASSERT(index < argument_size);
     bool is_optional = is_optional_arguments[index];
     TORCH_INTERNAL_ASSERT(dlpack || is_optional || default_values[index] ||
@@ -57,7 +57,7 @@ struct ATenOperator {
     c10::IValue i_value;
     // Create the torch tensor from this DLPack no matter we need it or not below,
     // so that the dlpack's deleter will be triggered when torch tensor is out of scope.
-    at::Tensor tensor = at::fromDLPack(const_cast<DLManagedTensor*>(dlpack));
+    at::Tensor tensor = at::fromDLPack(dlpack);
     switch (elem_kinds[index]) {
       case c10::TypeKind::TensorType: {
         i_value = is_optional ? c10::IValue(c10::optional<at::Tensor>(tensor)) : c10::IValue(tensor);

From f9cddd2cf5730bb330dc417ba461a684ea678444 Mon Sep 17 00:00:00 2001
From: kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
Date: Fri, 22 Mar 2024 14:44:34 -0700
Subject: [PATCH 229/279] Remove early stopping from LLaMA end-to-end
 benchmarking (#20033)

### Description
This PR removes early stopping from the end-to-end LLaMA-2 benchmark
script.

### Motivation and Context
This allows models to always generate the requested number of new
tokens.
---
 .../python/tools/transformers/models/llama/benchmark_e2e.py   | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/models/llama/benchmark_e2e.py b/onnxruntime/python/tools/transformers/models/llama/benchmark_e2e.py
index 4d0d2e68e8983..47b7f35cbdd7c 100644
--- a/onnxruntime/python/tools/transformers/models/llama/benchmark_e2e.py
+++ b/onnxruntime/python/tools/transformers/models/llama/benchmark_e2e.py
@@ -400,11 +400,7 @@ def main():
                 sampling_times.append(sampling_end_time - sampling_start_time)
 
                 all_token_ids = torch.cat([all_token_ids, tokens_to_add], dim=-1)
-
-                # Return early if all batch entries have reached EOS token id
                 current_length += 1
-                if torch.all(has_eos) or current_length > max_length:
-                    break
 
                 # Update inputs for next inference run
                 inputs["input_ids"] = tokens_to_add

From 3076b569472d0cbdae5e3657e3c267a63830b2b3 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <yuslepukhin@users.noreply.github.com>
Date: Fri, 22 Mar 2024 16:17:47 -0700
Subject: [PATCH 230/279] Make MS Debug engine SymInitialize() called as
 needed. (#20036)

### Description
<!-- Describe your changes. -->
Initialize Symbol engine as needed with no duplicate calls.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
  Currently absel library may call SymInitialize more than once
  when shared libraries are involved. However, this can only be
  called only once per process. Our debug_alloc also may call it
  when enabled. This change enables intialization to proceed
  only when needed with no duplicate effort.
---
 cmake/patches/abseil/absl_windows.patch       | 98 ++++++++++++++++---
 .../core/platform/windows/debug_alloc.cc      | 64 ++++++++----
 2 files changed, 131 insertions(+), 31 deletions(-)

diff --git a/cmake/patches/abseil/absl_windows.patch b/cmake/patches/abseil/absl_windows.patch
index 66ef0c5125a74..584c49d612293 100644
--- a/cmake/patches/abseil/absl_windows.patch
+++ b/cmake/patches/abseil/absl_windows.patch
@@ -25,17 +25,91 @@ index a6efc98e..8c4de8e7 100644
      "/wd4800",
  ]
 diff --git a/absl/copts/copts.py b/absl/copts/copts.py
-index 0d6c1ec3..75fd935f 100644
+index e6e11949..0aa7d868 100644
 --- a/absl/copts/copts.py
 +++ b/absl/copts/copts.py
-@@ -132,10 +132,6 @@ COPT_VARS = {
-             "/wd4068",  # unknown pragma
-             # qualifier applied to function type has no meaning; ignored
-             "/wd4180",
--            # conversion from 'type1' to 'type2', possible loss of data
--            "/wd4244",
--            # conversion from 'size_t' to 'type', possible loss of data
--            "/wd4267",
-             # The decorated name was longer than the compiler limit
-             "/wd4503",
-             # forcing value to bool 'true' or 'false' (performance warning)
+@@ -115,10 +115,6 @@ MSVC_WARNING_FLAGS = [
+     "/wd4068",  # unknown pragma
+     # qualifier applied to function type has no meaning; ignored
+     "/wd4180",
+-    # conversion from 'type1' to 'type2', possible loss of data
+-    "/wd4244",
+-    # conversion from 'size_t' to 'type', possible loss of data
+-    "/wd4267",
+     # The decorated name was longer than the compiler limit
+     "/wd4503",
+     # forcing value to bool 'true' or 'false' (performance warning)
+diff --git a/absl/debugging/symbolize_win32.inc b/absl/debugging/symbolize_win32.inc
+index 53a099a1..34d210d6 100644
+--- a/absl/debugging/symbolize_win32.inc
++++ b/absl/debugging/symbolize_win32.inc
+@@ -35,15 +35,15 @@ ABSL_NAMESPACE_BEGIN
+ 
+ static HANDLE process = NULL;
+ 
+-void InitializeSymbolizer(const char*) {
+-  if (process != nullptr) {
+-    return;
+-  }
++namespace {
++void InitializeSymbolizerImpl() {
++
+   process = GetCurrentProcess();
+ 
+   // Symbols are not loaded until a reference is made requiring the
+   // symbols be loaded. This is the fastest, most efficient way to use
+   // the symbol handler.
++
+   SymSetOptions(SYMOPT_DEFERRED_LOADS | SYMOPT_UNDNAME);
+   if (!SymInitialize(process, nullptr, true)) {
+     // GetLastError() returns a Win32 DWORD, but we assign to
+@@ -54,6 +54,36 @@ void InitializeSymbolizer(const char*) {
+   }
+ }
+ 
++bool LookupAndInitialize(const void* pc, SYMBOL_INFO* symbol) {
++  auto hProcess = (process != NULL) ? process : GetCurrentProcess();
++  if (SymFromAddr(hProcess, reinterpret_cast<DWORD64>(pc), nullptr, symbol) != TRUE) {
++    if (GetLastError() == ERROR_INVALID_HANDLE && process == NULL) {
++      InitializeSymbolizerImpl();
++      if (SymFromAddr(process, reinterpret_cast<DWORD64>(pc), nullptr, symbol) != TRUE) {
++        return false;
++      }
++    } else {
++      return false;
++    }
++    return false;
++  }
++  return true;
++}
++}
++
++void InitializeSymbolizer(const char*) {
++  if (process != nullptr) {
++    return;
++  }
++
++  alignas(SYMBOL_INFO) char buf[sizeof(SYMBOL_INFO) + MAX_SYM_NAME];
++  SYMBOL_INFO* symbol = reinterpret_cast<SYMBOL_INFO*>(buf);
++  symbol->SizeOfStruct = sizeof(SYMBOL_INFO);
++  symbol->MaxNameLen = MAX_SYM_NAME;
++
++  static_cast<void>(LookupAndInitialize(reinterpret_cast<const void*>(&InitializeSymbolizer), symbol));
++}
++
+ bool Symbolize(const void* pc, char* out, int out_size) {
+   if (out_size <= 0) {
+     return false;
+@@ -62,9 +92,11 @@ bool Symbolize(const void* pc, char* out, int out_size) {
+   SYMBOL_INFO* symbol = reinterpret_cast<SYMBOL_INFO*>(buf);
+   symbol->SizeOfStruct = sizeof(SYMBOL_INFO);
+   symbol->MaxNameLen = MAX_SYM_NAME;
+-  if (!SymFromAddr(process, reinterpret_cast<DWORD64>(pc), nullptr, symbol)) {
++
++  if(!LookupAndInitialize(pc, symbol)) {
+     return false;
+   }
++
+   const size_t out_size_t = static_cast<size_t>(out_size);
+   strncpy(out, symbol->Name, out_size_t);
+   if (out[out_size_t - 1] != '\0') {
diff --git a/onnxruntime/core/platform/windows/debug_alloc.cc b/onnxruntime/core/platform/windows/debug_alloc.cc
index ff6a059607367..f3520b4f7f7f5 100644
--- a/onnxruntime/core/platform/windows/debug_alloc.cc
+++ b/onnxruntime/core/platform/windows/debug_alloc.cc
@@ -55,41 +55,67 @@ struct MemoryBlock {
 };
 
 struct SymbolHelper {
-  SymbolHelper() noexcept {
-    SymSetOptions(SymGetOptions() | SYMOPT_DEFERRED_LOADS);
-    SymInitialize(GetCurrentProcess(), nullptr, true);
+  HANDLE process_handle_ = GetCurrentProcess();
+  bool initialized_ = false;
+
+  bool InitializeWhenNeeded() {
+    // We try only once
+    if (!initialized_) {
+      SymSetOptions(SymGetOptions() | SYMOPT_DEFERRED_LOADS);
+      // We use GetCurrentProcess() because other libs are likely to use it
+      if (!SymInitialize(process_handle_, nullptr, true)) {
+        const unsigned long long error{GetLastError()};
+        std::cerr << "SymInitialize() failed: " << error << std::endl;
+        return false;
+      }
+      initialized_ = true;
+    }
+    return true;
+  }
+
+  SymbolHelper() = default;
+
+  static constexpr size_t kInitialBufferSize = sizeof(SYMBOL_INFO) + MAX_SYM_NAME;
+
+  bool LoookupSymAndInitialize(const ULONG_PTR address, char* buffer, size_t buffer_size, SYMBOL_INFO* symbol) {
+    if (SymFromAddr(process_handle_, address, 0, symbol) != TRUE) {
+      if (GetLastError() == ERROR_INVALID_HANDLE) {
+        // Try to initialize first
+        if (!InitializeWhenNeeded() || SymFromAddr(process_handle_, address, 0, symbol) != TRUE) {
+          _snprintf_s(buffer, buffer_size, _TRUNCATE, "0x%08IX (Unknown symbol)", address);
+          return false;
+        }
+      } else {
+        _snprintf_s(buffer, buffer_size, _TRUNCATE, "0x%08IX (Unknown symbol)", address);
+        return false;
+      }
+    }
+    return true;
   }
 
   void Lookup(std::string& string, const ULONG_PTR address) {
-    char buffer[2048] = {0};
-    Symbol symbol;
-    if (SymFromAddr(GetCurrentProcess(), address, 0, &symbol) == false) {
-      _snprintf_s(buffer, _TRUNCATE, "0x%08IX (Unknown symbol)", address);
+    alignas(SYMBOL_INFO) char buffer[kInitialBufferSize] = {0};
+    SYMBOL_INFO* symbol = reinterpret_cast<SYMBOL_INFO*>(buffer);
+    symbol->SizeOfStruct = sizeof(SYMBOL_INFO);
+    symbol->MaxNameLen = MAX_SYM_NAME;
+
+    if (!LoookupSymAndInitialize(address, buffer, kInitialBufferSize, symbol)) {
       string.append(buffer);
       return;
     }
 
     Line line;
     DWORD displacement;
-    if (SymGetLineFromAddr(GetCurrentProcess(), address, &displacement, &line) == false) {
-      _snprintf_s(buffer, _TRUNCATE, "(unknown file & line number): %s", symbol.Name);
+    if (SymGetLineFromAddr(process_handle_, address, &displacement, &line) == false) {
+      _snprintf_s(buffer, _TRUNCATE, "(unknown file & line number): %s", symbol->Name);
       string.append(buffer);
       return;
     }
 
-    _snprintf_s(buffer, _TRUNCATE, "%s(%d): %s", line.FileName, static_cast<int>(line.LineNumber), symbol.Name);
+    _snprintf_s(buffer, _TRUNCATE, "%s(%d): %s", line.FileName, static_cast<int>(line.LineNumber), symbol->Name);
     string.append(buffer);
   }
 
-  struct Symbol : SYMBOL_INFO {
-    Symbol() noexcept {
-      SizeOfStruct = sizeof(SYMBOL_INFO);
-      MaxNameLen = _countof(buffer);
-    }
-
-    char buffer[1024] = {0};
-  };
-
   struct Line : IMAGEHLP_LINE {
     Line() noexcept {
       SizeOfStruct = sizeof(IMAGEHLP_LINE);

From 71551dacd510a9b85d6ef9fa12af319fa4687592 Mon Sep 17 00:00:00 2001
From: Xiaoyu <85524621+xiaoyu-work@users.noreply.github.com>
Date: Fri, 22 Mar 2024 18:40:58 -0700
Subject: [PATCH 231/279] Add ModelProto support for transformers
 optimize_model (#19990)

### Description
Add `ModelProto` support as an input to transformers `optimize_model`
API.


### Motivation and Context
Currently, the `optimize_model` API only accepts a model path as the
input model. However, for large models, saving and loading from disk can
be time-consuming. By adding `ModelProto` as an input option to the
`optimize_model` API, significant time can be saved.
---
 .../python/tools/transformers/onnx_utils.py   | 55 +++++++++++++++
 .../python/tools/transformers/optimizer.py    | 69 +++++++++++++------
 .../python/transformers/test_onnx_utils.py    | 38 ++++++++++
 3 files changed, 140 insertions(+), 22 deletions(-)
 create mode 100644 onnxruntime/python/tools/transformers/onnx_utils.py
 create mode 100644 onnxruntime/test/python/transformers/test_onnx_utils.py

diff --git a/onnxruntime/python/tools/transformers/onnx_utils.py b/onnxruntime/python/tools/transformers/onnx_utils.py
new file mode 100644
index 0000000000000..64fade9369395
--- /dev/null
+++ b/onnxruntime/python/tools/transformers/onnx_utils.py
@@ -0,0 +1,55 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from fusion_utils import NumpyHelper
+from onnx import ModelProto, TensorProto
+from onnx.external_data_helper import set_external_data
+from onnx_model import OnnxModel
+
+from onnxruntime import OrtValue
+
+
+def extract_raw_data_from_model(model: ModelProto):
+    """
+    Extract external data from model and return the external data as a list of tuples (name, value).
+    Note this function does not handle external data that is not loaded into the model as raw data.
+
+    Args:
+        model (ModelProto): the model proto to extract external data from.
+    Returns:
+        (external_names, external_values): a tuple of two lists of external data names and values.
+    """
+    external_data = []
+    onnx_model = OnnxModel(model)
+    for graph in onnx_model.graphs():
+        for initializer in graph.initializer:
+            name = initializer.name
+
+            if initializer.HasField("raw_data"):
+                numpy_tensor = NumpyHelper.to_array(initializer)
+                ort_value = OrtValue.ortvalue_from_numpy(numpy_tensor)
+                external_data.append((name, ort_value))
+                # mimic set_external_data
+                set_external_data(initializer, location="foo.bin")
+                initializer.name = name
+                initializer.ClearField("raw_data")
+
+    return zip(*external_data)
+
+
+def has_external_data(model: ModelProto):
+    """
+    Check if the model has external data.
+
+    Args:
+        model (ModelProto): the model proto to check for external data.
+    Returns:
+        bool: True if the model has external data, False otherwise.
+    """
+    onnx_model = OnnxModel(model)
+    for graph in onnx_model.graphs():
+        for initializer in graph.initializer:
+            if initializer.HasField("data_location") and initializer.data_location == TensorProto.EXTERNAL:
+                return True
+    return False
diff --git a/onnxruntime/python/tools/transformers/optimizer.py b/onnxruntime/python/tools/transformers/optimizer.py
index ce0be6b3449ed..068ccefef7d97 100644
--- a/onnxruntime/python/tools/transformers/optimizer.py
+++ b/onnxruntime/python/tools/transformers/optimizer.py
@@ -21,11 +21,12 @@
 import logging
 import os
 import tempfile
-from typing import Dict, List, Optional
+from pathlib import Path
+from typing import Dict, List, Optional, Union
 
 import coloredlogs
 from fusion_options import FusionOptions
-from onnx import ModelProto, TensorProto, load_model
+from onnx import ModelProto, load_model
 from onnx_model import OnnxModel
 from onnx_model_bart import BartOnnxModel
 from onnx_model_bert import BertOnnxModel
@@ -40,6 +41,9 @@
 from onnx_model_unet import UnetOnnxModel
 from onnx_model_vae import VaeOnnxModel
 
+import onnxruntime
+from onnxruntime.transformers.onnx_utils import extract_raw_data_from_model, has_external_data
+
 logger = logging.getLogger(__name__)
 
 # Map model type to tuple: optimizer class, export tools (pytorch, tf2onnx, keras2onnx), and default opt_level
@@ -64,7 +68,7 @@
 
 
 def optimize_by_onnxruntime(
-    onnx_model_path: str,
+    onnx_model: Union[str, ModelProto],
     use_gpu: bool = False,
     optimized_model_path: Optional[str] = None,
     opt_level: Optional[int] = 99,
@@ -80,7 +84,7 @@ def optimize_by_onnxruntime(
     Use onnxruntime to optimize model.
 
     Args:
-        onnx_model_path (str): the path of input onnx model.
+        onnx_model (str | ModelProto): the path of input onnx model or ModelProto.
         use_gpu (bool): whether the optimized model is targeted to run in GPU.
         optimized_model_path (str or None): the path of optimized model.
         opt_level (int): graph optimization level.
@@ -95,8 +99,6 @@ def optimize_by_onnxruntime(
     assert opt_level in [1, 2, 99]
     from torch import version as torch_version
 
-    import onnxruntime
-
     if (
         use_gpu
         and provider is None
@@ -105,9 +107,13 @@ def optimize_by_onnxruntime(
         )
     ):
         logger.error("There is no gpu for onnxruntime to do optimization.")
-        return onnx_model_path
+        return onnx_model
 
-    model = OnnxModel(load_model(onnx_model_path, load_external_data=False))
+    model = (
+        OnnxModel(load_model(onnx_model, load_external_data=False))
+        if isinstance(onnx_model, str)
+        else OnnxModel(onnx_model)
+    )
     if model.use_float16() and not use_gpu:
         logger.warning(
             "This model uses float16 in the graph, use_gpu=False might cause extra Cast nodes. "
@@ -125,7 +131,10 @@ def optimize_by_onnxruntime(
         sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
 
     if optimized_model_path is None:
-        path_prefix = onnx_model_path[:-5]  # remove .onnx suffix
+        if isinstance(onnx_model, str):
+            path_prefix = str(Path(onnx_model).with_suffix(""))  # remove .onnx suffix
+        else:
+            path_prefix = "optimized_model"
         optimized_model_path = "{}_o{}_{}.onnx".format(path_prefix, opt_level, "gpu" if use_gpu else "cpu")
 
     sess_options.optimized_model_filepath = optimized_model_path
@@ -174,7 +183,20 @@ def optimize_by_onnxruntime(
         else:
             providers.append("CUDAExecutionProvider")
 
-    onnxruntime.InferenceSession(onnx_model_path, sess_options, providers=providers, **kwargs)
+    # For large model, extract external data from model and add to session options
+    if isinstance(onnx_model, ModelProto):
+        if has_external_data(onnx_model):
+            raise ValueError(
+                "ModelProto has external data not loaded into memory, ORT cannot create session. "
+                "Please load external data before calling this function. "
+                "See https://onnx.ai/onnx/repo-docs/ExternalData.html for more information."
+            )
+        external_names, external_values = extract_raw_data_from_model(onnx_model)
+        sess_options.add_external_initializers(list(external_names), list(external_values))
+
+    # Inference session is only used to optimize the model.
+    onnx_model = onnx_model.SerializeToString() if isinstance(onnx_model, ModelProto) else onnx_model
+    onnxruntime.InferenceSession(onnx_model, sess_options, providers=providers, **kwargs)
 
     assert os.path.exists(optimized_model_path) and os.path.isfile(optimized_model_path)
     logger.debug("Save optimized model by onnxruntime to %s", optimized_model_path)
@@ -187,7 +209,7 @@ def optimize_by_fusion(
     num_heads: int = 0,
     hidden_size: int = 0,
     optimization_options: Optional[FusionOptions] = None,
-):
+) -> OnnxModel:
     """Optimize Model by graph fusion logic.
 
     Note that ONNXRuntime graph optimizations (like constant folding) will not be applied. So it is better to enable
@@ -241,7 +263,7 @@ def optimize_by_fusion(
 
 
 def optimize_model(
-    input: str,
+    input: Union[str, ModelProto],
     model_type: str = "bert",
     num_heads: int = 0,
     hidden_size: int = 0,
@@ -252,7 +274,7 @@ def optimize_model(
     verbose: bool = False,
     *,
     provider: Optional[str] = None,
-):
+) -> OnnxModel:
     """Optimize Model by OnnxRuntime and/or python fusion logic.
 
     ONNX Runtime has graph optimizations (https://onnxruntime.ai/docs/performance/model-optimizations/graph-optimizations.html).
@@ -275,7 +297,7 @@ def optimize_model(
     For BERT model, num_heads and hidden_size are optional. For other model types, you need specify these parameters.
 
     Args:
-        input (str): input model path.
+        input (str | ModelProto): input model path or ModelProto.
         model_type (str, optional): model type - like bert, bert_tf, bert_keras or gpt2. Defaults to 'bert'.
         num_heads (int, optional): number of attention heads. Defaults to 0.
             0 allows detect the parameter from graph automatically.
@@ -298,9 +320,9 @@ def optimize_model(
 
     if model_type not in MODEL_TYPES:
         logger.warning(f"Unsupported model type: {model_type} for optimization, directly return model.")
-        return OnnxModel(load_model(input))
+        return OnnxModel(load_model(input)) if isinstance(input, str) else OnnxModel(input)
 
-    (optimizer_class, _producer, default_opt_level) = MODEL_TYPES[model_type]
+    (optimizer_class, _, default_opt_level) = MODEL_TYPES[model_type]
 
     if opt_level is None:
         opt_level = default_opt_level
@@ -316,11 +338,9 @@ def optimize_model(
 
     # Auto detect if input model has external data
     has_external_data_file = False
-    original_model = load_model(input, load_external_data=False)
-    for initializer in original_model.graph.initializer:
-        if initializer.HasField("data_location") and initializer.data_location == TensorProto.EXTERNAL:
-            has_external_data_file = True
-            break
+    original_model = load_model(input, load_external_data=False) if isinstance(input, str) else input
+    if has_external_data(original_model):
+        has_external_data_file = True
     del original_model
 
     if opt_level > 1:
@@ -365,7 +385,12 @@ def optimize_model(
     if only_onnxruntime and not temp_model_path:
         logger.warning("Please specify a positive value for opt_level when only_onnxruntime is True")
 
-    model = load_model(temp_model_path or input)
+    if temp_model_path is not None:
+        model = load_model(temp_model_path)
+    elif isinstance(input, str):
+        model = load_model(input)
+    else:
+        model = input
 
     if only_onnxruntime:
         optimizer = optimizer_class(model, num_heads, hidden_size)
diff --git a/onnxruntime/test/python/transformers/test_onnx_utils.py b/onnxruntime/test/python/transformers/test_onnx_utils.py
new file mode 100644
index 0000000000000..974991359795e
--- /dev/null
+++ b/onnxruntime/test/python/transformers/test_onnx_utils.py
@@ -0,0 +1,38 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import unittest
+
+import numpy
+from onnx import ModelProto, TensorProto, helper
+from onnx.external_data_helper import set_external_data
+
+from onnxruntime.transformers.onnx_utils import extract_raw_data_from_model, has_external_data
+
+
+class TestOnnxUtils(unittest.TestCase):
+    def test_extract_raw_data_from_model(self):
+        model = self._get_model_proto_with_raw_data(False)
+        external_names, external_values = extract_raw_data_from_model(model)
+        self.assertEqual(list(external_names), ["inputs"])
+        self.assertEqual(len(external_values), 1)
+        self.assertEqual(external_values[0].numpy(), [0.0])
+
+    def test_has_external_data(self):
+        model = self._get_model_proto_with_raw_data()
+        self.assertTrue(has_external_data(model))
+
+    def test_has_external_data_with_no_external_data(self):
+        model = self._get_model_proto_with_raw_data(False)
+        self.assertFalse(has_external_data(model))
+
+    def _get_model_proto_with_raw_data(self, has_external_data: bool = True) -> ModelProto:
+        input = helper.make_tensor_value_info("inputs", TensorProto.FLOAT, [None])
+        output = helper.make_tensor_value_info("outputs", TensorProto.FLOAT, [None])
+        raw_data = numpy.array([0.0], dtype=numpy.float32).tobytes()
+        tensor = helper.make_tensor("inputs", TensorProto.FLOAT, [1], raw_data, True)
+        if has_external_data:
+            set_external_data(tensor, location="foo.bin")
+        node = helper.make_node("Identity", inputs=["inputs"], outputs=["outputs"])
+        return helper.make_model(helper.make_graph([node], "graph", [input], [output], initializer=[tensor]))

From 3b4b99b90b7de7848e5c1e817ad19b32bf598b27 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Sat, 23 Mar 2024 08:53:50 -0700
Subject: [PATCH 232/279] Fix a bug in WASM's GEMM (#20023)

### Description
Fix a bug in WASM's GEMM. The bug was found when running
"ConvAddActivationFusionTests.ConvGemmDirect" unit test in a wasm build
with address sanitizer enabled. When CountK=25, CountN=1, lda=25, ldc=1,
the function I am modifying triggered a read out of bound error.

The bug fix was provided by @fs-eire.
---
 onnxruntime/core/mlas/lib/wasm_simd/SgemmKernelWasmSimd.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/onnxruntime/core/mlas/lib/wasm_simd/SgemmKernelWasmSimd.cpp b/onnxruntime/core/mlas/lib/wasm_simd/SgemmKernelWasmSimd.cpp
index 955b7c5deee9a..43a12b37e4ffa 100644
--- a/onnxruntime/core/mlas/lib/wasm_simd/SgemmKernelWasmSimd.cpp
+++ b/onnxruntime/core/mlas/lib/wasm_simd/SgemmKernelWasmSimd.cpp
@@ -171,11 +171,9 @@ Return Value:
         if (k > 0) {
 
             Row0AElements0 = a[0];
-            Row0AElements1 = a[1];
 
             if (ProcessTwoRows) {
                 Row1AElements0 = a[lda];
-                Row1AElements1 = a[lda + 1];
             }
 
             BElements0 = MlasLoadFloat32x4(B + 0);

From cdc5d72ba9dfcba38462d7fcfa7047fd6005fa5a Mon Sep 17 00:00:00 2001
From: Adrian Lizarraga <adlizarraga@microsoft.com>
Date: Sat, 23 Mar 2024 11:05:08 -0700
Subject: [PATCH 233/279] [QDQ Quant] Support mixed-precision integer
 quantization via overrides (#19925)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Description
Adds support for specifying mixed precision QDQ models via tensor
quantization overrides.


### Motivation and Context
This PR implements an approach for supported "mixed precision" models.
The following figure demonstrates an example mixed precision model as
defined in this PR.


![image](https://github.com/microsoft/onnxruntime/assets/19691973/40ae3bf9-b21a-4ba5-a1cd-41c1e08c21e7)

A mixed precision QDQ model consists of regions with different
activation/weight quantization data types. The boundary between regions
converts between activation quantization data types (e.g., uint8 to
uint16) using a DQ to Q sequence.

The ability to specify regions with different quantization data types
enables exploring the tradeoffs between accuracy and latency. A higher
integer precision may improve accuracy at the expense of latency, so
selectively promoting certain regions to a higher precision can aid in
achieving a desirable balance in key metrics.

#### Current support
By default, the ORT quantizer supports specifying default activation and
weight quantization data types for the entire model. A recent PR added
support for specifying basic quantization overrides at the tensor level
via the `extra_options["TensorQuantOverrides"]` configuration:

```
TensorQuantOverrides = dictionary :
    Default is {}. Set tensor quantization overrides. The key is a tensor name and the value is a
    list of dictionaries. For per-tensor quantization, the list contains a single dictionary. For
    per-channel quantization, the list contains a dictionary for each channel in the tensor.
    Each dictionary contains optional overrides with the following keys and values.
           'quant_type' = QuantType : The tensor's quantization data type.
           'scale' =  Float         : The scale value to use. Must also specify `zero_point` if set.
           'zero_point' = Int       : The zero-point value to use. Must also specify `scale` is set.
           'symmetric' = Bool       : If the tensor should use symmetric quantization. Invalid if also
                                      set `scale` or `zero_point`.
           'reduce_range' = Bool    : If the quantization range should be reduced. Invalid if also
                                      set `scale` or `zero_point`.
           'rmax' = Float           : Override the maximum real tensor value in calibration data.
                                      Invalid if also set `scale` or `zero_point`.
           'rmin' = Float           : Override the minimum real tensor value in calibration data.
                                      Invalid if also set `scale` or `zero_point`.
```
The tensor-level overrides are currently used to override the
quantization type for weights/initializers or to set specific
scale/zero-point values for a tensor (e.g., QNN requires Sigmoid to use
a specific scale/zero-point at its output).

However, these overrides are not typically used to override activation
quantization types due in large part to operator data type constraints.
Consider, for example, that all inputs and outputs to an Add operator
must be of the same data type. Consequently, using tensor-level
overrides to promote the Add’s output to 16-bits would force the inputs
to also be overridden to 16-bit. In turn, this would have a cascading
effect on potentially the entire graph. The solution implemented by this
PR is to allow the specification of tensor boundaries where the
activation quantization data type changes.

#### The approach
The following figure shows a model with a region that has been promoted
to 16-bit from the default 8-bit activation type.


![image](https://github.com/microsoft/onnxruntime/assets/19691973/5998c301-ae20-4ac9-8a43-37f335cfcf8b)

Note the following observations:
- Op2’s output is consumed by Op4, Op7, and Op8. Op4 consumes the
converted u16 type, while Op7 and Op8 consume the original u8 type.
- Op3’s output is converted from u8 to u16. Op5 consumes the converted
u16 type.
 - Op4’s output is just u16 (not converted).
 - Op5’s output is converted from u16 to u8. Op6 consumes the u8 type.

The approach implemented by this PR uses the tensor-level quantization
overrides to specify a tensor’s quantization type at both the producer
and consumer ends. **The following shows the overrides necessary to
create this mixed precision QDQ model.**

```python3
overrides = {
  “Op2_out”: [{“quant_type”: QUInt8, “convert”: {“quant_type”: QUInt16, “recv_nodes”: {“Op4”}}}],
  “Op3_out”: [{“quant_type”: QUInt8, “convert”: {“quant_type”: QUInt16, “recv_nodes”: {“Op5”}}}],
  “Op4_out”: [{“quant_type”: QUInt16}],
  “Op5_out”: [{“quant_type”: QUInt16, “convert”: {“quant_type”: QUInt8, “recv_nodes”: {“Op6”}}}]
}
```
---
 .../tools/quantization/base_quantizer.py      | 323 +-------
 .../python/tools/quantization/onnx_model.py   |  10 +
 .../tools/quantization/onnx_quantizer.py      | 227 +++++-
 .../tools/quantization/operators/conv.py      |   2 +-
 .../tools/quantization/operators/direct_q8.py |   4 +-
 .../tools/quantization/operators/gather.py    |   4 +-
 .../tools/quantization/operators/gemm.py      |   4 +-
 .../tools/quantization/operators/norm.py      |   2 +-
 .../tools/quantization/operators/softmax.py   |  38 +-
 .../tools/quantization/operators/split.py     |   2 +-
 .../tools/quantization/qdq_quantizer.py       | 711 ++++++++++++++++--
 .../python/tools/quantization/registry.py     |   3 +-
 .../quantization/tensor_quant_overrides.py    | 214 ++++++
 .../test/python/quantization/test_qdq.py      | 594 ++++++++++++++-
 14 files changed, 1744 insertions(+), 394 deletions(-)
 create mode 100644 onnxruntime/python/tools/quantization/tensor_quant_overrides.py

diff --git a/onnxruntime/python/tools/quantization/base_quantizer.py b/onnxruntime/python/tools/quantization/base_quantizer.py
index 667d7047c1fbd..80617b7b5edaa 100644
--- a/onnxruntime/python/tools/quantization/base_quantizer.py
+++ b/onnxruntime/python/tools/quantization/base_quantizer.py
@@ -21,19 +21,15 @@
 from .quant_utils import (
     ONNX_TYPE_TO_NP_TYPE,
     TENSOR_NAME_QUANT_SUFFIX,
-    QuantizedValue,
-    QuantizedValueType,
     QuantType,
-    compute_scale_zp,
-    compute_scale_zp_float8,
     find_by_name,
-    get_qmin_qmax_for_qType,
     model_has_infer_metadata,
     quantize_data,
     quantize_nparray,
     save_and_reload_model_with_shape_infer,
     tensor_proto_to_array,
 )
+from .tensor_quant_overrides import TensorQuantOverridesHelper
 
 
 class QuantizationParams:
@@ -121,27 +117,17 @@ def __init__(
 
         self.opset_version = self.check_opset_version()
 
-        # Map of all original value names to quantized value names
-        self.quantized_value_map = {}
+        # Get tensor-level quantization overrides and ensure they are valid.
+        self.tensor_quant_overrides = TensorQuantOverridesHelper(self.extra_options.get("TensorQuantOverrides", {}))
 
-        self.tensor_quant_overrides, self.tensor_quant_override_types = self._get_and_check_tensor_quant_overrides()
-        self.quantization_params = self.calculate_quantization_params()
-
-        # to store specified scale and zeropoint instead of calculated value, tensor_name->(scale, zeropoint)
-        self.used_scale_zp_map = {}
-
-    def set_quant_scale_zp(self, tensor_name, value):
-        assert isinstance(value, tuple) and len(value) == 2, "value must be scale(float or float16) and zeropoint"
-        assert hasattr(value[0], "dtype")
-        assert tensor_name not in self.used_scale_zp_map, f"{tensor_name} has been setted before"
-        self.used_scale_zp_map[tensor_name] = value
+        initializer_names = {initzer.name for initzer in self.model.initializer()}
+        overrides_valid, overrides_err = self.tensor_quant_overrides.is_valid(
+            initializer_names, self.value_infos.keys(), activation_qType
+        )
+        if not overrides_valid:
+            raise ValueError(overrides_err)
 
-    def find_quant_scale_zp(self, input_name):
-        if input_name in self.used_scale_zp_map:
-            return self.used_scale_zp_map[input_name]
-        if self.parent is not None:
-            return self.parent.find_quantized_value(input_name)
-        return (None, None)
+        self.tensor_quant_override_qtypes = self.tensor_quant_overrides.get_quant_types()
 
     def quantize_model(self):
         raise NotImplementedError
@@ -212,36 +198,16 @@ def check_opset_version(self):
 
         return opset_version
 
-    def quantize_bias_static(self, bias_name, input_name, weight_name, beta=1.0):
+    def quantize_bias_static_impl(self, bias_name, input_scale, weight_scale, beta=1.0):
         """
         Quantized the bias. Zero Point == 0 and Scale == Input_Scale * Weight_Scale
         """
 
-        # Handle case where bias already in quantization map
-        if bias_name in self.quantized_value_map:
-            return self.quantized_value_map[bias_name].q_name
-
-        # get scale for weight
-        weight_scale_name = self.quantized_value_map[weight_name].scale_name
-        weight_initializer = find_by_name(weight_scale_name, self.model.initializer())
-        weight_scale = tensor_proto_to_array(weight_initializer)
-
         # get bias
         bias_initializer = find_by_name(bias_name, self.model.initializer())
         bias_data = tensor_proto_to_array(bias_initializer)
         quantized_bias_name = bias_name + TENSOR_NAME_QUANT_SUFFIX
 
-        # get scale for input
-        if input_name in self.quantized_value_map:
-            input_scale_name = self.quantized_value_map[input_name].scale_name
-        elif input_name in self.quantization_params:
-            _, input_scale_name, _, _, _ = self._get_quantization_params(input_name)
-        else:
-            raise ValueError(f"Expected {input_name} to be in quantized value map for static quantization")
-
-        inputscale_initializer = find_by_name(input_scale_name, self.model.initializer())
-        input_scale = tensor_proto_to_array(inputscale_initializer)
-
         # quantize bias
         if self.weight_qType == onnx.TensorProto.FLOAT8E4M3FN:
             data = np.asarray(bias_data)
@@ -293,22 +259,16 @@ def quantize_bias_static(self, bias_name, input_name, weight_name, beta=1.0):
             packed_bias_zp_initializer = onnx.helper.make_tensor(quantized_bias_zp_name, tensor_type, [], [0])
         self.model.initializer_extend([packed_bias_zp_initializer])
 
-        assert bias_name not in self.quantized_value_map
-        quantized_value = QuantizedValue(
-            bias_name,
+        return (
             quantized_bias_name,
             quantized_bias_scale_name,
             quantized_bias_zp_name,
-            QuantizedValueType.Initializer,
-            0 if bias_scale_data.size > 1 else None,
-            node_type=node_type,
-            node_qtype=node_qtype,
+            bias_scale_data,
+            node_type,
+            node_qtype,
         )
-        self.quantized_value_map[bias_name] = quantized_value
-
-        return quantized_bias_name
 
-    def quantize_initializer(self, weight, qType, reduce_range=False, keep_float_weight=False):
+    def quantize_initializer_impl(self, weight, qType, reduce_range=False, keep_float_weight=False):
         """
         :param weight: TensorProto initializer
         :param qType: type to quantize to
@@ -316,22 +276,13 @@ def quantize_initializer(self, weight, qType, reduce_range=False, keep_float_wei
                                   If keep_float_weight is False, quantize the weight, or don't quantize the weight.
         :return: quantized weight name, zero point name, scale name
         """
-        # Find if this input is already quantized
-        if weight.name in self.quantized_value_map:
-            quantized_value = self.quantized_value_map[weight.name]
-            return (
-                quantized_value.q_name,
-                quantized_value.zp_name,
-                quantized_value.scale_name,
-            )
-
         q_weight_name = weight.name + TENSOR_NAME_QUANT_SUFFIX
         zp_name = weight.name + "_zero_point"
         scale_name = weight.name + "_scale"
 
         # Quantize weight data. Use quantization overrides if provided by the user.
         weight_data = tensor_proto_to_array(weight)
-        quant_overrides = self.get_per_tensor_quant_overrides(weight.name)
+        quant_overrides = self.tensor_quant_overrides.get_per_tensor_overrides(weight.name)
         if "quant_type" in quant_overrides:
             qType = quant_overrides["quant_type"].tensor_type  # noqa: N806
 
@@ -392,19 +343,9 @@ def quantize_initializer(self, weight, qType, reduce_range=False, keep_float_wei
                 q_weight_initializer = onnx.numpy_helper.from_array(q_weight_data, q_weight_name)
             self.model.initializer_extend([q_weight_initializer])
 
-        # Log entry for this quantized weight
-        quantized_value = QuantizedValue(
-            weight.name,
-            q_weight_name,
-            scale_name,
-            zp_name,
-            QuantizedValueType.Initializer,
-            None,
-        )
-        self.quantized_value_map[weight.name] = quantized_value
         return q_weight_name, zp_name, scale_name
 
-    def quantize_weight_per_channel(
+    def quantize_weight_per_channel_impl(
         self,
         weight_name,
         weight_qType,
@@ -412,22 +353,13 @@ def quantize_weight_per_channel(
         reduce_range=True,
         keep_float_weight=False,
     ):
-        # Find if this input is already quantized
-        if weight_name in self.quantized_value_map:
-            quantized_value = self.quantized_value_map[weight_name]
-            return (
-                quantized_value.q_name,
-                quantized_value.zp_name,
-                quantized_value.scale_name,
-            )
-
         initializer = find_by_name(weight_name, self.model.initializer())
         if initializer is None:
             raise ValueError("{} is not an initializer", weight_name)
 
         weights = tensor_proto_to_array(initializer)
         channel_count = weights.shape[channel_axis]
-        quant_overrides_for_channels = self.get_per_channel_quant_overrides(weight_name, channel_count)
+        quant_overrides_for_channels = self.tensor_quant_overrides.get_per_channel_overrides(weight_name, channel_count)
 
         # If user provides per-channel quantization overrides, all channels must use the same quantization type.
         # So, just use the first channel's type.
@@ -499,16 +431,6 @@ def quantize_weight_per_channel(
         zp_name = weight_name + "_zero_point"
         scale_name = weight_name + "_scale"
 
-        quantized_value = QuantizedValue(
-            weight_name,
-            q_weight_name,
-            scale_name,
-            zp_name,
-            QuantizedValueType.Initializer,
-            None,
-        )
-        self.quantized_value_map[weight_name] = quantized_value
-
         # Update packed weight, zero point, and scale initializers
         zero_scale_shape = [initializer.dims[channel_axis]]
         scale_initializer = onnx.helper.make_tensor(
@@ -530,194 +452,25 @@ def quantize_weight_per_channel(
 
         return q_weight_name, zp_name, scale_name
 
-    def _get_and_check_tensor_quant_overrides(self):
-        """
-        Get tensor quantization overrides and check correctness.
-        """
-        tensor_quant_overrides = self.extra_options.get("TensorQuantOverrides", {})
-        tensor_quant_override_types = set()
-
-        # Validate that compatible/valid overrides are provided.
-        if tensor_quant_overrides:
-            initializer_names = self.model.get_initializer_name_set()
-            value_info_names = set(self.value_infos.keys())
-            keys_unsupported_with_scale_zp = {"symmetric", "reduce_range", "rmax", "rmin"}
-
-            for tensor_name, quant_overrides_list in tensor_quant_overrides.items():
-                if tensor_name not in initializer_names and tensor_name not in value_info_names:
-                    raise ValueError(f"Tensor '{tensor_name}' in TensorQuantOverrides is not present in the model")
-
-                if not isinstance(quant_overrides_list, list):
-                    raise ValueError(f"Tensor quantization overrides for '{tensor_name}' are not in a list")
-
-                is_initializer = tensor_name in initializer_names
-                if not is_initializer and len(quant_overrides_list) > 1:
-                    raise ValueError(
-                        f"Tensor '{tensor_name}' has a list of per-channel overrides, but is not an initializer"
-                    )
-
-                quant_type = None
-                for index, quant_overrides in enumerate(quant_overrides_list):
-                    if not isinstance(quant_overrides, dict):
-                        raise ValueError(
-                            f"Tensor quantization overrides at index {index} for '{tensor_name}' are not in a dict"
-                        )
-
-                    # For per-channel quantization, all channels must use the same quantization type.
-                    # Therefore, if the user tries to override the quant_type for a channel, it must match in all
-                    # other channels.
-                    if index == 0:
-                        quant_type = quant_overrides.get("quant_type")
-                        if quant_type:
-                            tensor_quant_override_types.add(quant_type.tensor_type)
-                    elif quant_type != quant_overrides.get("quant_type"):
-                        raise ValueError(
-                            "Channel quantization types for tensor '{tensor_name}' do not match at index {index}."
-                        )
-
-                    has_scale = "scale" in quant_overrides
-                    has_zero_point = "zero_point" in quant_overrides
-
-                    if (has_scale and not has_zero_point) or (has_zero_point and not has_scale):
-                        raise ValueError(
-                            "Must provide both 'scale' and 'zero_point' if one of the overrides is provided"
-                        )
-
-                    if has_scale:
-                        for key in keys_unsupported_with_scale_zp:
-                            if key in quant_overrides:
-                                raise ValueError(
-                                    f"Tensor override option '{key}' is invalid with 'scale' and 'zero_point'"
-                                )
-
-        return tensor_quant_overrides, tensor_quant_override_types
-
-    def get_per_tensor_quant_overrides(self, tensor_name):
-        quant_overrides_list = self.tensor_quant_overrides.get(tensor_name, [{}])
-        num_overrides = len(quant_overrides_list)
-        if num_overrides > 1:
-            raise ValueError(
-                f"Expected tensor '{tensor_name}' to use per-tensor quantization overrides, "
-                f"but found {num_overrides} per-channel overrides."
-            )
-
-        return quant_overrides_list[0] if num_overrides > 0 else {}
-
-    def get_per_channel_quant_overrides(self, tensor_name, num_channels):
-        quant_overrides_list = self.tensor_quant_overrides.get(tensor_name, [{} for i in range(num_channels)])
-
-        if len(quant_overrides_list) != num_channels:
-            raise ValueError(
-                f"Expected tensor '{tensor_name}' to have {num_channels} per-channel quantization overrides, "
-                f"but found {len(quant_overrides_list)} instead."
-            )
-
-        return quant_overrides_list
-
-    def _get_quantization_params(self, param_name, use_scale=None, use_zeropoint=None):
-        """
-        Create initializers and inputs in the graph for zero point and scale of output.
-        Zero point and scale values are obtained from self.quantization_params if specified.
-            parameter param_name: Name of the quantization parameter.
-            return: result, scale_name, zero_point_name, scale_shape, zero_point_shape.
-        """
-        zero_point_type = self.activation_qType
-
-        if use_scale is None or use_zeropoint is None:
-            if self.quantization_params is None or param_name not in self.quantization_params:
-                logging.info(f'Quantization parameters for tensor:"{param_name}" not specified')
-                return False, "", "", "", ""
-
-            params = self.quantization_params[param_name]
-            if not isinstance(params, QuantizationParams):
-                raise TypeError(f"Unexpected type {type(params)} for {param_name!r}.")
-            if params is None or len(params) != 3:
-                raise ValueError(
-                    "Quantization parameters should contain zero point, scale, quant type. "
-                    f"Specified values for output {param_name}: {params}"
-                )
-
-            zero_point_values = np.array([params["zero_point"]])
-            if not hasattr(params["scale"], "dtype") or params["scale"].dtype not in (np.float32, np.float16):
-                raise ValueError(f"Unexpected type {type(params['scale'])} and param_name={param_name!r}")
-            scale_values = np.array([params["scale"]])
-            assert scale_values.dtype != np.float64
-            zero_point_type = params["quant_type"]
-        else:
-            zero_point_values = np.array([use_zeropoint])
-            scale_values = np.array([use_scale])
-            params = self.quantization_params[param_name]
-            if "scale" in params:
-                dtype = params["scale"].dtype
-                scale_values = scale_values.astype(dtype)
-            assert scale_values.dtype != np.float64
-
-        zero_point_shape = []
-        zero_point_name = param_name + "_zero_point"
-        scale_shape = []
-        scale_name = param_name + "_scale"
-
-        # Add initializers
-        init_zp = onnx.helper.make_tensor(
-            zero_point_name, zero_point_type, zero_point_shape, zero_point_values.ravel().tolist()
-        )
-        self.model.add_initializer(init_zp)
-        if scale_values.dtype == np.float32:
-            scale_type = onnx.TensorProto.FLOAT
-        elif scale_values.dtype == np.float16:
-            scale_type = onnx.TensorProto.FLOAT16
-        else:
-            raise ValueError(f"Unexpected dtype={scale_values.dtype} for param_name={param_name!r}")
-        init_scale = onnx.helper.make_tensor(scale_name, scale_type, scale_shape, scale_values.reshape((-1,)).tolist())
-        self.model.add_initializer(init_scale)
-
-        return True, scale_name, zero_point_name, scale_shape, zero_point_shape
-
-    def calculate_quantization_params(self):
+    def adjust_tensor_ranges(self):
         if self.tensors_range is None:
-            return {}
+            return
 
-        # adjust tensor_ranges for input of Clip and Relu node
         for node in self.model.nodes():
-            if node.op_type not in ["Clip", "Relu"]:
-                continue
-            if self.is_activation_symmetric:
-                continue
-            if not self.should_quantize_node(node):
-                continue
-            if len(self.model.input_name_to_nodes()[node.input[0]]) != 1:
-                continue
-            if node.input[0] not in self.tensors_range or node.output[0] not in self.tensors_range:
-                continue
-            td = self.tensors_range[node.output[0]]
-            if not isinstance(td, TensorData):
-                raise TypeError(f"Unexpected type {type(td)} for {node.output[0]!r}.")
-            self.tensors_range[node.input[0]] = td
-
-        quantization_params = {}
-        for tensor_name in self.tensors_range:
-            td = self.tensors_range[tensor_name]
-            if not isinstance(td, TensorData):
-                raise TypeError(f"Unexpected type {type(td)} for {tensor_name!r}.")
-
-            quant_overrides = self.get_per_tensor_quant_overrides(tensor_name)
-
-            quant_type = self.activation_qType
-            if "quant_type" in quant_overrides:
-                quant_type = quant_overrides["quant_type"].tensor_type
-
-            if "scale" in quant_overrides and "zero_point" in quant_overrides:
-                zero, scale = quant_overrides["zero_point"], quant_overrides["scale"]
-            elif quant_type == onnx.TensorProto.FLOAT8E4M3FN:
-                zero, scale = compute_scale_zp_float8(quant_type, td.avg_std[1])
-            else:
-                rmin = quant_overrides.get("rmin", td.range_value[0])
-                rmax = quant_overrides.get("rmax", td.range_value[1])
-                symmetric = quant_overrides.get("symmetric", self.is_activation_symmetric)
-                reduce_range = quant_overrides.get("reduce_range", False)
-                qmin, qmax = get_qmin_qmax_for_qType(quant_type, reduce_range=reduce_range, symmetric=symmetric)
-                zero, scale = compute_scale_zp(rmin, rmax, qmin, qmax, symmetric, self.min_real_range)
-
-            quantization_params[tensor_name] = QuantizationParams(zero_point=zero, scale=scale, quant_type=quant_type)
-
-        return quantization_params
+            # adjust tensor_ranges for input of Clip and Relu node
+            if node.op_type in ["Clip", "Relu"]:
+                if self.is_activation_symmetric:
+                    continue
+                if not self.should_quantize_node(node):
+                    continue
+                if len(self.model.input_name_to_nodes()[node.input[0]]) != 1:
+                    continue
+                if node.input[0] not in self.tensors_range or node.output[0] not in self.tensors_range:
+                    continue
+                td = self.tensors_range[node.output[0]]
+                if not isinstance(td, TensorData):
+                    raise TypeError(f"Unexpected type {type(td)} for {node.output[0]!r}.")
+                self.tensors_range[node.input[0]] = td
+            # Adjust Softmax to range from 0.0 to 1.0
+            elif node.op_type == "Softmax":
+                self.tensors_range[node.output[0]] = TensorData(lowest=np.float32(0.0), highest=np.float32(1.0))
diff --git a/onnxruntime/python/tools/quantization/onnx_model.py b/onnxruntime/python/tools/quantization/onnx_model.py
index 716dd1eacec6a..174bf5fd1509c 100644
--- a/onnxruntime/python/tools/quantization/onnx_model.py
+++ b/onnxruntime/python/tools/quantization/onnx_model.py
@@ -441,6 +441,11 @@ def replace_input_of_all_nodes(self, old_input_name, new_input_name):
         for node in self.model.graph.node:
             ONNXModel.replace_node_input(node, old_input_name, new_input_name)
 
+    def replace_input_of_nodes(self, old_input_name, new_input_name, node_names_set):
+        for node in self.model.graph.node:
+            if node.name in node_names_set:
+                ONNXModel.replace_node_input(node, old_input_name, new_input_name)
+
     @staticmethod
     def replace_node_output(node, old_output_name, new_output_name):
         assert isinstance(old_output_name, str) and isinstance(new_output_name, str)
@@ -452,6 +457,11 @@ def replace_output_of_all_nodes(self, old_output_name, new_output_name):
         for node in self.model.graph.node:
             ONNXModel.replace_node_output(node, old_output_name, new_output_name)
 
+    def replace_output_of_nodes(self, old_output_name, new_output_name, node_names_set):
+        for node in self.model.graph.node:
+            if node.name in node_names_set:
+                ONNXModel.replace_node_output(node, old_output_name, new_output_name)
+
     def remove_unused_constant(self):
         input_name_to_nodes = self.input_name_to_nodes()
 
diff --git a/onnxruntime/python/tools/quantization/onnx_quantizer.py b/onnxruntime/python/tools/quantization/onnx_quantizer.py
index e2044db04303d..4b76de6ecf1cb 100644
--- a/onnxruntime/python/tools/quantization/onnx_quantizer.py
+++ b/onnxruntime/python/tools/quantization/onnx_quantizer.py
@@ -5,30 +5,31 @@
 # --------------------------------------------------------------------------
 import logging
 
+import numpy as np
 import onnx
 import onnx.numpy_helper
 from onnx import onnx_pb as onnx_proto
 
-try:
-    from onnx.reference.op_run import to_array_extended
-except ImportError:
-    # old version of onnx.
-    to_array_extended = None
-
-from .base_quantizer import BaseQuantizer
+from .base_quantizer import BaseQuantizer, QuantizationParams
+from .calibrate import TensorData
 from .onnx_model import ONNXModel
 from .quant_utils import (
     TENSOR_NAME_QUANT_SUFFIX,
     QuantizationMode,
     QuantizedValue,
+    QuantizedValueType,
     __producer__,
     __version__,
     add_infer_metadata,
     attribute_to_kwarg,
+    compute_scale_zp,
+    compute_scale_zp_float8,
     find_by_name,
+    get_qmin_qmax_for_qType,
     get_qrange_for_qType,
     ms_domain,
     save_and_reload_model_with_shape_infer,
+    tensor_proto_to_array,
 )
 from .registry import CreateOpQuantizer
 
@@ -77,6 +78,7 @@ def __init__(
         self.fuse_dynamic_quant = self.opset_version > 10
 
         self.q_matmul_const_b_only = "MatMulConstBOnly" in self.extra_options and self.extra_options["MatMulConstBOnly"]
+
         self.new_nodes = []
         self.graph_scope = "/"  # for human readable debug information
         self.tensor_names = {}  # in case the shape inference not totally working
@@ -88,6 +90,8 @@ def __init__(
         if self.mode not in QuantizationMode:
             raise ValueError(f"unsupported quantization mode {self.mode}")
 
+        self.quantization_params = self.calculate_quantization_params()
+
         # QuantizeRange tensor name and zero tensor name for scale and zero point calculation.
         # Used when static is False
         self.fixed_qrange_uint8_name = "fixed_quantization_range_uint8"
@@ -97,6 +101,8 @@ def __init__(
         # For int8 data-type, zero point is always zero (respresented by fixed_zero_point_name tensor)
         self.fixed_zero_zp_name = "fixed_zero_zp"
 
+        # Map of all original value names to quantized value names
+        self.quantized_value_map = {}
         # some output from nodes will be quantized, yet itself should be treat as existing so
         # no dequantized will be applied when needed later
         self.generated_value_names = self.model.get_non_initializer_inputs()
@@ -494,6 +500,65 @@ def _get_dynamic_input_quantization_params_uint8(self, input_name, nodes_list, i
 
         return input_scale_name, input_zp_name, [], []
 
+    def _get_quantization_params(self, param_name, use_scale=None, use_zeropoint=None):
+        """
+        Create initializers and inputs in the graph for zero point and scale of output.
+        Zero point and scale values are obtained from self.quantization_params if specified.
+            parameter param_name: Name of the quantization parameter.
+            return: result, scale_name, zero_point_name, scale_shape, zero_point_shape.
+        """
+        zero_point_type = self.activation_qType
+
+        if use_scale is None or use_zeropoint is None:
+            if self.quantization_params is None or param_name not in self.quantization_params:
+                logging.info(f'Quantization parameters for tensor:"{param_name}" not specified')
+                return False, "", "", "", ""
+
+            params = self.quantization_params[param_name]
+            if not isinstance(params, QuantizationParams):
+                raise TypeError(f"Unexpected type {type(params)} for {param_name!r}.")
+            if params is None or len(params) != 3:
+                raise ValueError(
+                    "Quantization parameters should contain zero point, scale, quant type. "
+                    f"Specified values for output {param_name}: {params}"
+                )
+
+            zero_point_values = np.array([params["zero_point"]])
+            if not hasattr(params["scale"], "dtype") or params["scale"].dtype not in (np.float32, np.float16):
+                raise ValueError(f"Unexpected type {type(params['scale'])} and param_name={param_name!r}")
+            scale_values = np.array([params["scale"]])
+            assert scale_values.dtype != np.float64
+            zero_point_type = params["quant_type"]
+        else:
+            zero_point_values = np.array([use_zeropoint])
+            scale_values = np.array([use_scale])
+            params = self.quantization_params[param_name]
+            if "scale" in params:
+                dtype = params["scale"].dtype
+                scale_values = scale_values.astype(dtype)
+            assert scale_values.dtype != np.float64
+
+        zero_point_shape = []
+        zero_point_name = param_name + "_zero_point"
+        scale_shape = []
+        scale_name = param_name + "_scale"
+
+        # Add initializers
+        init_zp = onnx.helper.make_tensor(
+            zero_point_name, zero_point_type, zero_point_shape, zero_point_values.ravel().tolist()
+        )
+        self.model.add_initializer(init_zp)
+        if scale_values.dtype == np.float32:
+            scale_type = onnx_proto.TensorProto.FLOAT
+        elif scale_values.dtype == np.float16:
+            scale_type = onnx_proto.TensorProto.FLOAT16
+        else:
+            raise ValueError(f"Unexpected dtype={scale_values.dtype} for param_name={param_name!r}")
+        init_scale = onnx.helper.make_tensor(scale_name, scale_type, scale_shape, scale_values.reshape((-1,)).tolist())
+        self.model.add_initializer(init_scale)
+
+        return True, scale_name, zero_point_name, scale_shape, zero_point_shape
+
     def _get_quantize_input_nodes(self, node, input_index, qType, given_scale_name=None, given_zp_name=None):
         """
         Given an input for a node (which is not a initializer), this function
@@ -564,6 +629,55 @@ def find_quantized_value(self, input_name):
             return self.parent.find_quantized_value(input_name)
         return None
 
+    def quantize_bias_static(self, bias_name, input_name, weight_name, beta=1.0):
+        """
+        Quantized the bias. Zero Point == 0 and Scale == Input_Scale * Weight_Scale
+        """
+
+        # Handle case where bias already in quantization map
+        if bias_name in self.quantized_value_map:
+            return self.quantized_value_map[bias_name].q_name
+
+        # get scale for weight
+        weight_scale_name = self.quantized_value_map[weight_name].scale_name
+        weight_initializer = find_by_name(weight_scale_name, self.model.initializer())
+        weight_scale = tensor_proto_to_array(weight_initializer)
+
+        # get scale for input
+        if input_name in self.quantized_value_map:
+            input_scale_name = self.quantized_value_map[input_name].scale_name
+        elif input_name in self.quantization_params:
+            _, input_scale_name, _, _, _ = self._get_quantization_params(input_name)
+        else:
+            raise ValueError(f"Expected {input_name} to be in quantized value map for static quantization")
+
+        inputscale_initializer = find_by_name(input_scale_name, self.model.initializer())
+        input_scale = tensor_proto_to_array(inputscale_initializer)
+
+        (
+            quantized_bias_name,
+            quantized_bias_scale_name,
+            quantized_bias_zp_name,
+            bias_scale_data,
+            node_type,
+            node_qtype,
+        ) = self.quantize_bias_static_impl(bias_name, input_scale, weight_scale, beta)
+
+        assert bias_name not in self.quantized_value_map
+        quantized_value = QuantizedValue(
+            bias_name,
+            quantized_bias_name,
+            quantized_bias_scale_name,
+            quantized_bias_zp_name,
+            QuantizedValueType.Initializer,
+            0 if bias_scale_data.size > 1 else None,
+            node_type=node_type,
+            node_qtype=node_qtype,
+        )
+        self.quantized_value_map[bias_name] = quantized_value
+
+        return quantized_bias_name
+
     def contains_tensor(self, tensor_name):
         """
         only check for value info and newly generated tensor names, initializers are checked separately
@@ -721,6 +835,71 @@ def __quantize_inputs(
 
         return quantized_input_names, zero_point_names, scale_names, nodes
 
+    def quantize_initializer(self, weight, qType, reduce_range=False, keep_float_weight=False):
+        """
+        :param weight: TensorProto initializer
+        :param qType: type to quantize to
+        :param keep_float_weight: Whether to quantize the weight. In some cases, we only want to qunatize scale and zero point.
+                                  If keep_float_weight is False, quantize the weight, or don't quantize the weight.
+        :return: quantized weight name, zero point name, scale name
+        """
+        # Find if this input is already quantized
+        if weight.name in self.quantized_value_map:
+            quantized_value = self.quantized_value_map[weight.name]
+            return (
+                quantized_value.q_name,
+                quantized_value.zp_name,
+                quantized_value.scale_name,
+            )
+
+        q_weight_name, zp_name, scale_name = self.quantize_initializer_impl(
+            weight, qType, reduce_range, keep_float_weight
+        )
+
+        # Log entry for this quantized weight
+        quantized_value = QuantizedValue(
+            weight.name,
+            q_weight_name,
+            scale_name,
+            zp_name,
+            QuantizedValueType.Initializer,
+            None,
+        )
+        self.quantized_value_map[weight.name] = quantized_value
+        return q_weight_name, zp_name, scale_name
+
+    def quantize_weight_per_channel(
+        self,
+        weight_name,
+        weight_qType,
+        channel_axis,
+        reduce_range=True,
+        keep_float_weight=False,
+    ):
+        # Find if this input is already quantized
+        if weight_name in self.quantized_value_map:
+            quantized_value = self.quantized_value_map[weight_name]
+            return (
+                quantized_value.q_name,
+                quantized_value.zp_name,
+                quantized_value.scale_name,
+            )
+
+        q_weight_name, zp_name, scale_name = self.quantize_weight_per_channel_impl(
+            weight_name, weight_qType, channel_axis, reduce_range, keep_float_weight
+        )
+        quantized_value = QuantizedValue(
+            weight_name,
+            q_weight_name,
+            scale_name,
+            zp_name,
+            QuantizedValueType.Initializer,
+            None,
+        )
+        self.quantized_value_map[weight_name] = quantized_value
+
+        return q_weight_name, zp_name, scale_name
+
     def _dequantize_value(self, value_name):
         """
         Given a value (input/output) which is quantized, add a DequantizeLinear node to dequantize
@@ -771,3 +950,37 @@ def _dequantize_outputs(self):
             dequantize_node = self._dequantize_value(output.name)
             if dequantize_node is not None:
                 self.new_nodes.append(dequantize_node)
+
+    def calculate_quantization_params(self):
+        if self.tensors_range is None:
+            return None
+
+        self.adjust_tensor_ranges()
+
+        quantization_params = {}
+        for tensor_name in self.tensors_range:
+            td = self.tensors_range[tensor_name]
+            if not isinstance(td, TensorData):
+                raise TypeError(f"Unexpected type {type(td)} for {tensor_name!r}.")
+
+            quant_overrides = self.tensor_quant_overrides.get_per_tensor_overrides(tensor_name)
+
+            quant_type = self.activation_qType
+            if "quant_type" in quant_overrides:
+                quant_type = quant_overrides["quant_type"].tensor_type
+
+            if "scale" in quant_overrides and "zero_point" in quant_overrides:
+                zero, scale = quant_overrides["zero_point"], quant_overrides["scale"]
+            elif quant_type == onnx.TensorProto.FLOAT8E4M3FN:
+                zero, scale = compute_scale_zp_float8(quant_type, td.avg_std[1])
+            else:
+                rmin = quant_overrides.get("rmin", td.range_value[0])
+                rmax = quant_overrides.get("rmax", td.range_value[1])
+                symmetric = quant_overrides.get("symmetric", self.is_activation_symmetric)
+                reduce_range = quant_overrides.get("reduce_range", False)
+                qmin, qmax = get_qmin_qmax_for_qType(quant_type, reduce_range=reduce_range, symmetric=symmetric)
+                zero, scale = compute_scale_zp(rmin, rmax, qmin, qmax, symmetric, self.min_real_range)
+
+            quantization_params[tensor_name] = QuantizationParams(zero_point=zero, scale=scale, quant_type=quant_type)
+
+        return quantization_params
diff --git a/onnxruntime/python/tools/quantization/operators/conv.py b/onnxruntime/python/tools/quantization/operators/conv.py
index 06204585ba1ca..7054173450569 100644
--- a/onnxruntime/python/tools/quantization/operators/conv.py
+++ b/onnxruntime/python/tools/quantization/operators/conv.py
@@ -252,4 +252,4 @@ def quantize(self):
             self.quantizer.quantize_weight_tensor(node.input[1])
 
         if len(node.input) == 3:
-            self.quantizer.quantize_bias_tensor(node.input[2], node.input[0], node.input[1])
+            self.quantizer.quantize_bias_tensor(node.name, node.input[2], node.input[0], node.input[1])
diff --git a/onnxruntime/python/tools/quantization/operators/direct_q8.py b/onnxruntime/python/tools/quantization/operators/direct_q8.py
index c14532b96acbc..ae9679ae8ec7a 100644
--- a/onnxruntime/python/tools/quantization/operators/direct_q8.py
+++ b/onnxruntime/python/tools/quantization/operators/direct_q8.py
@@ -73,6 +73,6 @@ def quantize(self):
         if self.quantizer.force_quantize_no_input_check:
             self.quantizer.quantize_activation_tensor(self.node.input[0])
             if not self.disable_qdq_for_node_output:
-                self.quantizer.quantize_activation_tensor(self.node.output[0], self.node.input[0])
+                self.quantizer.quantize_output_same_as_input(self.node.output[0], self.node.input[0], self.node.name)
         elif self.quantizer.is_tensor_quantized(self.node.input[0]) and not self.disable_qdq_for_node_output:
-            self.quantizer.quantize_activation_tensor(self.node.output[0], self.node.input[0])
+            self.quantizer.quantize_output_same_as_input(self.node.output[0], self.node.input[0], self.node.name)
diff --git a/onnxruntime/python/tools/quantization/operators/gather.py b/onnxruntime/python/tools/quantization/operators/gather.py
index f48725d1e428f..e390e874a2662 100644
--- a/onnxruntime/python/tools/quantization/operators/gather.py
+++ b/onnxruntime/python/tools/quantization/operators/gather.py
@@ -59,6 +59,6 @@ def quantize(self):
 
         if self.quantizer.is_valid_quantize_weight(node.input[0]) or self.quantizer.force_quantize_no_input_check:
             self.quantizer.quantize_activation_tensor(node.input[0])
-            self.quantizer.quantize_activation_tensor(node.output[0], node.input[0])
+            self.quantizer.quantize_output_same_as_input(node.output[0], node.input[0], node.name)
         elif self.quantizer.is_tensor_quantized(node.input[0]):
-            self.quantizer.quantize_activation_tensor(node.output[0], node.input[0])
+            self.quantizer.quantize_output_same_as_input(node.output[0], node.input[0], node.name)
diff --git a/onnxruntime/python/tools/quantization/operators/gemm.py b/onnxruntime/python/tools/quantization/operators/gemm.py
index d269c8fb47bd1..df24e256aa7fc 100644
--- a/onnxruntime/python/tools/quantization/operators/gemm.py
+++ b/onnxruntime/python/tools/quantization/operators/gemm.py
@@ -153,7 +153,9 @@ def quantize(self):
 
         if len(node.input) == 3:
             if self.quantizer.is_input_a_initializer(node.input[2]):
-                self.quantizer.quantize_bias_tensor(node.input[2], node.input[0], node.input[1], get_beta(self.node))
+                self.quantizer.quantize_bias_tensor(
+                    node.name, node.input[2], node.input[0], node.input[1], get_beta(self.node)
+                )
                 set_default_beta(self.node)
             else:
                 logging.warning(
diff --git a/onnxruntime/python/tools/quantization/operators/norm.py b/onnxruntime/python/tools/quantization/operators/norm.py
index e825fe6075601..3c14c926a7e75 100644
--- a/onnxruntime/python/tools/quantization/operators/norm.py
+++ b/onnxruntime/python/tools/quantization/operators/norm.py
@@ -29,7 +29,7 @@ def quantize(self):
             self.quantizer.quantize_activation_tensor(node.input[1])
 
         # Bias
-        self.quantizer.quantize_bias_tensor(node.input[2], node.input[0], node.input[1])
+        self.quantizer.quantize_bias_tensor(node.name, node.input[2], node.input[0], node.input[1])
 
         # Output
         if not self.disable_qdq_for_node_output:
diff --git a/onnxruntime/python/tools/quantization/operators/softmax.py b/onnxruntime/python/tools/quantization/operators/softmax.py
index 61a69ab3649dd..4b39fae8ac063 100644
--- a/onnxruntime/python/tools/quantization/operators/softmax.py
+++ b/onnxruntime/python/tools/quantization/operators/softmax.py
@@ -1,18 +1,8 @@
-import numpy as np
 import onnx
 import onnx.helper
 
-from ..quant_utils import (
-    TENSOR_NAME_QUANT_SUFFIX,
-    QuantizedValue,
-    QuantizedValueType,
-    attribute_to_kwarg,
-    compute_scale_zp,
-    get_qmin_qmax_for_qType,
-    ms_domain,
-)
+from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
 from .base_operator import QuantOperatorBase
-from .qdq_base_operator import QDQOperatorBase
 
 
 class QLinearSoftmax(QuantOperatorBase):
@@ -82,29 +72,3 @@ def quantize(self):
         nodes.append(qnode)
         self.quantizer.new_nodes += nodes
         return None
-
-
-class QDQSoftmax(QDQOperatorBase):
-    def quantize(self):
-        super().quantize()
-        output_name = self.node.output[0]
-        quant_overrides = self.quantizer.get_per_tensor_quant_overrides(output_name)
-
-        quant_type = self.quantizer.activation_qType
-        if "quant_type" in quant_overrides:
-            quant_type = quant_overrides["quant_type"].tensor_type
-
-        if "scale" in quant_overrides and "zero_point" in quant_overrides:
-            out_zero_point, out_scale = quant_overrides["zero_point"], quant_overrides["scale"]
-        else:
-            # Unless overridden by the user, force Softmax to range from 0.0 to 1.0
-            qparams = self.quantizer.quantization_params[output_name]
-            dtype = qparams.data["scale"].dtype
-            rmin = quant_overrides.get("rmin", np.array(0, dtype=dtype))
-            rmax = quant_overrides.get("rmax", np.array(1, dtype=dtype))
-            symmetric = quant_overrides.get("symmetric", self.quantizer.is_activation_symmetric)
-            reduce_range = quant_overrides.get("reduce_range", False)
-            qmin, qmax = get_qmin_qmax_for_qType(quant_type, reduce_range=reduce_range, symmetric=symmetric)
-            out_zero_point, out_scale = compute_scale_zp(rmin, rmax, qmin, qmax, symmetric=symmetric)
-
-        self.quantizer.set_quant_scale_zp(output_name, (out_scale, out_zero_point))
diff --git a/onnxruntime/python/tools/quantization/operators/split.py b/onnxruntime/python/tools/quantization/operators/split.py
index c36b767f5abcc..74fc30cd075d2 100644
--- a/onnxruntime/python/tools/quantization/operators/split.py
+++ b/onnxruntime/python/tools/quantization/operators/split.py
@@ -60,4 +60,4 @@ def quantize(self):
             self.quantizer.quantize_activation_tensor(node.input[0])
         if not self.disable_qdq_for_node_output:
             for output in node.output:
-                self.quantizer.quantize_activation_tensor(output, node.input[0])
+                self.quantizer.quantize_output_same_as_input(output, node.input[0], node.name)
diff --git a/onnxruntime/python/tools/quantization/qdq_quantizer.py b/onnxruntime/python/tools/quantization/qdq_quantizer.py
index 1875c552fab9c..c323c6fec545a 100644
--- a/onnxruntime/python/tools/quantization/qdq_quantizer.py
+++ b/onnxruntime/python/tools/quantization/qdq_quantizer.py
@@ -3,15 +3,21 @@
 # Licensed under the MIT License. See License.txt in the project root for
 # license information.
 # --------------------------------------------------------------------------
+from __future__ import annotations
+
 import logging
+from dataclasses import dataclass
 from enum import Enum
+from typing import Any
 
+import numpy as np
 import onnx
 import onnx.numpy_helper
 from onnx import TensorProto
 from onnx import onnx_pb as onnx_proto
 
-from .base_quantizer import BaseQuantizer
+from .base_quantizer import BaseQuantizer, QuantizationParams
+from .calibrate import TensorData
 from .quant_utils import (
     DEQUANT_OP_NAME,
     QUANT_OP_NAME,
@@ -24,8 +30,12 @@
     add_quant_input_suffix,
     add_quant_output_suffix,
     add_quant_suffix,
+    compute_scale_zp,
+    compute_scale_zp_float8,
     find_by_name,
+    get_qmin_qmax_for_qType,
     ms_domain,
+    tensor_proto_to_array,
 )
 from .registry import CreateQDQQuantizer
 
@@ -36,6 +46,17 @@ class QDQQuantTensorType(Enum):
     BIAS = 2
 
 
+# Holds the name of the node input from which a node output will share the
+# same quantization param initializers (zero-point and scale initializers).
+# Ex: A Transpose node's output will use the same quant param initializers used at the input.
+@dataclass
+class QDQQuantParamProvider:
+    input_name: str
+    node_name: str
+
+
+# Holds information for tensors that have been marked for quantization by operator quantizers.
+# Does not hold information for bias tensors.
 class QDQTensorQuantInfo:
     def __init__(self, tensor_type=QDQQuantTensorType.ACTIVATION, quant_para_provider=None, axis=None, data_type=None):
         self.tensor_type = tensor_type
@@ -46,6 +67,64 @@ def __init__(self, tensor_type=QDQQuantTensorType.ACTIVATION, quant_para_provide
         self.data_type = data_type
 
 
+# Holds information for bias tensors that have been marked for quantization by operator quantizers.
+@dataclass
+class QDQBiasQuantInfo:
+    node_name: str
+    input_name: str
+    weight_name: str
+    beta: float
+
+
+# Holds quantization parameter values (scale, zp) for a tensor.
+# A tensor typically has a one set of quantization parameters, unless the tensor is
+# at a "mixed-precision" boundary where the activation quantization type changes (e.g., from uint8 to uint16).
+@dataclass
+class QDQTensorQuantParams:
+    original: QuantizationParams  # Generated by producer node.
+    converted: QuantizationParams | None  # Converted type consumed by some (or all/none) consumer nodes.
+    converted_recv_nodes: set[str] | None  # The name of nodes that consume the converted type.
+
+
+# Holds scale and zero_point initializer TensorProtos.
+@dataclass
+class QDQScaleZpInitializers:
+    scale: TensorProto
+    zero_point: TensorProto
+
+
+# Holds all scale and zero-point initializers for a tensor.
+# A tensor typically has a one set of quantization parameters, unless the tensor is
+# at a "mixed-precision" boundary where the activation quantization type changes (e.g., from uint8 to uint16).
+@dataclass
+class QDQTensorScaleZpInitializers:
+    original: QDQScaleZpInitializers
+    converted: QDQScaleZpInitializers | None
+    converted_recv_nodes: set[str] | None
+
+
+# Holds cached information of a tensor's quantized values (types, zp/scale initializer names, etc.).
+# A tensor typically has a one set of quantization parameters, unless the tensor is
+# at a "mixed-precision" boundary where the activation quantization type changes (e.g., from uint8 to uint16).
+@dataclass
+class QDQTensorQuantizedValue:
+    original: QuantizedValue
+    converted: QuantizedValue | None
+    converted_recv_nodes: set[str] | None
+
+    def get_for_consumer(self, consumer_node_name) -> QuantizedValue:
+        if self.converted is None:  # Quantized value is not converted, return original
+            return self.original
+
+        if self.converted_recv_nodes is None:  # All consumers receive the converted value
+            return self.converted
+
+        # Check if consumer node name is in the list of nodes that
+        # receive the converted quantization value. If not, return the original value generated
+        # by the tensor's producer.
+        return self.converted if (consumer_node_name in self.converted_recv_nodes) else self.original
+
+
 class QDQQuantizer(BaseQuantizer):
     def __init__(
         self,
@@ -74,7 +153,7 @@ def __init__(
             extra_options,
         )
         self.tensors_to_quantize = {}
-        self.bias_to_quantize = []
+        self.bias_to_quantize = {}
 
         self.nodes_to_remove = []
 
@@ -100,8 +179,7 @@ def __init__(
         # The default behavior is that multiple nodes can share a QDQ pair as their inputs.
         # In TRT, QDQ pair can`t be shared between nodes, so it will create dedicated QDQ pairs for each node.
         self.dedicated_qdq_pair = extra_options.get("DedicatedQDQPair", False)
-        if self.dedicated_qdq_pair:
-            self.tensor_to_its_receiving_nodes = {}
+        self.tensor_to_its_receiving_nodes = {}
 
         # Let user set channel axis for specific op type and it's effective only when per channel quantization is supported and per_channel is True.
         self.qdq_op_type_per_channel_support_to_axis = extra_options.get("QDQOpTypePerChannelSupportToAxis", {})
@@ -112,7 +190,7 @@ def __init__(
         # if the activation or weight types are 16-bit integers.
         # TODO: Remove this override (and use only the 'UseQDQContribOps' option) if/when ONNX adds 16-bit support.
         int16_types = (TensorProto.UINT16, TensorProto.INT16)
-        overrides_have_int16 = any(t in int16_types for t in self.tensor_quant_override_types)
+        overrides_have_int16 = any(t.tensor_type in int16_types for t in self.tensor_quant_override_qtypes)
         if not self.qdq_op_domain and (
             self.activation_qType in int16_types or self.weight_qType in int16_types or overrides_have_int16
         ):
@@ -123,6 +201,11 @@ def __init__(
             )
             self.qdq_op_domain = ms_domain
 
+        self.quantization_params = self.calc_graph_quant_params()
+
+        # Map of all original value names to quantized value names
+        self.quantized_value_map = {}
+
     def _get_tensor_type(self, tensor_name):
         """
         Check if tensor can be quantized
@@ -158,45 +241,71 @@ def _is_tensor_quantizable(self, tensor_name):
 
         return False
 
-    def __quantize_tensor(self, tensor_name, quant_sharing_param=None, tensor_type=QDQQuantTensorType.ACTIVATION):
+    def __quantize_tensor(self, tensor_name, quant_sharing_provider=None, tensor_type=QDQQuantTensorType.ACTIVATION):
         """
-        Quantize tensors. If quant_param_tensor is not None, tensor with name tensor_name will be quantized with same
-        quantization parameters as tensor quant_param_tensor
+        Adds a tensor to the list (actually a dict) of tensors to quantize. Called indirectly by op quantizers that
+        want to quantize a tensor (i.e., "mark" a tensor for quantization).
+
+        If quant_sharing_provider is not None, tensor with name tensor_name will be quantized with the same
+        quantization parameters as the node input specified in quant_sharing_provider. Ex: A Tranpose node's output
+        will typically use the same quantization parameter initializers used at the Transpose node's input.
 
         Args:
             tensor_name: name of the tensor to quantize
-            quant_sharing_param: name of the tensor that provides quantization parameter
+            quant_sharing_provider: name of the tensor and node that provides quantization parameter
             tensor_type: QDQQuantTensorType default ACTIVATION
         """
         if self._is_tensor_quantizable(tensor_name):
-            if quant_sharing_param:
+            if quant_sharing_provider:
+                if not isinstance(quant_sharing_provider, QDQQuantParamProvider):
+                    raise TypeError(
+                        f"quant_sharing_provider must be of type QDQQuantParamProvider, not {type(quant_sharing_provider)}."
+                    )
+
                 data_type = self._get_tensor_type(tensor_name)
                 self.tensors_to_quantize[tensor_name] = QDQTensorQuantInfo(
-                    tensor_type=tensor_type, quant_para_provider=quant_sharing_param, data_type=data_type
+                    tensor_type=tensor_type, quant_para_provider=quant_sharing_provider, data_type=data_type
                 )
             elif tensor_name not in self.tensors_to_quantize:
                 data_type = self._get_tensor_type(tensor_name)
                 self.tensors_to_quantize[tensor_name] = QDQTensorQuantInfo(tensor_type=tensor_type, data_type=data_type)
 
-    def quantize_activation_tensor(self, tensor_name, quant_sharing_param=None):
+    def quantize_activation_tensor(self, tensor_name: str):
         """
-        Quantize Activation Tensor
+        Adds a tensor to the list of tensors to quantize. Called by op quantizers that
+        want to quantize a tensor (i.e., "mark" a tensor for quantization).
+
         Args:
             tensor_name: name of the tensor to quantize
-            quant_sharing_param: name of the tensor that provides quantization parameter
-
         """
-        return self.__quantize_tensor(tensor_name, quant_sharing_param, QDQQuantTensorType.ACTIVATION)
+        return self.__quantize_tensor(tensor_name, None, QDQQuantTensorType.ACTIVATION)
 
-    def quantize_weight_tensor(self, tensor_name, quant_sharing_param=None):
+    def quantize_output_same_as_input(self, output_name: str, input_name: str, node_name: str):
         """
-        Quantize Weight Tensor
+        Adds a tensor to the list of tensors to quantize. Called by op quantizers that
+        want to quantize an output tensor using the same quantization parameters as one of the node's inputs.
+
+        Ex: A Tranpose node's output will typically use the same quantization parameter initializers used at
+        the Transpose node's input.
+
         Args:
-            tensor_name: name of the tensor to quantize
-            quant_sharing_param: name of the tensor that provides quantization parameter
+            output_name: name of the node output to quantize so that it uses the same quantization params as an input.
+            input_name: name of the node input from which the output tensor will get its quantization params.
+            node_name: name of the node that consumes `input_name`.
+        """
+        return self.__quantize_tensor(
+            output_name, QDQQuantParamProvider(input_name, node_name), QDQQuantTensorType.ACTIVATION
+        )
 
+    def quantize_weight_tensor(self, tensor_name: str):
         """
-        return self.__quantize_tensor(tensor_name, quant_sharing_param, QDQQuantTensorType.WEIGHT)
+        Adds a tensor to the list of weight tensors to quantize. Called by op quantizers that
+        want to quantize a weight (i.e., "mark" a weight for quantization).
+
+        Args:
+            tensor_name: name of the weight to quantize
+        """
+        return self.__quantize_tensor(tensor_name, None, QDQQuantTensorType.WEIGHT)
 
     def quantize_weight_tensor_per_channel(self, tensor_name, axis):
         weight = find_by_name(tensor_name, self.model.initializer())
@@ -208,7 +317,19 @@ def quantize_weight_tensor_per_channel(self, tensor_name, axis):
         else:
             logging.warning(f"only support per-channel quantization on weight. Tensor: {tensor_name} is not quantized.")
 
-    def quantize_bias_tensor(self, bias_name, input_name, weight_name, beta=1.0):
+    def quantize_bias_tensor(self, node_name, bias_name, input_name, weight_name, beta=1.0):
+        """
+        Adds a bias tensor to the list of bias tensors to quantize. Called by op quantizers that
+        want to quantize a bias with bias_zero_point = 0 and bias_scale = input_scale * weight_scale * beta.
+        TODO: Explain the reasoning for using this formula.
+
+        Args:
+            node_name: name of the node that consumes the bias, input, and weight tensors.
+            bias_name: name of the bias tensor to quantize.
+            input_name: name of the input tensor whose scale is used to compute the bias's scale.
+            weight_name: name of the weight tensor whose scale is used to compute the bias's scale.
+            beta: Multiplier used to compute the bias's scale.
+        """
         # If the user provided quantization overrides for this tensor, treat it as a regular weight.
         if self.tensor_quant_overrides.get(bias_name):
             logging.info(
@@ -223,7 +344,10 @@ def quantize_bias_tensor(self, bias_name, input_name, weight_name, beta=1.0):
         weight = find_by_name(bias_name, self.model.initializer())
         if weight is not None:
             if weight.data_type in (onnx_proto.TensorProto.FLOAT, onnx_proto.TensorProto.FLOAT16):
-                self.bias_to_quantize.append((bias_name, input_name, weight_name, beta))
+                if bias_name not in self.bias_to_quantize:
+                    self.bias_to_quantize[bias_name] = QDQBiasQuantInfo(node_name, input_name, weight_name, beta)
+                else:
+                    logging.warning(f"Bias {bias_name} has already been marked for quantization")
         else:
             logging.warning(f"Expected {bias_name} to be a weight")
 
@@ -239,11 +363,10 @@ def quantize_model(self):
                 op_quantizer = CreateQDQQuantizer(self, node)
                 op_quantizer.quantize()
 
-                if self.dedicated_qdq_pair:
-                    for tensor_name in node.input:
-                        if tensor_name not in self.tensor_to_its_receiving_nodes:
-                            self.tensor_to_its_receiving_nodes[tensor_name] = []
-                        self.tensor_to_its_receiving_nodes[tensor_name].append(node)
+                for tensor_name in node.input:
+                    if tensor_name not in self.tensor_to_its_receiving_nodes:
+                        self.tensor_to_its_receiving_nodes[tensor_name] = []
+                    self.tensor_to_its_receiving_nodes[tensor_name].append(node)
 
         self._quantize_normal_tensors()
         self._quantize_sharing_param_tensors()
@@ -263,6 +386,8 @@ def quantize_model(self):
     def try_replacing_upstream_output(self, upstream_output_name, output_name):
         if (
             output_name in self.quantization_params
+            and self.quantization_params[output_name].converted is None
+            and self.quantization_params[upstream_output_name].converted is None
             and len(self.model.input_name_to_nodes()[upstream_output_name]) == 1
             and not self.model.is_graph_output(upstream_output_name)
             and not self.model.is_graph_input(upstream_output_name)
@@ -273,6 +398,50 @@ def try_replacing_upstream_output(self, upstream_output_name, output_name):
             return True
         return False
 
+    def _create_q_node(
+        self,
+        q_input: str,
+        q_output: str,
+        quant_node_name: str,
+        scale_name: str,
+        zp_name: str,
+        axis: int | None = None,
+    ):
+        """
+        Creates a QuantizeLinear node and adds it to the model.
+        """
+        qlinear_node = onnx.helper.make_node(
+            QUANT_OP_NAME,
+            [q_input, scale_name, zp_name],
+            [q_output],
+            quant_node_name,
+            axis=axis,
+            domain=self.qdq_op_domain,
+        )
+        self.model.add_nodes([qlinear_node])
+
+    def _create_dq_node(
+        self,
+        dq_input: str,
+        dq_output: str,
+        dequant_node_name: str,
+        scale_name: str,
+        zp_name: str,
+        axis: int | None = None,
+    ):
+        """
+        Creates a DequantizeLinear node and adds it to the model.
+        """
+        dequant_node = onnx.helper.make_node(
+            DEQUANT_OP_NAME,
+            [dq_input, scale_name, zp_name],
+            [dq_output],
+            dequant_node_name,
+            axis=axis,
+            domain=self.qdq_op_domain,
+        )
+        self.model.add_nodes([dequant_node])
+
     def _create_qdq_nodes(
         self, q_input, q_output, quant_node_name, dq_input, dq_output, dequant_node_name, scale_name, zp_name, axis=None
     ):
@@ -383,7 +552,7 @@ def _add_qdq_pair_for_activation(self, tensor_name, scale_name, zp_name, data_ty
                         QuantizedValueType.Input,
                         scale_type=data_type,
                     )
-                    self.quantized_value_map[tensor_name] = quantized_value
+                    self.quantized_value_map[tensor_name] = QDQTensorQuantizedValue(quantized_value, None, None)
         else:
             q_input = tensor_name
             dq_output = add_dequant_output_suffix(tensor_name)
@@ -413,9 +582,165 @@ def _add_qdq_pair_for_activation(self, tensor_name, scale_name, zp_name, data_ty
                 QuantizedValueType.Input,
                 scale_type=data_type,
             )
-            self.quantized_value_map[tensor_name] = quantized_value
+            self.quantized_value_map[tensor_name] = QDQTensorQuantizedValue(quantized_value, None, None)
+
+    def _add_qdq_ops_for_converted_activation(
+        self,
+        tensor_name,
+        first_scale_name,
+        first_zp_name,
+        scale_data_type,
+        convert_scale_name,
+        convert_zp_name,
+        convert_recv_nodes,
+    ):
+        """
+        Adds Q and DQ ops to a tensor whose quantized data type is converted. That is, some consumers may use the
+        original data type from the producer, while other consumers use the converted data type.
+        This is generally done by adding a sequence of ops that convert from one data type (e.g., uint8) to another (e.g., uint16).
+
+        T_float ---> Quant(to u8) ---> Convert(to u16) ---> Dequant(to float) ---> T_float'
+        where Convert(to u16) is equivalent to: ---> Dequant(to float) ---> Quant(to u16) --->
+
+        This function handles the following scenarios:
+
+        1) Tensor T is not a graph output; all consumers use the converted type
+
+            <Producer> ---> Q1 ---> DQ1 ---> Q2 ---> DQ2 ---> <Consumers>
+
+        2) Tensor T is not a graph output; some consumers use the original type, others use the converted type
+
+            <Producer> ---> Q1 -+-> DQ1 ---> <Consumers of original type>
+                                |
+                                +-> DQ1' ---> Q2 ---> DQ2 ---> <Consumers of converted type>
+
+        3) Tensor T is a graph output; all consumers use the converted type
+
+            <Producer> ---> Q1 ---> DQ1 ---> Q2 ---> DQ2 -+-> <Consumers>
+                                                          |
+                                                          +-> <Graph output>
+
+        4) Tensor T is a graph output; some consumers use the original type, others use the converted type
+
+            <Producer> ---> Q1 -+-> DQ1 -+-> <Consumers of original type>
+                                |        |
+                                |        +-> <Graph output>
+                                |
+                                +-> DQ1' ---> Q2 ---> DQ2 ---> <Consumers of converted type>
+        """
+        tensor_recv_nodes = set([node.name for node in self.tensor_to_its_receiving_nodes[tensor_name]])
+
+        if (
+            self.dedicated_qdq_pair
+            and tensor_name in self.tensor_to_its_receiving_nodes
+            and len(self.tensor_to_its_receiving_nodes[tensor_name]) > 1
+        ):
+            # TODO: Add support for dedicated_qdq_pair if/when needed.
+            raise ValueError(
+                "Do not currently support converted quant_types in TensorQuantOverrides when the `dedicated_qdq_pair` extra_option is enabled"
+            )
+
+        # Determine which nodes consume the original quantized type and which nodes
+        # consume the converted quantized type.
+        original_recv_nodes = tensor_recv_nodes
+        if convert_recv_nodes is None:  # In this case, all consumers receive the converted type.
+            convert_recv_nodes = tensor_recv_nodes
+            original_recv_nodes = set()
+        else:
+            original_recv_nodes = original_recv_nodes - convert_recv_nodes
+
+        all_use_converted = len(convert_recv_nodes) == len(tensor_recv_nodes)
+        is_graph_output = self.model.is_graph_output(tensor_name)
+
+        # Create first Q op.
+        first_q_input = tensor_name
+        if is_graph_output:
+            first_q_input = add_quant_input_suffix(tensor_name)
+            self.model.replace_output_of_all_nodes(tensor_name, first_q_input)
+
+        first_q_output = add_quant_output_suffix(tensor_name)
+        self._create_q_node(
+            first_q_input, first_q_output, add_quant_suffix(tensor_name), first_scale_name, first_zp_name
+        )
+
+        # Create first DQ op.
+        first_dq_output = add_dequant_output_suffix(tensor_name)
+        if is_graph_output and not all_use_converted:
+            first_dq_output = tensor_name
+        if original_recv_nodes and first_dq_output != tensor_name:
+            self.model.replace_input_of_nodes(tensor_name, first_dq_output, original_recv_nodes)
+
+        self._create_dq_node(
+            first_q_output, first_dq_output, add_dequant_suffix(tensor_name), first_scale_name, first_zp_name
+        )
+
+        # Create parallel clone of first DQ op if _not all_ consumers use the converted type.
+        # --> DQ1' --> Q2 --> DQ2 --> <Consumers of converted type>
+        #
+        # This DQ clone would only have one consumer Q node (Q2) and could be potentially fused with
+        # it by some EPs (e.g., QNN) without breaking other "node units".
+        # Ex QNN fusion:
+        # --> Convert (fused) --> DQ2 --> <Consumers of converted type>
+        second_q_input = first_dq_output
+        if not all_use_converted:
+            second_q_input = add_quant_input_suffix(f"{tensor_name}_convert")
+            self._create_dq_node(
+                first_q_output,
+                second_q_input,
+                add_dequant_suffix(f"{tensor_name}_convert_clone"),
+                first_scale_name,
+                first_zp_name,
+            )
+
+        # Create second Q op.
+        second_q_output = add_quant_output_suffix(f"{tensor_name}_convert")
+        self._create_q_node(
+            second_q_input,
+            second_q_output,
+            add_quant_suffix(f"{tensor_name}_convert"),
+            convert_scale_name,
+            convert_zp_name,
+        )
+
+        # Create second DQ op.
+        second_dq_output = add_dequant_output_suffix(f"{tensor_name}_convert")
+        if is_graph_output and all_use_converted:
+            second_dq_output = tensor_name
+        if convert_recv_nodes and second_dq_output != tensor_name:
+            self.model.replace_input_of_nodes(tensor_name, second_dq_output, convert_recv_nodes)
+        self._create_dq_node(
+            second_q_output,
+            second_dq_output,
+            add_dequant_suffix(f"{tensor_name}_convert"),
+            convert_scale_name,
+            convert_zp_name,
+        )
+
+        # Store in quantized_value_map
+        original_quantized_value = QuantizedValue(
+            tensor_name,
+            first_dq_output,
+            first_scale_name,
+            first_zp_name,
+            QuantizedValueType.Input,
+            scale_type=scale_data_type,
+        )
+        converted_quantized_value = QuantizedValue(
+            tensor_name,
+            second_dq_output,
+            convert_scale_name,
+            convert_zp_name,
+            QuantizedValueType.Input,
+            scale_type=scale_data_type,
+        )
+        self.quantized_value_map[tensor_name] = QDQTensorQuantizedValue(
+            original_quantized_value, converted_quantized_value, convert_recv_nodes
+        )
 
     def _quantize_normal_tensors(self):
+        """
+        Adds Q/DQ ops to tensors (activations and weights) that have been marked for quantization by op quantizers.
+        """
         for tensor_name, tensor_info in self.tensors_to_quantize.copy().items():
             if tensor_name in self.quantized_value_map:
                 continue
@@ -426,53 +751,105 @@ def _quantize_normal_tensors(self):
                 if initializer:
                     self._add_qdq_pair_for_initializer(initializer, tensor_info.tensor_type, tensor_info.axis)
                 else:
-                    used_scale, used_zp = self.find_quant_scale_zp(tensor_name)
-                    if used_scale is not None and not hasattr(used_scale, "dtype"):
-                        raise TypeError(
-                            f"Unexpected type {type(used_scale)} for used_scale and tensor_name={tensor_name!r}"
-                        )
-                    data_found, scale_name, zp_name, _, _ = self._get_quantization_params(
-                        tensor_name, used_scale, used_zp
-                    )
-
-                    if not data_found:
+                    tensor_qparam_initializers = self._make_tensor_scale_zp_initializers(tensor_name)
+                    if not tensor_qparam_initializers:
                         raise ValueError(
                             f"Quantization parameters are not specified for param {tensor_name}. "
                             "In static mode quantization params for inputs and outputs of nodes to be quantized are required."
                         )
 
-                    self._add_qdq_pair_for_activation(tensor_name, scale_name, zp_name, data_type=tensor_info.data_type)
+                    if tensor_qparam_initializers.converted is None:
+                        # Normal case: <producer> --> Q --> DQ --> <consumers>
+                        self._add_qdq_pair_for_activation(
+                            tensor_name,
+                            tensor_qparam_initializers.original.scale.name,
+                            tensor_qparam_initializers.original.zero_point.name,
+                            data_type=tensor_info.data_type,
+                        )
+                    else:
+                        # Conversion case: <producer> ---> Q1 -+-> DQ1 --> <consumers of original type>
+                        #                                      |
+                        #                                      +-> DQ1' --> Q2 --> DQ2 --> <consumers of converted type>
+                        assert tensor_info.data_type == tensor_qparam_initializers.original.scale.data_type
+                        self._add_qdq_ops_for_converted_activation(
+                            tensor_name,
+                            tensor_qparam_initializers.original.scale.name,
+                            tensor_qparam_initializers.original.zero_point.name,
+                            tensor_info.data_type,
+                            tensor_qparam_initializers.converted.scale.name,
+                            tensor_qparam_initializers.converted.zero_point.name,
+                            tensor_qparam_initializers.converted_recv_nodes,
+                        )
 
                 del self.tensors_to_quantize[tensor_name]
 
     def _quantize_sharing_param_tensors(self):
+        """
+        Adds Q/DQ ops to tensors that have been marked for quantization by op quantizers.
+        Only operates on tensors that want to use the quantization parameter initializers from an upstream tensor.
+        For example, a Transpose node's output tensor will typically want to use the same quantization parameter
+        initializers as the Transpose node's input.
+        """
         while self.tensors_to_quantize:
             for tensor_name, tensor_info in self.tensors_to_quantize.copy().items():
-                tensor_provider_name = tensor_info.quant_para_provider
-                if tensor_provider_name in self.quantized_value_map:
+                quant_provider = tensor_info.quant_para_provider
+                if quant_provider and quant_provider.input_name in self.quantized_value_map:
                     del self.tensors_to_quantize[tensor_name]
 
-                    quantized_value = self.quantized_value_map[tensor_provider_name]
-                    # Quantize the input
-                    initializer = find_by_name(tensor_name, self.model.initializer())
-                    if initializer is not None:
+                    quantized_value = self.quantized_value_map[quant_provider.input_name].get_for_consumer(
+                        quant_provider.node_name
+                    )
+                    if self.is_input_a_initializer(tensor_name):
                         raise ValueError("Quantization parameter shared mode is not supported for weight yet")
-                    self._add_qdq_pair_for_activation(tensor_name, quantized_value.scale_name, quantized_value.zp_name)
+
+                    # Need to check if this tensor's quant_type is converted for some consumers.
+                    # If so, create new scale/zp initializers for these consumers.
+                    converted_qparam_inits = None
+                    converted_recv_nodes = None
+                    if tensor_name in self.quantization_params:
+                        tensor_params = self.quantization_params[tensor_name]
+                        if tensor_params.converted:
+                            converted_qparam_inits = self._make_scale_zp_initializers(
+                                tensor_name, tensor_params.converted, "_convert"
+                            )
+                            converted_recv_nodes = tensor_params.converted_recv_nodes
+
+                    if converted_qparam_inits is None:
+                        # Normal case: <producer> --> Q_shared --> DQ_shared --> <consumers>
+                        self._add_qdq_pair_for_activation(
+                            tensor_name, quantized_value.scale_name, quantized_value.zp_name
+                        )
+                    else:
+                        # Conversion case: <producer> ---> Q_shared -+-> DQ_shared --> <consumers of original type>
+                        #                                            |
+                        #                                            +-> DQ_shared' --> Q2 --> DQ2 --> <consumers of converted type>
+                        self._add_qdq_ops_for_converted_activation(
+                            tensor_name,
+                            quantized_value.scale_name,
+                            quantized_value.zp_name,
+                            converted_qparam_inits.scale.data_type,
+                            converted_qparam_inits.scale.name,
+                            converted_qparam_inits.zero_point.name,
+                            converted_recv_nodes,
+                        )
 
     def _quantize_bias_tensors(self):
-        for bias_name, input_name, weight_name, beta in self.bias_to_quantize:
+        """
+        Adds DQ ops (or Cast) for bias tensors that have been marked for quantization by op quantizers.
+        """
+        for bias_name, bias_info in self.bias_to_quantize.items():
             if bias_name in self.quantized_value_map:
                 continue
             # Quantize the input
-            self.quantize_bias_static(bias_name, input_name, weight_name, beta)
+            self.quantize_bias_static(bias_name, bias_info)
             init = find_by_name(bias_name, self.model.initializer())
             self.model.remove_initializer(init)
-            quant_value = self.quantized_value_map[bias_name]
+            quant_value = self.quantized_value_map[bias_name].original
             if quant_value.node_type == "Cast":
                 # simple cast to float 16 and not DequantizeLinear
                 # cublasLtMatmul only supports (b)float16, float bias.
                 if not isinstance(init.data_type, int):
-                    raise TypeError(f"Unexpected type {type(init.data_type)} for input={input_name!r}")
+                    raise TypeError(f"Unexpected type {type(init.data_type)} for input={bias_info.input_name!r}")
                 node_name = add_dequant_suffix(bias_name)
                 dequant_node = onnx.helper.make_node(
                     "Cast",
@@ -511,5 +888,233 @@ def _quantize_bias_tensors(self):
                 raise RuntimeError(f"Unexpected operator type {quant_value.node_type!r}.")
             self.model.add_node(dequant_node)
 
-    def is_tensor_quantized(self, tensor_name):
+    def is_tensor_quantized(self, tensor_name: str):
         return tensor_name in self.tensors_to_quantize or tensor_name in self.bias_to_quantize
+
+    def quantize_initializer(
+        self,
+        weight: onnx.TensorProto,
+        qType: onnx.TensorProto.DataType,
+        reduce_range: bool = False,
+        keep_float_weight: bool = False,
+    ) -> tuple[str, str, str]:
+        """
+        :param weight: TensorProto initializer
+        :param qType: type to quantize to
+        :param keep_float_weight: Whether to quantize the weight. In some cases, we only want to qunatize scale and zero point.
+                                  If keep_float_weight is False, quantize the weight, or don't quantize the weight.
+        :return: quantized weight name, zero point name, scale name
+        """
+        # Find if this input is already quantized
+        if weight.name in self.quantized_value_map:
+            quantized_value = self.quantized_value_map[weight.name].original
+            return (
+                quantized_value.q_name,
+                quantized_value.zp_name,
+                quantized_value.scale_name,
+            )
+
+        q_weight_name, zp_name, scale_name = self.quantize_initializer_impl(
+            weight, qType, reduce_range, keep_float_weight
+        )
+
+        # Log entry for this quantized weight
+        quantized_value = QuantizedValue(
+            weight.name,
+            q_weight_name,
+            scale_name,
+            zp_name,
+            QuantizedValueType.Initializer,
+            None,
+        )
+        self.quantized_value_map[weight.name] = QDQTensorQuantizedValue(quantized_value, None, None)
+        return q_weight_name, zp_name, scale_name
+
+    def quantize_weight_per_channel(
+        self,
+        weight_name: str,
+        weight_qType: onnx.TensorProto.DataType,
+        channel_axis: int,
+        reduce_range: bool = True,
+        keep_float_weight: bool = False,
+    ) -> tuple[str, str, str]:
+        # Find if this input is already quantized
+        if weight_name in self.quantized_value_map:
+            quantized_value = self.quantized_value_map[weight_name].original
+            return (
+                quantized_value.q_name,
+                quantized_value.zp_name,
+                quantized_value.scale_name,
+            )
+
+        q_weight_name, zp_name, scale_name = self.quantize_weight_per_channel_impl(
+            weight_name, weight_qType, channel_axis, reduce_range, keep_float_weight
+        )
+        quantized_value = QuantizedValue(
+            weight_name,
+            q_weight_name,
+            scale_name,
+            zp_name,
+            QuantizedValueType.Initializer,
+            None,
+        )
+        self.quantized_value_map[weight_name] = QDQTensorQuantizedValue(quantized_value, None, None)
+
+        return q_weight_name, zp_name, scale_name
+
+    def quantize_bias_static(self, bias_name: str, bias_info: QDQBiasQuantInfo) -> str:
+        """
+        Quantized the bias. Zero Point == 0 and Scale == Input_Scale * Weight_Scale
+        """
+
+        # Handle case where bias already in quantization map
+        if bias_name in self.quantized_value_map:
+            return self.quantized_value_map[bias_name].original.q_name
+
+        # get scale for weight
+        weight_scale_name = self.quantized_value_map[bias_info.weight_name].original.scale_name
+        weight_initializer = find_by_name(weight_scale_name, self.model.initializer())
+        weight_scale = tensor_proto_to_array(weight_initializer)
+
+        # get scale for input
+        input_scale_name = (
+            self.quantized_value_map[bias_info.input_name].get_for_consumer(bias_info.node_name).scale_name
+        )
+        inputscale_initializer = find_by_name(input_scale_name, self.model.initializer())
+        input_scale = tensor_proto_to_array(inputscale_initializer)
+
+        (
+            quantized_bias_name,
+            quantized_bias_scale_name,
+            quantized_bias_zp_name,
+            bias_scale_data,
+            node_type,
+            node_qtype,
+        ) = self.quantize_bias_static_impl(bias_name, input_scale, weight_scale, bias_info.beta)
+
+        quantized_value = QuantizedValue(
+            bias_name,
+            quantized_bias_name,
+            quantized_bias_scale_name,
+            quantized_bias_zp_name,
+            QuantizedValueType.Initializer,
+            0 if bias_scale_data.size > 1 else None,
+            node_type=node_type,
+            node_qtype=node_qtype,
+        )
+        self.quantized_value_map[bias_name] = QDQTensorQuantizedValue(quantized_value, None, None)
+
+        return quantized_bias_name
+
+    def _make_scale_zp_initializers(
+        self, param_name: str, params: QuantizationParams, init_name_suffix: str = ""
+    ) -> QDQScaleZpInitializers:
+        """
+        Creates and returns scale and zero-point initializers for the given quantization params. The initializers are
+        named:
+            - {param_name}_zero_point{init_name_suffix}
+            - {param_name}_scale{init_name_suffix}
+        """
+        zero_point_values = np.array([params["zero_point"]])
+        if not hasattr(params["scale"], "dtype") or params["scale"].dtype not in (np.float32, np.float16):
+            raise ValueError(f"Unexpected type {type(params['scale'])} and param_name={param_name!r}")
+        scale_values = np.array([params["scale"]])
+        assert scale_values.dtype != np.float64
+        zero_point_type = params.data.get("quant_type", self.activation_qType)
+
+        zero_point_shape = []
+        zero_point_name = param_name + "_zero_point" + init_name_suffix
+        scale_shape = []
+        scale_name = param_name + "_scale" + init_name_suffix
+
+        # Add initializers to model
+        init_zp = onnx.helper.make_tensor(
+            zero_point_name, zero_point_type, zero_point_shape, zero_point_values.ravel().tolist()
+        )
+        self.model.add_initializer(init_zp)
+
+        if scale_values.dtype == np.float32:
+            scale_type = onnx_proto.TensorProto.FLOAT
+        elif scale_values.dtype == np.float16:
+            scale_type = onnx_proto.TensorProto.FLOAT16
+        else:
+            raise ValueError(f"Unexpected dtype={scale_values.dtype} for param_name={param_name!r}")
+        init_scale = onnx.helper.make_tensor(scale_name, scale_type, scale_shape, scale_values.reshape((-1,)).tolist())
+        self.model.add_initializer(init_scale)
+
+        return QDQScaleZpInitializers(init_scale, init_zp)
+
+    def _make_tensor_scale_zp_initializers(self, tensor_name: str) -> QDQTensorScaleZpInitializers | None:
+        """
+        Create and returns all scale/zero_point initializers for a given tensor. If the tensor is converted
+        to a different quantization type, this function creates two pairs of zp/scale initializers. Otherwise,
+        only one pair of zp/scale initializers is created.
+        """
+        if self.quantization_params is None or tensor_name not in self.quantization_params:
+            logging.info(f'Quantization parameters for tensor:"{tensor_name}" not specified')
+            return None
+
+        tensor_params = self.quantization_params[tensor_name]
+        if not isinstance(tensor_params, QDQTensorQuantParams):
+            raise TypeError(f"Unexpected type {type(tensor_params)} for {tensor_name!r}.")
+
+        original_inits = self._make_scale_zp_initializers(tensor_name, tensor_params.original)
+        converted_inits = (
+            self._make_scale_zp_initializers(tensor_name, tensor_params.converted, "_convert")
+            if tensor_params.converted
+            else None
+        )
+
+        return QDQTensorScaleZpInitializers(original_inits, converted_inits, tensor_params.converted_recv_nodes)
+
+    def calc_quant_params(self, tensor_data: TensorData, quant_overrides: dict[str, Any]) -> QuantizationParams:
+        """
+        Calculates quantization parameters (scale/zero-point) given a tensor's min/max range and optional
+        user-provided overrides.
+        """
+        quant_type = self.activation_qType
+        if "quant_type" in quant_overrides:
+            quant_type = quant_overrides["quant_type"].tensor_type
+
+        if "scale" in quant_overrides and "zero_point" in quant_overrides:
+            zero, scale = quant_overrides["zero_point"], quant_overrides["scale"]
+        elif quant_type == onnx.TensorProto.FLOAT8E4M3FN:
+            zero, scale = compute_scale_zp_float8(quant_type, tensor_data.avg_std[1])
+        else:
+            rmin = quant_overrides.get("rmin", tensor_data.range_value[0])
+            rmax = quant_overrides.get("rmax", tensor_data.range_value[1])
+            symmetric = quant_overrides.get("symmetric", self.is_activation_symmetric)
+            reduce_range = quant_overrides.get("reduce_range", False)
+            qmin, qmax = get_qmin_qmax_for_qType(quant_type, reduce_range=reduce_range, symmetric=symmetric)
+            zero, scale = compute_scale_zp(rmin, rmax, qmin, qmax, symmetric, self.min_real_range)
+
+        return QuantizationParams(zero_point=zero, scale=scale, quant_type=quant_type)
+
+    def calc_graph_quant_params(self) -> dict[str, QDQTensorQuantParams]:
+        """
+        Calculates quantization parameters (scale/zero-point) for all tensors in the graph using each tensor's min/max range
+        and optional user-provided overrides.
+        """
+        if self.tensors_range is None:
+            return {}
+
+        self.adjust_tensor_ranges()
+
+        quantization_params = {}
+        for tensor_name in self.tensors_range:
+            td = self.tensors_range[tensor_name]
+            if not isinstance(td, TensorData):
+                raise TypeError(f"Unexpected type {type(td)} for {tensor_name!r}.")
+
+            quant_overrides = self.tensor_quant_overrides.get_per_tensor_overrides(tensor_name)
+            original = self.calc_quant_params(td, quant_overrides)
+            converted = None
+            converted_recv_nodes = None
+
+            if "convert" in quant_overrides:
+                converted = self.calc_quant_params(td, quant_overrides["convert"])
+                converted_recv_nodes = quant_overrides["convert"].get("recv_nodes")
+
+            quantization_params[tensor_name] = QDQTensorQuantParams(original, converted, converted_recv_nodes)
+
+        return quantization_params
diff --git a/onnxruntime/python/tools/quantization/registry.py b/onnxruntime/python/tools/quantization/registry.py
index a693f4192bc2b..b00e830a2a366 100644
--- a/onnxruntime/python/tools/quantization/registry.py
+++ b/onnxruntime/python/tools/quantization/registry.py
@@ -18,7 +18,7 @@
 from .operators.pooling import QLinearPool
 from .operators.qdq_base_operator import QDQOperatorBase
 from .operators.resize import QDQResize, QResize
-from .operators.softmax import QDQSoftmax, QLinearSoftmax
+from .operators.softmax import QLinearSoftmax
 from .operators.split import QDQSplit, QSplit
 from .operators.where import QDQWhere, QLinearWhere
 from .quant_utils import QuantizationMode
@@ -79,7 +79,6 @@
     "MatMul": QDQMatMul,
     "Split": QDQSplit,
     "Gather": QDQGather,
-    "Softmax": QDQSoftmax,
     "Where": QDQWhere,
     "InstanceNormalization": QDQNormalization,
     "LayerNormalization": QDQNormalization,
diff --git a/onnxruntime/python/tools/quantization/tensor_quant_overrides.py b/onnxruntime/python/tools/quantization/tensor_quant_overrides.py
new file mode 100644
index 0000000000000..610b96b9d7937
--- /dev/null
+++ b/onnxruntime/python/tools/quantization/tensor_quant_overrides.py
@@ -0,0 +1,214 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
+import json
+from collections.abc import MutableMapping
+from typing import Any
+
+from .quant_utils import QuantType
+
+
+class TensorQuantOverridesHelper(MutableMapping):
+    """
+    Utility wrapper over the tensor quantization overrides passed via extra_options.
+    """
+
+    def __init__(self, raw_overrides: dict[str, list[dict[str, Any]]]):
+        self.overrides = raw_overrides
+        self.quant_types = None
+
+    def get_per_tensor_overrides(self, tensor_name: str) -> dict[str, Any]:
+        overrides_list = self.overrides.get(tensor_name, [{}])
+        num_overrides = len(overrides_list)
+        if num_overrides > 1:
+            raise ValueError(
+                f"Expected tensor '{tensor_name}' to use per-tensor quantization overrides, "
+                f"but found {num_overrides} per-channel overrides."
+            )
+
+        return overrides_list[0] if num_overrides > 0 else {}
+
+    def get_per_channel_overrides(
+        self,
+        tensor_name: str,
+        num_channels: int,
+    ) -> list[dict[str, Any]]:
+        overrides_list = self.overrides.get(tensor_name, [{} for i in range(num_channels)])
+
+        if len(overrides_list) != num_channels:
+            raise ValueError(
+                f"Expected tensor '{tensor_name}' to have {num_channels} per-channel quantization overrides, "
+                f"but found {len(overrides_list)} instead."
+            )
+
+        return overrides_list
+
+    def get_quant_types(self) -> set[QuantType]:
+        if self.quant_types is not None:
+            return self.quant_types
+
+        self.quant_types = set()
+
+        if self.overrides:
+            for quant_overrides_list in self.overrides.values():
+                for quant_overrides in quant_overrides_list:
+                    if "quant_type" in quant_overrides:
+                        self.quant_types.add(quant_overrides["quant_type"])
+
+                    if "convert" in quant_overrides and "quant_type" in quant_overrides["convert"]:
+                        self.quant_types.add(quant_overrides["convert"]["quant_type"])
+
+        return self.quant_types
+
+    def is_valid(
+        self,
+        initializer_names: set[str],
+        activation_names: set[str],
+        default_activation_qtype,
+    ) -> tuple[bool, str | None]:
+        self.quant_types = set()
+
+        # Validate that compatible/valid overrides are provided.
+        if self.overrides:
+            keys_unsupported_with_scale_zp = {"symmetric", "reduce_range", "rmax", "rmin"}
+
+            for tensor_name, quant_overrides_list in self.overrides.items():
+                if tensor_name not in initializer_names and tensor_name not in activation_names:
+                    return False, f"Tensor '{tensor_name}' in TensorQuantOverrides is not present in the model"
+
+                if not isinstance(quant_overrides_list, list):
+                    return False, f"Tensor quantization overrides for '{tensor_name}' are not in a list"
+
+                is_initializer = tensor_name in initializer_names
+                if not is_initializer and len(quant_overrides_list) > 1:
+                    return (
+                        False,
+                        f"Tensor '{tensor_name}' has a list of per-channel overrides, but is not an initializer",
+                    )
+
+                quant_type = None
+                for index, quant_overrides in enumerate(quant_overrides_list):
+                    if not isinstance(quant_overrides, dict):
+                        return (
+                            False,
+                            f"Tensor quantization overrides at index {index} for '{tensor_name}' are not in a dict",
+                        )
+
+                    # For per-channel quantization, all channels must use the same quantization type.
+                    # Therefore, if the user tries to override the quant_type for a channel, it must match in all
+                    # other channels.
+                    if index == 0:
+                        quant_type = quant_overrides.get("quant_type")
+                        if quant_type:
+                            self.quant_types.add(quant_type)
+                    elif quant_type != quant_overrides.get("quant_type"):
+                        return (
+                            False,
+                            "Channel quantization types for tensor '{tensor_name}' do not match at index {index}.",
+                        )
+
+                    has_scale = "scale" in quant_overrides
+                    has_zero_point = "zero_point" in quant_overrides
+
+                    if (has_scale and not has_zero_point) or (has_zero_point and not has_scale):
+                        return (
+                            False,
+                            "Must provide both 'scale' and 'zero_point' if one of the overrides is provided",
+                        )
+
+                    if has_scale:
+                        for key in keys_unsupported_with_scale_zp:
+                            if key in quant_overrides:
+                                return (
+                                    False,
+                                    f"Tensor override option '{key}' is invalid with 'scale' and 'zero_point'",
+                                )
+
+                    if "reduce_range" in quant_overrides and not is_initializer:
+                        return (
+                            False,
+                            f"Option 'reduce_range' is only supported for initializers, not for activation {tensor_name}",
+                        )
+
+                    if "convert" in quant_overrides:
+                        if index > 0:
+                            return (
+                                False,
+                                f"Per-channel overrides (tensor '{tensor_name}') do not support 'convert'.",
+                            )
+
+                        if is_initializer:
+                            return False, "Cannot use 'convert' override for initializers"
+
+                        if "quant_type" not in quant_overrides["convert"]:
+                            return False, f"'convert' options (tensor '{tensor_name}') must specify a 'quant_type'"
+
+                        if "reduce_range" in quant_overrides["convert"]:
+                            return (
+                                False,
+                                f"Option 'reduce_range' is only supported for initializers, not for activation {tensor_name}",
+                            )
+
+                        convert_quant_type = quant_overrides["convert"]["quant_type"]
+                        original_quant_type = quant_type if quant_type is not None else default_activation_qtype
+                        if convert_quant_type == original_quant_type:
+                            return (
+                                False,
+                                f"'convert' quant_type must differ from original quant_type (tensor '{tensor_name}')",
+                            )
+
+                        convert_has_scale = "scale" in quant_overrides["convert"]
+                        convert_has_zero_point = "zero_point" in quant_overrides["convert"]
+
+                        if (convert_has_scale and not convert_has_zero_point) or (
+                            convert_has_zero_point and not convert_has_scale
+                        ):
+                            return (
+                                False,
+                                f"Must provide both 'scale' and 'zero_point' if one of the overrides is provided (tensor '{tensor_name}')",
+                            )
+
+                        if convert_has_scale:
+                            for key in keys_unsupported_with_scale_zp:
+                                if key in quant_overrides["convert"]:
+                                    return (
+                                        False,
+                                        f"Tensor override option '{key}' is invalid with 'scale' and 'zero_point' (tensor '{tensor_name}')",
+                                    )
+
+                        self.quant_types.add(convert_quant_type)
+
+        return True, None
+
+    def pprint_str(self, indent=None) -> str:
+        return json.dumps(self.overrides, default=str, indent=indent)
+
+    def get_dict(self) -> dict[str, list[dict[str, Any]]]:
+        return self.overrides
+
+    # Required implementations of abstract methods in collections.abc.MutableMapping
+    # so that this class can be used like a dict.
+    def __setitem__(self, key: str, value: list[dict]):
+        self.overrides[key] = value
+
+    def __getitem__(self, key: str) -> list[dict]:
+        return self.overrides[key]
+
+    def __delitem__(self, key: str):
+        del self.overrides[key]
+
+    def __iter__(self):
+        return iter(self.overrides)
+
+    def __len__(self):
+        return len(self.overrides)
+
+    def __str__(self) -> str:
+        return str(self.overrides)
+
+    def __repr__(self) -> str:
+        return f"{super().__repr__()}, TensorQuantOverridesHelper({self.overrides})"
diff --git a/onnxruntime/test/python/quantization/test_qdq.py b/onnxruntime/test/python/quantization/test_qdq.py
index 9e7a4a125121d..db4ab7e8a412c 100644
--- a/onnxruntime/test/python/quantization/test_qdq.py
+++ b/onnxruntime/test/python/quantization/test_qdq.py
@@ -4,7 +4,9 @@
 # Licensed under the MIT License. See License.txt in the project root for
 # license information.
 # --------------------------------------------------------------------------
+from __future__ import annotations
 
+import os
 import tempfile
 import unittest
 from pathlib import Path
@@ -25,12 +27,12 @@
 
 
 class TestQDQFormat(unittest.TestCase):
-    def input_feeds(self, n, name2shape):
+    def input_feeds(self, n, name2shape, np_float_type=np.float32):
         input_data_list = []
         for _i in range(n):
             inputs = {}
             for name, shape in name2shape.items():
-                inputs.update({name: np.random.randint(-1, 2, shape).astype(np.float32)})
+                inputs.update({name: np.random.randint(-1, 2, shape).astype(np_float_type)})
             input_data_list.extend([inputs])
         dr = TestDataFeeds(input_data_list)
         return dr
@@ -720,5 +722,593 @@ def test_activation_only(self):
         check_op_type_count(self, qdq_model_path, **qop_nodes)
 
 
+class TestQDQMixedPrecision(TestQDQFormat):
+    @classmethod
+    def setUpClass(cls):
+        cls._tmp_model_dir = tempfile.TemporaryDirectory(prefix="ort.qdq.mixed_prec_")
+
+        # Note: swap with the commented line if you want to see the models in local test dir.
+        cls._tmp_dir_path = cls._tmp_model_dir.name
+        # cls._tmp_dir_path = "."
+
+    @classmethod
+    def tearDownClass(cls):
+        cls._tmp_model_dir.cleanup()
+
+    def build_test_model_for_add_qdq_ops(
+        self,
+        num_consumers: int,
+        is_graph_output: bool,
+        float_type: onnx.TensorProto.DataType = onnx.TensorProto.FLOAT,
+        op0_transpose: bool = False,
+    ):
+        """
+        Builds a float32 model with a single producer node and a configurable number of consumer nodes.
+        The tensor between the producer and consumers can be optionally made a graph output.
+        op_0 can optionally be made a Transpose node to test sharing qparams across the input and output.
+
+                           +-> op_0_out (optional graph output)
+                           |
+        input_0 --> op_0 --+-> op_1 --> output_0
+                           |
+                           +-> op_2 --> output_1
+                           |
+                           ...
+                           |
+                           +-> op_{n} --> output_{n-1}
+        """
+        shape = (1, 2, 3)
+        shape_t = (1, 3, 2)
+        input_0 = onnx.helper.make_tensor_value_info("input_0", float_type, shape)
+        output_shape = shape if not op0_transpose else shape_t
+
+        outputs = []
+        for i in range(num_consumers):
+            outputs.append(onnx.helper.make_tensor_value_info(f"output_{i}", float_type, output_shape))
+
+        if is_graph_output:
+            outputs.append(onnx.helper.make_tensor_value_info("op_0_out", float_type, output_shape))
+
+        nodes = []
+        if op0_transpose:
+            nodes.append(onnx.helper.make_node("Transpose", ["input_0"], ["op_0_out"], perm=[0, 2, 1], name="op_0"))
+        else:
+            nodes.append(onnx.helper.make_node("Sigmoid", ["input_0"], ["op_0_out"], name="op_0"))
+
+        for i in range(num_consumers):
+            op_index = i + 1
+            nodes.append(onnx.helper.make_node("Cos", ["op_0_out"], [f"output_{i}"], name=f"op_{op_index}"))
+
+        graph = onnx.helper.make_graph(
+            nodes,
+            "test_add_qdq_ops_for_converted_activation",
+            [input_0],
+            outputs,
+        )
+        opset_imports = [
+            onnx.helper.make_opsetid("", 18),
+        ]
+        model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        return onnx.shape_inference.infer_shapes(model)
+
+    def test_add_tensor_qdq_ops_case_1(self):
+        """
+        Tensor T is not a graph output; all consumers use the converted type
+        <Producer> ---> Q1 ---> DQ1 ---> Q2 ---> DQ2 ---> <Consumers>
+        """
+        # Test configurations (qparam_sharing, float_type)
+        subtest_configs = [
+            (False, onnx.TensorProto.FLOAT, np.float32),
+            (False, onnx.TensorProto.FLOAT16, np.float16),
+            (True, onnx.TensorProto.FLOAT, np.float32),
+            (True, onnx.TensorProto.FLOAT16, np.float16),
+        ]
+        for test_qparam_sharing, float_type, np_float_type in subtest_configs:
+            with self.subTest(test_qparam_sharing=test_qparam_sharing, float_type=float_type):
+                label = f"_share{test_qparam_sharing}_f{float_type}"
+                float_model_path = os.path.join(self._tmp_dir_path, f"case_1{label}.onnx")
+                qdq_model_path = os.path.join(self._tmp_dir_path, f"case_1{label}.qdq.onnx")
+                float_model = self.build_test_model_for_add_qdq_ops(
+                    2, False, float_type=float_type, op0_transpose=test_qparam_sharing
+                )
+                onnx.save_model(float_model, float_model_path)
+
+                data_reader = self.input_feeds(3, {"input_0": (1, 2, 3)}, np_float_type)
+
+                mixed_prec_overrides = {
+                    "op_0_out": [
+                        {
+                            "quant_type": QuantType.QUInt8,
+                            "convert": {"quant_type": QuantType.QUInt16, "recv_nodes": {"op_1", "op_2"}},
+                        }
+                    ],
+                    "output_0": [{"quant_type": QuantType.QUInt16}],
+                    "output_1": [{"quant_type": QuantType.QUInt16}],
+                }
+                quantize_static(
+                    float_model_path,
+                    qdq_model_path,
+                    data_reader,
+                    quant_format=QuantFormat.QDQ,
+                    activation_type=QuantType.QUInt8,
+                    op_types_to_quantize=[node.op_type for node in float_model.graph.node],
+                    extra_options={
+                        "TensorQuantOverrides": mixed_prec_overrides,
+                        "ForceQuantizeNoInputCheck": test_qparam_sharing,  # To ensure Transpose is wrapped in DQ/Q
+                    },
+                )
+
+                # Expect the following QDQ model:
+                # input_0 --> Q --> DQ --> op_0 --> Q_8 --> DQ_8 --> Q_16 --> DQ_16 -+-> op_1 --> Q --> DQ --> output_0
+                #                                                                    |
+                #                                                                    +-> op_2 --> Q --> DQ --> output_1
+                qdq_node_counts = {"QuantizeLinear": 5, "DequantizeLinear": 5}
+                check_op_type_count(self, qdq_model_path, **qdq_node_counts)
+
+                qdq_model = onnx.load_model(qdq_model_path)
+                onnx.checker.check_model(qdq_model, True)
+
+                initializers = {init.name: init for init in qdq_model.graph.initializer}
+
+                # Check zero-point data types
+                orig_zp_init = None
+                if test_qparam_sharing:
+                    # op_0_out_zero_point should not be in the model because the Transpose output is sharing
+                    # qparams from the Transpose input.
+                    self.assertNotIn("op_0_out_zero_point", initializers)
+                    orig_zp_init = initializers["input_0_zero_point"]
+                else:
+                    orig_zp_init = initializers["op_0_out_zero_point"]
+
+                self.assertEqual(orig_zp_init.data_type, onnx.TensorProto.UINT8)
+                convert_zp_init = initializers["op_0_out_zero_point_convert"]
+                self.assertEqual(convert_zp_init.data_type, onnx.TensorProto.UINT16)
+                output_0_zp_init = initializers["output_0_zero_point"]
+                self.assertEqual(output_0_zp_init.data_type, onnx.TensorProto.UINT16)
+                output_1_zp_init = initializers["output_1_zero_point"]
+                self.assertEqual(output_1_zp_init.data_type, onnx.TensorProto.UINT16)
+
+                # Check scale data types
+                orig_scale_init = None
+                if test_qparam_sharing:
+                    self.assertNotIn("op_0_out_scale", initializers)
+                    orig_scale_init = initializers["input_0_scale"]
+                else:
+                    orig_scale_init = initializers["op_0_out_scale"]
+
+                self.assertEqual(orig_scale_init.data_type, float_type)
+                convert_scale_init = initializers["op_0_out_scale_convert"]
+                self.assertEqual(convert_scale_init.data_type, float_type)
+                output_0_scale_init = initializers["output_0_scale"]
+                self.assertEqual(output_0_scale_init.data_type, float_type)
+                output_1_scale_init = initializers["output_1_scale"]
+                self.assertEqual(output_1_scale_init.data_type, float_type)
+
+    def test_add_tensor_qdq_ops_case_2(self):
+        """
+        Tensor T is not a graph output; some consumers use the original type, others use the converted type
+        <Producer> ---> Q1 -+-> DQ1 ---> <Consumers of original type>
+                            |
+                            +-> DQ1' ---> Q2 ---> DQ2 ---> <Consumers of converted type>
+        """
+        # Test configurations (qparam_sharing, float_type)
+        subtest_configs = [
+            (False, onnx.TensorProto.FLOAT, np.float32),
+            (False, onnx.TensorProto.FLOAT16, np.float16),
+            (True, onnx.TensorProto.FLOAT, np.float32),
+            (True, onnx.TensorProto.FLOAT16, np.float16),
+        ]
+        for test_qparam_sharing, float_type, np_float_type in subtest_configs:
+            with self.subTest(test_qparam_sharing=test_qparam_sharing, float_type=float_type):
+                label = f"_share{test_qparam_sharing}_f{float_type}"
+                float_model_path = os.path.join(self._tmp_dir_path, f"case_2{label}.onnx")
+                qdq_model_path = os.path.join(self._tmp_dir_path, f"case_2{label}.qdq.onnx")
+                float_model = self.build_test_model_for_add_qdq_ops(
+                    4, False, float_type=float_type, op0_transpose=test_qparam_sharing
+                )
+                onnx.save_model(float_model, float_model_path)
+
+                data_reader = self.input_feeds(3, {"input_0": (1, 2, 3)}, np_float_type)
+
+                mixed_prec_overrides = {
+                    "op_0_out": [
+                        {
+                            "quant_type": QuantType.QUInt8,
+                            "convert": {"quant_type": QuantType.QUInt16, "recv_nodes": {"op_3", "op_4"}},
+                        }
+                    ],
+                    "output_2": [{"quant_type": QuantType.QUInt16}],
+                    "output_3": [{"quant_type": QuantType.QUInt16}],
+                }
+                quantize_static(
+                    float_model_path,
+                    qdq_model_path,
+                    data_reader,
+                    quant_format=QuantFormat.QDQ,
+                    activation_type=QuantType.QUInt8,
+                    op_types_to_quantize=[node.op_type for node in float_model.graph.node],
+                    extra_options={
+                        "TensorQuantOverrides": mixed_prec_overrides,
+                        "ForceQuantizeNoInputCheck": test_qparam_sharing,  # To ensure Transpose is wrapped in DQ/Q
+                    },
+                )
+
+                # Expect the following QDQ model:
+                # input_0 --> Q --> DQ --> op_0 --> Q_8 -+-> DQ_8 -+-> op_1 --> Q --> DQ --> output_0
+                #                                        |         |
+                #                                        |         +-> op_2 --> Q --> DQ --> output_1
+                #                                        |
+                #                                        +-> DQ_8' --> Q_16 --> DQ_16 -+-> op_3 --> Q --> DQ --> output_2
+                #                                                                      |
+                #                                                                      +-> op_4 --> Q --> DQ --> output_3
+                qdq_node_counts = {"QuantizeLinear": 7, "DequantizeLinear": 8}
+                check_op_type_count(self, qdq_model_path, **qdq_node_counts)
+
+                qdq_model = onnx.load_model(qdq_model_path)
+                onnx.checker.check_model(qdq_model, True)
+
+                initializers = {init.name: init for init in qdq_model.graph.initializer}
+
+                # Check zero-point data types
+                orig_zp_init = None
+                if test_qparam_sharing:
+                    # op_0_out_zero_point should not be in the model because the Transpose output is sharing
+                    # qparams from the Transpose input.
+                    self.assertNotIn("op_0_out_zero_point", initializers)
+                    orig_zp_init = initializers["input_0_zero_point"]
+                else:
+                    orig_zp_init = initializers["op_0_out_zero_point"]
+
+                self.assertEqual(orig_zp_init.data_type, onnx.TensorProto.UINT8)
+                convert_zp_init = initializers["op_0_out_zero_point_convert"]
+                self.assertEqual(convert_zp_init.data_type, onnx.TensorProto.UINT16)
+                output_0_zp_init = initializers["output_0_zero_point"]
+                self.assertEqual(output_0_zp_init.data_type, onnx.TensorProto.UINT8)
+                output_1_zp_init = initializers["output_1_zero_point"]
+                self.assertEqual(output_1_zp_init.data_type, onnx.TensorProto.UINT8)
+                output_2_zp_init = initializers["output_2_zero_point"]
+                self.assertEqual(output_2_zp_init.data_type, onnx.TensorProto.UINT16)
+                output_3_zp_init = initializers["output_3_zero_point"]
+                self.assertEqual(output_3_zp_init.data_type, onnx.TensorProto.UINT16)
+
+                # Check scale data types
+                orig_scale_init = None
+                if test_qparam_sharing:
+                    self.assertNotIn("op_0_out_scale", initializers)
+                    orig_scale_init = initializers["input_0_scale"]
+                else:
+                    orig_scale_init = initializers["op_0_out_scale"]
+
+                self.assertEqual(orig_scale_init.data_type, float_type)
+                convert_scale_init = initializers["op_0_out_scale_convert"]
+                self.assertEqual(convert_scale_init.data_type, float_type)
+                output_0_scale_init = initializers["output_0_scale"]
+                self.assertEqual(output_0_scale_init.data_type, float_type)
+                output_1_scale_init = initializers["output_1_scale"]
+                self.assertEqual(output_1_scale_init.data_type, float_type)
+                output_2_scale_init = initializers["output_2_scale"]
+                self.assertEqual(output_2_scale_init.data_type, float_type)
+                output_3_scale_init = initializers["output_3_scale"]
+                self.assertEqual(output_3_scale_init.data_type, float_type)
+
+    def test_add_tensor_qdq_ops_case_3(self):
+        """
+        Tensor T is a graph output; all consumers use the converted type
+        <Producer> ---> Q1 ---> DQ1 ---> Q2 ---> DQ2 -+-> <Consumers>
+                                                      |
+                                                      +-> <Graph output>
+        """
+        # Test configurations (qparam_sharing, float_type)
+        subtest_configs = [
+            (False, onnx.TensorProto.FLOAT, np.float32),
+            (False, onnx.TensorProto.FLOAT16, np.float16),
+            (True, onnx.TensorProto.FLOAT, np.float32),
+            (True, onnx.TensorProto.FLOAT16, np.float16),
+        ]
+        for test_qparam_sharing, float_type, np_float_type in subtest_configs:
+            with self.subTest(test_qparam_sharing=test_qparam_sharing, float_type=float_type):
+                label = f"_share{test_qparam_sharing}_f{float_type}"
+                float_model_path = os.path.join(self._tmp_dir_path, f"case_3{label}.onnx")
+                qdq_model_path = os.path.join(self._tmp_dir_path, f"case_3{label}.qdq.onnx")
+                float_model = self.build_test_model_for_add_qdq_ops(
+                    2, True, float_type=float_type, op0_transpose=test_qparam_sharing
+                )
+                onnx.save_model(float_model, float_model_path)
+
+                data_reader = self.input_feeds(3, {"input_0": (1, 2, 3)}, np_float_type)
+
+                mixed_prec_overrides = {
+                    "op_0_out": [
+                        {
+                            "quant_type": QuantType.QUInt8,
+                            "convert": {"quant_type": QuantType.QUInt16, "recv_nodes": {"op_1", "op_2"}},
+                        }
+                    ],
+                    "output_0": [{"quant_type": QuantType.QUInt16}],
+                    "output_1": [{"quant_type": QuantType.QUInt16}],
+                }
+                quantize_static(
+                    float_model_path,
+                    qdq_model_path,
+                    data_reader,
+                    quant_format=QuantFormat.QDQ,
+                    activation_type=QuantType.QUInt8,
+                    op_types_to_quantize=[node.op_type for node in float_model.graph.node],
+                    extra_options={
+                        "TensorQuantOverrides": mixed_prec_overrides,
+                        "ForceQuantizeNoInputCheck": test_qparam_sharing,  # To ensure Transpose is wrapped in DQ/Q
+                    },
+                )
+
+                # Expect the following QDQ model:
+                # input_0 --> Q --> DQ --> op_0 --> Q_8 --> DQ_8 --> Q_16 --> DQ_16 -+-> op_1 --> Q --> DQ --> output_0
+                #                                                                    |
+                #                                                                    +-> op_2 --> Q --> DQ --> output_1
+                #                                                                    |
+                #                                                                    +--> op_0_out (is graph output)
+                qdq_node_counts = {"QuantizeLinear": 5, "DequantizeLinear": 5}
+                check_op_type_count(self, qdq_model_path, **qdq_node_counts)
+
+                qdq_model = onnx.load_model(qdq_model_path)
+                onnx.checker.check_model(qdq_model, True)
+
+                initializers = {init.name: init for init in qdq_model.graph.initializer}
+                graph_outputs = {g_output.name: g_output for g_output in qdq_model.graph.output}
+
+                # Check zero-point data types
+                orig_zp_init = None
+                if test_qparam_sharing:
+                    # op_0_out_zero_point should not be in the model because the Transpose output is sharing
+                    # qparams from the Transpose input.
+                    self.assertNotIn("op_0_out_zero_point", initializers)
+                    self.assertNotIn("op_0_out_scale", initializers)
+                    orig_zp_init = initializers["input_0_zero_point"]
+                else:
+                    orig_zp_init = initializers["op_0_out_zero_point"]
+
+                self.assertEqual(orig_zp_init.data_type, onnx.TensorProto.UINT8)
+                convert_zp_init = initializers["op_0_out_zero_point_convert"]
+                self.assertEqual(convert_zp_init.data_type, onnx.TensorProto.UINT16)
+                output_0_zp_init = initializers["output_0_zero_point"]
+                self.assertEqual(output_0_zp_init.data_type, onnx.TensorProto.UINT16)
+                output_1_zp_init = initializers["output_1_zero_point"]
+                self.assertEqual(output_1_zp_init.data_type, onnx.TensorProto.UINT16)
+
+                # Check scale data types
+                orig_scale_init = None
+                if test_qparam_sharing:
+                    self.assertNotIn("op_0_out_scale", initializers)
+                    orig_scale_init = initializers["input_0_scale"]
+                else:
+                    orig_scale_init = initializers["op_0_out_scale"]
+
+                self.assertEqual(orig_scale_init.data_type, float_type)
+                convert_scale_init = initializers["op_0_out_scale_convert"]
+                self.assertEqual(convert_scale_init.data_type, float_type)
+                output_0_scale_init = initializers["output_0_scale"]
+                self.assertEqual(output_0_scale_init.data_type, float_type)
+                output_1_scale_init = initializers["output_1_scale"]
+                self.assertEqual(output_1_scale_init.data_type, float_type)
+
+                self.assertIn("op_0_out", graph_outputs)
+
+    def test_add_tensor_qdq_ops_case_4(self):
+        """
+        Tensor T is a graph output; some consumers use the original type, others use the converted type
+        <Producer> ---> Q1 -+-> DQ1 -+-> <Consumers of original type>
+                            |        |
+                            |        +-> <Graph output>
+                            |
+                            +-> DQ1' ---> Q2 ---> DQ2 ---> <Consumers of converted type>
+        """
+        # Test configurations (qparam_sharing, float_type)
+        subtest_configs = [
+            (False, onnx.TensorProto.FLOAT, np.float32),
+            (False, onnx.TensorProto.FLOAT16, np.float16),
+            (True, onnx.TensorProto.FLOAT, np.float32),
+            (True, onnx.TensorProto.FLOAT16, np.float16),
+        ]
+        for test_qparam_sharing, float_type, np_float_type in subtest_configs:
+            with self.subTest(test_qparam_sharing=test_qparam_sharing, float_type=float_type):
+                label = f"_share{test_qparam_sharing}_f{float_type}"
+                float_model_path = os.path.join(self._tmp_dir_path, f"case_4{label}.onnx")
+                qdq_model_path = os.path.join(self._tmp_dir_path, f"case_4{label}.qdq.onnx")
+                float_model = self.build_test_model_for_add_qdq_ops(
+                    4, True, float_type=float_type, op0_transpose=test_qparam_sharing
+                )
+                onnx.save_model(float_model, float_model_path)
+
+                data_reader = self.input_feeds(3, {"input_0": (1, 2, 3)}, np_float_type)
+
+                mixed_prec_overrides = {
+                    "op_0_out": [
+                        {
+                            "quant_type": QuantType.QUInt8,
+                            "convert": {"quant_type": QuantType.QUInt16, "recv_nodes": {"op_3", "op_4"}},
+                        }
+                    ],
+                    "output_2": [{"quant_type": QuantType.QUInt16}],
+                    "output_3": [{"quant_type": QuantType.QUInt16}],
+                }
+                quantize_static(
+                    float_model_path,
+                    qdq_model_path,
+                    data_reader,
+                    quant_format=QuantFormat.QDQ,
+                    activation_type=QuantType.QUInt8,
+                    op_types_to_quantize=[node.op_type for node in float_model.graph.node],
+                    extra_options={
+                        "TensorQuantOverrides": mixed_prec_overrides,
+                        "ForceQuantizeNoInputCheck": test_qparam_sharing,  # To ensure Transpose is wrapped in DQ/Q
+                    },
+                )
+
+                # Expect the following QDQ model:
+                # input_0 --> Q --> DQ --> op_0 --> Q_8 -+-> DQ_8 -+-> op_1 --> Q --> DQ --> output_0
+                #                                        |         |
+                #                                        |         +-> op_2 --> Q --> DQ --> output_1
+                #                                        |         |
+                #                                        |         +-> op_0_out (is graph output)
+                #                                        |
+                #                                        +-> DQ_8' --> Q_16 --> DQ_16 -+-> op_3 --> Q --> DQ --> output_2
+                #                                                                      |
+                #                                                                      +-> op_4 --> Q --> DQ --> output_3
+                qdq_node_counts = {"QuantizeLinear": 7, "DequantizeLinear": 8}
+                check_op_type_count(self, qdq_model_path, **qdq_node_counts)
+
+                qdq_model = onnx.load_model(qdq_model_path)
+                onnx.checker.check_model(qdq_model, True)
+
+                initializers = {init.name: init for init in qdq_model.graph.initializer}
+                graph_outputs = {g_output.name: g_output for g_output in qdq_model.graph.output}
+
+                # Check zero-point data types
+                orig_zp_init = None
+                if test_qparam_sharing:
+                    # op_0_out_zero_point should not be in the model because the Transpose output is sharing
+                    # qparams from the Transpose input.
+                    self.assertNotIn("op_0_out_zero_point", initializers)
+                    orig_zp_init = initializers["input_0_zero_point"]
+                else:
+                    orig_zp_init = initializers["op_0_out_zero_point"]
+
+                self.assertEqual(orig_zp_init.data_type, onnx.TensorProto.UINT8)
+                convert_zp_init = initializers["op_0_out_zero_point_convert"]
+                self.assertEqual(convert_zp_init.data_type, onnx.TensorProto.UINT16)
+                output_0_zp_init = initializers["output_0_zero_point"]
+                self.assertEqual(output_0_zp_init.data_type, onnx.TensorProto.UINT8)
+                output_1_zp_init = initializers["output_1_zero_point"]
+                self.assertEqual(output_1_zp_init.data_type, onnx.TensorProto.UINT8)
+                output_2_zp_init = initializers["output_2_zero_point"]
+                self.assertEqual(output_2_zp_init.data_type, onnx.TensorProto.UINT16)
+                output_3_zp_init = initializers["output_3_zero_point"]
+                self.assertEqual(output_3_zp_init.data_type, onnx.TensorProto.UINT16)
+
+                # Check scale data types
+                orig_scale_init = None
+                if test_qparam_sharing:
+                    self.assertNotIn("op_0_out_scale", initializers)
+                    orig_scale_init = initializers["input_0_scale"]
+                else:
+                    orig_scale_init = initializers["op_0_out_scale"]
+
+                self.assertEqual(orig_scale_init.data_type, float_type)
+                convert_scale_init = initializers["op_0_out_scale_convert"]
+                self.assertEqual(convert_scale_init.data_type, float_type)
+                output_0_scale_init = initializers["output_0_scale"]
+                self.assertEqual(output_0_scale_init.data_type, float_type)
+                output_1_scale_init = initializers["output_1_scale"]
+                self.assertEqual(output_1_scale_init.data_type, float_type)
+                output_2_scale_init = initializers["output_2_scale"]
+                self.assertEqual(output_2_scale_init.data_type, float_type)
+                output_3_scale_init = initializers["output_3_scale"]
+                self.assertEqual(output_3_scale_init.data_type, float_type)
+
+                self.assertIn("op_0_out", graph_outputs)
+
+    def build_test_model_1(self, shape):
+        """
+        Returns the following float32 model.
+
+        input_0 --> op1 --> op3 --> op5 --> op6 --> output_0
+                                     ^
+                                     |
+        input_1 --> op2 -+-> op4 ----+
+                         |
+                         +-> op7 --> output_1
+                         |
+                         +-> op8 --> output_2
+        """
+        input_0 = onnx.helper.make_tensor_value_info("input_0", onnx.TensorProto.FLOAT, shape)
+        input_1 = onnx.helper.make_tensor_value_info("input_1", onnx.TensorProto.FLOAT, shape)
+        output_0 = onnx.helper.make_tensor_value_info("output_0", onnx.TensorProto.FLOAT, shape)
+        output_1 = onnx.helper.make_tensor_value_info("output_1", onnx.TensorProto.FLOAT, shape)
+        output_2 = onnx.helper.make_tensor_value_info("output_2", onnx.TensorProto.FLOAT, shape)
+
+        op1_node = onnx.helper.make_node("Sigmoid", ["input_0"], ["op1_out"], name="op1")
+        op2_node = onnx.helper.make_node("Cos", ["input_1"], ["op2_out"], name="op2")
+        op3_node = onnx.helper.make_node("Sin", ["op1_out"], ["op3_out"], name="op3")
+        op4_node = onnx.helper.make_node("Tanh", ["op2_out"], ["op4_out"], name="op4")
+        op5_node = onnx.helper.make_node("Mul", ["op3_out", "op4_out"], ["op5_out"], name="op5")
+        op6_node = onnx.helper.make_node("Relu", ["op5_out"], ["output_0"], name="op6")
+        op7_node = onnx.helper.make_node("Cos", ["op2_out"], ["output_1"], name="op7")
+        op8_node = onnx.helper.make_node("Sigmoid", ["op2_out"], ["output_2"], name="op8")
+
+        graph = onnx.helper.make_graph(
+            [
+                op1_node,
+                op2_node,
+                op3_node,
+                op4_node,
+                op5_node,
+                op6_node,
+                op7_node,
+                op8_node,
+            ],
+            "mixed_prec_test",
+            [input_0, input_1],
+            [output_0, output_1, output_2],
+        )
+        opset_imports = [
+            onnx.helper.make_opsetid("", 18),
+        ]
+        model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        return onnx.shape_inference.infer_shapes(model)
+
+    def test_16bit_subgraph(self):
+        """
+        Test correctness of a qdq model that uses a default 8-bit quantization type and contains
+        a subgraph that uses 16-bit activations.
+        """
+        shape = (1, 2, 3)
+        f32_model_path = os.path.join(self._tmp_dir_path, "model.onnx")
+        qdq_model_path = os.path.join(self._tmp_dir_path, "model.qdq.onnx")
+        qdq_mixed_model_path = os.path.join(self._tmp_dir_path, "model.mixed.qdq.onnx")
+        f32_model = self.build_test_model_1(shape)
+        onnx.save_model(f32_model, f32_model_path)
+
+        data_reader = self.input_feeds(3, {"input_0": shape, "input_1": shape})
+
+        # Create pure 8-bit qdq model
+        quantize_static(
+            f32_model_path,
+            qdq_model_path,
+            data_reader,
+            quant_format=QuantFormat.QDQ,
+            activation_type=QuantType.QUInt8,
+            op_types_to_quantize=[node.op_type for node in f32_model.graph.node],
+        )
+
+        # Create mixed precision 8-bit/16-bit qdq model
+        mixed_prec_overrides = {
+            "op2_out": [
+                {"quant_type": QuantType.QUInt8, "convert": {"quant_type": QuantType.QUInt16, "recv_nodes": {"op4"}}}
+            ],
+            "op3_out": [
+                {"quant_type": QuantType.QUInt8, "convert": {"quant_type": QuantType.QUInt16, "recv_nodes": {"op5"}}}
+            ],
+            "op4_out": [{"quant_type": QuantType.QUInt16}],
+            "op5_out": [{"quant_type": QuantType.QUInt16}],
+            "output_0": [{"quant_type": QuantType.QUInt16}],
+        }
+        data_reader.rewind()
+        quantize_static(
+            f32_model_path,
+            qdq_mixed_model_path,
+            data_reader,
+            quant_format=QuantFormat.QDQ,
+            activation_type=QuantType.QUInt8,
+            op_types_to_quantize=[node.op_type for node in f32_model.graph.node],
+            extra_options={"TensorQuantOverrides": mixed_prec_overrides},
+        )
+
+        qop_nodes = {"Relu": 0, "QuantizeLinear": 11, "DequantizeLinear": 12}
+        check_op_type_count(self, qdq_mixed_model_path, **qop_nodes)
+        data_reader.rewind()
+        check_model_correctness(self, f32_model_path, qdq_mixed_model_path, data_reader.get_next())
+        data_reader.rewind()
+        check_model_correctness(self, f32_model_path, qdq_model_path, data_reader.get_next())
+
+
 if __name__ == "__main__":
     unittest.main()

From 5b64d7c32b29e1f97523f184a147107431d99611 Mon Sep 17 00:00:00 2001
From: Satya Kumar Jandhyala <satya.k.jandhyala@gmail.com>
Date: Sat, 23 Mar 2024 11:19:14 -0700
Subject: [PATCH 234/279] [JS/WebGPU] Use non-matmul implementation for
 ConvTranspose in channel-first case. (#20022)

### Description
Avoid using vec4 Matmul implementation for ConvTranspose with channel-last


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../ops/3rd-party/conv_backprop_mm_webgpu.ts  |  11 +-
 js/web/test/data/ops/conv-transpose.jsonc     | 262 ++++++++++++++++++
 2 files changed, 266 insertions(+), 7 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_mm_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_mm_webgpu.ts
index 11c8778b72335..080b24a2432aa 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_mm_webgpu.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_mm_webgpu.ts
@@ -164,17 +164,14 @@ export const createConv2DTransposeMatMulProgramInfo =
       const outWidth = isChannelsLast ? outputShape[2] : outputShape[3];
       const outHeight = isChannelsLast ? outputShape[1] : outputShape[2];
       const outChannels = isChannelsLast ? outputShape[3] : outputShape[1];
-      const isVec4 =
-          isChannelsLast ? inChannels % 4 === 0 && outChannels % 4 === 0 : outWidth % 4 === 0 && outChannels % 4 === 0;
+      // TODO: enable vec4 for NCHW
+      const isVec4 = isChannelsLast && (inChannels % 4 === 0 && inChannels % 3) && outChannels % 4 === 0;
 
       // TODO: fine tune size
       const dispatchX = isChannelsLast ? outChannels : outWidth * outHeight;
       const dispatchY = isChannelsLast ? outWidth * outHeight : outChannels;
-      const workGroupSize: [number, number, number] = isVec4 ?
-          [8, 8, 1] :
-          [(dispatchX <= 4 || dispatchY <= 4) ? 4 : 16, dispatchX > 4 && dispatchY <= 4 ? 4 : 16, 1];
-      const elementsPerThread =
-          isVec4 ? [4, 4, 1] : [dispatchX <= 4 ? 1 : 4, dispatchX > 4 && dispatchY <= 4 ? 1 : 4, 1];
+      const workGroupSize: [number, number, number] = [8, 8, 1];
+      const elementsPerThread = dimAOuter <= 8 ? [4, 1, 1] : [4, 4, 1];
       const dispatch = [
         Math.ceil(dispatchX / workGroupSize[0] / elementsPerThread[0]),
         Math.ceil(dispatchY / workGroupSize[1] / elementsPerThread[1]),
diff --git a/js/web/test/data/ops/conv-transpose.jsonc b/js/web/test/data/ops/conv-transpose.jsonc
index 7038e2a4f8766..8ed48dd07e6f1 100644
--- a/js/web/test/data/ops/conv-transpose.jsonc
+++ b/js/web/test/data/ops/conv-transpose.jsonc
@@ -392,5 +392,267 @@
         ]
       }
     ]
+  },
+  {
+    "name": "ConvTranspose without bias addition C",
+    "operator": "ConvTranspose",
+    "attributes": [
+      { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
+      { "name": "strides", "data": [2, 2], "type": "ints" }
+    ],
+    "cases": [
+      {
+        "name": "T[0]",
+        "inputs": [
+          {
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+              26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
+              23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+              20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+              17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+              14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+              11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6,
+              7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2,
+              3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
+              31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
+              28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+              25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+              22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+              19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+              16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+              13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+              10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5,
+              6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1,
+              2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
+              27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+              24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+              21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
+              18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+              15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+              12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8,
+              9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4,
+              5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0,
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
+              27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+              24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+              21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
+              18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+              15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+            ],
+            "dims": [1, 4, 16, 16],
+            "type": "float32"
+          },
+          {
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+              15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+              14, 15
+            ],
+            "dims": [4, 4, 2, 2],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              0, 0, 0, 4, 0, 8, 0, 12, 0, 16, 0, 20, 0, 24, 0, 28, 0, 32, 0, 36, 0, 40, 0, 44, 0, 48, 0, 52, 0, 56, 0,
+              60, 0, 0, 8, 12, 16, 24, 24, 36, 32, 48, 40, 60, 48, 72, 56, 84, 64, 96, 72, 108, 80, 120, 88, 132, 96,
+              144, 104, 156, 112, 168, 120, 180, 0, 64, 0, 68, 0, 72, 0, 76, 0, 80, 0, 84, 0, 88, 0, 92, 0, 96, 0, 100,
+              0, 104, 0, 108, 0, 112, 0, 116, 0, 120, 0, 124, 128, 192, 136, 204, 144, 216, 152, 228, 160, 240, 168,
+              252, 176, 264, 184, 276, 192, 288, 200, 300, 208, 312, 216, 324, 224, 336, 232, 348, 240, 360, 248, 372,
+              0, 0, 0, 4, 0, 8, 0, 12, 0, 16, 0, 20, 0, 24, 0, 28, 0, 32, 0, 36, 0, 40, 0, 44, 0, 48, 0, 52, 0, 56, 0,
+              60, 0, 0, 8, 12, 16, 24, 24, 36, 32, 48, 40, 60, 48, 72, 56, 84, 64, 96, 72, 108, 80, 120, 88, 132, 96,
+              144, 104, 156, 112, 168, 120, 180, 0, 64, 0, 68, 0, 72, 0, 76, 0, 80, 0, 84, 0, 88, 0, 92, 0, 96, 0, 100,
+              0, 104, 0, 108, 0, 112, 0, 116, 0, 120, 0, 124, 128, 192, 136, 204, 144, 216, 152, 228, 160, 240, 168,
+              252, 176, 264, 184, 276, 192, 288, 200, 300, 208, 312, 216, 324, 224, 336, 232, 348, 240, 360, 248, 372,
+              0, 0, 0, 4, 0, 8, 0, 12, 0, 16, 0, 20, 0, 24, 0, 28, 0, 32, 0, 36, 0, 40, 0, 44, 0, 48, 0, 52, 0, 56, 0,
+              60, 0, 0, 8, 12, 16, 24, 24, 36, 32, 48, 40, 60, 48, 72, 56, 84, 64, 96, 72, 108, 80, 120, 88, 132, 96,
+              144, 104, 156, 112, 168, 120, 180, 0, 64, 0, 68, 0, 72, 0, 76, 0, 80, 0, 84, 0, 88, 0, 92, 0, 96, 0, 100,
+              0, 104, 0, 108, 0, 112, 0, 116, 0, 120, 0, 124, 128, 192, 136, 204, 144, 216, 152, 228, 160, 240, 168,
+              252, 176, 264, 184, 276, 192, 288, 200, 300, 208, 312, 216, 324, 224, 336, 232, 348, 240, 360, 248, 372,
+              0, 0, 0, 4, 0, 8, 0, 12, 0, 16, 0, 20, 0, 24, 0, 28, 0, 32, 0, 36, 0, 40, 0, 44, 0, 48, 0, 52, 0, 56, 0,
+              60, 0, 0, 8, 12, 16, 24, 24, 36, 32, 48, 40, 60, 48, 72, 56, 84, 64, 96, 72, 108, 80, 120, 88, 132, 96,
+              144, 104, 156, 112, 168, 120, 180, 0, 64, 0, 68, 0, 72, 0, 76, 0, 80, 0, 84, 0, 88, 0, 92, 0, 96, 0, 100,
+              0, 104, 0, 108, 0, 112, 0, 116, 0, 120, 0, 124, 128, 192, 136, 204, 144, 216, 152, 228, 160, 240, 168,
+              252, 176, 264, 184, 276, 192, 288, 200, 300, 208, 312, 216, 324, 224, 336, 232, 348, 240, 360, 248, 372,
+              0, 0, 0, 4, 0, 8, 0, 12, 0, 16, 0, 20, 0, 24, 0, 28, 0, 32, 0, 36, 0, 40, 0, 44, 0, 48, 0, 52, 0, 56, 0,
+              60, 0, 0, 8, 12, 16, 24, 24, 36, 32, 48, 40, 60, 48, 72, 56, 84, 64, 96, 72, 108, 80, 120, 88, 132, 96,
+              144, 104, 156, 112, 168, 120, 180, 0, 64, 0, 68, 0, 72, 0, 76, 0, 80, 0, 84, 0, 88, 0, 92, 0, 96, 0, 100,
+              0, 104, 0, 108, 0, 112, 0, 116, 0, 120, 0, 124, 128, 192, 136, 204, 144, 216, 152, 228, 160, 240, 168,
+              252, 176, 264, 184, 276, 192, 288, 200, 300, 208, 312, 216, 324, 224, 336, 232, 348, 240, 360, 248, 372,
+              0, 0, 0, 4, 0, 8, 0, 12, 0, 16, 0, 20, 0, 24, 0, 28, 0, 32, 0, 36, 0, 40, 0, 44, 0, 48, 0, 52, 0, 56, 0,
+              60, 0, 0, 8, 12, 16, 24, 24, 36, 32, 48, 40, 60, 48, 72, 56, 84, 64, 96, 72, 108, 80, 120, 88, 132, 96,
+              144, 104, 156, 112, 168, 120, 180, 0, 64, 0, 68, 0, 72, 0, 76, 0, 80, 0, 84, 0, 88, 0, 92, 0, 96, 0, 100,
+              0, 104, 0, 108, 0, 112, 0, 116, 0, 120, 0, 124, 128, 192, 136, 204, 144, 216, 152, 228, 160, 240, 168,
+              252, 176, 264, 184, 276, 192, 288, 200, 300, 208, 312, 216, 324, 224, 336, 232, 348, 240, 360, 248, 372,
+              0, 0, 0, 4, 0, 8, 0, 12, 0, 16, 0, 20, 0, 24, 0, 28, 0, 32, 0, 36, 0, 40, 0, 44, 0, 48, 0, 52, 0, 56, 0,
+              60, 0, 0, 8, 12, 16, 24, 24, 36, 32, 48, 40, 60, 48, 72, 56, 84, 64, 96, 72, 108, 80, 120, 88, 132, 96,
+              144, 104, 156, 112, 168, 120, 180, 0, 64, 0, 68, 0, 72, 0, 76, 0, 80, 0, 84, 0, 88, 0, 92, 0, 96, 0, 100,
+              0, 104, 0, 108, 0, 112, 0, 116, 0, 120, 0, 124, 128, 192, 136, 204, 144, 216, 152, 228, 160, 240, 168,
+              252, 176, 264, 184, 276, 192, 288, 200, 300, 208, 312, 216, 324, 224, 336, 232, 348, 240, 360, 248, 372,
+              0, 0, 0, 4, 0, 8, 0, 12, 0, 16, 0, 20, 0, 24, 0, 28, 0, 32, 0, 36, 0, 40, 0, 44, 0, 48, 0, 52, 0, 56, 0,
+              60, 0, 0, 8, 12, 16, 24, 24, 36, 32, 48, 40, 60, 48, 72, 56, 84, 64, 96, 72, 108, 80, 120, 88, 132, 96,
+              144, 104, 156, 112, 168, 120, 180, 0, 64, 0, 68, 0, 72, 0, 76, 0, 80, 0, 84, 0, 88, 0, 92, 0, 96, 0, 100,
+              0, 104, 0, 108, 0, 112, 0, 116, 0, 120, 0, 124, 128, 192, 136, 204, 144, 216, 152, 228, 160, 240, 168,
+              252, 176, 264, 184, 276, 192, 288, 200, 300, 208, 312, 216, 324, 224, 336, 232, 348, 240, 360, 248, 372,
+              0, 0, 16, 20, 32, 40, 48, 60, 64, 80, 80, 100, 96, 120, 112, 140, 128, 160, 144, 180, 160, 200, 176, 220,
+              192, 240, 208, 260, 224, 280, 240, 300, 0, 0, 24, 28, 48, 56, 72, 84, 96, 112, 120, 140, 144, 168, 168,
+              196, 192, 224, 216, 252, 240, 280, 264, 308, 288, 336, 312, 364, 336, 392, 360, 420, 256, 320, 272, 340,
+              288, 360, 304, 380, 320, 400, 336, 420, 352, 440, 368, 460, 384, 480, 400, 500, 416, 520, 432, 540, 448,
+              560, 464, 580, 480, 600, 496, 620, 384, 448, 408, 476, 432, 504, 456, 532, 480, 560, 504, 588, 528, 616,
+              552, 644, 576, 672, 600, 700, 624, 728, 648, 756, 672, 784, 696, 812, 720, 840, 744, 868, 0, 0, 16, 20,
+              32, 40, 48, 60, 64, 80, 80, 100, 96, 120, 112, 140, 128, 160, 144, 180, 160, 200, 176, 220, 192, 240, 208,
+              260, 224, 280, 240, 300, 0, 0, 24, 28, 48, 56, 72, 84, 96, 112, 120, 140, 144, 168, 168, 196, 192, 224,
+              216, 252, 240, 280, 264, 308, 288, 336, 312, 364, 336, 392, 360, 420, 256, 320, 272, 340, 288, 360, 304,
+              380, 320, 400, 336, 420, 352, 440, 368, 460, 384, 480, 400, 500, 416, 520, 432, 540, 448, 560, 464, 580,
+              480, 600, 496, 620, 384, 448, 408, 476, 432, 504, 456, 532, 480, 560, 504, 588, 528, 616, 552, 644, 576,
+              672, 600, 700, 624, 728, 648, 756, 672, 784, 696, 812, 720, 840, 744, 868, 0, 0, 16, 20, 32, 40, 48, 60,
+              64, 80, 80, 100, 96, 120, 112, 140, 128, 160, 144, 180, 160, 200, 176, 220, 192, 240, 208, 260, 224, 280,
+              240, 300, 0, 0, 24, 28, 48, 56, 72, 84, 96, 112, 120, 140, 144, 168, 168, 196, 192, 224, 216, 252, 240,
+              280, 264, 308, 288, 336, 312, 364, 336, 392, 360, 420, 256, 320, 272, 340, 288, 360, 304, 380, 320, 400,
+              336, 420, 352, 440, 368, 460, 384, 480, 400, 500, 416, 520, 432, 540, 448, 560, 464, 580, 480, 600, 496,
+              620, 384, 448, 408, 476, 432, 504, 456, 532, 480, 560, 504, 588, 528, 616, 552, 644, 576, 672, 600, 700,
+              624, 728, 648, 756, 672, 784, 696, 812, 720, 840, 744, 868, 0, 0, 16, 20, 32, 40, 48, 60, 64, 80, 80, 100,
+              96, 120, 112, 140, 128, 160, 144, 180, 160, 200, 176, 220, 192, 240, 208, 260, 224, 280, 240, 300, 0, 0,
+              24, 28, 48, 56, 72, 84, 96, 112, 120, 140, 144, 168, 168, 196, 192, 224, 216, 252, 240, 280, 264, 308,
+              288, 336, 312, 364, 336, 392, 360, 420, 256, 320, 272, 340, 288, 360, 304, 380, 320, 400, 336, 420, 352,
+              440, 368, 460, 384, 480, 400, 500, 416, 520, 432, 540, 448, 560, 464, 580, 480, 600, 496, 620, 384, 448,
+              408, 476, 432, 504, 456, 532, 480, 560, 504, 588, 528, 616, 552, 644, 576, 672, 600, 700, 624, 728, 648,
+              756, 672, 784, 696, 812, 720, 840, 744, 868, 0, 0, 16, 20, 32, 40, 48, 60, 64, 80, 80, 100, 96, 120, 112,
+              140, 128, 160, 144, 180, 160, 200, 176, 220, 192, 240, 208, 260, 224, 280, 240, 300, 0, 0, 24, 28, 48, 56,
+              72, 84, 96, 112, 120, 140, 144, 168, 168, 196, 192, 224, 216, 252, 240, 280, 264, 308, 288, 336, 312, 364,
+              336, 392, 360, 420, 256, 320, 272, 340, 288, 360, 304, 380, 320, 400, 336, 420, 352, 440, 368, 460, 384,
+              480, 400, 500, 416, 520, 432, 540, 448, 560, 464, 580, 480, 600, 496, 620, 384, 448, 408, 476, 432, 504,
+              456, 532, 480, 560, 504, 588, 528, 616, 552, 644, 576, 672, 600, 700, 624, 728, 648, 756, 672, 784, 696,
+              812, 720, 840, 744, 868, 0, 0, 16, 20, 32, 40, 48, 60, 64, 80, 80, 100, 96, 120, 112, 140, 128, 160, 144,
+              180, 160, 200, 176, 220, 192, 240, 208, 260, 224, 280, 240, 300, 0, 0, 24, 28, 48, 56, 72, 84, 96, 112,
+              120, 140, 144, 168, 168, 196, 192, 224, 216, 252, 240, 280, 264, 308, 288, 336, 312, 364, 336, 392, 360,
+              420, 256, 320, 272, 340, 288, 360, 304, 380, 320, 400, 336, 420, 352, 440, 368, 460, 384, 480, 400, 500,
+              416, 520, 432, 540, 448, 560, 464, 580, 480, 600, 496, 620, 384, 448, 408, 476, 432, 504, 456, 532, 480,
+              560, 504, 588, 528, 616, 552, 644, 576, 672, 600, 700, 624, 728, 648, 756, 672, 784, 696, 812, 720, 840,
+              744, 868, 0, 0, 16, 20, 32, 40, 48, 60, 64, 80, 80, 100, 96, 120, 112, 140, 128, 160, 144, 180, 160, 200,
+              176, 220, 192, 240, 208, 260, 224, 280, 240, 300, 0, 0, 24, 28, 48, 56, 72, 84, 96, 112, 120, 140, 144,
+              168, 168, 196, 192, 224, 216, 252, 240, 280, 264, 308, 288, 336, 312, 364, 336, 392, 360, 420, 256, 320,
+              272, 340, 288, 360, 304, 380, 320, 400, 336, 420, 352, 440, 368, 460, 384, 480, 400, 500, 416, 520, 432,
+              540, 448, 560, 464, 580, 480, 600, 496, 620, 384, 448, 408, 476, 432, 504, 456, 532, 480, 560, 504, 588,
+              528, 616, 552, 644, 576, 672, 600, 700, 624, 728, 648, 756, 672, 784, 696, 812, 720, 840, 744, 868, 0, 0,
+              16, 20, 32, 40, 48, 60, 64, 80, 80, 100, 96, 120, 112, 140, 128, 160, 144, 180, 160, 200, 176, 220, 192,
+              240, 208, 260, 224, 280, 240, 300, 0, 0, 24, 28, 48, 56, 72, 84, 96, 112, 120, 140, 144, 168, 168, 196,
+              192, 224, 216, 252, 240, 280, 264, 308, 288, 336, 312, 364, 336, 392, 360, 420, 256, 320, 272, 340, 288,
+              360, 304, 380, 320, 400, 336, 420, 352, 440, 368, 460, 384, 480, 400, 500, 416, 520, 432, 540, 448, 560,
+              464, 580, 480, 600, 496, 620, 384, 448, 408, 476, 432, 504, 456, 532, 480, 560, 504, 588, 528, 616, 552,
+              644, 576, 672, 600, 700, 624, 728, 648, 756, 672, 784, 696, 812, 720, 840, 744, 868, 0, 0, 32, 36, 64, 72,
+              96, 108, 128, 144, 160, 180, 192, 216, 224, 252, 256, 288, 288, 324, 320, 360, 352, 396, 384, 432, 416,
+              468, 448, 504, 480, 540, 0, 0, 40, 44, 80, 88, 120, 132, 160, 176, 200, 220, 240, 264, 280, 308, 320, 352,
+              360, 396, 400, 440, 440, 484, 480, 528, 520, 572, 560, 616, 600, 660, 512, 576, 544, 612, 576, 648, 608,
+              684, 640, 720, 672, 756, 704, 792, 736, 828, 768, 864, 800, 900, 832, 936, 864, 972, 896, 1008, 928, 1044,
+              960, 1080, 992, 1116, 640, 704, 680, 748, 720, 792, 760, 836, 800, 880, 840, 924, 880, 968, 920, 1012,
+              960, 1056, 1000, 1100, 1040, 1144, 1080, 1188, 1120, 1232, 1160, 1276, 1200, 1320, 1240, 1364, 0, 0, 32,
+              36, 64, 72, 96, 108, 128, 144, 160, 180, 192, 216, 224, 252, 256, 288, 288, 324, 320, 360, 352, 396, 384,
+              432, 416, 468, 448, 504, 480, 540, 0, 0, 40, 44, 80, 88, 120, 132, 160, 176, 200, 220, 240, 264, 280, 308,
+              320, 352, 360, 396, 400, 440, 440, 484, 480, 528, 520, 572, 560, 616, 600, 660, 512, 576, 544, 612, 576,
+              648, 608, 684, 640, 720, 672, 756, 704, 792, 736, 828, 768, 864, 800, 900, 832, 936, 864, 972, 896, 1008,
+              928, 1044, 960, 1080, 992, 1116, 640, 704, 680, 748, 720, 792, 760, 836, 800, 880, 840, 924, 880, 968,
+              920, 1012, 960, 1056, 1000, 1100, 1040, 1144, 1080, 1188, 1120, 1232, 1160, 1276, 1200, 1320, 1240, 1364,
+              0, 0, 32, 36, 64, 72, 96, 108, 128, 144, 160, 180, 192, 216, 224, 252, 256, 288, 288, 324, 320, 360, 352,
+              396, 384, 432, 416, 468, 448, 504, 480, 540, 0, 0, 40, 44, 80, 88, 120, 132, 160, 176, 200, 220, 240, 264,
+              280, 308, 320, 352, 360, 396, 400, 440, 440, 484, 480, 528, 520, 572, 560, 616, 600, 660, 512, 576, 544,
+              612, 576, 648, 608, 684, 640, 720, 672, 756, 704, 792, 736, 828, 768, 864, 800, 900, 832, 936, 864, 972,
+              896, 1008, 928, 1044, 960, 1080, 992, 1116, 640, 704, 680, 748, 720, 792, 760, 836, 800, 880, 840, 924,
+              880, 968, 920, 1012, 960, 1056, 1000, 1100, 1040, 1144, 1080, 1188, 1120, 1232, 1160, 1276, 1200, 1320,
+              1240, 1364, 0, 0, 32, 36, 64, 72, 96, 108, 128, 144, 160, 180, 192, 216, 224, 252, 256, 288, 288, 324,
+              320, 360, 352, 396, 384, 432, 416, 468, 448, 504, 480, 540, 0, 0, 40, 44, 80, 88, 120, 132, 160, 176, 200,
+              220, 240, 264, 280, 308, 320, 352, 360, 396, 400, 440, 440, 484, 480, 528, 520, 572, 560, 616, 600, 660,
+              512, 576, 544, 612, 576, 648, 608, 684, 640, 720, 672, 756, 704, 792, 736, 828, 768, 864, 800, 900, 832,
+              936, 864, 972, 896, 1008, 928, 1044, 960, 1080, 992, 1116, 640, 704, 680, 748, 720, 792, 760, 836, 800,
+              880, 840, 924, 880, 968, 920, 1012, 960, 1056, 1000, 1100, 1040, 1144, 1080, 1188, 1120, 1232, 1160, 1276,
+              1200, 1320, 1240, 1364, 0, 0, 32, 36, 64, 72, 96, 108, 128, 144, 160, 180, 192, 216, 224, 252, 256, 288,
+              288, 324, 320, 360, 352, 396, 384, 432, 416, 468, 448, 504, 480, 540, 0, 0, 40, 44, 80, 88, 120, 132, 160,
+              176, 200, 220, 240, 264, 280, 308, 320, 352, 360, 396, 400, 440, 440, 484, 480, 528, 520, 572, 560, 616,
+              600, 660, 512, 576, 544, 612, 576, 648, 608, 684, 640, 720, 672, 756, 704, 792, 736, 828, 768, 864, 800,
+              900, 832, 936, 864, 972, 896, 1008, 928, 1044, 960, 1080, 992, 1116, 640, 704, 680, 748, 720, 792, 760,
+              836, 800, 880, 840, 924, 880, 968, 920, 1012, 960, 1056, 1000, 1100, 1040, 1144, 1080, 1188, 1120, 1232,
+              1160, 1276, 1200, 1320, 1240, 1364, 0, 0, 32, 36, 64, 72, 96, 108, 128, 144, 160, 180, 192, 216, 224, 252,
+              256, 288, 288, 324, 320, 360, 352, 396, 384, 432, 416, 468, 448, 504, 480, 540, 0, 0, 40, 44, 80, 88, 120,
+              132, 160, 176, 200, 220, 240, 264, 280, 308, 320, 352, 360, 396, 400, 440, 440, 484, 480, 528, 520, 572,
+              560, 616, 600, 660, 512, 576, 544, 612, 576, 648, 608, 684, 640, 720, 672, 756, 704, 792, 736, 828, 768,
+              864, 800, 900, 832, 936, 864, 972, 896, 1008, 928, 1044, 960, 1080, 992, 1116, 640, 704, 680, 748, 720,
+              792, 760, 836, 800, 880, 840, 924, 880, 968, 920, 1012, 960, 1056, 1000, 1100, 1040, 1144, 1080, 1188,
+              1120, 1232, 1160, 1276, 1200, 1320, 1240, 1364, 0, 0, 32, 36, 64, 72, 96, 108, 128, 144, 160, 180, 192,
+              216, 224, 252, 256, 288, 288, 324, 320, 360, 352, 396, 384, 432, 416, 468, 448, 504, 480, 540, 0, 0, 40,
+              44, 80, 88, 120, 132, 160, 176, 200, 220, 240, 264, 280, 308, 320, 352, 360, 396, 400, 440, 440, 484, 480,
+              528, 520, 572, 560, 616, 600, 660, 512, 576, 544, 612, 576, 648, 608, 684, 640, 720, 672, 756, 704, 792,
+              736, 828, 768, 864, 800, 900, 832, 936, 864, 972, 896, 1008, 928, 1044, 960, 1080, 992, 1116, 640, 704,
+              680, 748, 720, 792, 760, 836, 800, 880, 840, 924, 880, 968, 920, 1012, 960, 1056, 1000, 1100, 1040, 1144,
+              1080, 1188, 1120, 1232, 1160, 1276, 1200, 1320, 1240, 1364, 0, 0, 32, 36, 64, 72, 96, 108, 128, 144, 160,
+              180, 192, 216, 224, 252, 256, 288, 288, 324, 320, 360, 352, 396, 384, 432, 416, 468, 448, 504, 480, 540,
+              0, 0, 40, 44, 80, 88, 120, 132, 160, 176, 200, 220, 240, 264, 280, 308, 320, 352, 360, 396, 400, 440, 440,
+              484, 480, 528, 520, 572, 560, 616, 600, 660, 512, 576, 544, 612, 576, 648, 608, 684, 640, 720, 672, 756,
+              704, 792, 736, 828, 768, 864, 800, 900, 832, 936, 864, 972, 896, 1008, 928, 1044, 960, 1080, 992, 1116,
+              640, 704, 680, 748, 720, 792, 760, 836, 800, 880, 840, 924, 880, 968, 920, 1012, 960, 1056, 1000, 1100,
+              1040, 1144, 1080, 1188, 1120, 1232, 1160, 1276, 1200, 1320, 1240, 1364, 0, 0, 48, 52, 96, 104, 144, 156,
+              192, 208, 240, 260, 288, 312, 336, 364, 384, 416, 432, 468, 480, 520, 528, 572, 576, 624, 624, 676, 672,
+              728, 720, 780, 0, 0, 56, 60, 112, 120, 168, 180, 224, 240, 280, 300, 336, 360, 392, 420, 448, 480, 504,
+              540, 560, 600, 616, 660, 672, 720, 728, 780, 784, 840, 840, 900, 768, 832, 816, 884, 864, 936, 912, 988,
+              960, 1040, 1008, 1092, 1056, 1144, 1104, 1196, 1152, 1248, 1200, 1300, 1248, 1352, 1296, 1404, 1344, 1456,
+              1392, 1508, 1440, 1560, 1488, 1612, 896, 960, 952, 1020, 1008, 1080, 1064, 1140, 1120, 1200, 1176, 1260,
+              1232, 1320, 1288, 1380, 1344, 1440, 1400, 1500, 1456, 1560, 1512, 1620, 1568, 1680, 1624, 1740, 1680,
+              1800, 1736, 1860, 0, 0, 48, 52, 96, 104, 144, 156, 192, 208, 240, 260, 288, 312, 336, 364, 384, 416, 432,
+              468, 480, 520, 528, 572, 576, 624, 624, 676, 672, 728, 720, 780, 0, 0, 56, 60, 112, 120, 168, 180, 224,
+              240, 280, 300, 336, 360, 392, 420, 448, 480, 504, 540, 560, 600, 616, 660, 672, 720, 728, 780, 784, 840,
+              840, 900, 768, 832, 816, 884, 864, 936, 912, 988, 960, 1040, 1008, 1092, 1056, 1144, 1104, 1196, 1152,
+              1248, 1200, 1300, 1248, 1352, 1296, 1404, 1344, 1456, 1392, 1508, 1440, 1560, 1488, 1612, 896, 960, 952,
+              1020, 1008, 1080, 1064, 1140, 1120, 1200, 1176, 1260, 1232, 1320, 1288, 1380, 1344, 1440, 1400, 1500,
+              1456, 1560, 1512, 1620, 1568, 1680, 1624, 1740, 1680, 1800, 1736, 1860, 0, 0, 48, 52, 96, 104, 144, 156,
+              192, 208, 240, 260, 288, 312, 336, 364, 384, 416, 432, 468, 480, 520, 528, 572, 576, 624, 624, 676, 672,
+              728, 720, 780, 0, 0, 56, 60, 112, 120, 168, 180, 224, 240, 280, 300, 336, 360, 392, 420, 448, 480, 504,
+              540, 560, 600, 616, 660, 672, 720, 728, 780, 784, 840, 840, 900, 768, 832, 816, 884, 864, 936, 912, 988,
+              960, 1040, 1008, 1092, 1056, 1144, 1104, 1196, 1152, 1248, 1200, 1300, 1248, 1352, 1296, 1404, 1344, 1456,
+              1392, 1508, 1440, 1560, 1488, 1612, 896, 960, 952, 1020, 1008, 1080, 1064, 1140, 1120, 1200, 1176, 1260,
+              1232, 1320, 1288, 1380, 1344, 1440, 1400, 1500, 1456, 1560, 1512, 1620, 1568, 1680, 1624, 1740, 1680,
+              1800, 1736, 1860, 0, 0, 48, 52, 96, 104, 144, 156, 192, 208, 240, 260, 288, 312, 336, 364, 384, 416, 432,
+              468, 480, 520, 528, 572, 576, 624, 624, 676, 672, 728, 720, 780, 0, 0, 56, 60, 112, 120, 168, 180, 224,
+              240, 280, 300, 336, 360, 392, 420, 448, 480, 504, 540, 560, 600, 616, 660, 672, 720, 728, 780, 784, 840,
+              840, 900, 768, 832, 816, 884, 864, 936, 912, 988, 960, 1040, 1008, 1092, 1056, 1144, 1104, 1196, 1152,
+              1248, 1200, 1300, 1248, 1352, 1296, 1404, 1344, 1456, 1392, 1508, 1440, 1560, 1488, 1612, 896, 960, 952,
+              1020, 1008, 1080, 1064, 1140, 1120, 1200, 1176, 1260, 1232, 1320, 1288, 1380, 1344, 1440, 1400, 1500,
+              1456, 1560, 1512, 1620, 1568, 1680, 1624, 1740, 1680, 1800, 1736, 1860, 0, 0, 48, 52, 96, 104, 144, 156,
+              192, 208, 240, 260, 288, 312, 336, 364, 384, 416, 432, 468, 480, 520, 528, 572, 576, 624, 624, 676, 672,
+              728, 720, 780, 0, 0, 56, 60, 112, 120, 168, 180, 224, 240, 280, 300, 336, 360, 392, 420, 448, 480, 504,
+              540, 560, 600, 616, 660, 672, 720, 728, 780, 784, 840, 840, 900, 768, 832, 816, 884, 864, 936, 912, 988,
+              960, 1040, 1008, 1092, 1056, 1144, 1104, 1196, 1152, 1248, 1200, 1300, 1248, 1352, 1296, 1404, 1344, 1456,
+              1392, 1508, 1440, 1560, 1488, 1612, 896, 960, 952, 1020, 1008, 1080, 1064, 1140, 1120, 1200, 1176, 1260,
+              1232, 1320, 1288, 1380, 1344, 1440, 1400, 1500, 1456, 1560, 1512, 1620, 1568, 1680, 1624, 1740, 1680,
+              1800, 1736, 1860, 0, 0, 48, 52, 96, 104, 144, 156, 192, 208, 240, 260, 288, 312, 336, 364, 384, 416, 432,
+              468, 480, 520, 528, 572, 576, 624, 624, 676, 672, 728, 720, 780, 0, 0, 56, 60, 112, 120, 168, 180, 224,
+              240, 280, 300, 336, 360, 392, 420, 448, 480, 504, 540, 560, 600, 616, 660, 672, 720, 728, 780, 784, 840,
+              840, 900, 768, 832, 816, 884, 864, 936, 912, 988, 960, 1040, 1008, 1092, 1056, 1144, 1104, 1196, 1152,
+              1248, 1200, 1300, 1248, 1352, 1296, 1404, 1344, 1456, 1392, 1508, 1440, 1560, 1488, 1612, 896, 960, 952,
+              1020, 1008, 1080, 1064, 1140, 1120, 1200, 1176, 1260, 1232, 1320, 1288, 1380, 1344, 1440, 1400, 1500,
+              1456, 1560, 1512, 1620, 1568, 1680, 1624, 1740, 1680, 1800, 1736, 1860, 0, 0, 48, 52, 96, 104, 144, 156,
+              192, 208, 240, 260, 288, 312, 336, 364, 384, 416, 432, 468, 480, 520, 528, 572, 576, 624, 624, 676, 672,
+              728, 720, 780, 0, 0, 56, 60, 112, 120, 168, 180, 224, 240, 280, 300, 336, 360, 392, 420, 448, 480, 504,
+              540, 560, 600, 616, 660, 672, 720, 728, 780, 784, 840, 840, 900, 768, 832, 816, 884, 864, 936, 912, 988,
+              960, 1040, 1008, 1092, 1056, 1144, 1104, 1196, 1152, 1248, 1200, 1300, 1248, 1352, 1296, 1404, 1344, 1456,
+              1392, 1508, 1440, 1560, 1488, 1612, 896, 960, 952, 1020, 1008, 1080, 1064, 1140, 1120, 1200, 1176, 1260,
+              1232, 1320, 1288, 1380, 1344, 1440, 1400, 1500, 1456, 1560, 1512, 1620, 1568, 1680, 1624, 1740, 1680,
+              1800, 1736, 1860, 0, 0, 48, 52, 96, 104, 144, 156, 192, 208, 240, 260, 288, 312, 336, 364, 384, 416, 432,
+              468, 480, 520, 528, 572, 576, 624, 624, 676, 672, 728, 720, 780, 0, 0, 56, 60, 112, 120, 168, 180, 224,
+              240, 280, 300, 336, 360, 392, 420, 448, 480, 504, 540, 560, 600, 616, 660, 672, 720, 728, 780, 784, 840,
+              840, 900, 768, 832, 816, 884, 864, 936, 912, 988, 960, 1040, 1008, 1092, 1056, 1144, 1104, 1196, 1152,
+              1248, 1200, 1300, 1248, 1352, 1296, 1404, 1344, 1456, 1392, 1508, 1440, 1560, 1488, 1612, 896, 960, 952,
+              1020, 1008, 1080, 1064, 1140, 1120, 1200, 1176, 1260, 1232, 1320, 1288, 1380, 1344, 1440, 1400, 1500,
+              1456, 1560, 1512, 1620, 1568, 1680, 1624, 1740, 1680, 1800, 1736, 1860
+            ],
+            "dims": [1, 4, 32, 32],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
   }
 ]

From f977be066318111f01d7f2e2373824566a7fe9c0 Mon Sep 17 00:00:00 2001
From: Hector Li <hecli@microsoft.com>
Date: Sat, 23 Mar 2024 13:43:20 -0700
Subject: [PATCH 235/279] Fix issue that failed to load Conv node with external
 initializer (#20042)

### Description
Fix issue that failed to load Conv node with external initializer.
Root cause the model path is not provided while loading the weight and
bias tensor for Conv.
---
 .../qnn/builder/opbuilder/base_op_builder.cc  |   4 +++-
 .../test/providers/qnn/qnn_basic_test.cc      |  20 ++++++++++++++++++
 .../test/testdata/conv_qdq_external_ini.bin   | Bin 0 -> 2000 bytes
 .../test/testdata/conv_qdq_external_ini.onnx  | Bin 0 -> 2204 bytes
 4 files changed, 23 insertions(+), 1 deletion(-)
 create mode 100644 onnxruntime/test/testdata/conv_qdq_external_ini.bin
 create mode 100644 onnxruntime/test/testdata/conv_qdq_external_ini.onnx

diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc
index 6d8c80bd2aaa1..08c9a8449cc33 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc
@@ -244,7 +244,9 @@ Status BaseOpBuilder::TransposeInitializer(const QnnModelWrapper& qnn_model_wrap
 
   TensorShape new_tensor_shape(new_tensor_shape_dims);
   Tensor out_tensor = Tensor(tensor_dtype, new_tensor_shape, cpu_allocator);
-  ORT_RETURN_IF_ERROR(onnxruntime::utils::TensorProtoToTensor(Env::Default(), nullptr, initializer, in_tensor));
+  onnxruntime::PathString model_path = qnn_model_wrapper.GetGraphViewer().ModelPath().ToPathString();
+  const ORTCHAR_T* model_path_str = model_path.empty() ? nullptr : model_path.c_str();
+  ORT_RETURN_IF_ERROR(onnxruntime::utils::TensorProtoToTensor(Env::Default(), model_path_str, initializer, in_tensor));
   ORT_RETURN_IF_ERROR(Transpose::DoTranspose(permutations, in_tensor, out_tensor));
   onnx::TensorProto new_tensor_proto = onnxruntime::utils::TensorToTensorProto(out_tensor, "test");
   ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(new_tensor_proto, transposed_data));
diff --git a/onnxruntime/test/providers/qnn/qnn_basic_test.cc b/onnxruntime/test/providers/qnn/qnn_basic_test.cc
index 4f294f899c170..7fd2441441dcf 100644
--- a/onnxruntime/test/providers/qnn/qnn_basic_test.cc
+++ b/onnxruntime/test/providers/qnn/qnn_basic_test.cc
@@ -168,6 +168,26 @@ TEST(QnnEP, TestDisableCPUFallback_ConflictingConfig) {
   }
 }
 
+// Conv node `Conv` is not supported: GetFileLength for conv_qdq_external_ini.bin failed:open file conv_qdq_external_ini.bin fail,
+// errcode = 2 - The system cannot find the file specified.
+TEST_F(QnnHTPBackendTests, TestConvWithExternalData) {
+  Ort::SessionOptions so;
+  onnxruntime::ProviderOptions options;
+#if defined(_WIN32)
+  options["backend_path"] = "QnnHtp.dll";
+#else
+  options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  so.AppendExecutionProvider("QNN", options);
+
+  Ort::Status status(OrtSessionOptionsAppendExecutionProvider_CPU(so, 1));
+
+  const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "conv_qdq_external_ini.onnx";
+
+  Ort::Session session(*ort_env, ort_model_path, so);
+}
+
 // Helper function that runs an ONNX model with a NHWC Resize operator to test that
 // type/shape inference succeeds during layout transformation.
 // Refer to onnxruntime/core/graph/contrib_ops/nhwc_inference_context.h.
diff --git a/onnxruntime/test/testdata/conv_qdq_external_ini.bin b/onnxruntime/test/testdata/conv_qdq_external_ini.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e749ab5af29c58c932a40d432f236cf3ae5d5be3
GIT binary patch
literal 2000
zcmeH`>rY#C7{?pqB__t07!!T%i^VtFE2B}ehy&IE8_MR22yHD;N=Gk1PwD0K^z?SR
z9(sDCJ?-f!)KfZWx%5I?29(l{%L;+XZf+(6;$>zkk<IBFH4%R#`zs8;<de@g&+~g;
ze(N*Y*x=4XgUm=ZuPh7NT)GOofkRuch*WzG{)juWp}e6M+B|irTv}X}pBr1@WuhU-
z6gFc5j<4AAU3g}T+iAmx&PJXFXPNX6%o$ZQTh~uCqq^nba?=lTqU^}q@sMM*hSoD`
ziuB{lD<sZQ)ywn4HydWT2K&jXMm}n|Lu^SqAB!X#w4u8>*_xh~3$emfk0VBY+?((J
zo`m%SwzfRN3-oB#iwg<I<_R0NaJzWDwp{QHSA~?+>5v~S%!v~&neh6&Hr;uEZFw=i
zY~j*p+H}J4&0ahni-bvs+x!rMEzTj*mvijsP3sZy_~#N_&is5fYP7J~IgMwqR5wv`
zbw`L)V_}~>HGh+6bXw{zw|KTuqFT<T`o)bH!mrm|f^tppaXWs;ZPcb6?TF!mL{YGw
zB5MAy-B#$iWk~Ol=VMuQhe4!Yy!`2uPTbat1ogLsllMlk%|>)^+9|_NZ~aJVP@D-=
z&UKC@R);=uq4qFaYwmwk@HhgQxM@%tX!OEe0-sLfwd|D{D1{R+A=zIts%-C?I*gd8
z&faT@^=L<w6&aIvBmQJkc+R&P9H&Sa#b?=T%96-)Z;*Oa7PvKJJGJ~^G>XvK_f^S&
zV6=<B?~2)T$q+9P)|5n%rG^hGQ=yoPig}7|d?1;GoMTt}Xf>5RGpX%YM0+zd3%X0`
zgl~`ct<zF3p68sj5i^NCb)4Yn7K@Ws3omzp?@l2j$i{FyWA6BsshFN4#>`V@!;P+s
z6Dth#RSFxRcX^vMX1r!VyJcn^>0O5@H0W0$aKZNF(wfshBOa1Q6UrU|`5viu2BI=r
z_D@_qLUk$HCy%wNvM!>ecRP<YVQa##-767<N#q6^+&cfF$*(Od_wr(a#L}v`mEyW^
zKYzSRSL=<;w#auSR`~8?QXAG%f-fSBzTp&fK4?j`L-;#ZrM-|lKfY_cKg*ODWJ+o<
z2H2|YG{WvEmP<jTcS>jRHx=T^Z?n+6Xo4Pga>!NdPc4Y1!?w^pQs!1ZxEO>&)vDS8
z-fr`15?8i)KCU=&0Nz_H7V8DTn}Fv5zk%6H!27_z13+Xw0saN>8ptmKUVi4of3E`G
y0D$$u-+;s2bAZE}LtY2@Dezx_KcD#(F#8weMeqF&L1fh*_57&kM?L@F_xwL!Iyz|p

literal 0
HcmV?d00001

diff --git a/onnxruntime/test/testdata/conv_qdq_external_ini.onnx b/onnxruntime/test/testdata/conv_qdq_external_ini.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..fad6074aea133f2d5c3ecbf5b597bae70bdde4ce
GIT binary patch
literal 2204
zcmbVOL2uJA6eexgg|~L3p=(PRw5r4<(n33?u>sPmzy*m5S1wVSx~;R+$r5)R;}>v%
z8#fN{Z;&`3{slh+J89RXO<QS-tR&B${e92R@7dzETyzP!(k{j(q4*jpTP>}vwO)L`
z2VVs!TQ0e5Yh7#x#thK|CCzFCmgyj<r}kYV-!(iJ6Dq?Tpgf#^8*k8z;2kE&^qbP6
z1-=PTiAGEi9t>$rlAN$dQmP4COv0l?$0CigzB@%%7q&3*#uRqK*K{SM=qJWojx6t|
zOC9;Cq*|N1Nfb;V6}O495>YPsxWb;|+;6t%>65_vg@Rb5WyKv+wn9eItG9gBl>BS9
z$&6Mr$KR?9cYD||DRxQF5jw&TVOuT0*1&hiUXA@_)IBJiBcCA02!^JII@wN^gG!L|
z%wEvRL=QJfupLkz_t0!6_ylW|Kw+vf+CQb~Uo3)qsctCg=A{^(LppFXZ$9Exs#E^O
zcrM3HmZ<An!vh=B19auNK2@tyMil~mX<w>)lF-ck{P+1xk8fagpPT*aua|mCcByqX
zTjq<PugWsj78_!gKT<|tk+${|5FOVFgDpScS!rDKE(~-<S$s{$zy!0}L+oA1yZwHE
zsGMuJS}nT>c?XdJ9m?60<0DT{YuvRv`f|k73U}hqtG{P@BDiW-Y)E-MwZ7d(6_XKH
z)jm97s`{cJ3Hq99c^jtF(?fgb#(#l(f6jRF`liZzJF?>ivJdrXv6CxM76n;?j3}^{
zWv#|rx50uHCi?=S!Fm<mm=NKa+<?byxB>+kx+b-TM!-%V);)*S5)}kMWcVCM*v~hV
zo)|Nqj%y2>t)*Ad5T9B0#cr*teKCfF_x91c0<$n`?&{MOm0>~H%>X?k74<6Ix7?97
z!j|s_Zl7wz8cr;><8}c|tcAz=$dR&oji2KT7-;MO4l_$594YB#kY-5gn=IYPfQ{dW
z+RW<t=n|ue7V9lRF+s})Jmyz|xDr-zkdH@*P}5=kjK=i`n)flRNt#I!kp#aE5NgDv
Sk9@hrzx^@0gp5#h8~G2;%d_bK

literal 0
HcmV?d00001


From 4a196d15940b0f328735c888e2e861d67602ffcf Mon Sep 17 00:00:00 2001
From: aciddelgado <139922440+aciddelgado@users.noreply.github.com>
Date: Sat, 23 Mar 2024 14:30:35 -0700
Subject: [PATCH 236/279] Packed QKV and Rotary Embedding Support for sm<80 GQA
 (#20012)

### Description
Add support for packed qkv input and rotary embedding with sm<80 using
memory efficient attention kernel.


### Motivation and Context
Allows lower-end gpus to run gqa with packed qkv input and rotary
embedding.
---
 .../cuda/bert/group_query_attention.cc        |  23 ++-
 .../cuda/bert/group_query_attention_impl.cu   | 160 ++++++++++++++++--
 .../cuda/bert/group_query_attention_impl.h    |   2 +
 .../python/transformers/test_flash_attn.py    |  95 ++++++-----
 4 files changed, 216 insertions(+), 64 deletions(-)

diff --git a/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc b/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc
index 814aa1fb3c8f0..112f609d46598 100644
--- a/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc
@@ -159,8 +159,6 @@ Status GroupQueryAttention<T>::ComputeInternal(OpKernelContext* context) const {
       !use_flash_attention &&
       !disable_memory_efficient_attention_ &&
       local_window_size_ == -1 &&
-      do_rotary_ == false &&
-      key != nullptr &&
       (parameters.head_size & 7) == 0 &&
       parameters.sequence_length <= parameters.seqlen_past_kv_cache + parameters.sequence_length &&
       (sizeof(T) == 2 || parameters.sequence_length >= attention::kMinSeqLenForMemoryEfficientAttentionFp32) &&
@@ -172,18 +170,31 @@ Status GroupQueryAttention<T>::ComputeInternal(OpKernelContext* context) const {
   if (use_memory_efficient_attention && needs_buff) {
     kv_buffer_bytes = (sizeof(T) * parameters.batch_size * parameters.num_heads * parameters.seqlen_present_kv_cache * parameters.head_size);
   }
+  size_t rotary_buffer_bytes = 0;
+  if (use_memory_efficient_attention && do_rotary_) {
+    rotary_buffer_bytes = 2 * sizeof(T) * parameters.batch_size * parameters.num_heads * parameters.sequence_length * parameters.head_size;
+    rotary_buffer_bytes += sizeof(int64_t) * parameters.batch_size * parameters.sequence_length;
+  }
   size_t fmha_buffer_bytes = 0;
   if (use_memory_efficient_attention && MemoryEfficientAttentionParams::need_workspace(parameters.head_size, sizeof(T) == sizeof(float))) {
     fmha_buffer_bytes = (parameters.batch_size * parameters.sequence_length * parameters.num_heads * parameters.head_size * sizeof(float));
   }
+  size_t unpacked_qkv_bytes = 0;
+  if (use_memory_efficient_attention && parameters.is_packed_qkv) {
+    unpacked_qkv_bytes = (parameters.batch_size * parameters.sequence_length * (parameters.num_heads + 2 * parameters.kv_num_heads) * parameters.head_size * sizeof(T));
+  }
   auto k_buffer = GetScratchBuffer<void>(kv_buffer_bytes, context->GetComputeStream());
   auto v_buffer = GetScratchBuffer<void>(kv_buffer_bytes, context->GetComputeStream());
+  auto rotary_buffer = GetScratchBuffer<void>(rotary_buffer_bytes, context->GetComputeStream());
   auto fmha_buffer = GetScratchBuffer<void>(fmha_buffer_bytes, context->GetComputeStream());
+  auto unpacked_qkv_buffer = GetScratchBuffer<void>(unpacked_qkv_bytes, context->GetComputeStream());
 #else
   constexpr bool use_memory_efficient_attention = false;
   auto k_buffer = GetScratchBuffer<void>(0, context->GetComputeStream());
   auto v_buffer = GetScratchBuffer<void>(0, context->GetComputeStream());
+  auto rotary_buffer = GetScratchBuffer<void>(0, context->GetComputeStream());
   auto fmha_buffer = GetScratchBuffer<void>(0, context->GetComputeStream());
+  auto unpacked_qkv_buffer = GetScratchBuffer<void>(0, context->GetComputeStream());
 #endif
 
   // seqlens_k buffer
@@ -251,7 +262,13 @@ Status GroupQueryAttention<T>::ComputeInternal(OpKernelContext* context) const {
   if (fmha_buffer != nullptr) {
     data.fmha_buffer = reinterpret_cast<CudaT*>(fmha_buffer.get());
   }
-  // Rotary
+  if (unpacked_qkv_buffer != nullptr) {
+    data.unpacked_qkv_buffer = reinterpret_cast<CudaT*>(unpacked_qkv_buffer.get());
+  }
+  if (rotary_buffer != nullptr) {
+    data.rotary_buffer = reinterpret_cast<CudaT*>(rotary_buffer.get());
+  }
+  // Rotary Embedding
   if (parameters.do_rotary) {
     data.cos_cache = reinterpret_cast<const CudaT*>(cos_cache->Data<T>());
     data.sin_cache = reinterpret_cast<const CudaT*>(sin_cache->Data<T>());
diff --git a/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu
index afba83be34e2d..f519be1c97149 100644
--- a/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu
@@ -42,6 +42,7 @@ limitations under the License.
 #include "contrib_ops/cuda/bert/group_query_attention_impl.h"
 #include "contrib_ops/cuda/bert/attention_impl.h"
 #include "core/providers/cuda/shared_inc/cuda_call.h"
+#include "contrib_ops/cuda/bert/rotary_embedding_impl.h"
 #include <cublas_v2.h>
 
 using namespace onnxruntime::cuda;
@@ -150,6 +151,8 @@ __global__ void ConcatNewToPastKVLarge(const int new_seqlen,
 template <typename T>
 Status LaunchConcatNewToPastKV(contrib::GroupQueryAttentionParameters& parameters,
                                GroupQueryAttentionData<T>& data,
+                               const void* new_key,
+                               const void* new_value,
                                cudaStream_t stream,
                                const int max_threads_per_block,
                                const bool past_only = false) {
@@ -171,14 +174,14 @@ Status LaunchConcatNewToPastKV(contrib::GroupQueryAttentionParameters& parameter
     ConcatNewToPastKV<float2><<<grid, block, 0, stream>>>(kv_sequence_length,
                                                           past_sequence_length,
                                                           reinterpret_cast<const float2*>(data.past_key),
-                                                          reinterpret_cast<const float2*>(data.key),
+                                                          reinterpret_cast<const float2*>(new_key),
                                                           reinterpret_cast<float2*>(data.present_key),
                                                           seqlens_k,
                                                           past_kv_format == AttentionQkvFormat::Q_K_V_BSNH);
     ConcatNewToPastKV<float2><<<grid, block, 0, stream>>>(kv_sequence_length,
                                                           past_sequence_length,
                                                           reinterpret_cast<const float2*>(data.past_value),
-                                                          reinterpret_cast<const float2*>(data.value),
+                                                          reinterpret_cast<const float2*>(new_value),
                                                           reinterpret_cast<float2*>(data.present_value),
                                                           seqlens_k,
                                                           past_kv_format == AttentionQkvFormat::Q_K_V_BSNH);
@@ -191,7 +194,7 @@ Status LaunchConcatNewToPastKV(contrib::GroupQueryAttentionParameters& parameter
                                                                H,
                                                                kv_num_heads,
                                                                reinterpret_cast<const float2*>(data.past_key),
-                                                               reinterpret_cast<const float2*>(data.key),
+                                                               reinterpret_cast<const float2*>(new_key),
                                                                reinterpret_cast<float2*>(data.present_key),
                                                                seqlens_k,
                                                                past_kv_format == AttentionQkvFormat::Q_K_V_BSNH);
@@ -200,7 +203,7 @@ Status LaunchConcatNewToPastKV(contrib::GroupQueryAttentionParameters& parameter
                                                                H,
                                                                kv_num_heads,
                                                                reinterpret_cast<const float2*>(data.past_value),
-                                                               reinterpret_cast<const float2*>(data.value),
+                                                               reinterpret_cast<const float2*>(new_value),
                                                                reinterpret_cast<float2*>(data.present_value),
                                                                seqlens_k,
                                                                past_kv_format == AttentionQkvFormat::Q_K_V_BSNH);
@@ -281,6 +284,8 @@ __global__ void ConcatKVInPlaceLarge(const int max_seqlen,
 template <typename T>
 Status LaunchConcatKVInPlace(contrib::GroupQueryAttentionParameters& parameters,
                              GroupQueryAttentionData<T>& data,
+                             const void* new_key,
+                             const void* new_value,
                              cudaStream_t stream,
                              const int max_threads_per_block) {
   const int batch_size = parameters.batch_size;
@@ -300,12 +305,12 @@ Status LaunchConcatKVInPlace(contrib::GroupQueryAttentionParameters& parameters,
     const dim3 block(H, kv_num_heads, 1);
     ConcatKVInPlace<float2><<<grid, block, 0, stream>>>(present_sequence_length,
                                                         reinterpret_cast<float2*>(data.present_key),
-                                                        reinterpret_cast<const float2*>(data.key),
+                                                        reinterpret_cast<const float2*>(new_key),
                                                         seqlens_k,
                                                         past_kv_format == AttentionQkvFormat::Q_K_V_BSNH);
     ConcatKVInPlace<float2><<<grid, block, 0, stream>>>(present_sequence_length,
                                                         reinterpret_cast<float2*>(data.present_value),
-                                                        reinterpret_cast<const float2*>(data.value),
+                                                        reinterpret_cast<const float2*>(new_value),
                                                         seqlens_k,
                                                         past_kv_format == AttentionQkvFormat::Q_K_V_BSNH);
   } else {
@@ -316,14 +321,14 @@ Status LaunchConcatKVInPlace(contrib::GroupQueryAttentionParameters& parameters,
                                                              H,
                                                              kv_num_heads,
                                                              reinterpret_cast<float2*>(data.present_key),
-                                                             reinterpret_cast<const float2*>(data.key),
+                                                             reinterpret_cast<const float2*>(new_key),
                                                              seqlens_k,
                                                              past_kv_format == AttentionQkvFormat::Q_K_V_BSNH);
     ConcatKVInPlaceLarge<float2><<<grid, block, 0, stream>>>(present_sequence_length,
                                                              H,
                                                              kv_num_heads,
                                                              reinterpret_cast<float2*>(data.present_value),
-                                                             reinterpret_cast<const float2*>(data.value),
+                                                             reinterpret_cast<const float2*>(new_value),
                                                              seqlens_k,
                                                              past_kv_format == AttentionQkvFormat::Q_K_V_BSNH);
   }
@@ -468,6 +473,83 @@ Status LaunchGetSeqlenBuff(contrib::GroupQueryAttentionParameters& parameters, i
   return CUDA_CALL(cudaGetLastError());
 }
 
+// Kernel to unpack qkv from packed qkv
+template <typename T>
+__global__ void UnpackQKV(const T* packed_qkv, T* unpacked_q, T* unpacked_k, T* unpacked_v, const int num_heads,
+                          const int kv_num_heads, const int head_size, const int sequence_length,
+                          const int batch_size) {
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int d = (num_heads + 2 * kv_num_heads) * head_size;
+  const int qkv_size = batch_size * sequence_length * d;
+  const int q_size = num_heads * head_size;
+  const int k_size = kv_num_heads * head_size;
+  if (tid < qkv_size) {
+    int batch = tid / (d * sequence_length);
+    int sequence = (tid % (d * sequence_length)) / d;
+    int offset = tid % d;
+    if (offset < q_size) {
+      int unpacked_i = batch * sequence_length * num_heads * head_size + sequence * num_heads * head_size + offset;
+      unpacked_q[unpacked_i] = packed_qkv[tid];
+    } else if (offset < q_size + k_size) {
+      int unpacked_i = batch * sequence_length * kv_num_heads * head_size + sequence * kv_num_heads * head_size + (offset - q_size);
+      unpacked_k[unpacked_i] = packed_qkv[tid];
+    } else {
+      int unpacked_i = batch * sequence_length * kv_num_heads * head_size + sequence * kv_num_heads * head_size + (offset - q_size - k_size);
+      unpacked_v[unpacked_i] = packed_qkv[tid];
+    }
+  }
+}
+
+// Unpack packed qkv
+template <typename T>
+Status LaunchUnpackQKV(const T* packed_qkv, T* unpacked_q, T* unpacked_k, T* unpacked_v, const int num_heads,
+                       const int kv_num_heads, const int head_size, const int sequence_length, const int batch_size,
+                       cudaStream_t stream, const int max_threads_per_block) {
+  const int threads = max_threads_per_block;
+  const int blocks = (batch_size * sequence_length * (num_heads + 2 * kv_num_heads) * head_size + threads - 1) / threads;
+  UnpackQKV<<<blocks, threads, 0, stream>>>(packed_qkv, unpacked_q, unpacked_k, unpacked_v, num_heads, kv_num_heads,
+                                            head_size, sequence_length, batch_size);
+  return CUDA_CALL(cudaGetLastError());
+}
+
+// Kernel to convert seqlens_k to position_ids
+__global__ void SeqlensToPosIdsPrompt(int32_t* seqlens_k, int64_t* position_ids, const int seqlen,
+                                      const int batch_size) {
+  int tid = blockDim.x * blockIdx.x + threadIdx.x;
+  int b = tid / seqlen;
+  int s = tid % seqlen;
+  if (b < batch_size) {
+    if (s < seqlens_k[b] + 1) {
+      position_ids[tid] = s;
+    } else {
+      position_ids[tid] = 1;
+    }
+  }
+}
+
+// Kernel to convert seqlens_k to position_ids
+__global__ void SeqlensToPosIdsToken(int32_t* seqlens_k, int64_t* position_ids, const int batch_size) {
+  int tid = blockDim.x * blockIdx.x + threadIdx.x;
+  if (tid < batch_size) {
+    position_ids[tid] = seqlens_k[tid];
+  }
+}
+
+// Convert seqlens_k to position_ids
+Status LaunchSeqlensToPosIds(contrib::GroupQueryAttentionParameters& parameters, int32_t* seqlens_k,
+                             int64_t* position_ids, cudaStream_t stream, const int max_threads_per_block) {
+  const int seqlen = parameters.sequence_length;
+  const int batch_size = parameters.batch_size;
+  const int threads = max_threads_per_block;
+  const int blocks = (batch_size * seqlen + threads - 1) / threads;
+  if (parameters.is_prompt) {
+    SeqlensToPosIdsPrompt<<<blocks, threads, 0, stream>>>(seqlens_k, position_ids, seqlen, batch_size);
+  } else {
+    SeqlensToPosIdsToken<<<blocks, threads, 0, stream>>>(seqlens_k, position_ids, batch_size);
+  }
+  return CUDA_CALL(cudaGetLastError());
+}
+
 ////////// Launch Kernels
 
 #if USE_FLASH_ATTENTION
@@ -517,7 +599,8 @@ Status FlashAttention(
       seqlens_k = data.seqlens_k_total;
     }
   } else if (!parameters.kv_share_buffer) {  // copy past kv to present kv
-    ORT_RETURN_IF_ERROR(LaunchConcatNewToPastKV(parameters, data, stream, max_threads_per_block, true));
+    ORT_RETURN_IF_ERROR(LaunchConcatNewToPastKV(parameters, data, nullptr, nullptr, stream, max_threads_per_block,
+                                                true));
   }
 
   void* present_key = reinterpret_cast<void*>(const_cast<T*>(data.present_key));
@@ -563,15 +646,62 @@ Status EfficientAttention(
   const int head_size = parameters.head_size;
   AttentionQkvFormat past_kv_format = parameters.past_kv_format;
 
-  const void* query = reinterpret_cast<const void*>(data.query);
-  const void* key = reinterpret_cast<const void*>(data.key);
-  const void* value = reinterpret_cast<const void*>(data.value);
+  const void* query;
+  const void* key;
+  const void* value;
+
+  if (!parameters.is_packed_qkv) {
+    query = reinterpret_cast<const void*>(data.query);
+    key = reinterpret_cast<const void*>(data.key);
+    value = reinterpret_cast<const void*>(data.value);
+  } else {
+    size_t q_size = static_cast<size_t>(batch_size * sequence_length * num_heads * head_size);
+    size_t k_size = static_cast<size_t>(batch_size * sequence_length * kv_num_heads * head_size);
+    auto q = reinterpret_cast<T*>(data.unpacked_qkv_buffer);
+    auto k = reinterpret_cast<T*>(data.unpacked_qkv_buffer + q_size);
+    auto v = reinterpret_cast<T*>(data.unpacked_qkv_buffer + q_size + k_size);
+    ORT_RETURN_IF_ERROR(LaunchUnpackQKV(reinterpret_cast<const T*>(data.query), q, k, v, num_heads, kv_num_heads,
+                                        head_size, sequence_length, batch_size, stream, max_threads_per_block));
+    query = reinterpret_cast<const void*>(q);
+    key = reinterpret_cast<const void*>(k);
+    value = reinterpret_cast<const void*>(v);
+  }
+
+  if (parameters.do_rotary) {
+    size_t q_size = static_cast<size_t>(batch_size * sequence_length * num_heads * head_size);
+    size_t k_size = static_cast<size_t>(batch_size * sequence_length * kv_num_heads * head_size);
+    auto q_buffer = reinterpret_cast<T*>(data.rotary_buffer);
+    auto k_buffer = q_buffer + q_size;
+    auto position_ids_buff = reinterpret_cast<int64_t*>(k_buffer + k_size);
+    ORT_RETURN_IF_ERROR(LaunchSeqlensToPosIds(parameters, data.seqlens_k, position_ids_buff, stream,
+                                              max_threads_per_block));
+    DUMP_TENSOR_INIT();
+    DUMP_TENSOR("position_ids", position_ids_buff, batch_size, sequence_length);
+    // Launch rotary embedding kernel
+    ORT_RETURN_IF_ERROR(LaunchRotaryEmbeddingKernel<T>(stream, q_buffer, reinterpret_cast<const T*>(query),
+                                                       position_ids_buff, data.cos_cache, data.sin_cache,
+                                                       parameters.batch_size, parameters.sequence_length,
+                                                       parameters.num_heads, parameters.head_size,
+                                                       parameters.rotary_dim, parameters.seqlen_present_kv_cache,
+                                                       /*position_ids_format*/ 1, parameters.rotary_interleaved,
+                                                       device_prop.maxThreadsPerBlock, /*transposed*/ false));
+    ORT_RETURN_IF_ERROR(LaunchRotaryEmbeddingKernel<T>(stream, k_buffer, reinterpret_cast<const T*>(key),
+                                                       position_ids_buff, data.cos_cache, data.sin_cache,
+                                                       parameters.batch_size, parameters.sequence_length,
+                                                       parameters.kv_num_heads, parameters.head_size,
+                                                       parameters.rotary_dim, parameters.seqlen_present_kv_cache,
+                                                       /*position_ids_format*/ 1, parameters.rotary_interleaved,
+                                                       device_prop.maxThreadsPerBlock, /*transposed*/ false));
+    query = reinterpret_cast<const void*>(q_buffer);
+    key = reinterpret_cast<const void*>(k_buffer);
+  }
 
   if (parameters.is_prompt) {
     // Launch kernel to copy seqlen
     constexpr int thr_per_blk = 256;
     int blk_in_grid = (batch_size + thr_per_blk - 1) / thr_per_blk;
-    repeat_seqlen<<<blk_in_grid, thr_per_blk, 0, stream>>>(data.seqlens_k_total, parameters.sequence_length, batch_size);
+    repeat_seqlen<<<blk_in_grid, thr_per_blk, 0, stream>>>(data.seqlens_k_total, parameters.sequence_length,
+                                                           batch_size);
   } else {
     ORT_RETURN_IF_ERROR(LaunchGetSeqlenBuff(parameters, data.seqlens_k, data.seqlens_k_total, true, stream, 256));
   }
@@ -583,7 +713,7 @@ Status EfficientAttention(
                              "Past and present kv shall share the same tensor when kv_share_buffer is on.");
     }
     // Concatenate new kv in place
-    ORT_RETURN_IF_ERROR(LaunchConcatKVInPlace(parameters, data, stream, max_threads_per_block));
+    ORT_RETURN_IF_ERROR(LaunchConcatKVInPlace(parameters, data, key, value, stream, max_threads_per_block));
   } else {
     // Not share buffer case
     if (data.past_key != nullptr && data.past_key == data.present_key) {
@@ -591,7 +721,7 @@ Status EfficientAttention(
                              "Past and present kv share the same tensor but kv_share_buffer is not on.");
     }
     // Copy past and concat new KV to present buffer
-    ORT_RETURN_IF_ERROR(LaunchConcatNewToPastKV(parameters, data, stream, max_threads_per_block));
+    ORT_RETURN_IF_ERROR(LaunchConcatNewToPastKV(parameters, data, key, value, stream, max_threads_per_block));
   }
 
   // Ungroup if grouped, otherwise use present kv directly
diff --git a/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.h b/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.h
index 1bf91f9c875eb..32341afa0e3fa 100644
--- a/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.h
+++ b/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.h
@@ -30,6 +30,8 @@ struct GroupQueryAttentionData {
   int* seqlens_k_total = nullptr;
   // Memory Efficient buffers
   T* fmha_buffer = nullptr;
+  T* unpacked_qkv_buffer = nullptr;
+  T* rotary_buffer = nullptr;
   T* k = nullptr;
   T* v = nullptr;
   // Output Tensors
diff --git a/onnxruntime/test/python/transformers/test_flash_attn.py b/onnxruntime/test/python/transformers/test_flash_attn.py
index b784c83329c76..183d6218567a7 100644
--- a/onnxruntime/test/python/transformers/test_flash_attn.py
+++ b/onnxruntime/test/python/transformers/test_flash_attn.py
@@ -1216,8 +1216,6 @@ def parity_check_gqa_prompt(
         dtype=torch.float16,
         requires_grad=False,
     )
-    # print(k.shape)
-    # print(new_k.shape)
 
     window_size = (-1, -1)
     left_window_size = -1
@@ -1328,10 +1326,6 @@ def parity_check_gqa_prompt(
     out = torch.reshape(out, (config.batch_size, config.q_sequence_length, config.num_heads, config.head_size))
     out = out.detach().cpu().numpy()
 
-    # print(cache_seqlens[0])
-    # print((present_k - k_cache_ref.detach().cpu().numpy())[0, 0, :, 0])
-    # print((out - out_ref)[0, :, 0, 0])
-
     # Make sure past-present buffer updating correctly
     assert numpy.allclose(present_k, k_cache_ref.detach().cpu().numpy(), rtol=rtol, atol=atol, equal_nan=True)
     assert numpy.allclose(present_v, v_cache_ref.detach().cpu().numpy(), rtol=rtol, atol=atol, equal_nan=True)
@@ -1724,9 +1718,6 @@ def parity_check_gqa_past(
     out = torch.reshape(out, (config.batch_size, config.sequence_length, config.num_heads, config.head_size))
     out = out.detach().cpu().numpy()
 
-    # print(cache_seqlens[0])
-    # print((present_k - k_cache_ref.detach().cpu().numpy())[0, 0, cache_seqlens[0], :])
-
     # Make sure past-present buffer updating correctly
     assert numpy.allclose(present_k, k_cache_ref.detach().cpu().numpy(), rtol=rtol, atol=atol, equal_nan=True)
     assert numpy.allclose(present_v, v_cache_ref.detach().cpu().numpy(), rtol=rtol, atol=atol, equal_nan=True)
@@ -1939,18 +1930,6 @@ def parity_check_gqa_past_no_buff(
     out = torch.reshape(out, (config.batch_size, config.sequence_length, config.num_heads, config.head_size))
     out = out.detach().cpu().numpy()
 
-    # print(cache_seqlens[0])
-    # print((out - out_ref)[0])
-    # print((present_k - k_cache_ref.detach().cpu().numpy())[0, 0, :, 0])
-
-    # Make sure past-present buffer updating correctly
-    # assert numpy.allclose(
-    #     present_k[:, :, :-1, :], k_cache_ref.detach().cpu().numpy()[:, :, :-1, :], rtol=rtol, atol=atol, equal_nan=True
-    # )
-    # assert numpy.allclose(
-    #     present_v[:, :, :-1, :], v_cache_ref.detach().cpu().numpy()[:, :, :-1, :], rtol=rtol, atol=atol, equal_nan=True
-    # )
-
     # Compare results
     print(
         "NO buff",
@@ -2078,10 +2057,27 @@ def test_gqa_no_past(self):
             for sq, skv in seqs:
                 for n, n2 in num_h:
                     for h in h_sizes:
-                        for past_kv_format in [Formats.BNSH]:
-                            config = PromptConfig(b, sq, skv, sq + skv + 8, n, n2, h)
-                            parity_check_gqa_prompt(config, past_format=past_kv_format)
-                            parity_check_gqa_prompt_no_buff(config, past_format=past_kv_format)
+                        for rotary, rotary_interleaved in [(True, False), (True, True), (False, False)]:
+                            for packed in [False, True]:
+                                config = PromptConfig(b, sq, skv, sq + skv + 8, n, n2, h)
+                                parity_check_gqa_prompt(
+                                    config,
+                                    rtol=2e-3,
+                                    atol=2e-3,
+                                    past_format=Formats.BNSH,
+                                    rotary=rotary,
+                                    rotary_interleaved=rotary_interleaved,
+                                    packed=packed,
+                                )
+                                parity_check_gqa_prompt_no_buff(
+                                    config,
+                                    rtol=2e-3,
+                                    atol=2e-3,
+                                    past_format=Formats.BNSH,
+                                    rotary=rotary,
+                                    rotary_interleaved=rotary_interleaved,
+                                    packed=packed,
+                                )
         if major < 8 or platform.system() != "Linux":
             return
         print("------- FLASH ATTENTION (PROMPT CASE) --------")
@@ -2092,12 +2088,12 @@ def test_gqa_no_past(self):
                     for h in h_sizes:
                         for local in [False, True]:
                             for rotary, rotary_interleaved in [(True, False), (True, True), (False, False)]:
-                                for past_kv_format, packed in [(Formats.BNSH, False), (Formats.BNSH, True)]:
+                                for packed in [False, True]:
                                     config = PromptConfig(b, sq, skv, sq + skv + 8, n, n2, h)
                                     parity_check_gqa_prompt(
                                         config,
                                         local=local,
-                                        past_format=past_kv_format,
+                                        past_format=Formats.BNSH,
                                         rotary=rotary,
                                         rotary_interleaved=rotary_interleaved,
                                         packed=packed,
@@ -2105,7 +2101,7 @@ def test_gqa_no_past(self):
                                     parity_check_gqa_prompt_no_buff(
                                         config,
                                         local=local,
-                                        past_format=past_kv_format,
+                                        past_format=Formats.BNSH,
                                         rotary=rotary,
                                         rotary_interleaved=rotary_interleaved,
                                         packed=packed,
@@ -2145,21 +2141,28 @@ def test_gqa_past(self):
             for s, s2 in seqs:
                 for n, n2 in num_h:
                     for h in h_sizes:
-                        for past_kv_format in [Formats.BNSH]:
-                            sp = random.randint(1, s2 - s) if s2 - s > 0 else 0
-                            config = Config(b, s, s2, sp, n, n2, h)
-                            parity_check_gqa_past(
-                                config,
-                                past_format=past_kv_format,
-                                rtol=1e-3,
-                                atol=1e-3,
-                            )
-                            parity_check_gqa_past_no_buff(
-                                config,
-                                past_format=past_kv_format,
-                                rtol=1e-3,
-                                atol=1e-3,
-                            )
+                        for rotary, rotary_interleaved in [(True, False), (True, True), (False, False)]:
+                            for packed in [False, True]:
+                                sp = random.randint(1, s2 - s) if s2 - s > 0 else 0
+                                config = Config(b, s, s2, sp, n, n2, h)
+                                parity_check_gqa_past(
+                                    config,
+                                    past_format=Formats.BNSH,
+                                    rtol=1e-3,
+                                    atol=1e-3,
+                                    rotary=rotary,
+                                    rotary_interleaved=rotary_interleaved,
+                                    packed=packed,
+                                )
+                                parity_check_gqa_past_no_buff(
+                                    config,
+                                    past_format=Formats.BNSH,
+                                    rtol=1e-3,
+                                    atol=1e-3,
+                                    rotary=rotary,
+                                    rotary_interleaved=rotary_interleaved,
+                                    packed=packed,
+                                )
         if major < 8 or platform.system() != "Linux":
             return
         print("------- FLASH ATTENTION (TOKEN GEN) -------")
@@ -2170,13 +2173,13 @@ def test_gqa_past(self):
                     for h in h_sizes:
                         for local in [False, True]:
                             for rotary, rotary_interleaved in [(True, False), (True, True), (False, False)]:
-                                for past_kv_format, packed in [(Formats.BNSH, False), (Formats.BNSH, True)]:
+                                for packed in [False, True]:
                                     sp = random.randint(1, s2 - s) if s2 - s > 0 else 0
                                     config = Config(b, s, s2, sp, n, n2, h)
                                     parity_check_gqa_past(
                                         config,
                                         local=local,
-                                        past_format=past_kv_format,
+                                        past_format=Formats.BNSH,
                                         rtol=1e-3,
                                         atol=1e-3,
                                         rotary=rotary,
@@ -2186,7 +2189,7 @@ def test_gqa_past(self):
                                     parity_check_gqa_past_no_buff(
                                         config,
                                         local=local,
-                                        past_format=past_kv_format,
+                                        past_format=Formats.BNSH,
                                         rtol=1e-3,
                                         atol=1e-3,
                                         rotary=rotary,

From d30c81d270894f41ccce7b102b1d4aedd9e628b1 Mon Sep 17 00:00:00 2001
From: Vincent Wang <wangwchpku@outlook.com>
Date: Mon, 25 Mar 2024 15:05:02 +0800
Subject: [PATCH 237/279] Add Symbolic Shape Hint to Triton Codegen Config
 (#20056)

Add symbolic shape hint to Triton codegen config so that we can avoid
unnecessary recompile when input shapes are keeping changing. Below
screenshot shows that with proper configuration, we can speed up the
training a lot by reducing unnecessary recompile.


![image](https://github.com/microsoft/onnxruntime/assets/11661208/699944d2-81cd-4c22-84e7-73a4fa0d2a28)
---
 .../python/training/ort_triton/_cache.py      |  2 +
 .../training/ort_triton/triton_op_executor.py | 57 +++++++++++++++----
 2 files changed, 49 insertions(+), 10 deletions(-)

diff --git a/orttraining/orttraining/python/training/ort_triton/_cache.py b/orttraining/orttraining/python/training/ort_triton/_cache.py
index ede9cd86a9da5..b70064377abfc 100644
--- a/orttraining/orttraining/python/training/ort_triton/_cache.py
+++ b/orttraining/orttraining/python/training/ort_triton/_cache.py
@@ -9,6 +9,7 @@
 import getpass
 import hashlib
 import os
+import sys
 import tempfile
 from types import ModuleType
 from typing import Tuple
@@ -61,6 +62,7 @@ def load(cls, source_code) -> ModuleType:
                 mod.__file__ = path
                 mod.key = key
                 exec(code, mod.__dict__, mod.__dict__)
+                sys.modules[mod.__name__] = mod
                 # another thread might set this first
                 cls.cache.setdefault(key, mod)
         return cls.cache[key]
diff --git a/orttraining/orttraining/python/training/ort_triton/triton_op_executor.py b/orttraining/orttraining/python/training/ort_triton/triton_op_executor.py
index e104ea13c59a3..14bc2779aa05b 100644
--- a/orttraining/orttraining/python/training/ort_triton/triton_op_executor.py
+++ b/orttraining/orttraining/python/training/ort_triton/triton_op_executor.py
@@ -6,11 +6,13 @@
 import functools
 import json
 import os
+import re
 import sys
 from types import ModuleType
 from typing import List, Tuple, Union
 
 import onnx
+from onnx import ModelProto
 from torch._C import _from_dlpack
 from torch.utils.dlpack import to_dlpack
 
@@ -41,18 +43,39 @@ class _ShapeCache:
     """
 
     cache = dict()  # noqa: RUF012
+    symbolic_shape_hint = None
+    min_symbolic_shape = 0
     clear = staticmethod(cache.clear)
 
     @classmethod
-    def get_shape(cls, onnx_key: int, shapes: List[List[int]]) -> List[List[Union[int, str]]]:
+    def set_symbolic_shape_hint(cls, symbolic_shape_hint_config):
+        for k, v in symbolic_shape_hint_config.items():
+            if k == "*":
+                cls.min_symbolic_shape = v
+            else:
+                if cls.symbolic_shape_hint is None:
+                    cls.symbolic_shape_hint = dict()
+                cls.symbolic_shape_hint[k] = v
+
+    @classmethod
+    def get_shape(cls, onnx_key: int, model: ModelProto, shapes: List[List[int]]) -> List[List[Union[int, str]]]:
         if onnx_key not in cls.cache:
+            if cls.symbolic_shape_hint is not None:
+                for i, input in enumerate(model.graph.input):
+                    if input.type.tensor_type.HasField("shape"):
+                        for j, dim in enumerate(input.type.tensor_type.shape.dim):
+                            if dim.dim_param:
+                                for k, v in cls.symbolic_shape_hint.items():
+                                    if re.fullmatch(k, dim.dim_param):
+                                        shapes[i][j] = f"i{i}_dim{j}_{v}"
+                                        break
             cls.cache[onnx_key] = shapes
         else:
             changed = False
             for i, shape in enumerate(shapes):
                 for j, dim in enumerate(shape):
-                    if dim != cls.cache[onnx_key][i][j] and isinstance(cls.cache[onnx_key][i][j], int):
-                        max_dim = max(dim, cls.cache[onnx_key][i][j])
+                    if isinstance(cls.cache[onnx_key][i][j], int) and dim != cls.cache[onnx_key][i][j]:
+                        max_dim = max(dim, cls.cache[onnx_key][i][j], cls.min_symbolic_shape)
                         shape[j] = f"i{i}_dim{j}_{next_power_of_2(max_dim)}"
                         changed = True
                     elif isinstance(cls.cache[onnx_key][i][j], str):
@@ -67,13 +90,12 @@ def get_shape(cls, onnx_key: int, shapes: List[List[int]]) -> List[List[Union[in
         return cls.cache[onnx_key]
 
 
-def _gen_key(onnx_key: int, onnx_str: bytes, shapes: List[List[Union[int, str]]]) -> int:
+def _gen_key(onnx_key: int, model: ModelProto, shapes: List[List[Union[int, str]]]) -> int:
     # pylint: disable=unused-argument
     return hash(f"{onnx_key}|{str(shapes).replace(' ', '')}")
 
 
-def _gen_module(onnx_key: int, onnx_str: bytes, shapes: List[List[Union[int, str]]]) -> Tuple[str, ModuleType]:
-    model = onnx.load_model_from_string(onnx_str)
+def _gen_module(onnx_key: int, model: ModelProto, shapes: List[List[Union[int, str]]]) -> Tuple[str, ModuleType]:
     sorted_graph = SortedGraph(model, [parse_shape(shape) for shape in shapes])
     if _DEBUG_MODE:
         os.makedirs(os.path.dirname("triton_debug/"), exist_ok=True)
@@ -96,14 +118,28 @@ def get_config() -> str:
         "scalar": only related scalar initializers will be added to subgraphs.
         "all": all related initializers will be added to subgraphs.
     The min_nodes is used to control the minimum number of non-no-op nodes in a subgraph.
+    User can also specify symbolic_shape_hint in the config, which is a dict to control the symbolic shape hint.
+    Each entry is a regex pattern to match the dim_param in ONNX model and the value is the power of 2 for the symbolic
+    shape. Each dim_param will be replaced by i{input_index}_dim{dim_index}_{power_of_2} in the symbolic shape.
     """
 
+    config = dict()
     config_file = os.getenv("ORTMODULE_TRITON_CONFIG_FILE", "")
     if config_file and os.path.exists(config_file):
         with open(config_file, encoding="UTF-8") as f:
-            return f.read()
+            config = json.load(f)
+
+    if "ops" not in config:
+        config["ops"] = get_supported_ops()
+    if "initializer" not in config:
+        config["initializer"] = "scalar"
+    if "min_nodes" not in config:
+        config["min_nodes"] = 2
+
+    if "symbolic_shape_hint" in config and len(config["symbolic_shape_hint"]) > 0:
+        _ShapeCache.set_symbolic_shape_hint(config["symbolic_shape_hint"])
+        del config["symbolic_shape_hint"]
 
-    config = {"ops": get_supported_ops(), "initializer": "scalar", "min_nodes": 2}
     return json.dumps(config)
 
 
@@ -136,8 +172,9 @@ def call_triton_by_onnx(onnx_key: int, onnx_str: bytes, *tensors):
     assert all(tensor is not None for tensor in tensors)
     torch_tensors = [_from_dlpack(tensor) for tensor in tensors]
     concrete_shapes = [list(tensor.size()) for tensor in torch_tensors]
-    shapes = _ShapeCache.get_shape(onnx_key, concrete_shapes)
-    func_name, mod = ModuleCache.load(_gen_key, _gen_module, onnx_key, onnx_str, shapes)
+    model = onnx.load_model_from_string(onnx_str)
+    shapes = _ShapeCache.get_shape(onnx_key, model, concrete_shapes)
+    func_name, mod = ModuleCache.load(_gen_key, _gen_module, onnx_key, model, shapes)
     func = getattr(mod, func_name)
     output = func(*torch_tensors)
     if isinstance(output, tuple):

From 7d976cf72098639b0c8427629bef797861497ea9 Mon Sep 17 00:00:00 2001
From: Adrian Lizarraga <adlizarraga@microsoft.com>
Date: Mon, 25 Mar 2024 14:41:14 -0700
Subject: [PATCH 238/279] [QNN QDQ Quant] Utils to generate mixed-precision
 quant overrides (#20028)

### Description
- Adds a utility to the QNN quantization scripts that "fixes" an initial
set of tensor quantization overrides for mixed-precision QDQ models.
Follow-up to https://github.com/microsoft/onnxruntime/pull/19925
- Moves existing overrides for QNN compatibility (matmul, layernorm,
sigmoid, tanh) to separate functions. PR adds missing unit tests for
these.
- Adds `weight_symmetric=None` parameter to the `get_qnn_qdq_config()`
function to enable user specification (instead of always using default
behavior).
- If weight_symmetric is set to `None`, it will be set to
`weight_symmetric = weight_type in (QUInt8, QUInt16)`.
  - Otherwise, the user's value is used.

#### Example
Float model:

```
    input_0 --> Op1 --> Op3 --> Op5 --> Op6 --> output_0
                                 ^
                                 |
    input_1 --> Op2 -+-> Op4 ----+
                     |
                     +-> Op7 --> output_1
                     |
                     +-> Op8 --> output_2
```

If we'd like to quantize this model to uint8 precision, but would like
to make sure tensor "Op4_out" is quantized to 16-bit, then we would
specify the following initial tensor quantization overrides:
```python
# Op4_out could be an inaccurate tensor that should be upgraded to 16bit
initial_overrides = {"Op4_out": [{"quant_type": QuantType.QUInt16}]}
```

These initial overrides may not create a valid model because Op4 and Op5
may require both the input and output to be the same type (e.g.,
uint16). This helper fixes the overrides so that input/output data types
are valid:

```python
qnn_config = get_qnn_qdq_config(
    float_model_path,
    data_reader,
    activation_type=QuantType.QUInt8,
    weight_type=QuantType.QUInt8,
    init_overrides=initial_overrides,  # These initial overrides will be "fixed"
)
```

The above snippet generates the following "fixed" overrides (get via
`qnn_config.extra_options["TensorQuantOverrides"]`):
```python
    {
      "Op2_out": [{"quant_type": QUInt8, "convert": {"quant_type": QUInt16, "recv_nodes": {"Op4"}}}],
      "Op3_out": [{"quant_type": QUInt8, "convert": {"quant_type": QUInt16, "recv_nodes": {"Op5"}}}],
      "Op4_out": [{"quant_type": QUInt16}],
      "Op5_out": [{"quant_type": QUInt16, "convert": {"quant_type": QUInt8, "recv_nodes": {"Op6"}}}]
    }
```

How to interpret the fixed overrides:
- Op2's output is consumed by Op4, Op7, and Op8. Op4 consumes the
converted u16 type, but Op7 and Op8 consume the original u8 type.
- Op3's output is converted from u8 to u16. Op5 consumes the converted
u16 type.
- Op4's output is just u16 (not converted). All consumers of Op4_out get
the u16 type.
- Op5's output is converted from u16 to u8. Op6 consumes the u8 type.

### Motivation and Context
Generating mixed-precision quantization overrides is currently a manual
process. This PR adds an utility that helps generate valid overrides.
---
 .../qnn/mixed_precision_overrides_utils.py    | 413 +++++++++++++++
 .../execution_providers/qnn/quant_config.py   | 248 +++++++--
 .../quantization/tensor_quant_overrides.py    | 131 +++++
 .../test_mixed_prec_quant_overrides_fixer.py  | 171 +++++++
 .../test_tensor_quant_overrides_option.py     | 470 +++++++++++++++++-
 5 files changed, 1369 insertions(+), 64 deletions(-)
 create mode 100644 onnxruntime/python/tools/quantization/execution_providers/qnn/mixed_precision_overrides_utils.py
 create mode 100644 onnxruntime/test/python/quantization/test_mixed_prec_quant_overrides_fixer.py

diff --git a/onnxruntime/python/tools/quantization/execution_providers/qnn/mixed_precision_overrides_utils.py b/onnxruntime/python/tools/quantization/execution_providers/qnn/mixed_precision_overrides_utils.py
new file mode 100644
index 0000000000000..d59a0ec74ca7c
--- /dev/null
+++ b/onnxruntime/python/tools/quantization/execution_providers/qnn/mixed_precision_overrides_utils.py
@@ -0,0 +1,413 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass
+
+import onnx
+
+from ...quant_utils import QuantType
+from ...tensor_quant_overrides import QuantTypeInfo, TensorQuantOverridesHelper
+
+
+@dataclass
+class TensorTypeRequest:
+    """
+    Bundles desired quantization type requests for a tensor. A distinction is made between the
+    produced type and the consumed type.
+    """
+
+    # The tensor's quant type at the producer end. If None, assumed to be the default activation quant type.
+    producer: QuantTypeInfo | None
+
+    # The tensor's quant type received by a set of consumer nodes.
+    # If None, assumed to be the default activation quant type for all consumers.
+    # consumers[1] is a set of consumer node names.
+    consumers: tuple[QuantTypeInfo, set[str]] | None
+
+
+class MixedPrecisionTensorQuantOverridesFixer:
+    """
+    Helper that generates tensor quantization overrides for mixed-precision QDQ models.
+
+    Specifically, this helper fixes an initial set of quantization overrides that assign a non-default
+    activation quantization type to one or more tensors by doing the following:
+     - Inferring which other tensors need to be overridden to the non-default activation quantization type.
+     - Inserting quantization data type conversions.
+
+    Example:
+    --------
+
+    Float model:
+
+    input_0 --> Op1 --> Op3 --> Op5 --> Op6 --> output_0
+                                 ^
+                                 |
+    input_1 --> Op2 -+-> Op4 ----+
+                     |
+                     +-> Op7 --> output_1
+                     |
+                     +-> Op8 --> output_2
+
+    If we'd like to quantize this model to uint8 precision, but would like to make sure tensor "Op4_out"
+    is quantized to 16-bit, then we would specify the following initial tensor quantization overrides:
+
+    ```
+    init_overrides = {"Op4_out": [{"quant_type": QuantType.QUInt16}]}
+    ```
+
+    These initial overrides may not create a valid model because Op4 and Op5 may require both the input and output
+    to be the same type (e.g., uint16). This helper fixes the overrides so that input/output data types
+    are valid:
+
+    ```
+    overrides = TensorQuantOverridesHelper(init_overrides)
+
+    fixer = MixedPrecisionTensorQuantOverridesFixer.create_from_model(overrides, model, QuantType.QUInt8)
+    fixer.apply(
+        default_activation_qtype=QuantType.QUInt8,
+        default_activation_symmetric=False,
+    )
+    ```
+
+    The above snippet generates the following "fixed" overrides (get via overrides.get_dict()):
+
+    {
+      "Op2_out": [{"quant_type": QUInt8, "convert": {"quant_type": QUInt16, "recv_nodes": {"Op4"}}}],
+      "Op3_out": [{"quant_type": QUInt8, "convert": {"quant_type": QUInt16, "recv_nodes": {"Op5"}}}],
+      "Op4_out": [{"quant_type": QUInt16}],
+      "Op5_out": [{"quant_type": QUInt16, "convert": {"quant_type": QUInt8, "recv_nodes": {"Op6"}}}]
+    }
+
+    How to interpret the fixed overrides:
+    - Op2's output is consumed by Op4, Op7, and Op8. Op4 consumes the converted u16 type,
+      but Op7 and Op8 consume the original u8 type.
+    - Op3's output is converted from u8 to u16. Op5 consumes the converted u16 type.
+    - Op4's output is just u16 (not converted). All consumers of Op4_out get the u16 type.
+    - Op5's output is converted from u16 to u8. Op6 consumes the u8 type.
+    """
+
+    def __init__(
+        self,
+        overrides: TensorQuantOverridesHelper,
+        producers: dict[str, onnx.NodeProto],
+        consumers: dict[str, list[onnx.NodeProto]],
+        value_infos: dict[str, onnx.ValueInfoProto],
+        initializers: dict[str, onnx.TensorProto],
+    ):
+        """
+        Params:
+            overrides: The initial tensor quantization overrides to fix.
+            producers: Dictionary that maps a tensor name to the producer node that generates the tensor.
+            consumers: Dictionary that maps a tensor name to the consumer nodes that take the tensor as input.
+            value_infos: Dictionary that maps a tensor name to its onnx.ValueInfoProto.
+            initializers: Dictionary that maps an initializer name to its onnx.TensorProto.
+        """
+        self.overrides = overrides
+        self.consumers = consumers
+        self.producers = producers
+        self.value_infos = value_infos
+        self.initializers = initializers
+
+    @staticmethod
+    def create_from_model(
+        overrides: TensorQuantOverridesHelper, model: onnx.ModelProto, default_activation_qtype: QuantType
+    ) -> MixedPrecisionTensorQuantOverridesFixer:
+        """
+        Helper function that creates an instance of this class from a loaded ONNX model.
+
+        Params:
+            overrides: The initial tensor quantization overrides to fix.
+            model: Loaded ONNX model
+            default_activation_qtype: The intended default activation quantization type.
+                                      Used to validate the initial overrides.
+
+        Returns:
+            Initialized MixedPrecisionTensorQuantOverridesFixer object
+        """
+        model = onnx.shape_inference.infer_shapes(model)  # Need to infer shapes to get value_infos
+
+        # Build dictionaries that enable convenient lookups of initializers and value_infos by name.
+        initializers = {initializer.name: initializer for initializer in model.graph.initializer}
+        value_infos = {vi.name: vi for vi in model.graph.value_info}
+        value_infos.update({ot.name: ot for ot in model.graph.output})
+        value_infos.update({it.name: it for it in model.graph.input})
+
+        # Ensure that the user-provided initial overrides are actually valid.
+        valid, err = overrides.is_valid(set(initializers), set(value_infos), default_activation_qtype)
+        if not valid:
+            pprint_overrides = overrides.pprint_str(indent=4)
+            logging.error(f"Provided invalid tensor quantization overrides:\n{pprint_overrides}")
+            raise ValueError(err)
+
+        consumers = {}
+        producers = {}
+
+        # Build dictionaries that map a tensor name to the consumer or producer nodes.
+        for node in model.graph.node:
+            for input_name in node.input:
+                if input_name:
+                    if input_name not in consumers:
+                        consumers[input_name] = []
+
+                    consumers[input_name].append(node)
+
+            for output_name in node.output:
+                producers[output_name] = node
+
+        return MixedPrecisionTensorQuantOverridesFixer(overrides, producers, consumers, value_infos, initializers)
+
+    def apply(
+        self,
+        default_activation_qtype: QuantType,
+        default_activation_symmetric: bool,
+    ):
+        """
+        Fixes the initial tensor quantization overrides (in-place) for use in mixed-precision QDQ models.
+
+        Params:
+            default_activation_qtype: The intended default activation quantization type.
+            default_activation_symmetric: The intended default symmetry used to quantize activations.
+        """
+        type_requests = self.get_desired_tensor_types(default_activation_qtype, default_activation_symmetric)
+
+        # Use type requests to "fix" tensor quantization overrides by adding
+        # quantization type conversions where necessary.
+        for tensor_name, type_req in type_requests.items():
+            all_consumers = set([node.name for node in self.consumers.get(tensor_name, [])])
+            has_producer_req = type_req.producer is not None
+            has_consumer_req = bool(type_req.consumers)
+
+            # Only producer type: Add conversion back to default activation type
+            if has_producer_req and not has_consumer_req:
+                self._update_converted_tensor(
+                    tensor_name, type_req.producer, QuantTypeInfo(default_activation_qtype), all_consumers
+                )
+            # Only consumers
+            elif not has_producer_req and has_consumer_req:
+                prod_type_info = self.overrides.get_node_output_qtype_info(tensor_name, default_activation_qtype)
+                consumer_type_info = type_req.consumers[0]
+
+                if prod_type_info != consumer_type_info:
+                    self._update_converted_tensor(
+                        tensor_name, prod_type_info, consumer_type_info, type_req.consumers[1]
+                    )
+                else:
+                    if not self._check_nodes_are_not_convert_consumers(tensor_name, type_req.consumers[1]):
+                        raise ValueError(
+                            f"Tensor override for '{tensor_name}' converts the type for consumers that need the original type."
+                        )
+            # Both producer and consumers
+            elif has_producer_req and has_consumer_req:
+                prod_type_info = type_req.producer
+                consumer_type_info = type_req.consumers[0]
+
+                if prod_type_info != consumer_type_info:
+                    self._update_converted_tensor(
+                        tensor_name, prod_type_info, consumer_type_info, type_req.consumers[1]
+                    )
+                else:
+                    consumers_for_original_type = all_consumers.difference(type_req.consumers[1])
+
+                    if len(consumers_for_original_type) == 0:
+                        # All consumers want the overridden type, so no need for convert nodes!
+                        # Just add the override to the new new if not already present.
+                        if tensor_name not in self.overrides:
+                            self.overrides[tensor_name] = [{}]
+                            prod_type_info.save_to_dict(self.overrides[tensor_name][0])
+
+                        assert "convert" not in self.overrides[tensor_name][0]
+                    else:
+                        # Some consumers don't want the overridden type.
+                        self._update_converted_tensor(
+                            tensor_name,
+                            prod_type_info,
+                            QuantTypeInfo(default_activation_qtype),
+                            consumers_for_original_type,
+                        )
+            else:
+                raise ValueError(f"TypeRequest for tensor {tensor_name} has no producer or consumers.")
+
+        # Done. Check if the overrides are valid.
+        valid, err = self.overrides.is_valid(set(self.initializers), set(self.value_infos), default_activation_qtype)
+        if not valid:
+            pprint_overrides = self.overrides.pprint_str(indent=4)
+            logging.error(
+                f"Generated invalid tensor quantization overrides for mixed-precision QDQ model:\n{pprint_overrides}"
+            )
+            raise ValueError(err)
+
+    def get_desired_tensor_types(
+        self,
+        default_activation_qtype: QuantType,
+        default_activation_symmetric: bool,
+    ) -> dict[str, TensorTypeRequest]:
+        """
+        Iterates through the initial tensor quantization overrides and builds a set of TensorTypeRequests objects
+        that describe the quantization types required at each tensor. These TensorTypeRequests objects are ultimately
+        used to generated the "fixed" overrides.
+
+        Params:
+            default_activation_qtype: The intended default activation quantization type.
+            default_activation_symmetric: The intended default symmetry used to quantize activations.
+
+        Returns:
+            TensorTypeRequest objects as a dict that maps a tensor name to its requested types.
+        """
+        type_requests = {}
+        default_activation_type_info = QuantTypeInfo(default_activation_qtype, default_activation_symmetric)
+
+        # Scan tensor overrides for type conversion requests.
+        for tensor_name, override_list in self.overrides.items():
+            if not self.__is_tensor_quantizable(tensor_name):
+                continue  # Skip non-quantizable tensors (e.g., not a float)
+
+            if tensor_name in self.initializers:
+                continue  # Skip initializers
+
+            if not override_list or len(override_list) > 1:
+                continue  # Skip per-channel stuff
+
+            override_dict = override_list[0]
+            quant_type_info = QuantTypeInfo.load_from_dict(override_dict, default_activation_type_info.quant_type)
+            producer_node = self.producers.get(tensor_name)  # None if this is a model input
+
+            if quant_type_info != default_activation_type_info and "convert" not in override_dict:
+                if producer_node is not None:
+                    self._add_type_requests_for_node(type_requests, quant_type_info, producer_node)
+
+                # Find all consumer nodes of `tensor_name` and update their inputs/outputs to the new type.
+                for consumer_node in self.consumers.get(tensor_name, []):
+                    self._add_type_requests_for_node(type_requests, quant_type_info, consumer_node)
+
+        return type_requests
+
+    def _add_type_requests_for_node(
+        self,
+        type_requests: dict[str, TensorTypeRequest],
+        quant_type_info: QuantTypeInfo,
+        node: onnx.NodeProto,
+    ):
+        """
+        Adds TensorTypeRequest objects for a given node, assuming that we want all its inputs and outputs
+        to have the same quantization type (as specified by the `quant_type_info` parameter).
+
+        Params:
+            type_requests: Dictionary of type requests to append to for this node.
+            quant_type_info: The quantization type to use for inputs and outputs.
+            node: The node for which the TensorTypeRequest objects are created and added to type_requests.
+        """
+        # Add output side
+        for output_name in node.output:
+            if not self.__is_tensor_quantizable(output_name):
+                continue
+
+            if output_name not in type_requests:
+                type_requests[output_name] = TensorTypeRequest(quant_type_info, None)
+            else:
+                if (
+                    type_requests[output_name].producer is not None
+                    and type_requests[output_name].producer != quant_type_info
+                ):
+                    raise ValueError(f"Tensor {output_name} has multiple types.")
+
+                type_requests[output_name].producer = quant_type_info
+
+        # Add the consumer side
+        for input_name in node.input:
+            if input_name and input_name not in self.initializers and self.__is_tensor_quantizable(input_name):
+                if input_name not in type_requests:
+                    type_requests[input_name] = TensorTypeRequest(None, None)
+
+                if type_requests[input_name].consumers is None:
+                    type_requests[input_name].consumers = (quant_type_info, set())
+
+                if type_requests[input_name].consumers[0] != quant_type_info:
+                    raise ValueError(f"Tensor {input_name} has consumers requesting different types.")
+
+                if not node.name:
+                    raise ValueError(
+                        f"Node of type {node.op_type} with output 0 {node.output[0]} does not have a name!"
+                    )
+
+                type_requests[input_name].consumers[1].add(node.name)
+
+    def _update_converted_tensor(
+        self,
+        tensor_name: str,
+        producer_type_info: QuantTypeInfo,
+        consumer_type_info: QuantTypeInfo,
+        consumer_names: set[str],
+    ):
+        """
+        Updates the tensor quantization overrides for a tensor that is converted from one type to another.
+
+        Params:
+            tensor_name: The name of the tensor for which to update overrides.
+            producer_type_info: Info for the tensor's produced type.
+            consumer_type_info: Info for the tensor's consumed (i.e., converted) type.
+            consumer_names: Nodes names of consumers that consume the converted type.
+        """
+        if tensor_name not in self.overrides or not self.overrides[tensor_name]:
+            self.overrides[tensor_name] = [{}]
+            producer_type_info.save_to_dict(self.overrides[tensor_name][0])
+
+        overrides = self.overrides[tensor_name][0]
+        if producer_type_info != QuantTypeInfo.load_from_dict(overrides):
+            raise ValueError(f"Desired producer quant_type for {tensor_name} doesn't match existing type.")
+
+        if consumer_names:
+            if "convert" not in overrides:
+                overrides["convert"] = {}
+                consumer_type_info.save_to_dict(overrides["convert"])
+
+            convert_dict = overrides["convert"]
+            if consumer_type_info != QuantTypeInfo.load_from_dict(convert_dict):
+                raise ValueError(f"Desired consumer quant_type for {tensor_name} doesn't match existing type.")
+
+            if "recv_nodes" not in convert_dict:
+                convert_dict["recv_nodes"] = set()
+
+            convert_dict["recv_nodes"].update(consumer_names)
+
+    def _check_nodes_are_not_convert_consumers(self, tensor_name: str, node_names: set[str]):
+        """
+        Returns true if the given nodes do not consume/receive a converted quantization type.
+
+        Params:
+            tensor_name: The name of the tensor to check.
+            node_names: Set of node names that should not be consumers of the converted type.
+        """
+        if tensor_name not in self.overrides or not self.overrides[tensor_name]:
+            return True
+
+        overrides = self.overrides[tensor_name][0]
+
+        if "convert" not in overrides:
+            return True
+
+        convert_dict = overrides["convert"]
+
+        if "recv_nodes" not in convert_dict:
+            return False
+
+        return not convert_dict["recv_nodes"].intersection(node_names)
+
+    def __is_tensor_quantizable(self, tensor_name):
+        weight = self.initializers.get(tensor_name)
+        if weight is not None:
+            if weight.data_type in (onnx.TensorProto.FLOAT, onnx.TensorProto.FLOAT16):
+                return True
+        elif tensor_name in self.value_infos:
+            vi = self.value_infos[tensor_name]
+            if vi.type.HasField("tensor_type") and vi.type.tensor_type.elem_type in (
+                onnx.TensorProto.FLOAT,
+                onnx.TensorProto.FLOAT16,
+            ):
+                return True
+
+        return False
diff --git a/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py b/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py
index e9affae7ac263..479eaf5b0c542 100644
--- a/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py
+++ b/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py
@@ -3,6 +3,10 @@
 # Licensed under the MIT License. See License.txt in the project root for
 # license information.
 # --------------------------------------------------------------------------
+from __future__ import annotations
+
+import copy
+import logging
 from pathlib import Path
 
 import numpy as np
@@ -11,6 +15,8 @@
 from ...calibrate import CalibrationDataReader, CalibrationMethod
 from ...quant_utils import QuantType
 from ...quantize import StaticQuantConfig
+from ...tensor_quant_overrides import TensorQuantOverridesHelper
+from .mixed_precision_overrides_utils import MixedPrecisionTensorQuantOverridesFixer
 
 Q16_TYPES = {QuantType.QInt16, QuantType.QUInt16}
 Q8_TYPES = {QuantType.QInt8, QuantType.QUInt8}
@@ -18,6 +24,20 @@
 MODEL_SIZE_THRESHOLD = 2147483648  # Quant model should use external data if >= 2GB
 
 
+def warn_unable_to_override(
+    node: onnx.NodeProto,
+    what_str: str,
+    tensor_name: str,
+    io_kind: str,
+):
+    logging.warning(
+        f"Unable to override {what_str} for {node.op_type} node's {io_kind} "
+        "because it has already been overridden! Check the initial quantization overrides provided "
+        "to get_qnn_qdq_config() if the generated QDQ model does not run on QNN EP. "
+        f"Node name: {node.name}, {io_kind} name: {tensor_name}"
+    )
+
+
 def get_qnn_qdq_config(
     model_input: Path,
     calibration_data_reader: CalibrationDataReader,
@@ -25,14 +45,20 @@ def get_qnn_qdq_config(
     activation_type=QuantType.QUInt8,
     weight_type=QuantType.QUInt8,
     per_channel=False,
+    init_overrides=None,
+    add_qtype_converts=True,
+    activation_symmetric=False,
+    weight_symmetric=None,
 ):
     if per_channel:
         raise ValueError("QNN EP does not yet support per-channel quantization.")
 
+    if weight_symmetric is None:
+        weight_symmetric = weight_type in {QuantType.QInt8, QuantType.QInt16}
+
     model = onnx.load_model(model_input, load_external_data=False)
 
     op_types = set()
-    tensor_quant_overrides = {}
     model_has_external_data = False
     name_to_initializer = {}
 
@@ -43,52 +69,40 @@ def get_qnn_qdq_config(
         if onnx.external_data_helper.uses_external_data(initializer):
             model_has_external_data = True
 
-    # Setup quantization overrides for specific operator types
-    for node in model.graph.node:
-        op_types.add(node.op_type)
+    overrides_helper = TensorQuantOverridesHelper(copy.deepcopy(init_overrides) if init_overrides else {})
 
-        if node.op_type == "MatMul" and activation_type in Q16_TYPES and weight_type in Q8_TYPES:
-            weight_symmetric = weight_type == QuantType.QInt8
+    if not overrides_helper.empty() and add_qtype_converts:
+        # Fix mixed-precision overrides.
+        overrides_fixer = MixedPrecisionTensorQuantOverridesFixer.create_from_model(
+            overrides_helper, model, activation_type
+        )
+        overrides_fixer.apply(activation_type, activation_symmetric)
 
-            # Override initializers to use the weight_type
-            for input_name in node.input:
-                if input_name in name_to_initializer:
-                    tensor_quant_overrides[input_name] = [{"quant_type": weight_type, "symmetric": weight_symmetric}]
-        elif node.op_type == "LayerNormalization" and activation_type in Q16_TYPES and weight_type in Q8_TYPES:
-            weight_symmetric = weight_type == QuantType.QInt8
+    # Setup quantization overrides for specific operator types to ensure compatibility with QNN EP.
+    qnn_compat = QnnCompatibilityOverrides(
+        activation_type,
+        weight_type,
+        activation_symmetric,
+        weight_symmetric,
+        overrides_helper,
+        name_to_initializer,
+    )
 
-            # Override initializers to use the weight_type. Don't override the bias input.
-            for i in range(2):
-                input_name = node.input[i]
-                if input_name in name_to_initializer:
-                    tensor_quant_overrides[input_name] = [{"quant_type": weight_type, "symmetric": weight_symmetric}]
-        elif node.op_type == "Sigmoid":
-            if activation_type == QuantType.QUInt16:
-                tensor_quant_overrides[node.output[0]] = [
-                    {"scale": np.array(1.0 / 65536.0, dtype=np.float32), "zero_point": np.array(0, dtype=np.uint16)}
-                ]
-            elif activation_type == QuantType.QInt16:
-                tensor_quant_overrides[node.output[0]] = [
-                    {"scale": np.array(1.0 / 32768.0, dtype=np.float32), "zero_point": np.array(0, dtype=np.int16)}
-                ]
-        elif node.op_type == "Tanh":
-            if activation_type == QuantType.QUInt16:
-                tensor_quant_overrides[node.output[0]] = [
-                    {"scale": np.array(1.0 / 32768.0, dtype=np.float32), "zero_point": np.array(32768, dtype=np.uint16)}
-                ]
-            elif activation_type == QuantType.QInt16:
-                tensor_quant_overrides[node.output[0]] = [
-                    {"scale": np.array(1.0 / 32768.0, dtype=np.float32), "zero_point": np.array(0, dtype=np.int16)}
-                ]
+    for node in model.graph.node:
+        op_types.add(node.op_type)
+        qnn_compat.process_node(node)
 
     extra_options = {
         "MinimumRealRange": 0.0001,
         "DedicatedQDQPair": False,  # Let ORT optimizer duplicate DQ nodes
-        "TensorQuantOverrides": tensor_quant_overrides,
+        "TensorQuantOverrides": overrides_helper.get_dict(),
+        "ActivationSymmetric": activation_symmetric,
+        "WeightSymmetric": weight_symmetric,
     }
 
     # TODO: Remove this extra option once ORT uses an ONNX version that supports 16-bit Q/DQ ops.
-    if activation_type in Q16_TYPES or weight_type in Q16_TYPES:
+    overrides_have_int16 = any(t in Q16_TYPES for t in overrides_helper.get_quant_types())
+    if activation_type in Q16_TYPES or weight_type in Q16_TYPES or overrides_have_int16:
         extra_options["UseQDQContribOps"] = True
 
     return StaticQuantConfig(
@@ -100,3 +114,163 @@ def get_qnn_qdq_config(
         use_external_data_format=(model_has_external_data or model.ByteSize() >= MODEL_SIZE_THRESHOLD),
         extra_options=extra_options,
     )
+
+
+class QnnCompatibilityOverrides:
+    """
+    Helper that processes nodes to generate quantization overrides that make the resulting QDQ model
+    compatible with QNN EP.
+    """
+
+    def __init__(
+        self,
+        default_activation_qtype: QuantType,
+        default_weight_qtype: QuantType,
+        activation_symmetric: bool,
+        weight_symmetric: bool,
+        overrides: TensorQuantOverridesHelper,
+        initializers: dict[str, onnx.TensorProto],
+    ):
+        self.default_activation_qtype = default_activation_qtype
+        self.default_weight_qtype = default_weight_qtype
+        self.activation_symmetric = activation_symmetric
+        self.weight_symmetric = weight_symmetric
+        self.overrides = overrides
+        self.initializers = initializers
+
+        self.process_fns = {
+            "MatMul": self._process_matmul,
+            "LayerNormalization": self._process_layernorm,
+            "Sigmoid": self._process_sigmoid,
+            "Tanh": self._process_tanh,
+        }
+
+    def process_node(self, node: onnx.NodeProto):
+        process_fn = self.process_fns.get(node.op_type)
+
+        if process_fn is not None:
+            process_fn(node)
+
+    def _process_matmul(self, node: onnx.NodeProto):
+        """
+        Overrides MatMul's initializer input(s) to use the default weight type if:
+        - The default weight type is 8-bit
+        - One of the inputs is a 16-bit activation
+        """
+        assert node.op_type == "MatMul", f"Expected MatMul, but got {node.op_type}"
+        if self.default_weight_qtype not in Q8_TYPES:
+            return
+
+        input_16bit_act = None
+        input_wgt = None
+
+        for input_name in node.input:
+            if input_name and input_name not in self.initializers:
+                qtype = self.overrides.get_node_input_qtype_info(
+                    input_name, node.name, self.default_activation_qtype
+                ).quant_type
+                if qtype in Q16_TYPES:
+                    input_16bit_act = input_name
+            else:
+                input_wgt = input_name
+
+        # Override initializer to use the default weight type.
+        if input_16bit_act and input_wgt:
+            did_update = self.overrides.update_tensor_overrides(
+                input_wgt,
+                {"quant_type": self.default_weight_qtype, "symmetric": self.weight_symmetric},
+                overwrite=False,
+            )
+
+            if not did_update:
+                warn_unable_to_override(node, "quant_type/symmetric", input_wgt, "input weight")
+
+    def _process_layernorm(self, node: onnx.NodeProto):
+        """
+        Overrides LayerNormalization's initializer input(s), except for bias, to use the default weight type if:
+        - The default weight type is 8-bit
+        - One of the inputs is a 16-bit activation
+        """
+        assert node.op_type == "LayerNormalization", f"Expected LayerNormalization, but got {node.op_type}"
+        if self.default_weight_qtype not in Q8_TYPES:
+            return
+
+        has_q16_activation = False
+        for input_name in node.input:
+            if input_name and input_name not in self.initializers:
+                qtype = self.overrides.get_node_input_qtype_info(
+                    input_name, node.name, self.default_activation_qtype
+                ).quant_type
+                if qtype in Q16_TYPES:
+                    has_q16_activation = True
+                    break
+
+        # Override initializers to use the self.default_weight_qtype. Don't override the bias input.
+        if has_q16_activation:
+            for i in range(2):
+                input_name = node.input[i]
+                if input_name and input_name in self.initializers:
+                    did_update = self.overrides.update_tensor_overrides(
+                        input_name,
+                        {"quant_type": self.default_weight_qtype, "symmetric": self.weight_symmetric},
+                        overwrite=False,
+                    )
+
+                    if not did_update:
+                        warn_unable_to_override(node, "quant_type/symmetric", input_name, "input weight")
+
+    def _process_sigmoid(self, node: onnx.NodeProto):
+        """
+        Overrides 16-bit Sigmoid's output scale and zero-point as per QNN requirements.
+        """
+        assert node.op_type == "Sigmoid", f"Expected Sigmoid, but got {node.op_type}"
+        output_type = self.overrides.get_node_output_qtype_info(
+            node.output[0], self.default_activation_qtype
+        ).quant_type
+
+        if output_type == QuantType.QUInt16:
+            self.overrides.update_tensor_overrides(
+                node.output[0],
+                {
+                    "quant_type": output_type,
+                    "scale": np.array(1.0 / 65536.0, dtype=np.float32),
+                    "zero_point": np.array(0, dtype=np.uint16),
+                },
+            )
+        elif output_type == QuantType.QInt16:
+            self.overrides.update_tensor_overrides(
+                node.output[0],
+                {
+                    "quant_type": output_type,
+                    "scale": np.array(1.0 / 32768.0, dtype=np.float32),
+                    "zero_point": np.array(0, dtype=np.int16),
+                },
+            )
+
+    def _process_tanh(self, node: onnx.NodeProto):
+        """
+        Overrides 16-bit Tanh's output scale and zero-point as per QNN requirements.
+        """
+        assert node.op_type == "Tanh", f"Expected Tanh, but got {node.op_type}"
+        output_type = self.overrides.get_node_output_qtype_info(
+            node.output[0], self.default_activation_qtype
+        ).quant_type
+
+        if output_type == QuantType.QUInt16:
+            self.overrides.update_tensor_overrides(
+                node.output[0],
+                {
+                    "quant_type": output_type,
+                    "scale": np.array(1.0 / 32768.0, dtype=np.float32),
+                    "zero_point": np.array(32768, dtype=np.uint16),
+                },
+            )
+        elif output_type == QuantType.QInt16:
+            self.overrides.update_tensor_overrides(
+                node.output[0],
+                {
+                    "quant_type": output_type,
+                    "scale": np.array(1.0 / 32768.0, dtype=np.float32),
+                    "zero_point": np.array(0, dtype=np.int16),
+                },
+            )
diff --git a/onnxruntime/python/tools/quantization/tensor_quant_overrides.py b/onnxruntime/python/tools/quantization/tensor_quant_overrides.py
index 610b96b9d7937..793d58cbc4e3e 100644
--- a/onnxruntime/python/tools/quantization/tensor_quant_overrides.py
+++ b/onnxruntime/python/tools/quantization/tensor_quant_overrides.py
@@ -7,11 +7,52 @@
 
 import json
 from collections.abc import MutableMapping
+from dataclasses import dataclass
 from typing import Any
 
 from .quant_utils import QuantType
 
 
+@dataclass
+class QuantTypeInfo:
+    """
+    The quantization type information for a tensor override.
+    """
+
+    quant_type: QuantType
+    symmetric: bool | None = None  # If None, assumes default is used.
+    reduce_range: bool | None = None  # If None, assumes default is used.
+
+    def __eq__(self, other: object):
+        if isinstance(other, QuantTypeInfo):
+            return (
+                self.quant_type == other.quant_type
+                and (self.symmetric is None or other.symmetric is None or self.symmetric == other.symmetric)
+                and (self.reduce_range is None or other.reduce_range is None or self.reduce_range == other.reduce_range)
+            )
+        return NotImplemented
+
+    @staticmethod
+    def load_from_dict(
+        raw_dict: dict[str, Any],
+        default_activation_qtype: QuantType | None = None,
+        default_activation_symmetric: bool | None = None,
+        default_activation_reduce_range: bool | None = None,
+    ) -> QuantTypeInfo:
+        return QuantTypeInfo(
+            raw_dict.get("quant_type", default_activation_qtype),
+            raw_dict.get("symmetric", default_activation_symmetric),
+            raw_dict.get("reduce_range", default_activation_reduce_range),
+        )
+
+    def save_to_dict(self, raw_dict: dict[str, Any]):
+        raw_dict["quant_type"] = self.quant_type
+        if self.symmetric is not None:
+            raw_dict["symmetric"] = self.symmetric
+        if self.reduce_range is not None:
+            raw_dict["reduce_range"] = self.reduce_range
+
+
 class TensorQuantOverridesHelper(MutableMapping):
     """
     Utility wrapper over the tensor quantization overrides passed via extra_options.
@@ -184,9 +225,99 @@ def is_valid(
 
         return True, None
 
+    def update_tensor_overrides(
+        self,
+        tensor_name: str,
+        new_vals: dict[str, Any],
+        channels: list[int] | None = None,
+        overwrite: bool = True,
+    ) -> bool:
+        if not new_vals:
+            return False
+
+        channels = set(channels) if channels is not None else None
+        have_overrides = self.overrides.get(tensor_name)
+
+        # If `overwrite` is False, check if we would overwrite anything.
+        do_update = True
+        if not overwrite and have_overrides:
+            for channel, overrides in enumerate(self.overrides[tensor_name]):
+                if channels is not None and channel not in channels:
+                    continue
+                if set(new_vals).intersection(set(overrides)):
+                    do_update = False
+                    break
+
+        # Do the update if `overwrite` is True or if nothing is overwritten (do not want partial overwrites).
+        if do_update:
+            if not have_overrides:
+                self.overrides[tensor_name] = [{}]
+
+            for channel, overrides in enumerate(self.overrides[tensor_name]):
+                if channels is not None and channel not in channels:
+                    continue
+                overrides.update(new_vals)
+
+        return do_update
+
+    def get_node_output_qtype_info(
+        self,
+        output_name: str,
+        default_qtype: QuantType | None,
+        default_symmetric: bool | None = None,
+    ) -> QuantTypeInfo:
+        if output_name not in self.overrides:
+            return QuantTypeInfo(default_qtype, default_symmetric)
+
+        # Get the first overrides dict in the list. This works for both per-tensor and per-channel
+        # quantization because all channels must use the same quant type.
+        tensor_overrides = self.overrides[output_name][0]
+
+        return QuantTypeInfo(
+            tensor_overrides.get("quant_type", default_qtype),
+            tensor_overrides.get("symmetric", default_symmetric),
+        )
+
+    def get_node_input_qtype_info(
+        self,
+        input_name: str,
+        node_name: str,
+        default_qtype: QuantType | None,
+        default_symmetric: bool | None = None,
+        default_reduce_range: bool | None = None,
+    ) -> QuantTypeInfo:
+        if input_name not in self.overrides or not self.overrides[input_name]:
+            return QuantTypeInfo(default_qtype, default_symmetric, default_reduce_range)
+
+        # Get the first overrides dict in the list. This works for both per-tensor and per-channel
+        # quantization because all channels must use the same quant type.
+        tensor_overrides = self.overrides[input_name][0]
+        producer_type = tensor_overrides.get("quant_type", default_qtype)
+
+        if "convert" not in tensor_overrides:
+            return QuantTypeInfo(producer_type, default_symmetric, default_reduce_range)
+
+        # This tensor is converted. Check if the node gets the original qtype or the converted qtype.
+        convert_dict = tensor_overrides["convert"]
+        qtype_info = QuantTypeInfo(
+            producer_type,
+            convert_dict.get("symmetric", default_symmetric),
+            convert_dict.get("reduce_range", default_reduce_range),
+        )
+
+        # Check if all nodes receive the converted type (i.e., recv_nodes is None) or this node
+        # is in the list of consumers (recv_nodes).
+        if ("recv_nodes" not in convert_dict) or (node_name in convert_dict["recv_nodes"]):
+            qtype_info.quant_type = convert_dict["quant_type"]
+
+        return qtype_info
+
     def pprint_str(self, indent=None) -> str:
         return json.dumps(self.overrides, default=str, indent=indent)
 
+    def empty(self) -> bool:
+        return not self.overrides
+
     def get_dict(self) -> dict[str, list[dict[str, Any]]]:
         return self.overrides
 
diff --git a/onnxruntime/test/python/quantization/test_mixed_prec_quant_overrides_fixer.py b/onnxruntime/test/python/quantization/test_mixed_prec_quant_overrides_fixer.py
new file mode 100644
index 0000000000000..96277056adee0
--- /dev/null
+++ b/onnxruntime/test/python/quantization/test_mixed_prec_quant_overrides_fixer.py
@@ -0,0 +1,171 @@
+#!/usr/bin/env python
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+import unittest
+
+import onnx
+
+from onnxruntime.quantization import QuantType
+from onnxruntime.quantization.execution_providers.qnn.mixed_precision_overrides_utils import (
+    MixedPrecisionTensorQuantOverridesFixer,
+)
+from onnxruntime.quantization.tensor_quant_overrides import TensorQuantOverridesHelper
+
+
+class TestMixedPrecisionQuantOverridesFixer(unittest.TestCase):
+    def build_test_model_1(self, shape):
+        input_0 = onnx.helper.make_tensor_value_info("input_0", onnx.TensorProto.FLOAT, shape)
+        input_1 = onnx.helper.make_tensor_value_info("input_1", onnx.TensorProto.FLOAT, shape)
+        output_0 = onnx.helper.make_tensor_value_info("output_0", onnx.TensorProto.FLOAT, shape)
+        output_1 = onnx.helper.make_tensor_value_info("output_1", onnx.TensorProto.FLOAT, shape)
+        output_2 = onnx.helper.make_tensor_value_info("output_2", onnx.TensorProto.FLOAT, shape)
+
+        op1_node = onnx.helper.make_node("Sigmoid", ["input_0"], ["op1_out"], name="op1")
+        op2_node = onnx.helper.make_node("Cos", ["input_1"], ["op2_out"], name="op2")
+        op3_node = onnx.helper.make_node("Sin", ["op1_out"], ["op3_out"], name="op3")
+        op4_node = onnx.helper.make_node("Tanh", ["op2_out"], ["op4_out"], name="op4")
+        op5_node = onnx.helper.make_node("Mul", ["op3_out", "op4_out"], ["op5_out"], name="op5")
+        op6_node = onnx.helper.make_node("Relu", ["op5_out"], ["output_0"], name="op6")
+        op7_node = onnx.helper.make_node("Cos", ["op2_out"], ["output_1"], name="op7")
+        op8_node = onnx.helper.make_node("Sigmoid", ["op2_out"], ["output_2"], name="op8")
+
+        graph = onnx.helper.make_graph(
+            [
+                op1_node,
+                op2_node,
+                op3_node,
+                op4_node,
+                op5_node,
+                op6_node,
+                op7_node,
+                op8_node,
+            ],
+            "mixed_prec_test",
+            [input_0, input_1],
+            [output_0, output_1, output_2],
+        )
+        opset_imports = [
+            onnx.helper.make_opsetid("", 18),
+        ]
+        model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        return onnx.shape_inference.infer_shapes(model)
+
+    def test_fixer_1(self):
+        shape = (1, 2, 3)
+        model = self.build_test_model_1(shape)
+        onnx.save_model(model, "model.onnx")
+
+        default_act_qtype = QuantType.QUInt8
+        raw_overrides = {"op4_out": [{"quant_type": QuantType.QUInt16}]}
+        overrides = TensorQuantOverridesHelper(raw_overrides)
+        fixer = MixedPrecisionTensorQuantOverridesFixer.create_from_model(overrides, model, default_act_qtype)
+        fixer.apply(default_act_qtype, default_activation_symmetric=False)
+
+        expected = {
+            "op2_out": [
+                {"quant_type": QuantType.QUInt8, "convert": {"quant_type": QuantType.QUInt16, "recv_nodes": {"op4"}}}
+            ],
+            "op3_out": [
+                {"quant_type": QuantType.QUInt8, "convert": {"quant_type": QuantType.QUInt16, "recv_nodes": {"op5"}}}
+            ],
+            "op4_out": [{"quant_type": QuantType.QUInt16}],
+            "op5_out": [
+                {"quant_type": QuantType.QUInt16, "convert": {"quant_type": QuantType.QUInt8, "recv_nodes": {"op6"}}}
+            ],
+        }
+        self.assertDictEqual(overrides.get_dict(), expected)
+
+    def test_fixer_with_symmetric(self):
+        shape = (1, 2, 3)
+        model = self.build_test_model_1(shape)
+        onnx.save_model(model, "model.onnx")
+
+        default_act_qtype = QuantType.QInt8
+        raw_overrides = {"op4_out": [{"quant_type": QuantType.QInt16, "symmetric": True}]}
+        overrides = TensorQuantOverridesHelper(raw_overrides)
+        fixer = MixedPrecisionTensorQuantOverridesFixer.create_from_model(overrides, model, default_act_qtype)
+        fixer.apply(default_act_qtype, default_activation_symmetric=False)
+
+        expected = {
+            "op2_out": [
+                {
+                    "quant_type": QuantType.QInt8,
+                    "convert": {"quant_type": QuantType.QInt16, "symmetric": True, "recv_nodes": {"op4"}},
+                }
+            ],
+            "op3_out": [
+                {
+                    "quant_type": QuantType.QInt8,
+                    "convert": {"quant_type": QuantType.QInt16, "symmetric": True, "recv_nodes": {"op5"}},
+                }
+            ],
+            "op4_out": [{"quant_type": QuantType.QInt16, "symmetric": True}],
+            "op5_out": [
+                {
+                    "quant_type": QuantType.QInt16,
+                    "symmetric": True,
+                    "convert": {"quant_type": QuantType.QInt8, "recv_nodes": {"op6"}},
+                }
+            ],
+        }
+        self.assertDictEqual(overrides.get_dict(), expected)
+
+    def test_fixer_upgrade_output(self):
+        shape = (1, 2, 3)
+        model = self.build_test_model_1(shape)
+        onnx.save_model(model, "model.onnx")
+
+        default_act_qtype = QuantType.QUInt8
+        raw_overrides = {
+            "op4_out": [{"quant_type": QuantType.QUInt16}],
+            "output_0": [{"quant_type": QuantType.QUInt16}],
+        }
+        overrides = TensorQuantOverridesHelper(raw_overrides)
+        fixer = MixedPrecisionTensorQuantOverridesFixer.create_from_model(overrides, model, default_act_qtype)
+        fixer.apply(default_act_qtype, default_activation_symmetric=False)
+
+        expected = {
+            "op2_out": [
+                {"quant_type": QuantType.QUInt8, "convert": {"quant_type": QuantType.QUInt16, "recv_nodes": {"op4"}}}
+            ],
+            "op3_out": [
+                {"quant_type": QuantType.QUInt8, "convert": {"quant_type": QuantType.QUInt16, "recv_nodes": {"op5"}}}
+            ],
+            "op4_out": [{"quant_type": QuantType.QUInt16}],
+            "op5_out": [{"quant_type": QuantType.QUInt16}],
+            "output_0": [{"quant_type": QuantType.QUInt16}],
+        }
+        self.assertDictEqual(overrides.get_dict(), expected)
+
+    def test_fixer_upgrade_input(self):
+        shape = (1, 2, 3)
+        model = self.build_test_model_1(shape)
+        onnx.save_model(model, "model.onnx")
+
+        default_act_qtype = QuantType.QUInt8
+        raw_overrides = {"op4_out": [{"quant_type": QuantType.QUInt16}], "input_0": [{"quant_type": QuantType.QUInt16}]}
+        overrides = TensorQuantOverridesHelper(raw_overrides)
+        fixer = MixedPrecisionTensorQuantOverridesFixer.create_from_model(overrides, model, default_act_qtype)
+        fixer.apply(default_act_qtype, default_activation_symmetric=False)
+
+        expected = {
+            "input_0": [{"quant_type": QuantType.QUInt16}],
+            "op1_out": [
+                {"quant_type": QuantType.QUInt16, "convert": {"quant_type": QuantType.QUInt8, "recv_nodes": {"op3"}}}
+            ],
+            "op2_out": [
+                {"quant_type": QuantType.QUInt8, "convert": {"quant_type": QuantType.QUInt16, "recv_nodes": {"op4"}}}
+            ],
+            "op3_out": [
+                {"quant_type": QuantType.QUInt8, "convert": {"quant_type": QuantType.QUInt16, "recv_nodes": {"op5"}}}
+            ],
+            "op4_out": [{"quant_type": QuantType.QUInt16}],
+            "op5_out": [
+                {"quant_type": QuantType.QUInt16, "convert": {"quant_type": QuantType.QUInt8, "recv_nodes": {"op6"}}}
+            ],
+        }
+        self.assertDictEqual(overrides.get_dict(), expected)
diff --git a/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py b/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py
index 9ea4719f3c595..77f20b3caed96 100644
--- a/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py
+++ b/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py
@@ -11,12 +11,12 @@
 import numpy as np
 import onnx
 
-from onnxruntime import quantization
+from onnxruntime.quantization import CalibrationDataReader, QuantFormat, QuantType, quantize_static
 from onnxruntime.quantization.execution_providers.qnn import get_qnn_qdq_config
 from onnxruntime.quantization.quant_utils import compute_scale_zp, get_qmin_qmax_for_qType, ms_domain
 
 
-class DummyDataReader(quantization.CalibrationDataReader):
+class DummyDataReader(CalibrationDataReader):
     def __init__(self, activations):
         self.iterator = ({"INP": act} for act in activations)
 
@@ -81,11 +81,11 @@ def perform_qdq_quantization(self, output_model_name, extra_options=None, per_ch
         if activation_type is None:
             activation_type = self.default_act_qtype
 
-        quantization.quantize_static(
+        quantize_static(
             model_input="model.onnx",
             model_output=output_model_name,
             calibration_data_reader=DummyDataReader(self.activations),
-            quant_format=quantization.QuantFormat.QDQ,
+            quant_format=QuantFormat.QDQ,
             activation_type=activation_type,
             weight_type=self.default_wgt_qtype,
             per_channel=per_channel,
@@ -223,8 +223,8 @@ def test_qdq_overrides1(self):
                     "SIG_OUT": [
                         {"scale": np.array(1.0, dtype=np.float32), "zero_point": np.array(127, dtype=np.uint8)}
                     ],
-                    "WGT": [{"quant_type": quantization.QuantType.QInt8, "symmetric": True, "reduce_range": True}],
-                    "BIAS": [{"quant_type": quantization.QuantType.QInt8, "symmetric": True, "reduce_range": True}],
+                    "WGT": [{"quant_type": QuantType.QInt8, "symmetric": True, "reduce_range": True}],
+                    "BIAS": [{"quant_type": QuantType.QInt8, "symmetric": True, "reduce_range": True}],
                 }
             },
         )
@@ -240,7 +240,7 @@ def test_qdq_overrides1(self):
         self.assertEqual(sig_out_sc.float_data[0], np.float32(1.0))
 
         # Weight should have different type, zero_point, and scale
-        self.assertEqual(wgt_zp.data_type, quantization.QuantType.QInt8.tensor_type)
+        self.assertEqual(wgt_zp.data_type, QuantType.QInt8.tensor_type)
 
         wgt_qmin, wgt_qmax = get_qmin_qmax_for_qType(wgt_zp.data_type, reduce_range=True, symmetric=True)
         wgt_rmin, wgt_rmax = np.min(self.weight), np.max(self.weight)
@@ -249,7 +249,7 @@ def test_qdq_overrides1(self):
         self.assertEqual(wgt_sc.float_data[0], np.float32(new_wgt_sc))
 
         # Bias should now be treated as a weight and should have different type, zero_point, and scale
-        self.assertEqual(bias_zp.data_type, quantization.QuantType.QInt8.tensor_type)
+        self.assertEqual(bias_zp.data_type, QuantType.QInt8.tensor_type)
 
         bias_qmin, bias_qmax = get_qmin_qmax_for_qType(bias_zp.data_type, reduce_range=True, symmetric=True)
         bias_rmin, bias_rmax = np.min(self.bias), np.max(self.bias)
@@ -375,7 +375,7 @@ def test_qdq_overrides_per_channel2(self):
         """
         rmin_vals = [0.0, 0.2]
         rmax_vals = [1.0, 0.8]
-        quant_type = quantization.QuantType.QUInt8
+        quant_type = QuantType.QUInt8
         reduce_ranges = [True, False]
         (
             _,
@@ -434,8 +434,8 @@ def test_16bit_overrides_set_ms_domain(self):
             activation_type=onnx.TensorProto.UINT8,  # Default to 8bit activations
             extra_options={
                 "TensorQuantOverrides": {
-                    "INP": [{"quant_type": quantization.QuantType.QUInt16}],
-                    "SIG_OUT": [{"quant_type": quantization.QuantType.QUInt16}],
+                    "INP": [{"quant_type": QuantType.QUInt16}],
+                    "SIG_OUT": [{"quant_type": QuantType.QUInt16}],
                 }
             },
         )
@@ -559,31 +559,446 @@ def test_override_validation_bad_combination(self):
 
         self.assertIn("option 'reduce_range' is invalid with 'scale' and 'zero_point'", str(context.exception))
 
-    def test_get_qnn_qdq_config(self):
+    def test_get_qnn_qdq_config_sigmoid(self):
         """
-        Test that the QNN-specific configs override the scale and zero-point of Sigmoid.
+        Test that the QNN-specific configs override the scale and zero-point of 16-bit Sigmoid.
+        """
+        # Create float model with a Abs --> Sigmoid
+        graph = onnx.helper.make_graph(
+            [
+                onnx.helper.make_node("Abs", ["input_0"], ["abs_out"], name="Abs_0"),
+                onnx.helper.make_node("Sigmoid", ["abs_out"], ["output_0"], name="Sigmoid_0"),
+            ],
+            "sigmoid_graph",
+            [onnx.helper.make_tensor_value_info("input_0", onnx.TensorProto.FLOAT, (1, 2, 3))],
+            [onnx.helper.make_tensor_value_info("output_0", onnx.TensorProto.FLOAT, (1, 2, 3))],
+        )
+        opset_imports = [
+            onnx.helper.make_opsetid("", 18),
+        ]
+        model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        model = onnx.shape_inference.infer_shapes(model)
+        float_model_path = "model.onnx"
+        onnx.save_model(model, float_model_path)
+
+        other_override_0 = {"abs_out": [{"symmetric": True}]}
+        other_override_1 = {
+            "abs_out": [
+                {
+                    "quant_type": QuantType.QUInt8,
+                    "convert": {"quant_type": QuantType.QUInt16, "recv_nodes": {"Sigmoid_0"}},
+                }
+            ]
+        }
+        other_override_2 = {
+            "abs_out": [
+                {
+                    "quant_type": QuantType.QInt8,
+                    "convert": {"quant_type": QuantType.QInt16, "recv_nodes": {"Sigmoid_0"}},
+                }
+            ]
+        }
+
+        # Enumerate subtests (default_act_qtype, sigmoid_out_qtype, other_override)
+        subtest_configs = [
+            (QuantType.QUInt16, None, {}),  # Sigmoid gets new scale/zp
+            (QuantType.QUInt16, None, other_override_0),  # Sigmoid gets new scale/zp
+            (QuantType.QInt16, None, {}),  # Sigmoid gets new scale/zp
+            (QuantType.QInt16, None, other_override_0),  # Sigmoid gets new scale/zp
+            (QuantType.QUInt8, QuantType.QUInt16, other_override_1),  # Sigmoid gets new scale/zp
+            (QuantType.QInt8, QuantType.QInt16, other_override_2),  # Sigmoid gets new scale/zp
+            (QuantType.QUInt8, None, other_override_0),  # Sigmoid DOES NOT gets new scale/zp
+            (QuantType.QInt8, None, {}),  # Sigmoid DOES NOT gets new scale/zp
+            (QuantType.QInt8, QuantType.QInt8, {}),  # Sigmoid DOES NOT gets new scale/zp
+        ]
+
+        # Test that Sigmoid's output scale and zp should be overridden for 16-bit Sigmoid.
+        for default_act_qtype, sigmoid_out_qtype, abs_override in subtest_configs:
+            with self.subTest(
+                default_act_qtype=default_act_qtype, sigmoid_out_qtype=sigmoid_out_qtype, abs_override=abs_override
+            ):
+                init_overrides = {}
+                init_overrides.update(abs_override)
+
+                if sigmoid_out_qtype is not None:
+                    init_overrides["output_0"] = [{"quant_type": sigmoid_out_qtype}]
+
+                qnn_config = get_qnn_qdq_config(
+                    float_model_path,
+                    DummyDataReader([]),
+                    activation_type=default_act_qtype,
+                    init_overrides=(init_overrides if init_overrides else None),
+                    add_qtype_converts=False,
+                )
+
+                self.assertEqual(set(qnn_config.op_types_to_quantize), {"Abs", "Sigmoid"})
+
+                if default_act_qtype == QuantType.QUInt16 or sigmoid_out_qtype == QuantType.QUInt16:
+                    self.assertIn("TensorQuantOverrides", qnn_config.extra_options)
+                    self.assertIn("output_0", qnn_config.extra_options["TensorQuantOverrides"])
+                    self.assertEqual(
+                        qnn_config.extra_options["TensorQuantOverrides"]["output_0"],
+                        [
+                            {
+                                "quant_type": QuantType.QUInt16,
+                                "scale": np.array(1.0 / 65536.0, dtype=np.float32),
+                                "zero_point": np.array(0, dtype=np.uint16),
+                            }
+                        ],
+                    )
+                elif default_act_qtype == QuantType.QInt16 or sigmoid_out_qtype == QuantType.QInt16:
+                    self.assertIn("TensorQuantOverrides", qnn_config.extra_options)
+                    self.assertIn("output_0", qnn_config.extra_options["TensorQuantOverrides"])
+                    self.assertEqual(
+                        qnn_config.extra_options["TensorQuantOverrides"]["output_0"],
+                        [
+                            {
+                                "quant_type": QuantType.QInt16,
+                                "scale": np.array(1.0 / 32768.0, dtype=np.float32),
+                                "zero_point": np.array(0, dtype=np.int16),
+                            }
+                        ],
+                    )
+
+    def test_get_qnn_qdq_config_tanh(self):
+        """
+        Test that the QNN-specific configs override the scale and zero-point of 16-bit Tanh.
         """
-        self.build_float32_model()
 
-        qnn_config = get_qnn_qdq_config(
-            "model.onnx", DummyDataReader(self.activations), activation_type=quantization.QuantType.QUInt16
+        # Create float model with a Abs --> Tanh
+        graph = onnx.helper.make_graph(
+            [
+                onnx.helper.make_node("Abs", ["input_0"], ["abs_out"], name="Abs_0"),
+                onnx.helper.make_node("Tanh", ["abs_out"], ["output_0"], name="Tanh_0"),
+            ],
+            "tanh_graph",
+            [onnx.helper.make_tensor_value_info("input_0", onnx.TensorProto.FLOAT, (1, 2, 3))],
+            [onnx.helper.make_tensor_value_info("output_0", onnx.TensorProto.FLOAT, (1, 2, 3))],
         )
+        opset_imports = [
+            onnx.helper.make_opsetid("", 18),
+        ]
+        model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        model = onnx.shape_inference.infer_shapes(model)
+        float_model_path = "model.onnx"
+        onnx.save_model(model, float_model_path)
+
+        other_override_0 = {"abs_out": [{"symmetric": True}]}
+        other_override_1 = {
+            "abs_out": [
+                {"quant_type": QuantType.QUInt8, "convert": {"quant_type": QuantType.QUInt16, "recv_nodes": {"Tanh_0"}}}
+            ]
+        }
+        other_override_2 = {
+            "abs_out": [
+                {"quant_type": QuantType.QInt8, "convert": {"quant_type": QuantType.QInt16, "recv_nodes": {"Tanh_0"}}}
+            ]
+        }
 
-        self.assertEqual(qnn_config.extra_options["MinimumRealRange"], 0.0001)
+        # Enumerate subtests (default_act_qtype, tanh_out_qtype, other_override)
+        subtest_configs = [
+            (QuantType.QUInt16, None, {}),  # Tanh gets new scale/zp
+            (QuantType.QUInt16, None, other_override_0),  # Tanh gets new scale/zp
+            (QuantType.QInt16, None, {}),  # Tanh gets new scale/zp
+            (QuantType.QInt16, None, other_override_0),  # Tanh gets new scale/zp
+            (QuantType.QUInt8, QuantType.QUInt16, other_override_1),  # Tanh gets new scale/zp
+            (QuantType.QInt8, QuantType.QInt16, other_override_2),  # Tanh gets new scale/zp
+            (QuantType.QUInt8, None, other_override_0),  # Tanh DOES NOT gets new scale/zp
+            (QuantType.QInt8, None, {}),  # Tanh DOES NOT gets new scale/zp
+            (QuantType.QInt8, QuantType.QInt8, {}),  # Tanh DOES NOT gets new scale/zp
+        ]
 
-        inp_zp, inp_sc, sig_out_zp, sig_out_sc, _, _, _, _, _, _ = self.perform_qdq_quantization(
-            "model_qnn_quant_overrides.onnx",
-            extra_options=qnn_config.extra_options,
-            activation_type=quantization.QuantType.QUInt16,
+        # Test that Tanh's output scale and zp should be overridden for 16-bit Tanh.
+        for default_act_qtype, tanh_out_qtype, abs_override in subtest_configs:
+            with self.subTest(
+                default_act_qtype=default_act_qtype, tanh_out_qtype=tanh_out_qtype, abs_override=abs_override
+            ):
+                init_overrides = {}
+                init_overrides.update(abs_override)
+
+                if tanh_out_qtype is not None:
+                    init_overrides["output_0"] = [{"quant_type": tanh_out_qtype}]
+
+                qnn_config = get_qnn_qdq_config(
+                    float_model_path,
+                    DummyDataReader([]),
+                    activation_type=default_act_qtype,
+                    init_overrides=(init_overrides if init_overrides else None),
+                    add_qtype_converts=False,
+                )
+
+                self.assertEqual(set(qnn_config.op_types_to_quantize), {"Abs", "Tanh"})
+
+                if default_act_qtype == QuantType.QUInt16 or tanh_out_qtype == QuantType.QUInt16:
+                    self.assertIn("TensorQuantOverrides", qnn_config.extra_options)
+                    self.assertIn("output_0", qnn_config.extra_options["TensorQuantOverrides"])
+                    self.assertEqual(
+                        qnn_config.extra_options["TensorQuantOverrides"]["output_0"],
+                        [
+                            {
+                                "quant_type": QuantType.QUInt16,
+                                "scale": np.array(1.0 / 32768.0, dtype=np.float32),
+                                "zero_point": np.array(32768, dtype=np.uint16),
+                            }
+                        ],
+                    )
+                elif default_act_qtype == QuantType.QInt16 or tanh_out_qtype == QuantType.QInt16:
+                    self.assertIn("TensorQuantOverrides", qnn_config.extra_options)
+                    self.assertIn("output_0", qnn_config.extra_options["TensorQuantOverrides"])
+                    self.assertEqual(
+                        qnn_config.extra_options["TensorQuantOverrides"]["output_0"],
+                        [
+                            {
+                                "quant_type": QuantType.QInt16,
+                                "scale": np.array(1.0 / 32768.0, dtype=np.float32),
+                                "zero_point": np.array(0, dtype=np.int16),
+                            }
+                        ],
+                    )
+
+    def test_get_qnn_qdq_config_matmul(self):
+        """
+        Test that the QNN-specific configs override MatMul's initializer input type to 8-bit if
+        the other input is 16-bit and the default weight type is 8-bit.
+        """
+        # Create float model with a Abs --> MatMul
+        graph = onnx.helper.make_graph(
+            [
+                onnx.helper.make_node("Abs", ["input_0"], ["abs_0_out"], name="Abs_0"),
+                onnx.helper.make_node("MatMul", ["abs_0_out", "weight"], ["matmul_0_out"], name="MatMul_0"),
+                onnx.helper.make_node("Abs", ["matmul_0_out"], ["output_0"], name="Abs_1"),
+            ],
+            "matmul_graph",
+            [onnx.helper.make_tensor_value_info("input_0", onnx.TensorProto.FLOAT, (2, 3))],
+            [onnx.helper.make_tensor_value_info("output_0", onnx.TensorProto.FLOAT, (2, 2))],
+            initializer=[onnx.numpy_helper.from_array(np.random.random((3, 2)).astype(np.float32), "weight")],
         )
+        opset_imports = [
+            onnx.helper.make_opsetid("", 18),
+        ]
+        model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        model = onnx.shape_inference.infer_shapes(model)
+        float_model_path = "model.onnx"
+        onnx.save_model(model, float_model_path)
+
+        q16_qtypes = {QuantType.QUInt16, QuantType.QInt16}
+        q8_qtypes = {QuantType.QUInt8, QuantType.QInt8}
+        symmetric_wgt_qtypes = {QuantType.QInt8, QuantType.QInt16}
+
+        other_override_0 = {"output_0": [{"symmetric": True}]}
+        other_override_1 = {
+            "matmul_0_out": [
+                {
+                    "quant_type": QuantType.QUInt16,
+                    "convert": {"quant_type": QuantType.QUInt8, "recv_nodes": {"Abs_1"}},
+                }
+            ]
+        }
+        other_override_2 = {
+            "matmul_0_out": [
+                {
+                    "quant_type": QuantType.QInt16,
+                    "convert": {"quant_type": QuantType.QInt8, "recv_nodes": {"Abs_1"}},
+                }
+            ]
+        }
+        convert_matmul_input = {
+            "abs_0_out": [
+                {
+                    "quant_type": QuantType.QUInt8,
+                    "convert": {"quant_type": QuantType.QUInt16, "recv_nodes": {"MatMul_0"}},
+                }
+            ]
+        }
 
-        # Input should have uint16 quant type
-        self.assertEqual(inp_zp.data_type, onnx.TensorProto.UINT16)
+        # Enumerate subtests (default_act_qtype, default_wgt_qtype, matmul_in_qtype, other_override)
+        subtest_configs = [
+            (QuantType.QUInt8, QuantType.QUInt8, None, {}),
+            (QuantType.QUInt8, QuantType.QUInt8, QuantType.QUInt16, {}),
+            (QuantType.QUInt8, QuantType.QUInt8, QuantType.QUInt16, other_override_0),
+            (QuantType.QUInt8, QuantType.QUInt8, QuantType.QUInt16, other_override_1),
+            (QuantType.QInt8, QuantType.QInt8, QuantType.QInt16, other_override_2),
+            (QuantType.QUInt16, QuantType.QUInt8, None, other_override_0),
+            (QuantType.QInt16, QuantType.QInt8, None, {}),
+            (QuantType.QUInt16, QuantType.QUInt16, None, other_override_0),
+            (QuantType.QInt16, QuantType.QInt16, None, {}),
+            (QuantType.QUInt8, QuantType.QUInt8, None, {}),
+            (QuantType.QUInt8, QuantType.QUInt8, None, convert_matmul_input),
+        ]
 
-        # Sigmoid output should have overridden scale/zp
-        self.assertEqual(sig_out_zp.int32_data[0], 0)
-        self.assertEqual(sig_out_zp.data_type, onnx.TensorProto.UINT16)
-        self.assertEqual(sig_out_sc.float_data[0], np.float32(1.0 / 65536.0))
+        # Test if MatMul's weight input is overridden.
+        for default_act_qtype, default_wgt_qtype, matmul_input_qtype, other_override in subtest_configs:
+            with self.subTest(
+                default_act_qtype=default_act_qtype,
+                default_wgt_qtype=default_wgt_qtype,
+                matmul_input_qtype=matmul_input_qtype,
+                other_override=other_override,
+            ):
+                init_overrides = {}
+                init_overrides.update(other_override)
+
+                if matmul_input_qtype is not None:
+                    init_overrides["abs_0_out"] = [{"quant_type": matmul_input_qtype}]
+
+                qnn_config = get_qnn_qdq_config(
+                    float_model_path,
+                    DummyDataReader([]),
+                    activation_type=default_act_qtype,
+                    weight_type=default_wgt_qtype,
+                    init_overrides=(init_overrides if init_overrides else None),
+                    add_qtype_converts=False,
+                )
+
+                self.assertEqual(set(qnn_config.op_types_to_quantize), {"Abs", "MatMul"})
+                input_is_16bit = (
+                    (default_act_qtype in q16_qtypes)
+                    or (matmul_input_qtype in q16_qtypes)
+                    or (other_override == convert_matmul_input)
+                )
+                weight_is_symmetric = default_wgt_qtype in symmetric_wgt_qtypes
+
+                if input_is_16bit and default_wgt_qtype in q8_qtypes:
+                    self.assertIn("TensorQuantOverrides", qnn_config.extra_options)
+                    self.assertIn("weight", qnn_config.extra_options["TensorQuantOverrides"])
+                    self.assertEqual(
+                        qnn_config.extra_options["TensorQuantOverrides"]["weight"],
+                        [
+                            {
+                                "quant_type": default_wgt_qtype,
+                                "symmetric": weight_is_symmetric,
+                            }
+                        ],
+                    )
+                elif init_overrides:
+                    self.assertIn("TensorQuantOverrides", qnn_config.extra_options)
+                    self.assertNotIn("weight", qnn_config.extra_options["TensorQuantOverrides"])
+
+                self.assertEqual(weight_is_symmetric, qnn_config.extra_options["WeightSymmetric"])
+
+    def test_get_qnn_qdq_config_layernorm(self):
+        """
+        Test that the QNN-specific configs override LayerNorm's initializer input type to 8-bit if
+        the other input is 16-bit and the default weight type is 8-bit.
+        """
+        # Create float model with a Abs --> LayerNormalization
+        graph = onnx.helper.make_graph(
+            [
+                onnx.helper.make_node("Abs", ["input_0"], ["abs_0_out"], name="Abs_0"),
+                onnx.helper.make_node(
+                    "LayerNormalization", ["abs_0_out", "weight", "bias"], ["layernorm_0_out"], name="LayerNorm_0"
+                ),
+                onnx.helper.make_node("Abs", ["layernorm_0_out"], ["output_0"], name="Abs_1"),
+            ],
+            "layernorm_graph",
+            [onnx.helper.make_tensor_value_info("input_0", onnx.TensorProto.FLOAT, (2, 3))],
+            [onnx.helper.make_tensor_value_info("output_0", onnx.TensorProto.FLOAT, (2, 3))],
+            initializer=[
+                onnx.numpy_helper.from_array(np.random.random((2, 3)).astype(np.float32), "weight"),
+                onnx.numpy_helper.from_array(np.random.random((2, 3)).astype(np.float32), "bias"),
+            ],
+        )
+        opset_imports = [
+            onnx.helper.make_opsetid("", 18),
+        ]
+        model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        model = onnx.shape_inference.infer_shapes(model)
+        float_model_path = "model.onnx"
+        onnx.save_model(model, float_model_path)
+
+        q16_qtypes = {QuantType.QUInt16, QuantType.QInt16}
+        q8_qtypes = {QuantType.QUInt8, QuantType.QInt8}
+        symmetric_wgt_qtypes = {QuantType.QInt8, QuantType.QInt16}
+
+        other_override_0 = {"output_0": [{"symmetric": True}]}
+        other_override_1 = {
+            "layernorm_0_out": [
+                {
+                    "quant_type": QuantType.QUInt16,
+                    "convert": {"quant_type": QuantType.QUInt8, "recv_nodes": {"Abs_1"}},
+                }
+            ]
+        }
+        other_override_2 = {
+            "layernorm_0_out": [
+                {
+                    "quant_type": QuantType.QInt16,
+                    "convert": {"quant_type": QuantType.QInt8, "recv_nodes": {"Abs_1"}},
+                }
+            ]
+        }
+        convert_layernorm_input = {
+            "abs_0_out": [
+                {
+                    "quant_type": QuantType.QUInt8,
+                    "convert": {"quant_type": QuantType.QUInt16, "recv_nodes": {"LayerNorm_0"}},
+                }
+            ]
+        }
+
+        # Enumerate subtests (default_act_qtype, default_wgt_qtype, layernorm_in_qtype, other_override)
+        subtest_configs = [
+            (QuantType.QUInt8, QuantType.QUInt8, None, {}),
+            (QuantType.QUInt8, QuantType.QUInt8, QuantType.QUInt16, {}),
+            (QuantType.QUInt8, QuantType.QUInt8, QuantType.QUInt16, other_override_0),
+            (QuantType.QUInt8, QuantType.QUInt8, QuantType.QUInt16, other_override_1),
+            (QuantType.QInt8, QuantType.QInt8, QuantType.QInt16, other_override_2),
+            (QuantType.QUInt16, QuantType.QUInt8, None, other_override_0),
+            (QuantType.QInt16, QuantType.QInt8, None, {}),
+            (QuantType.QUInt16, QuantType.QUInt16, None, other_override_0),
+            (QuantType.QInt16, QuantType.QInt16, None, {}),
+            (QuantType.QUInt8, QuantType.QUInt8, None, {}),
+            (QuantType.QUInt8, QuantType.QUInt8, None, convert_layernorm_input),
+        ]
+
+        # Test if LayerNorm's weight input is overridden.
+        for default_act_qtype, default_wgt_qtype, layernorm_input_qtype, other_override in subtest_configs:
+            with self.subTest(
+                default_act_qtype=default_act_qtype,
+                default_wgt_qtype=default_wgt_qtype,
+                layernorm_input_qtype=layernorm_input_qtype,
+                other_override=other_override,
+            ):
+                init_overrides = {}
+                init_overrides.update(other_override)
+
+                if layernorm_input_qtype is not None:
+                    init_overrides["abs_0_out"] = [{"quant_type": layernorm_input_qtype}]
+
+                qnn_config = get_qnn_qdq_config(
+                    float_model_path,
+                    DummyDataReader([]),
+                    activation_type=default_act_qtype,
+                    weight_type=default_wgt_qtype,
+                    init_overrides=(init_overrides if init_overrides else None),
+                    add_qtype_converts=False,
+                )
+
+                self.assertEqual(set(qnn_config.op_types_to_quantize), {"Abs", "LayerNormalization"})
+                input_is_16bit = (
+                    (default_act_qtype in q16_qtypes)
+                    or (layernorm_input_qtype in q16_qtypes)
+                    or (other_override == convert_layernorm_input)
+                )
+                weight_is_symmetric = default_wgt_qtype in symmetric_wgt_qtypes
+
+                if input_is_16bit and default_wgt_qtype in q8_qtypes:
+                    self.assertIn("TensorQuantOverrides", qnn_config.extra_options)
+                    self.assertIn("weight", qnn_config.extra_options["TensorQuantOverrides"])
+                    self.assertEqual(
+                        qnn_config.extra_options["TensorQuantOverrides"]["weight"],
+                        [
+                            {
+                                "quant_type": default_wgt_qtype,
+                                "symmetric": weight_is_symmetric,
+                            }
+                        ],
+                    )
+                elif init_overrides:
+                    self.assertIn("TensorQuantOverrides", qnn_config.extra_options)
+                    self.assertNotIn("weight", qnn_config.extra_options["TensorQuantOverrides"])
+
+                self.assertEqual(weight_is_symmetric, qnn_config.extra_options["WeightSymmetric"])
+                self.assertNotIn("bias", qnn_config.extra_options["TensorQuantOverrides"])
 
     def test_get_qnn_qdq_config_ext_data(self):
         """
@@ -613,6 +1028,7 @@ def test_get_qnn_qdq_config_ext_data(self):
         )
 
         qnn_config = get_qnn_qdq_config("add_ext_data.onnx", DummyDataReader(self.activations))
+        self.assertEqual(set(qnn_config.op_types_to_quantize), {"Add"})
         self.assertTrue(qnn_config.use_external_data_format)
 
 
From 1a0ba3f69f5075754ecae9c92abce9360861a7a5 Mon Sep 17 00:00:00 2001
From: pengwa <pengwa@microsoft.com>
Date: Tue, 26 Mar 2024 13:09:20 +0800
Subject: [PATCH 239/279] Fix softmax export (#20057)

### Description


Why we need to define softmax export logic here?
For the usage `nn.functional.softmax(attn_weights, dim=-1,
dtype=torch.float32)` in the model,

https://github.com/huggingface/transformers/blob/76a33a10923ccc1074917f6b6a1e719e626b7dc9/src/transformers/models/mistral/modeling_mistral.py#L302
If dtype is specified, the input tensor is casted to dtype before the
operation is performed.
This is useful for preventing data type overflows. While existing ONNX
exporter do the cast after the operation, which is not correct.
(https://github.com/pytorch/pytorch/blob/cf06189a2d2785ac493bcd0d55e520af5a0e3b97/torch/onnx/symbolic_opset13.py#L27).
This override can be a workaround before PyTorch fix the issues in
coming releases.
 (TODO: pengwa - add PyTorch versions when the issue is fixed).


@thiagocrepaldi We may need a fix in PyTorch repo as well.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../ortmodule/_custom_op_symbolic_registry.py | 33 ++++++++++--
 .../python/orttraining_test_ortmodule_api.py  | 54 +++++++++++++++++++
 2 files changed, 84 insertions(+), 3 deletions(-)

diff --git a/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py b/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py
index f81aef5f6b9c4..dd7fea3ceda10 100644
--- a/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py
+++ b/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py
@@ -10,7 +10,7 @@
 from packaging import version
 from packaging.version import Version
 from torch.onnx import register_custom_op_symbolic
-from torch.onnx.symbolic_helper import _get_tensor_dim_size, _get_tensor_sizes, parse_args
+from torch.onnx.symbolic_helper import parse_args
 
 from onnxruntime.training.utils import pytorch_type_to_onnx_dtype
 
@@ -176,9 +176,9 @@ def embedding(g, weight, indices, padding_idx, scale_grad_by_freq, sparse):
     try:
         # Tolerant to the case when sizes of indices are not available or not usable (for example
         # when DeepSpeed stage3 enabled, all weights size is (0), this will fail.)
-        indices_shape = _get_tensor_sizes(indices)
+        indices_shape = sym_help._get_tensor_sizes(indices)
         if indices_shape is not None and hasattr(weight.type(), "with_sizes"):
-            output_type = weight.type().with_sizes([*indices_shape, _get_tensor_dim_size(weight, 1)])
+            output_type = weight.type().with_sizes([*indices_shape, sym_help._get_tensor_dim_size(weight, 1)])
             output.setType(output_type)
     except IndexError:
         output.setType(weight.type())
@@ -845,3 +845,30 @@ def layer_norm(g, input, normalized_shape, weight, bias, eps, cudnn_enable):
     )
 
     return res
+
+
+# Adapted from torch.onnx.symbolic_opset13.softmax -
+# https://github.com/pytorch/pytorch/blob/cf06189a2d2785ac493bcd0d55e520af5a0e3b97/torch/onnx/symbolic_opset13.py#L27
+# We don't need overloads symbolic_opset9 because training support opsets >= 13.
+#
+# Why we need to define softmax export logic here?
+# For the usage `nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32)` in the model,
+# https://github.com/huggingface/transformers/blob/76a33a10923ccc1074917f6b6a1e719e626b7dc9/src/transformers/models/mistral/modeling_mistral.py#L302
+# If dtype is specified, the input tensor is casted to dtype before the operation is performed.
+# This is useful for preventing data type overflows. While existing ONNX exporter do the cast after the operation.
+# This override can be a workaround before PyTorch fix the issues in coming releases.
+# (TODO: pengwa - add PyTorch versions when the issue is fixed).
+@register_symbolic("softmax")
+@parse_args("v", "i", "none")
+def softmax(g, input, dim, dtype=None):
+    from torch.onnx import _type_utils
+
+    casted_input = input
+    need_cast_for_compute = dtype and dtype.node().kind() != "prim::Constant"
+    if need_cast_for_compute:
+        parsed_dtype = sym_help._get_const(dtype, "i", "dtype")
+        casted_input = g.op("Cast", input, to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type())
+
+    softmax = g.op("Softmax", casted_input, axis_i=dim)
+
+    return softmax
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
index 7afad9145ed27..d6f55e787c320 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
@@ -33,6 +33,7 @@
 from onnxruntime.training.ortmodule import DebugOptions, LogLevel, ORTModule, _fallback, _io, _utils
 from onnxruntime.training.ortmodule._custom_gradient_registry import register_gradient
 from onnxruntime.training.ortmodule.options import _SkipCheck
+from onnxruntime.training.utils import pytorch_type_to_onnx_dtype
 
 DEFAULT_OPSET = 17
 
@@ -6496,3 +6497,56 @@ def run_step(model, x, y, z):
     torch.cuda.synchronize()
     if original_val is not None:
         os.environ["ORTMODULE_MEMORY_OPT_LEVEL"] = original_val
+
+
+@pytest.mark.parametrize("softmax_compute_type", [torch.float16, torch.float32])
+def test_overridden_softmax_export(softmax_compute_type):
+    class CustomSoftmaxExportTest(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, attn_weight):
+            return torch.nn.functional.softmax(attn_weight, dim=-1, dtype=softmax_compute_type)
+
+    device = "cuda"
+    pt_model = CustomSoftmaxExportTest().to(device)
+    ort_model = ORTModule(
+        copy.deepcopy(pt_model), DebugOptions(save_onnx=True, onnx_prefix="overridden_softmax_export")
+    )
+
+    def run_step(model, attn_weight):
+        prediction = model(attn_weight)
+        prediction.sum().backward()
+        return prediction
+
+    # reset manual seed to reset the generator
+    torch.manual_seed(2333)
+    attn_weight = torch.randn([20, 6, 10, 10], dtype=torch.float, device=device, requires_grad=True)
+    ort_attn_weight = copy.deepcopy(attn_weight)
+    pt_prediction = run_step(pt_model, attn_weight)
+    ort_prediction = run_step(ort_model, ort_attn_weight)
+
+    _test_helpers.assert_values_are_close(ort_prediction, pt_prediction)
+    _test_helpers.assert_values_are_close(attn_weight.grad, ort_attn_weight.grad)
+    _test_helpers.assert_gradients_match_and_reset_gradient(ort_model, pt_model)
+
+    # Check the ONNX Softmax is running in float32.
+    execution_mgr = ort_model._torch_module._execution_manager._training_manager
+    from onnxruntime.training.ortmodule._onnx_models import _get_onnx_file_name
+
+    # Keep the logic aligned with _graph_execution_manager.py
+    path = os.path.join(
+        execution_mgr._debug_options.save_onnx_models.path,
+        _get_onnx_file_name(
+            execution_mgr._debug_options.save_onnx_models.name_prefix, "torch_exported", execution_mgr._export_mode
+        ),
+    )
+
+    onnx_model = onnx.load(path)
+    onnx_nodes = [n for n in onnx_model.graph.node]
+
+    assert onnx_nodes[0].op_type == "Cast"
+    to_attr = onnx_nodes[0].attribute[0]
+    assert to_attr.name == "to"
+    to_value = to_attr.i
+    assert to_value == pytorch_type_to_onnx_dtype(softmax_compute_type), "Cast to attribute is not as expected"

From 0906c57c9e1ec60adaba2bee115eaf04748dee5e Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Tue, 26 Mar 2024 17:59:46 +0800
Subject: [PATCH 240/279] Pin Onnx Version (#20073)

### Description
1. change in build.py is to fix DML exception
(https://dev.azure.com/onnxruntime/onnxruntime/_build?definitionId=10&_a=summary)
2. change in requirements.txt is to fix exception in python packaging
pipeline.
https://dev.azure.com/aiinfra/Lotus/_build/results?buildId=430433&view=results


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

---------

Co-authored-by: Yi Zhang <your@email.com>
---
 onnxruntime/test/python/requirements.txt | 4 ++--
 tools/ci_build/build.py                  | 4 +++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/test/python/requirements.txt b/onnxruntime/test/python/requirements.txt
index e33fe0e4daded..dc158e0eebd19 100644
--- a/onnxruntime/test/python/requirements.txt
+++ b/onnxruntime/test/python/requirements.txt
@@ -1,2 +1,2 @@
-onnx
-pytest
\ No newline at end of file
+onnx==1.15.0
+pytest
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 3c1bdfc54c12e..e1649ae251d88 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -2087,7 +2087,9 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
                     run_subprocess(
                         [sys.executable, "-m", "pip", "uninstall", "--yes", "onnx"], cwd=cwd, dll_path=dll_path
                     )
-                    run_subprocess([sys.executable, "-m", "pip", "install", "-q", "onnx"], cwd=cwd, dll_path=dll_path)
+                    run_subprocess(
+                        [sys.executable, "-m", "pip", "install", "-q", "onnx==1.15.0"], cwd=cwd, dll_path=dll_path
+                    )
                 run_subprocess([sys.executable, "onnxruntime_test_python_iobinding.py"], cwd=cwd, dll_path=dll_path)
 
             if args.use_cuda:

From dfa891a2d8fab78cff51e27289774baf967214fd Mon Sep 17 00:00:00 2001
From: pengwa <pengwa@microsoft.com>
Date: Tue, 26 Mar 2024 21:25:59 +0800
Subject: [PATCH 241/279] Fix memory stats printing (#20061)

### Fix memory stats printing

The mmeory stats printing is failed when module is in eval mode, doing
ORTModule wrap. At that time, runtime inspector for training manager
should have training model being true, but got a false (because existing
logic get the boolean from module.training). Runtime inspector as part
of training manager or inference manager should know it is serving
training or not explicitly, so we cannot depend on the stat of
module.training during ORTModule initialization.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../ortmodule/_graph_execution_manager.py     | 24 +++++++++----
 .../training/ortmodule/_inference_manager.py  |  3 +-
 .../training/ortmodule/_runtime_inspector.py  | 27 +++++++++++---
 .../training/ortmodule/_training_manager.py   |  4 +--
 .../python/orttraining_test_ortmodule_api.py  | 36 +++++++++++++++++++
 5 files changed, 78 insertions(+), 16 deletions(-)

diff --git a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
index 568c92b71277f..5123594bff387 100755
--- a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
+++ b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
@@ -54,10 +54,20 @@ def __init__(
         self,
         module: _FlattenedModule,
         debug_options: DebugOptions,
+        export_mode: int,
         fallback_manager: _FallbackManager,
         logger: logging.Logger,
     ):
-        """Manages construction and execution of ONNX graphs"""
+        """Manages construction and execution of ONNX graphs.
+
+        Args:
+            module: The flatten PyTorch module to be executed.
+            debug_options: Debug options for ORTModule.
+            export_mode: export mode, should be torch.onnx.TrainingMode.TRAINING or torch.onnx.TrainingMode.EVAL.
+            fallback_manager: Fallback manager to handle exceptions.
+            logger: Logger for ORTModule.
+
+        """
 
         super().__init__(module._original_module)
 
@@ -88,16 +98,12 @@ def __init__(
 
         self._first_skip_check_warning = True
 
-        # Inspector for runtime information, for example input data, memory usage, etc.
-        self._runtime_inspector = RuntimeInspector(self._logger, self._original_module)
-        self._runtime_inspector.memory_ob.enable_memory_stats_by_step(self._runtime_options.print_memory_stat_by_step)
-
         # Tracker for ORTModule model export, session creation overhead.
         self.time_tracker = _logger.TimeTracker()
 
         # Value can be either torch.onnx.TrainingMode.TRAINING or torch.onnx.TrainingMode.EVAL
         # To be instantiated in the concrete implementation of GraphExecutionManager
-        self._export_mode = None
+        self._export_mode = export_mode
 
         # Exporter can take extra arguments for ORTModule extensions
         # It cannot overlap with required/immutable arguments (validated in runtime)
@@ -129,6 +135,12 @@ def __init__(
         # Re-export will be avoided if _skip_check is enabled.
         self._original_model_has_changed = False
 
+        # Inspector for runtime information, for example input data, memory usage, etc.
+        self._runtime_inspector = RuntimeInspector(
+            self._logger, self._original_module, self._export_mode == torch.onnx.TrainingMode.TRAINING
+        )
+        self._runtime_inspector.memory_ob.enable_memory_stats_by_step(self._runtime_options.print_memory_stat_by_step)
+
         # Load ATen operator executor extension.
         load_aten_op_executor_cpp_extension()
 
diff --git a/orttraining/orttraining/python/training/ortmodule/_inference_manager.py b/orttraining/orttraining/python/training/ortmodule/_inference_manager.py
index 6690af9b71bf1..13145c7c79091 100644
--- a/orttraining/orttraining/python/training/ortmodule/_inference_manager.py
+++ b/orttraining/orttraining/python/training/ortmodule/_inference_manager.py
@@ -28,8 +28,7 @@ class InferenceManager(GraphExecutionManager):
     """
 
     def __init__(self, model, debug_options: DebugOptions, fallback_manager: _FallbackManager, logger: Logger):
-        super().__init__(model, debug_options, fallback_manager, logger)
-        self._export_mode = torch.onnx.TrainingMode.EVAL
+        super().__init__(model, debug_options, torch.onnx.TrainingMode.EVAL, fallback_manager, logger)
 
     @staticmethod
     def execution_session_run_forward(
diff --git a/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py b/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py
index d3fe132609a90..5c86070430e81 100644
--- a/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py
+++ b/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py
@@ -46,11 +46,18 @@ class RuntimeInspector:
     Runtime inspector for ORTModule.
     """
 
-    def __init__(self, logger: Logger, module: torch.nn.Module):
+    def __init__(self, logger: Logger, module: torch.nn.Module, training: bool):
+        """Initialize runtime inspector.
+
+        Args:
+            logger: Logger.
+            module: Torch module.
+            training: a boolean indicating whether the module is in training mode.
+        """
         self._logger = logger
 
         self.input_density_ob: Union[InputDensityObserver, None] = None
-        self.memory_ob = MemoryObserver(module, self._logger)
+        self.memory_ob = MemoryObserver(module, self._logger, training)
 
     def enable_input_inspector(self, model: ModelProto, user_input_names: List[str]) -> None:
         """Initialize input inspector from the given ONNX model and user input names.
@@ -479,7 +486,14 @@ class MemoryObserver:
     NORMALIZER_FACTOR = float(1024 * 1024)
     NORMALIZER_UNIT = "MiB"
 
-    def __init__(self, m: torch.nn.Module, logger: Logger):
+    def __init__(self, m: torch.nn.Module, logger: Logger, training: bool):
+        """Initialize memory observer.
+
+        Args:
+            m: Torch module.
+            logger: Logger.
+            training: a boolean indicating whether the module is in training mode.
+        """
         self._logger = logger
         self._is_enabled = True
 
@@ -503,7 +517,10 @@ def __init__(self, m: torch.nn.Module, logger: Logger):
 
         self._rank_info = f"[{self._rank}/{self._world_size}]"
         self._pre_phase = Phase.INVALID
-        self._last_phase = Phase.POST_BACKWARD if m.training else Phase.POST_FORWARD
+
+        # Cannot infer it is for training or inferencing purpose from module.training,
+        # because it probabbly is not set correctly when this happens.
+        self._last_phase = Phase.POST_BACKWARD if training else Phase.POST_FORWARD
 
         self._is_first_inspect = True
 
@@ -721,7 +738,7 @@ def _get_user_config_without_freq(configs: str):
                 notes.append(saving_recommendation)
 
                 saving_recommendation = (
-                    "[Memory Optimizer] memory saving is calculated based on the 1st batch symbolic dim values:\n"
+                    "[Memory Optimizer] Memory saving is calculated based on the 1st batch symbolic dim values:\n"
                 )
                 for dim_param, dim_value in self.symbolic_dim_name_to_value_map.items():
                     saving_recommendation += f"  {dim_param}={dim_value},"
diff --git a/orttraining/orttraining/python/training/ortmodule/_training_manager.py b/orttraining/orttraining/python/training/ortmodule/_training_manager.py
index 5fa332d12f01c..a7426bce38a40 100644
--- a/orttraining/orttraining/python/training/ortmodule/_training_manager.py
+++ b/orttraining/orttraining/python/training/ortmodule/_training_manager.py
@@ -38,9 +38,7 @@ def __init__(
         fallback_manager: _FallbackManager,
         logger: Logger,
     ):
-        super().__init__(model, debug_options, fallback_manager, logger)
-
-        self._export_mode = torch.onnx.TrainingMode.TRAINING
+        super().__init__(model, debug_options, torch.onnx.TrainingMode.TRAINING, fallback_manager, logger)
         self._forward_class = self._create_autofunction_class()
 
     @staticmethod
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
index d6f55e787c320..da217eb76949c 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
@@ -6499,6 +6499,42 @@ def run_step(model, x, y, z):
         os.environ["ORTMODULE_MEMORY_OPT_LEVEL"] = original_val
 
 
+def test_bert_memory_inspection(caplog):
+    original_val = os.environ.get("ORTMODULE_PRINT_MEMORY_STATS", None)
+
+    # Create PyTorch model with dropout disabled.
+    pt_model = _get_bert_for_sequence_classification_model(
+        "cuda", is_training=True, hidden_dropout_prob=0.0, attention_probs_dropout_prob=0.0
+    )
+
+    os.environ["ORTMODULE_PRINT_MEMORY_STATS"] = "1"
+    pt_model.eval()  # Put it in evaluate mode by intention, in case some initialization in ORTModule use the module.is_training for its checks by mistake.
+    ort_model = ORTModule(
+        copy.deepcopy(pt_model), DebugOptions(log_level=LogLevel.INFO)  # The logged memory info is in INFO level.
+    )
+
+    def run_step(model, x, y, z):
+        outputs = model(x, y, None, None, None, None, z)
+        loss = outputs[0]
+        loss.backward()
+
+    ort_model.train()
+    for _ in range(32):
+        x, y, z = _get_bert_for_sequence_classification_sample_data_with_random_shapes("cuda")
+        run_step(ort_model, x, y, z)
+
+    info_records = [
+        record.message for record in caplog.records if record.levelname == "INFO" and "(MiB) | phase:" in record.message
+    ]
+
+    assert len(info_records) == 4 * 11
+
+    # Make sure environment variable is restored to its original value after the run is completed.
+    torch.cuda.synchronize()
+    if original_val is not None:
+        os.environ["ORTMODULE_PRINT_MEMORY_STATS"] = original_val
+
+
 @pytest.mark.parametrize("softmax_compute_type", [torch.float16, torch.float32])
 def test_overridden_softmax_export(softmax_compute_type):
     class CustomSoftmaxExportTest(torch.nn.Module):

From ea3082edc6f159efb928446d79eee133336a5dcc Mon Sep 17 00:00:00 2001
From: zesongw <zesong.wang@intel.com>
Date: Wed, 27 Mar 2024 02:59:41 +0800
Subject: [PATCH 242/279] [WebNN EP] Support Split before opset13 (#19988)

### Description
Support Split before opset13, where the `split` is an attribute.


### Motivation and Context
Support more models which use the earlier opset.
---
 .../webnn/builders/impl/split_op_builder.cc   | 106 +++++++-----------
 1 file changed, 43 insertions(+), 63 deletions(-)

diff --git a/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc
index 9819e4ce7ac5b..ea3b8ef384ddc 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc
@@ -28,8 +28,6 @@ class SplitOpBuilder : public BaseOpBuilder {
  private:
   bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
                          const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
-
-  int GetMinSupportedOpSet(const Node& node) const override;
 };
 
 // Add operator related.
@@ -57,53 +55,35 @@ Status SplitOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   axis = SafeInt<int32_t>(HandleNegativeAxis(axis, rank));
   options.set("axis", axis);
 
-  if (!GetTensorName(input_defs, 1).empty()) {
-    // Inputs contains optional 'split' input
-    std::vector<int32_t> splits;
+  uint32_t split_count = 0;
+  std::vector<uint32_t> splits = helper.Get("split", std::vector<uint32_t>{});
+
+  // Read either the split count or explicit split lengths from the various attributes over opset versions.
+  if (helper.HasAttr("num_outputs")) {
+    split_count = helper.Get("num_outputs", 0);
+  } else if (GetTensorName(input_defs, 1).size()) {
     const auto& initializers(model_builder.GetInitializerTensors());
     const auto& split_tensor = *initializers.at(input_defs[1]->Name());
-    ORT_RETURN_IF_NOT(ReadIntArrayFrom1DTensor(split_tensor, splits, logger), "Cannot get split.");
-    output_array = model_builder.GetBuilder().call<emscripten::val>("split",
-                                                                    input,
-                                                                    emscripten::val::array(splits),
-                                                                    options);
-    ORT_RETURN_IF_NOT(output_array["length"].as<int32_t>() == static_cast<int32_t>(splits.size()),
-                      "The size of outputs must be equal to the size of 'split' input.");
+    ORT_RETURN_IF_NOT(ReadIntArrayFrom1DTensor(split_tensor, splits, logger), "Cannot get input for split.");
+  } else if (!helper.HasAttr("split")) {
+    split_count = node.OutputDefs().size();
+  }
+
+  // Check that the splits evenly divide.
+  if (split_count > 0 && splits.empty() && input_shape[axis] % split_count != 0) {
+    // Divide inputs into variable size outputs:
+    splits.insert(splits.end(), split_count - 1, gsl::narrow<uint32_t>(input_shape[axis]) / split_count);
+    splits.insert(splits.end(), gsl::narrow<uint32_t>(input_shape[axis]) % split_count);
+  }
+
+  if (splits.empty()) {
+    output_array = model_builder.GetBuilder().call<emscripten::val>(
+        "split", input, split_count, options);
   } else {
-    if (helper.HasAttr("num_outputs")) {
-      const int32_t num_outputs = helper.Get("num_outputs", 1);
-      ORT_RETURN_IF_NOT(num_outputs > 0, "The 'num_outputs' must be a positive integer.");
-      if (input_shape[axis] % num_outputs == 0) {
-        // The 'num_outputs' evenly divide the dim value at 'axis' specified.
-        output_array = model_builder.GetBuilder().call<emscripten::val>("split",
-                                                                        input,
-                                                                        num_outputs,
-                                                                        options);
-      } else {
-        std::vector<int64_t> mapping_split;
-        mapping_split.insert(mapping_split.begin(), num_outputs - 1, input_shape[axis] / num_outputs);
-        mapping_split.insert(mapping_split.end(), input_shape[axis] % num_outputs);
-        std::vector<uint32_t> converted_splits = GetVecUint32FromVecInt64(mapping_split);
-        output_array = model_builder.GetBuilder().call<emscripten::val>("split",
-                                                                        input,
-                                                                        emscripten::val::array(converted_splits),
-                                                                        options);
-      }
-      ORT_RETURN_IF_NOT(output_array["length"].as<int32_t>() == num_outputs,
-                        "The size of outputs must be equal to 'num_outputs'.");
-    } else {
-      // w/o 'split' input for opset 13
-      // Refer to https://github.com/microsoft/onnxruntime/blob/a7ad859e3ab60bddfcf2fefa96bfcb550f0fc04c/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp#L984-L989
-      // split input stream equally across output streams.
-      const auto& output_defs = node.OutputDefs();
-      const size_t output_count = output_defs.size();
-      output_array = model_builder.GetBuilder().call<emscripten::val>("split",
-                                                                      input, static_cast<int32_t>(output_count),
-                                                                      options);
-      ORT_RETURN_IF_NOT(output_array["length"].as<size_t>() == output_count,
-                        "The size of outputs must be equal to the count of output nodes.");
-    }
+    output_array = model_builder.GetBuilder().call<emscripten::val>(
+        "split", input, emscripten::val::array(splits), options);
   }
+
   for (size_t i = 0, count = output_array["length"].as<size_t>(); i < count; i++) {
     model_builder.AddOperand(node.OutputDefs()[i]->Name(), std::move(output_array[i]));
   }
@@ -112,11 +92,6 @@ Status SplitOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 
 // Operator support related.
 
-int SplitOpBuilder::GetMinSupportedOpSet(const Node& /* node */) const {
-  // Since opset 13, Split has optional 'split' input.
-  return 13;
-}
-
 bool SplitOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
                                        const Node& node,
                                        const WebnnDeviceType /* device_type */,
@@ -132,6 +107,7 @@ bool SplitOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
   NodeAttrHelper helper(node);
   int32_t axis = helper.Get("axis", 0);
   axis = SafeInt<int32_t>(HandleNegativeAxis(axis, rank));
+  std::vector<uint32_t> split = helper.Get("split", std::vector<uint32_t>{});
 
   const std::string split_name = GetTensorName(input_defs, 1);
   // Inputs contain optional 'split' input.
@@ -141,7 +117,6 @@ bool SplitOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
       return false;
     }
     // Values should be >= 0. Sum of the values must be equal to the dim value at 'axis' specified.
-    std::vector<int64_t> split;
     const auto& split_tensor = *initializers.at(input_defs[1]->Name());
     if (split_tensor.data_type() != ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64) {
       LOGS(logger, VERBOSE) << "The type of tensor's element data must be INT64.";
@@ -151,18 +126,6 @@ bool SplitOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
       LOGS(logger, VERBOSE) << "Cannot get split.";
       return false;
     }
-    int64_t sum = 0;
-    for (size_t i = 0; i < split.size(); i++) {
-      if (split[i] < 0) {
-        LOGS(logger, VERBOSE) << "Value of split should be greater than or equal to 0.";
-        return false;
-      }
-      sum += split[i];
-    }
-    if (sum != input_shape[axis]) {
-      LOGS(logger, VERBOSE) << "Sum of the split's values must be equal to the dim value at 'axis' specified.";
-      return false;
-    }
   } else {
     if (helper.HasAttr("num_outputs")) {
       // Split has 'num_outputs' attribute when opset is 18.
@@ -179,6 +142,23 @@ bool SplitOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
       }
     }
   }
+
+  if (!split.empty()) {
+    int64_t sum = 0;
+    // TODO: Allow 0 size dimensions.
+    // https://github.com/webmachinelearning/webnn/issues/391
+    for (uint32_t split_value : split) {
+      if (split_value <= 0) {
+        LOGS(logger, VERBOSE) << "Value of split should be greater than 0.";
+        return false;
+      }
+      sum += split_value;
+    }
+    if (sum != input_shape[axis]) {
+      LOGS(logger, VERBOSE) << "Sum of the split's values must be equal to the dim value at 'axis' specified.";
+      return false;
+    }
+  }
   return true;
 }
 

From 40efbd6c37a735d963d56237ea98b96c3198b5ab Mon Sep 17 00:00:00 2001
From: Baiju Meswani <bmeswani@microsoft.com>
Date: Tue, 26 Mar 2024 12:20:11 -0700
Subject: [PATCH 243/279] Fix training and macos ci pipelines (#20034)

---
 .github/workflows/publish-objectivec-apidocs.yml          | 2 +-
 .../mac-objc-static-analysis-ci-pipeline.yml              | 2 +-
 .../azure-pipelines/nodejs/templates/test_macos.yml       | 2 +-
 .../github/azure-pipelines/nuget/templates/test_macos.yml | 2 +-
 tools/ci_build/github/azure-pipelines/post-merge-jobs.yml | 8 ++++----
 .../github/azure-pipelines/py-package-test-pipeline.yml   | 2 +-
 .../github/azure-pipelines/templates/c-api-cpu.yml        | 8 ++++----
 .../azure-pipelines/templates/py-packaging-stage.yml      | 4 +++-
 .../templates/py-packaging-training-cuda-stage-steps.yml  | 1 +
 .../azure-pipelines/templates/use-xcode-version.yml       | 2 +-
 10 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/publish-objectivec-apidocs.yml b/.github/workflows/publish-objectivec-apidocs.yml
index b9f3c0b9a398b..ebacd38f1f882 100644
--- a/.github/workflows/publish-objectivec-apidocs.yml
+++ b/.github/workflows/publish-objectivec-apidocs.yml
@@ -21,7 +21,7 @@ permissions:
 jobs:
   build:
     name: Generate Objective-C API docs
-    runs-on: macos-13
+    runs-on: macos-latest
     steps:
     - uses: actions/checkout@v4
 
diff --git a/tools/ci_build/github/azure-pipelines/mac-objc-static-analysis-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/mac-objc-static-analysis-ci-pipeline.yml
index 7e8e72cad179f..4927c9684b9d7 100644
--- a/tools/ci_build/github/azure-pipelines/mac-objc-static-analysis-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/mac-objc-static-analysis-ci-pipeline.yml
@@ -2,7 +2,7 @@ jobs:
 - job: ObjCStaticAnalysis
 
   pool:
-    vmImage: 'macOS-13'
+    vmImage: 'macOS-latest'
 
   timeoutInMinutes: 30
 
diff --git a/tools/ci_build/github/azure-pipelines/nodejs/templates/test_macos.yml b/tools/ci_build/github/azure-pipelines/nodejs/templates/test_macos.yml
index dc52e9a22f05b..f66c7d9938ec6 100644
--- a/tools/ci_build/github/azure-pipelines/nodejs/templates/test_macos.yml
+++ b/tools/ci_build/github/azure-pipelines/nodejs/templates/test_macos.yml
@@ -11,7 +11,7 @@ stages:
       clean: all
     timeoutInMinutes:  120
     pool:
-      vmImage: 'macOS-13'
+      vmImage: 'macOS-latest'
 
     variables:
     - name: OnnxRuntimeBuildDirectory
diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/test_macos.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/test_macos.yml
index de0520b97504f..4dcec0f8cf3e7 100644
--- a/tools/ci_build/github/azure-pipelines/nuget/templates/test_macos.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget/templates/test_macos.yml
@@ -11,7 +11,7 @@ stages:
     workspace:
       clean: all
     pool:
-      vmImage: 'macOS-13'
+      vmImage: 'macOS-latest'
 
     variables:
     - name: OnnxRuntimeBuildDirectory
diff --git a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
index bb4402faeb191..82708bbec34cf 100644
--- a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
+++ b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
@@ -405,7 +405,7 @@ stages:
   - job: IosDynamicFramework
     timeoutInMinutes: 120
     pool:
-      vmImage: "macOS-13"
+      vmImage: "macOS-latest"
 
     steps:
     - task: UsePythonVersion@0
@@ -416,7 +416,7 @@ stages:
 
     - template: templates/use-xcode-version.yml
       parameters:
-        xcodeVersion: 14.3
+        xcodeVersion: 14.2
 
     - script: |
         pip install -r tools/ci_build/github/apple/ios_packaging.requirements.txt
@@ -442,7 +442,7 @@ stages:
   - job: IosMinimalTrainingBuild
     timeoutInMinutes: 120
     pool:
-      vmImage: "macOS-13"
+      vmImage: "macOS-latest"
 
     steps:
     - task: UsePythonVersion@0
@@ -453,7 +453,7 @@ stages:
 
     - template: templates/use-xcode-version.yml
       parameters:
-        xcodeVersion: 14.3
+        xcodeVersion: 14.2
 
     - script: |
         pip install -r tools/ci_build/github/apple/ios_packaging.requirements.txt
diff --git a/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
index 04f555deb1a22..acec6f501ed2f 100644
--- a/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
@@ -43,7 +43,7 @@ stages:
       parameters:
         job_name: Test_MAC_Wheels
         machine_pool:
-          vmImage: 'macOS-13'
+          vmImage: 'macOS-latest'
         itemPattern: '*/*mac*x86_64.whl'
     - template: templates/py-package-smoking-test.yml
       parameters:
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
index 0bb9fad6716b7..44f7c1829af9b 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
@@ -101,14 +101,14 @@ stages:
     workspace:
       clean: all
     pool:
-      vmImage: 'macOS-13'
+      vmImage: 'macOS-latest'
     timeoutInMinutes: 300
     steps:
     - template: set-version-number-variables-step.yml
 
     - template: use-xcode-version.yml
       parameters:
-        xcodeVersion: 14.3
+        xcodeVersion: 14.2
     - script: |
         /bin/bash $(Build.SourcesDirectory)/tools/ci_build/github/apple/build_host_protoc.sh \
           $(Build.SourcesDirectory) \
@@ -796,7 +796,7 @@ stages:
 
 - template: ../nuget/templates/test_macos.yml
   parameters:
-    AgentPool : macOS-13
+    AgentPool : macOS-latest
     ArtifactSuffix: 'CPU'
 
 - template: ../nodejs/templates/test_win.yml
@@ -832,4 +832,4 @@ stages:
     OS: MacOS
     BuildId: ${{ parameters.BuildId }}
     SpecificArtifact: ${{ parameters.SpecificArtifact }}
-    PoolName: 'macOS-13'
+    PoolName: 'macOS-latest'
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
index 5ac5bda8b0964..2e5aee77604c1 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
@@ -362,7 +362,7 @@ stages:
       workspace:
         clean: all
       pool:
-        vmImage: 'macOS-13'
+        vmImage: 'macOS-latest'
       variables:
         MACOSX_DEPLOYMENT_TARGET: '11.0'
       strategy:
@@ -388,6 +388,8 @@ stages:
           versionSpec: $(PythonVersion)
 
       - template: use-xcode-version.yml
+        parameters:
+          xcodeVersion: 14.2
 
       - script: |
           set -e -x
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml
index 024b9b45591ba..004c44d5f6e78 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml
@@ -36,6 +36,7 @@ stages:
     jobs:
     - job: Build
       pool: onnxruntime-Ubuntu2204-AMD-CPU
+      timeoutInMinutes: 180
       steps:
         - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
           displayName: 'Clean Agent Directories'
diff --git a/tools/ci_build/github/azure-pipelines/templates/use-xcode-version.yml b/tools/ci_build/github/azure-pipelines/templates/use-xcode-version.yml
index 5742b6c60fec5..ec4398fe31fc5 100644
--- a/tools/ci_build/github/azure-pipelines/templates/use-xcode-version.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/use-xcode-version.yml
@@ -3,7 +3,7 @@
 parameters:
 - name: xcodeVersion
   type: string
-  default: "15.1"
+  default: "14.2"
 
 steps:
 - bash: |

From 0313dd1f65a329d001c32b1a6ba45bc52d0f16fa Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Tue, 26 Mar 2024 13:16:59 -0700
Subject: [PATCH 244/279] Update Web CI to use data dir under
 Agent.TempDirectory (#20074)

### Description
Update Web CI to use data dir under Agent.TempDirectory

This change fixes the random failure caused by unstable access to karma
temp directory (which is under AppData\Local\Temp) on CI pipeline
---
 js/web/karma.conf.js                          |  9 ++--
 js/web/script/test-runner-cli-args.ts         |  6 +++
 js/web/script/test-runner-cli.ts              |  5 +-
 .../azure-pipelines/templates/win-web-ci.yml  | 50 +++++++++++++++----
 .../templates/win-web-multi-browsers.yml      | 21 ++++++--
 5 files changed, 74 insertions(+), 17 deletions(-)

diff --git a/js/web/karma.conf.js b/js/web/karma.conf.js
index 9e44d9c0d9652..507da0de2b4ad 100644
--- a/js/web/karma.conf.js
+++ b/js/web/karma.conf.js
@@ -9,6 +9,8 @@ const karmaPlugins = args['karma-plugins'] || undefined;
 const timeoutMocha = args['timeout-mocha'] || 60000;
 const forceLocalHost = !!args['force-localhost'];
 
+// user data directory; will be passed to the Edge/Chrome/ChromeCanary/Firefox launchers
+const userDataDir = args['user-data-dir'];
 // parse chromium flags
 let chromiumFlags = args['chromium-flags'];
 if (!chromiumFlags) {
@@ -87,9 +89,10 @@ module.exports = function(config) {
     listenAddress,
     customLaunchers: {
       // Chromium-based browsers
-      EdgeTest: {base: 'Edge', flags: chromiumFlags},
-      ChromeTest: {base: 'Chrome', flags: chromiumFlags},
-      ChromeCanaryTest: {base: 'ChromeCanary', flags: chromiumFlags},
+      EdgeTest: {base: 'Edge', flags: chromiumFlags, edgeDataDir: userDataDir},
+      ChromeTest: {base: 'Chrome', flags: chromiumFlags, chromeDataDir: userDataDir},
+      ChromeCanaryTest: {base: 'ChromeCanary', flags: chromiumFlags, chromeDataDir: userDataDir},
+      FirefoxTest: {base: 'Firefox', profile: userDataDir},
 
       //
       // ==== BrowserStack browsers ====
diff --git a/js/web/script/test-runner-cli-args.ts b/js/web/script/test-runner-cli-args.ts
index b2b212bdb9bc1..745f504b0494d 100644
--- a/js/web/script/test-runner-cli-args.ts
+++ b/js/web/script/test-runner-cli-args.ts
@@ -103,6 +103,7 @@ Options:
 
  --no-sandbox                  This flag will be passed to Chrome.
                                  Sometimes Chrome need this flag to work together with Karma.
+ --user-data-dir=<...>         This flag will be passed to browsers to specify the user data directory.
  --chromium-flags=<...>        This flag will be passed to Chrome and Edge browsers. Can be used multiple times.
 
 Examples:
@@ -195,6 +196,7 @@ export interface TestRunnerCliArgs {
   webnnOptions?: InferenceSession.WebNNExecutionProviderOption;
   globalEnvFlags?: Test.Options['globalEnvFlags'];
   noSandbox?: boolean;
+  userDataDir?: string;
   chromiumFlags: string[];
 }
 
@@ -477,6 +479,9 @@ export function parseTestRunnerCliArgs(cmdlineArgs: string[]): TestRunnerCliArgs
   // Option: --no-sandbox
   const noSandbox = !!args['no-sandbox'];
 
+  // Option: --user-data-dir
+  const userDataDir = args['user-data-dir'];
+
   // parse chromium flags
   let chromiumFlags = args['chromium-flags'];
   if (!chromiumFlags) {
@@ -515,6 +520,7 @@ export function parseTestRunnerCliArgs(cmdlineArgs: string[]): TestRunnerCliArgs
     wasmOptions,
     globalEnvFlags,
     noSandbox,
+    userDataDir,
     chromiumFlags
   };
 }
diff --git a/js/web/script/test-runner-cli.ts b/js/web/script/test-runner-cli.ts
index ace64e9532b12..03d637b35bc7c 100644
--- a/js/web/script/test-runner-cli.ts
+++ b/js/web/script/test-runner-cli.ts
@@ -573,6 +573,9 @@ async function main() {
         karmaArgs.push('--log-level debug');
       }
       karmaArgs.push(`--bundle-mode=${args.bundleMode}`);
+      if (args.userDataDir) {
+        karmaArgs.push(`--user-data-dir="${args.userDataDir}"`);
+      }
       karmaArgs.push(...chromiumFlags.map(flag => `--chromium-flags=${flag}`));
       if (browser.startsWith('Edge')) {
         // There are currently 2 Edge browser launchers:
@@ -671,7 +674,7 @@ async function main() {
       case 'edge':
         return 'EdgeTest';
       case 'firefox':
-        return 'Firefox';
+        return 'FirefoxTest';
       case 'electron':
         return 'Electron';
       case 'safari':
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
index 1eb2ee6f6409c..b7aee559cf73c 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
@@ -153,31 +153,61 @@ jobs:
       errorActionPreference: stop
     displayName: 'Pack NPM packages'
   - script: |
-     npm test -- -e=chrome -b=webgl,wasm --karma-debug
+     powershell "Get-WmiObject Win32_Process -Filter \"name = 'chrome.exe'\" | Format-List CommandLine"
+    displayName: 'Check active Chrome processes (before test)'
+    condition: and(succeeded(), eq('$(Agent.Diagnostic)', 'true'))
+  - script: |
+     mkdir $(Agent.TempDirectory)\web\test\01
+     npm test -- -e=chrome -b=webgl,wasm --user-data-dir=$(Agent.TempDirectory)\web\test\01
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'Run ort-web tests (wasm,webgl backend)'
-    condition: eq('${{ parameters.RunWebGpuTests }}', 'false')
+    condition: and(succeeded(), eq('${{ parameters.RunWebGpuTests }}', 'false'))
+  - script: |
+     powershell "Get-WmiObject Win32_Process -Filter \"name = 'chrome.exe'\" | Format-List CommandLine"
+    displayName: 'Check active Chrome processes (before test)'
+    condition: and(succeeded(), eq('$(Agent.Diagnostic)', 'true'))
   - script: |
-     npm test -- -e=chrome -b=webgl,wasm,webgpu --karma-debug $(webgpuCommandlineExtraFlags)
+     mkdir $(Agent.TempDirectory)\web\test\02
+     npm test -- -e=chrome -b=webgl,wasm,webgpu $(webgpuCommandlineExtraFlags) --user-data-dir=$(Agent.TempDirectory)\web\test\02
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'Run ort-web tests (ALL backends)'
-    condition: eq('${{ parameters.RunWebGpuTests }}', 'true')
+    condition: and(succeeded(), eq('${{ parameters.RunWebGpuTests }}', 'true'))
+  - script: |
+     powershell "Get-WmiObject Win32_Process -Filter \"name = 'chrome.exe'\" | Format-List CommandLine"
+    displayName: 'Check active Chrome processes (before test)'
+    condition: and(succeeded(), eq('$(Agent.Diagnostic)', 'true'))
   - script: |
-     npm test -- suite1 -e=chrome -b=webgpu --io-binding=gpu-tensor --karma-debug $(webgpuCommandlineExtraFlags)
+     mkdir $(Agent.TempDirectory)\web\test\03
+     npm test -- suite1 -e=chrome -b=webgpu --io-binding=gpu-tensor $(webgpuCommandlineExtraFlags) --user-data-dir=$(Agent.TempDirectory)\web\test\03
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'Run ort-web tests (Suite1, webgpu, IO-binding=gpu-tensor)'
-    condition: eq('${{ parameters.RunWebGpuTests }}', 'true')
+    condition: and(succeeded(), eq('${{ parameters.RunWebGpuTests }}', 'true'))
   - script: |
-     npm test -- suite1 -e=chrome -b=webgpu --io-binding=gpu-location --karma-debug $(webgpuCommandlineExtraFlags)
+     powershell "Get-WmiObject Win32_Process -Filter \"name = 'chrome.exe'\" | Format-List CommandLine"
+    displayName: 'Check active Chrome processes (before test)'
+    condition: and(succeeded(), eq('$(Agent.Diagnostic)', 'true'))
+  - script: |
+     mkdir $(Agent.TempDirectory)\web\test\04
+     npm test -- suite1 -e=chrome -b=webgpu --io-binding=gpu-location $(webgpuCommandlineExtraFlags) --user-data-dir=$(Agent.TempDirectory)\web\test\04
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'Run ort-web tests (Suite1, webgpu, IO-binding=gpu-location)'
-    condition: eq('${{ parameters.RunWebGpuTests }}', 'true')
+    condition: and(succeeded(), eq('${{ parameters.RunWebGpuTests }}', 'true'))
+  - script: |
+     powershell "Get-WmiObject Win32_Process -Filter \"name = 'chrome.exe'\" | Format-List CommandLine"
+    displayName: 'Check active Chrome processes (before test)'
+    condition: and(succeeded(), eq('$(Agent.Diagnostic)', 'true'))
   - script: |
-     npm test -- --webgl.pack -b=webgl -e=chrome --karma-debug
+     mkdir $(Agent.TempDirectory)\web\test\05
+     npm test -- --webgl.pack -b=webgl -e=chrome --user-data-dir=$(Agent.TempDirectory)\web\test\05
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'Run ort-web tests - WebGL: packed mode'
   - script: |
-     npm test -- --wasm.proxy -b=wasm -e=chrome --karma-debug
+     powershell "Get-WmiObject Win32_Process -Filter \"name = 'chrome.exe'\" | Format-List CommandLine"
+    displayName: 'Check active Chrome processes (before test)'
+    condition: and(succeeded(), eq('$(Agent.Diagnostic)', 'true'))
+  - script: |
+     mkdir $(Agent.TempDirectory)\web\test\06
+     npm test -- --wasm.proxy -b=wasm -e=chrome --user-data-dir=$(Agent.TempDirectory)\web\test\06
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'Run ort-web tests - WebAssembly: proxy'
     condition: and(succeeded(), eq('${{ parameters.BuildConfig }}', 'Release'))
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-web-multi-browsers.yml b/tools/ci_build/github/azure-pipelines/templates/win-web-multi-browsers.yml
index 79bf0b5e71363..00109b348e8cb 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-web-multi-browsers.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-web-multi-browsers.yml
@@ -68,15 +68,30 @@ jobs:
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'npm ci /js/web/'
   - script: |
-      npm test -- suite0 -b=wasm,webgl --wasm.initTimeout=30000 --file-cache
+      powershell "Get-WmiObject Win32_Process -Filter \"name = 'chrome.exe'\" | Format-List CommandLine"
+    displayName: 'Check active Chrome processes (before test)'
+    condition: and(succeeded(), eq('$(Agent.Diagnostic)', 'true'))
+  - script: |
+      mkdir $(Agent.TempDirectory)\web\test_multi_browsers\01
+      npm test -- suite0 -e=chrome -b=wasm,webgl --wasm.initTimeout=30000 --file-cache --user-data-dir=$(Agent.TempDirectory)\web\test_multi_browsers\01
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'npm test (Suite0, Chrome)'
   - script: |
-      npm test -- suite0 -b=wasm,webgl -e=firefox --wasm.initTimeout=30000 --file-cache
+      powershell "Get-WmiObject Win32_Process -Filter \"name = 'firefox.exe'\" | Format-List CommandLine"
+    displayName: 'Check active Firefox processes (before test)'
+    condition: and(succeeded(), eq('$(Agent.Diagnostic)', 'true'))
+  - script: |
+      mkdir $(Agent.TempDirectory)\web\test_multi_browsers\02
+      npm test -- suite0 -b=wasm,webgl -e=firefox --wasm.initTimeout=30000 --file-cache --user-data-dir=$(Agent.TempDirectory)\web\test_multi_browsers\02
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'npm test (Suite0, Firefox)'
   - script: |
-      npm test -- suite0 -b=wasm,webgl -e=edge --wasm.initTimeout=30000 --file-cache
+      powershell "Get-WmiObject Win32_Process -Filter \"name = 'msedge.exe'\" | Format-List CommandLine"
+    displayName: 'Check active Edge processes (before test)'
+    condition: and(succeeded(), eq('$(Agent.Diagnostic)', 'true'))
+  - script: |
+      mkdir $(Agent.TempDirectory)\web\test_multi_browsers\03
+      npm test -- suite0 -b=wasm,webgl -e=edge --wasm.initTimeout=30000 --file-cache --user-data-dir=$(Agent.TempDirectory)\web\test_multi_browsers\03
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'npm test (Suite0, Edge)'
   - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3

From 050085a7fb1a8805d2c083571e03ea03f08ad3ca Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Tue, 26 Mar 2024 13:57:11 -0700
Subject: [PATCH 245/279] [js/web] remove "browser" field in package.json
 (#20021)

### Description

Field "browser" is deprecated in favor of "exports". Removes the unused
field.

Some bundler may read from "browser" and generate errors. Removing this
field should let bundler to look up "exports". Fixes #19915
---
 js/web/package.json | 1 -
 1 file changed, 1 deletion(-)

diff --git a/js/web/package.json b/js/web/package.json
index 55c3a3238bafc..384565dc0da90 100644
--- a/js/web/package.json
+++ b/js/web/package.json
@@ -1,6 +1,5 @@
 {
   "license": "MIT",
-  "browser": "dist/ort-web.min.js",
   "unpkg": "dist/ort.min.js",
   "name": "onnxruntime-web",
   "repository": {

From 793a8882ed8685d3040ec3e82b0a1b1799f99a98 Mon Sep 17 00:00:00 2001
From: cao lei <jslhcl@gmail.com>
Date: Tue, 26 Mar 2024 13:57:25 -0700
Subject: [PATCH 246/279] Regarding copy inputs before inference, flush the
 stream which copies the input only if the input is consumed by the ops from
 different streams (#19970)

### Description
<!-- Describe your changes. -->
Regarding copy inputs before inference, flush the stream which copies
the input only if the input is consumed by the ops from different
streams


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
This is the improvement for the fix
https://github.com/microsoft/onnxruntime/pull/17303
---
 .../core/framework/allocation_planner.cc      | 25 +++--
 .../core/framework/feeds_fetches_manager.h    |  3 +-
 .../framework/sequential_execution_plan.h     |  2 +
 onnxruntime/core/framework/session_state.h    |  5 +-
 .../core/framework/session_state_utils.cc     |  6 +-
 onnxruntime/core/framework/utils.cc           | 94 ++++++++-----------
 6 files changed, 61 insertions(+), 74 deletions(-)

diff --git a/onnxruntime/core/framework/allocation_planner.cc b/onnxruntime/core/framework/allocation_planner.cc
index fa6233476fe62..95e5380675df2 100644
--- a/onnxruntime/core/framework/allocation_planner.cc
+++ b/onnxruntime/core/framework/allocation_planner.cc
@@ -175,7 +175,6 @@ class PlannerImpl {
 
   size_t num_logic_streams_{0};
   std::vector<InlinedVector<NodeIndex>> stream_nodes_;
-  InlinedVector<size_t> node_stream_map_;
 
   // dependence_graph_ keeps the dependencies combining model graph and logic streams
   // e.g. dependence_graph_[downstream_node] = [upstream_node_0, upstream_node_1, upstream_node_2 ...]
@@ -1723,9 +1722,9 @@ class PlannerImpl {
         // we actually can do better if all the consumers depends on the last consumer.
         // will optimize it later
         bool is_all_consumer_same_stream = true;
-        auto stream_idx = node_stream_map_[value_consumers[i][0]];
+        auto stream_idx = plan_.node_stream_map_[value_consumers[i][0]];
         for (size_t j = 1; j < value_consumers[i].size(); ++j) {
-          if (node_stream_map_[value_consumers[i][j]] != stream_idx) {
+          if (plan_.node_stream_map_[value_consumers[i][j]] != stream_idx) {
             is_all_consumer_same_stream = false;
             break;
           }
@@ -1750,10 +1749,10 @@ class PlannerImpl {
                             const PathString& /*partition_config_file*/) {
     if (graph_viewer_.NumberOfNodes() > 0) {
       stream_nodes_.push_back({});
-      node_stream_map_.resize(SafeInt<size_t>(graph_viewer_.MaxNodeIndex()) + 1);
+      plan_.node_stream_map_.resize(SafeInt<size_t>(graph_viewer_.MaxNodeIndex()) + 1);
       for (auto node_index : graph_viewer_.GetNodesInTopologicalOrder()) {
         stream_nodes_[0].push_back(node_index);
-        node_stream_map_[node_index] = 0;
+        plan_.node_stream_map_[node_index] = 0;
       }
       num_logic_streams_ = 1;
     }
@@ -1797,10 +1796,10 @@ class PlannerImpl {
     auto partitioner = IGraphPartitioner::CreateGraphPartitioner(logger, partition_config_file);
     auto status = partitioner->PartitionGraph(graph_viewer_, execution_providers, stream_nodes_, context_->GetExecutionOrder());
     ORT_ENFORCE(status.IsOK(), status.ErrorMessage());
-    node_stream_map_.resize(SafeInt<size_t>(graph_viewer_.MaxNodeIndex()) + 1);
+    plan_.node_stream_map_.resize(SafeInt<size_t>(graph_viewer_.MaxNodeIndex()) + 1);
     for (size_t i = 0; i < stream_nodes_.size(); ++i) {
       for (auto node_index : stream_nodes_[i]) {
-        node_stream_map_[node_index] = i;
+        plan_.node_stream_map_[node_index] = i;
       }
     }
     num_logic_streams_ = stream_nodes_.size();
@@ -1863,7 +1862,7 @@ class PlannerImpl {
         auto* node = graph_viewer_.GetNode(node_index);
         for (auto it = node->OutputNodesBegin(); it != node->OutputNodesEnd(); ++it) {
           // if the output node is not in the same stream, generate a trigger point
-          if (node_stream_map_[it->Index()] != i
+          if (plan_.node_stream_map_[it->Index()] != i
 #ifdef ENABLE_TRAINING
               // Do not insert Barrier/TriggerDownStream step if the producer and consumer are in different sides of yieldOp
               // As in this case producer will surely be ready before the consumer is running.
@@ -1898,7 +1897,7 @@ class PlannerImpl {
                   //    in this case, the FIFO can't guarantee the cpu tensor is ready when resize kernel is launching
                   OrtDevice::DeviceType output_arg_device = AllocPlan(output_arg_idx).location.Type();
                   WaitNotificationFn wait_handle = stream_handle_registry.GetWaitHandle(stream_device, output_arg_device);
-                  if ((node_stream_map_[it->Index()] != i || output_arg_device == OrtDevice::CPU) && wait_handle != nullptr) {
+                  if ((plan_.node_stream_map_[it->Index()] != i || output_arg_device == OrtDevice::CPU) && wait_handle != nullptr) {
                     if (node_to_notification.find(node_index) == node_to_notification.end()) {
                       node_to_notification[node_index] = plan_.notification_owners.size();
                       plan_.notification_owners.push_back(i);
@@ -1910,7 +1909,7 @@ class PlannerImpl {
               }  // output->Exists
             }    // for each output
             if (output_consumed_in_subgraph) {
-              const auto downstream = node_stream_map_[it->Index()];
+              const auto downstream = plan_.node_stream_map_[it->Index()];
               if (downstream != i) {
                 auto downstream_device = execution_plan[downstream]->device_.Type();
                 WaitNotificationFn wait_handle = stream_handle_registry.GetWaitHandle(stream_device, downstream_device);
@@ -1936,7 +1935,7 @@ class PlannerImpl {
         onnxruntime::ProviderType exec_provider_name = node->GetExecutionProviderType();
         const IExecutionProvider* ep = execution_providers.Get(exec_provider_name);
         auto node_device_mem_location = ep->GetOrtDeviceByMemType(OrtMemType::OrtMemTypeDefault);
-        ORT_ENFORCE(execution_plan[node_stream_map_[node_index]]->device_.Type() == node_device_mem_location.Type());
+        ORT_ENFORCE(execution_plan[plan_.node_stream_map_[node_index]]->device_.Type() == node_device_mem_location.Type());
       }
     }
 
@@ -2014,7 +2013,7 @@ class PlannerImpl {
         if (!node_output->Exists()) continue;
         OrtValueIndex output_idx_global;
         ORT_THROW_IF_ERROR(ort_value_name_idx_map_.GetIdx(node_output->Name(), output_idx_global));
-        plan_.value_to_stream_map[output_idx_global] = node_stream_map_[node_index];
+        plan_.value_to_stream_map[output_idx_global] = plan_.node_stream_map_[node_index];
         value_node_map_[output_idx_global] = node_index;
       }
     }
@@ -2090,7 +2089,7 @@ class PlannerImpl {
         }
         // trigger downstream
         for (auto it = node->OutputNodesBegin(); it != node->OutputNodesEnd(); ++it) {
-          auto stream_idx = node_stream_map_[it->Index()];
+          auto stream_idx = plan_.node_stream_map_[it->Index()];
           if (stream_idx != i) {
             auto node_it = std::find(stream_nodes_[stream_idx].begin(), stream_nodes_[stream_idx].end(), it->Index());
             int offset = static_cast<int>(std::distance(stream_nodes_[stream_idx].begin(), node_it));
diff --git a/onnxruntime/core/framework/feeds_fetches_manager.h b/onnxruntime/core/framework/feeds_fetches_manager.h
index 75cb7485a6e38..31c00d65ce833 100644
--- a/onnxruntime/core/framework/feeds_fetches_manager.h
+++ b/onnxruntime/core/framework/feeds_fetches_manager.h
@@ -25,7 +25,7 @@ enum class DeviceCopyCheck {
 };
 
 struct DeviceCopyChecks {
-  DeviceCopyCheck status = DeviceCopyCheck::Unknown;  ///< Overall status. If NoCopy no input or output copies are needed
+  DeviceCopyCheck status = DeviceCopyCheck::Unknown;  ///< Overall status. NoCopy means input_copy_needed and output_copy_needed are both NoCopy
   DeviceCopyCheck input_copy_needed = DeviceCopyCheck::Unknown;
   DeviceCopyCheck output_copy_needed = DeviceCopyCheck::Unknown;
 };
@@ -73,6 +73,7 @@ struct FeedsFetchesInfo {
 struct MLValueCopyInfo {
   OrtDevice source_device{};
   OrtDevice target_device{};  // default is CPU
+  int unique_stream_index_consumes_it = -1;
 };
 
 class FeedsFetchesManager {
diff --git a/onnxruntime/core/framework/sequential_execution_plan.h b/onnxruntime/core/framework/sequential_execution_plan.h
index 3152154e52d7e..62c66bc6f336c 100644
--- a/onnxruntime/core/framework/sequential_execution_plan.h
+++ b/onnxruntime/core/framework/sequential_execution_plan.h
@@ -203,6 +203,8 @@ struct SequentialExecutionPlan : public ExecutionPlanBase {
     }
     return count;
   }
+
+  InlinedVector<size_t> node_stream_map_;
 };
 
 // Output details of an execution plan:
diff --git a/onnxruntime/core/framework/session_state.h b/onnxruntime/core/framework/session_state.h
index a2ee1601d386b..e318c9a8238c7 100644
--- a/onnxruntime/core/framework/session_state.h
+++ b/onnxruntime/core/framework/session_state.h
@@ -259,8 +259,8 @@ class SessionState {
      * \param p_node0 Nullable
      * \param kci0 Nullable
      */
-    NodeInfo(size_t index0, const onnxruntime::Node* p_node0, const KernelCreateInfo* kci0, const OrtDevice& device0)
-        : index(index0), p_node(p_node0), kci(kci0), device(&device0) {}
+    NodeInfo(size_t index0, const onnxruntime::Node* p_node0, const KernelCreateInfo* kci0, const OrtDevice& device0, int stream_index0 = -1)
+        : index(index0), p_node(p_node0), kci(kci0), device(&device0), stream_index(stream_index0) {}
 
     size_t index;
     // Nullable
@@ -268,6 +268,7 @@ class SessionState {
     // Nullable
     const KernelCreateInfo* kci = nullptr;
     const OrtDevice* device = nullptr;
+    int stream_index;
   };
 
   using NameNodeInfoMapType = InlinedHashMap<std::string, InlinedVector<NodeInfo>>;
diff --git a/onnxruntime/core/framework/session_state_utils.cc b/onnxruntime/core/framework/session_state_utils.cc
index df11fe8302aef..692ca08772535 100644
--- a/onnxruntime/core/framework/session_state_utils.cc
+++ b/onnxruntime/core/framework/session_state_utils.cc
@@ -367,6 +367,7 @@ common::Status SaveInputOutputNamesToNodeMapping(const onnxruntime::GraphViewer&
 
   for (auto& node : graph.Nodes()) {
     const KernelCreateInfo& kci = session_state.GetNodeKernelCreateInfo(node.Index());
+    int stream_index = static_cast<int>(exec_plan->node_stream_map_[node.Index()]);
 
     ORT_RETURN_IF_ERROR(
         onnxruntime::Node::ForEachWithIndex(
@@ -379,8 +380,7 @@ common::Status SaveInputOutputNamesToNodeMapping(const onnxruntime::GraphViewer&
               int arg_index;
               ORT_RETURN_IF_ERROR(name_to_id.GetIdx(arg.Name(), arg_index));
               const auto& device = exec_plan->GetLocation(arg_index);
-
-              SessionState::NodeInfo node_info(index, &node, &kci, device);
+              SessionState::NodeInfo node_info(index, &node, &kci, device, stream_index);
 
               if (IsArgNameInInputsOutputs(arg.Name(), graph_inputs)) {
                 ORT_RETURN_IF_ERROR(session_state.AddInputNameToNodeInfoMapping(arg.Name(), node_info));
@@ -419,7 +419,7 @@ common::Status SaveInputOutputNamesToNodeMapping(const onnxruntime::GraphViewer&
         int arg_index;
         ORT_RETURN_IF_ERROR(name_to_id.GetIdx(input_def->Name(), arg_index));
         auto& device = exec_plan->GetLocation(arg_index);
-        SessionState::NodeInfo node_info(std::numeric_limits<size_t>::max(), &node, &kci, device);
+        SessionState::NodeInfo node_info(std::numeric_limits<size_t>::max(), &node, &kci, device, stream_index);
         ORT_RETURN_IF_ERROR(session_state.AddInputNameToNodeInfoMapping(input_def->Name(), node_info));
       }
     }
diff --git a/onnxruntime/core/framework/utils.cc b/onnxruntime/core/framework/utils.cc
index b737d735b977b..0c4d498fae9e0 100644
--- a/onnxruntime/core/framework/utils.cc
+++ b/onnxruntime/core/framework/utils.cc
@@ -270,6 +270,15 @@ static common::Status CalculateStaticCopyInfoForFeed(const SessionState& session
     }
 
     copy_info.target_device = *node_info.device;
+    copy_info.unique_stream_index_consumes_it = node_info.stream_index;
+    ORT_RETURN_IF(node_info.stream_index < 0, "node_info.stream_index < 0");
+    for (size_t i = 1; i < node_info_vec.size(); i++) {
+      ORT_RETURN_IF(node_info_vec[i].stream_index < 0, "node_info_vec[i].stream_index < 0");
+      if (node_info_vec[i].stream_index != node_info.stream_index) {
+        copy_info.unique_stream_index_consumes_it = -1;
+        break;
+      }
+    }
 
 #ifdef ENABLE_TRAINING
   } else {
@@ -441,11 +450,12 @@ static void FinalizeFeedFetchCopyInfo(FeedsFetchesManager& feeds_fetches_manager
 static common::Status CopyInputsAcrossDevices(const SessionState& session_state,
                                               gsl::span<const OrtValue> orig_feeds,
                                               std::vector<OrtValue>& new_feeds,
-                                              gsl::span<const MLValueCopyInfo> copy_info,
-                                              gsl::span<Stream* const> feed_streams) {
+#ifdef ORT_ENABLE_STREAM
+                                              DeviceStreamCollection* device_stream_collection,
+#endif
+                                              gsl::span<const MLValueCopyInfo> copy_info) {
   size_t num_feeds = orig_feeds.size();
   ORT_ENFORCE(copy_info.size() == num_feeds);
-  ORT_ENFORCE(feed_streams.size() == num_feeds);
 
   new_feeds.resize(num_feeds);
   std::vector<IDataTransfer::SrcDstPair> batched_data_transfers;
@@ -453,14 +463,32 @@ static common::Status CopyInputsAcrossDevices(const SessionState& session_state,
   std::vector<IDataTransfer::SparseSrcDstPair> batched_sparse_data_transfers;
 #endif
 
+  std::unordered_set<Stream*> stream_to_flush;
   for (size_t idx = 0; idx < num_feeds; ++idx) {
+    Stream* copy_this_feed = nullptr;
+#ifdef ORT_ENABLE_STREAM
+    if (device_stream_collection) {
+      if (copy_info[idx].unique_stream_index_consumes_it < 0) {
+        for (size_t i = 0; i < device_stream_collection->NumStreams(); i++) {
+          Stream* stream = device_stream_collection->GetStream(i);
+          if (stream && stream->GetDevice().Type() == copy_info[idx].target_device.Type()) {
+            copy_this_feed = stream;
+            stream_to_flush.insert(stream);
+            break;
+          }
+        }
+      } else {
+        copy_this_feed = device_stream_collection->GetStream(copy_info[idx].unique_stream_index_consumes_it);
+      }
+    }
+#endif
 #if !defined(DISABLE_SPARSE_TENSORS)
     ORT_RETURN_IF_ERROR(BatchOrCopyMLValue(session_state, copy_info[idx], orig_feeds[idx], new_feeds[idx],
-                                           feed_streams[idx],
+                                           copy_this_feed,
                                            &batched_data_transfers, &batched_sparse_data_transfers));
 #else
     ORT_RETURN_IF_ERROR(BatchOrCopyMLValue(session_state, copy_info[idx], orig_feeds[idx], new_feeds[idx],
-                                           feed_streams[idx],
+                                           copy_this_feed,
                                            &batched_data_transfers));
 #endif
   }
@@ -479,10 +507,7 @@ static common::Status CopyInputsAcrossDevices(const SessionState& session_state,
   // TODO: this sync is because the graph inputs can be consumed by multiple stream,
   // but we can only place the MemCpyAsync on one of the stream. Ideally we should make
   // other stream wait on the event of the memory copy stream, instead of host sync stream.
-  std::unordered_set<Stream*> visited;
-  for (auto* stream : feed_streams) {
-    if (stream && visited.insert(stream).second) stream->Flush();
-  }
+  for (const auto& stream : stream_to_flush) stream->Flush();
   return Status::OK();
 }
 
@@ -640,33 +665,12 @@ ExecuteGraphImpl(const SessionState& session_state,
 
     if (device_copy_checks.input_copy_needed == DeviceCopyCheck::Copy) {
       const auto& feed_copy_info = feeds_fetches_manager.GetFeedsDeviceCopyInfo();
-      InlinedVector<Stream*> feed_streams;
-      feed_streams.reserve(feed_copy_info.size());
-      // TODO: we can pre-calculate the stream index for graph inputs in execution plan
+      auto status = CopyInputsAcrossDevices(session_state, feeds, device_feeds,
 #ifdef ORT_ENABLE_STREAM
-      for (auto& copy_info : feed_copy_info) {
-        auto& device = copy_info.target_device;
-        bool found = false;
-        if (device_stream_collection) {
-          size_t num_streams = device_stream_collection->NumStreams();
-          for (size_t i = 0; i < num_streams; i++) {
-            Stream* stream = device_stream_collection->GetStream(i);
-            if (stream && stream->GetDevice().Type() == device.Type()) {
-              feed_streams.push_back(stream);
-              found = true;
-              break;
-            }
-          }
-        }
-        if (!found)
-          feed_streams.push_back(nullptr);
-      }
-#else
-      for (size_t i = 0; i < feed_copy_info.size(); ++i) {
-        feed_streams.push_back(nullptr);
-      }
+                                            device_stream_collection,
 #endif
-      ORT_RETURN_IF_ERROR(CopyInputsAcrossDevices(session_state, feeds, device_feeds, feed_copy_info, feed_streams));
+                                            feed_copy_info);
+      ORT_RETURN_IF_ERROR(status);
       feeds_to_use = device_feeds;
     }
 
@@ -819,27 +823,7 @@ common::Status ExecutePartialGraphImpl(const SessionState& session_state, FeedsF
 
     if (device_copy_checks.input_copy_needed == DeviceCopyCheck::Copy) {
       const auto& feed_copy_info = feeds_fetches_manager.GetFeedsDeviceCopyInfo();
-      InlinedVector<Stream*> feed_streams;
-      feed_streams.reserve(feed_copy_info.size());
-      // TODO: we can pre-calculate the stream index for graph inputs in execution plan
-      for (auto& copy_info : feed_copy_info) {
-        auto& device = copy_info.target_device;
-        bool found = false;
-        if (device_stream_collection) {
-          size_t num_streams = device_stream_collection->NumStreams();
-          for (size_t i = 0; i < num_streams; i++) {
-            Stream* stream = device_stream_collection->GetStream(i);
-            if (stream && stream->GetDevice().Type() == device.Type()) {
-              feed_streams.push_back(stream);
-              found = true;
-              break;
-            }
-          }
-        }
-        if (!found)
-          feed_streams.push_back(nullptr);
-      }
-      ORT_RETURN_IF_ERROR(CopyInputsAcrossDevices(session_state, feeds, device_feeds, feed_copy_info, feed_streams));
+      ORT_RETURN_IF_ERROR(CopyInputsAcrossDevices(session_state, feeds, device_feeds, device_stream_collection, feed_copy_info));
       p_feeds = device_feeds;
     }
 

From 473434c73ffab7c4a5452e3c7d92e111784f0607 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Tue, 26 Mar 2024 17:14:43 -0700
Subject: [PATCH 247/279] [js/webgpu] perform uniform consistency check
 (#20019)

### Description

This PR makes a change in WebGPU backend to validate program uniforms.
It compares the uniform data that comes from the result of
`getRunData()` callback from the program info, with the `ShaderHelper`'s
maintained list of uniform variables.

Fixes a few bugs that found by this check as well.
---
 js/web/lib/wasm/jsep/backend-webgpu.ts         | 18 ++++++++++++++++++
 .../ops/3rd-party/conv_backprop_webgpu.ts      |  2 +-
 js/web/lib/wasm/jsep/webgpu/ops/common.ts      | 16 +++++++++++++++-
 .../lib/wasm/jsep/webgpu/ops/conv-grouped.ts   |  4 ++--
 js/web/lib/wasm/jsep/webgpu/ops/pad.ts         |  2 +-
 js/web/lib/wasm/jsep/webgpu/ops/softmax.ts     |  2 +-
 js/web/lib/wasm/jsep/webgpu/program-manager.ts |  2 +-
 js/web/lib/wasm/jsep/webgpu/types.ts           |  3 +++
 8 files changed, 42 insertions(+), 7 deletions(-)

diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts
index b36dc73330d46..1b421029cc7ae 100644
--- a/js/web/lib/wasm/jsep/backend-webgpu.ts
+++ b/js/web/lib/wasm/jsep/backend-webgpu.ts
@@ -565,6 +565,24 @@ export class WebGpuBackend {
       LOG_DEBUG('info', () => `[artifact] key: ${key}, programName: ${program.name}`);
     }
 
+    // validate uniform variables
+    if (programUniforms && artifact.uniformVariablesInfo) {
+      if (programUniforms.length !== artifact.uniformVariablesInfo.length) {
+        throw new Error(`Uniform variables count mismatch: expect ${artifact.uniformVariablesInfo.length}, got ${
+            programUniforms.length} in program "${artifact.programInfo.name}".`);
+      }
+      for (let i = 0; i < programUniforms.length; i++) {
+        const uniform = programUniforms[i];
+        const actualType = uniform.type;
+        const actualLength = typeof uniform.data === 'number' ? 1 : uniform.data.length;
+        const [type, length] = artifact.uniformVariablesInfo[i];
+        if (actualType !== type || actualLength !== length) {
+          throw new Error(`Uniform variable ${i} mismatch: expect type ${type} with size ${length}, got type ${
+              actualType} with size ${actualLength} in program "${artifact.programInfo.name}".`);
+        }
+      }
+    }
+
     LOG_DEBUG(
         'info',
         () => `[ProgramManager] run "${program.name}" (key=${key}) with ${normalizedDispatchGroup[0]}x${
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_webgpu.ts
index 846ad49c5222b..45c89406e1731 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_webgpu.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_webgpu.ts
@@ -265,7 +265,7 @@ export const createConvTranspose2DProgramInfo =
       const outputChannelsPerGroup = wShape[1];
 
       const programUniforms: ProgramUniform[] = [
-        {type: DataType.int32, data: outputSize}, {type: DataType.uint32, data: strides},
+        {type: DataType.uint32, data: outputSize}, {type: DataType.uint32, data: strides},
         {type: DataType.uint32, data: filterDims}, {type: DataType.uint32, data: dilations},
         {type: DataType.uint32, data: effectiveFilterDims}, {type: DataType.int32, data: pads},
         {type: DataType.uint32, data: inputChannelsPerGroup}, {type: DataType.uint32, data: outputChannelsPerGroup},
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/common.ts b/js/web/lib/wasm/jsep/webgpu/ops/common.ts
index 516094d0ef87b..e151b6de44317 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/common.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/common.ts
@@ -3,7 +3,7 @@
 
 import {DataType} from '../../../wasm-common';
 import {ShapeUtil} from '../../util';
-import {ProgramUniform} from '../types';
+import {ProgramUniform, ProgramUniformVariableInfo} from '../types';
 
 /**
  * constant value for a workgroup size.
@@ -903,6 +903,20 @@ class ShaderHelperImpl implements ShaderHelper {
     return this.uniformDeclaration() + this.variables.map(i => i.impl()).join('\n') +
         this.internalVariables.map(i => i.impl()).join('\n');
   }
+
+  /**
+   * Get the variable info of the shader program.
+   */
+  get variablesInfo(): ProgramUniformVariableInfo[]|undefined {
+    if (this.uniforms.length === 0) {
+      return undefined;
+    }
+
+    const uniformWgslTypeToDataType = (type: UniformDataElementType) =>
+        ([DataType.uint32, DataType.float16, DataType.float,
+          DataType.int32][['u32', 'f16', 'f32', 'i32'].indexOf(type)]);
+    return this.uniforms.map(u => ([uniformWgslTypeToDataType(u.type), u.length ?? 1]));
+  }
 }
 
 export const createShaderHelper = (dispatchGroup: [number, number, number]) => new ShaderHelperImpl(dispatchGroup);
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
index 7d424305c715f..924030125c420 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
@@ -35,7 +35,7 @@ export const createGroupedConvProgramInfo =
         {type: DataType.uint32, data: outputChannelsPerGroup}
       ];
       appendActivationUniformsData(attributes, programUniforms);
-      programUniforms.push(...createTensorShapeVariables(xShape, wShape, outputShape));
+      programUniforms.push(...createTensorShapeVariables(xShape, wShape));
       const inputDependencies: ProgramInputTensorInfoDependency[] = ['rank', 'rank'];
       if (hasBias) {
         programUniforms.push(...createTensorShapeVariables(inputs[2].dims));
@@ -51,7 +51,7 @@ export const createGroupedConvProgramInfo =
         const w = inputVariable('w', inputs[1].dataType, wShape.length);
         const inputVars = [x, w];
         if (hasBias) {
-          inputVars.push(inputVariable('b', inputs[2].dataType, inputs[2].dims));
+          inputVars.push(inputVariable('b', inputs[2].dataType, inputs[2].dims.length));
         }
 
         const uniforms: UniformsArrayType = [
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/pad.ts b/js/web/lib/wasm/jsep/webgpu/ops/pad.ts
index 236fc29fdf1ab..d649d3d220ae1 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/pad.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/pad.ts
@@ -153,7 +153,7 @@ const createPadProgramInfo = (inputs: readonly TensorView[], attributes: PadAttr
   const inputDims = inputs[0].dims;
   const outputSize = ShapeUtil.size(outputShape);
   const programUniforms: ProgramUniform[] =
-      [{type: DataType.uint32, data: outputSize}, {type: DataType.uint32, data: attributes.pads}];
+      [{type: DataType.uint32, data: outputSize}, {type: DataType.int32, data: attributes.pads}];
   if (attributes.mode === 0) {
     programUniforms.push({type: inputs[0].dataType, data: attributes.value});
   }
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/softmax.ts b/js/web/lib/wasm/jsep/webgpu/ops/softmax.ts
index 6f8bfa08d7b62..b0e3ddd149656 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/softmax.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/softmax.ts
@@ -137,7 +137,7 @@ const createSoftmaxProgramInfo = (input: TensorView, attributes: SoftmaxAttribut
     getRunData: () => ({
       outputs: [{dims: shape, dataType: input.dataType}],
       dispatchGroup: {x: rows},
-      programUniforms: [{type: DataType.uint32, data: packedCols}]
+      programUniforms: [{type: DataType.int32, data: packedCols}]
     }),
     getShaderSource,
   };
diff --git a/js/web/lib/wasm/jsep/webgpu/program-manager.ts b/js/web/lib/wasm/jsep/webgpu/program-manager.ts
index 9d05f607f817f..0064e53a9584b 100644
--- a/js/web/lib/wasm/jsep/webgpu/program-manager.ts
+++ b/js/web/lib/wasm/jsep/webgpu/program-manager.ts
@@ -97,7 +97,7 @@ export class ProgramManager {
         {compute: {module: shaderModule, entryPoint: 'main'}, layout: 'auto', label: programInfo.name});
 
     TRACE_FUNC_END(programInfo.name);
-    return {programInfo, computePipeline};
+    return {programInfo, computePipeline, uniformVariablesInfo: shaderHelper.variablesInfo};
   }
 
   normalizeDispatchGroupSize(dispatchGroup: ReturnType<ProgramInfo['getRunData']>['dispatchGroup']):
diff --git a/js/web/lib/wasm/jsep/webgpu/types.ts b/js/web/lib/wasm/jsep/webgpu/types.ts
index 48e0855f01a97..a0c4b77d44a77 100644
--- a/js/web/lib/wasm/jsep/webgpu/types.ts
+++ b/js/web/lib/wasm/jsep/webgpu/types.ts
@@ -38,6 +38,8 @@ export interface ProgramUniform {
   data: number|readonly number[];
 }
 
+export type ProgramUniformVariableInfo = [type: DataType, length: number];
+
 /**
  * Represent the dependency of a program on a specific input tensor.
  *
@@ -125,6 +127,7 @@ export interface ProgramInfo {
 export interface Artifact {
   programInfo: ProgramInfo;
   computePipeline: GPUComputePipeline;
+  uniformVariablesInfo: readonly ProgramUniformVariableInfo[]|undefined;
 }
 
 export interface ComputeContextInputsOutputsMapping {

From 512c803550e05682d02eae2e34efadc5fccfed79 Mon Sep 17 00:00:00 2001
From: Xiaoyu <85524621+xiaoyu-work@users.noreply.github.com>
Date: Tue, 26 Mar 2024 20:16:09 -0700
Subject: [PATCH 248/279] fix import in transformer optimizer python script
 (#20091)

### Description
Fix import.

### Motivation and Context
Fix error.
---
 onnxruntime/python/tools/transformers/optimizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/python/tools/transformers/optimizer.py b/onnxruntime/python/tools/transformers/optimizer.py
index 068ccefef7d97..e334e22d92ea6 100644
--- a/onnxruntime/python/tools/transformers/optimizer.py
+++ b/onnxruntime/python/tools/transformers/optimizer.py
@@ -40,9 +40,9 @@
 from onnx_model_tnlr import TnlrOnnxModel
 from onnx_model_unet import UnetOnnxModel
 from onnx_model_vae import VaeOnnxModel
+from onnx_utils import extract_raw_data_from_model, has_external_data
 
 import onnxruntime
-from onnxruntime.transformers.onnx_utils import extract_raw_data_from_model, has_external_data
 
 logger = logging.getLogger(__name__)
 

From b14d3f1d525187e2377626f771ab4806b9968bdd Mon Sep 17 00:00:00 2001
From: zhijiang <43435212+zhijxu-MS@users.noreply.github.com>
Date: Wed, 27 Mar 2024 11:37:10 +0800
Subject: [PATCH 249/279] Zhijxu/fix softmax fp16 (#20059)

in fp16 input, the softmax will return nan in some case,

the reason is because in float16 dtype,
std::numeric_limits<float16>::infinity() will return 0 instead of inf
---
 .../cuda/math/softmax_warpwise_impl.cuh       | 40 +++++++++----------
 .../test/training_ops/cuda/softmax_test.cc    | 17 ++++++++
 2 files changed, 37 insertions(+), 20 deletions(-)

diff --git a/onnxruntime/core/providers/cuda/math/softmax_warpwise_impl.cuh b/onnxruntime/core/providers/cuda/math/softmax_warpwise_impl.cuh
index c1b3d6ada8b77..5e2cec464a86b 100644
--- a/onnxruntime/core/providers/cuda/math/softmax_warpwise_impl.cuh
+++ b/onnxruntime/core/providers/cuda/math/softmax_warpwise_impl.cuh
@@ -1,18 +1,18 @@
 /**
-* Copyright (c) 2016-present, Facebook, Inc.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 // The code below is mostly copied from Pytorch PersistentSoftmax.cuh
 
@@ -55,7 +55,6 @@ __device__ __forceinline__ void warp_reduce(acc_t* sum) {
   }
 }
 
-
 // The softmax_warp_* methods perform softmax forward and backward propagation on samples spanning the fast dimension.
 // Each sample contains element_count scalar elements. element_count can be any integer value <= 1024.
 // The template arguments have the following meaning:
@@ -163,7 +162,6 @@ __global__ void softmax_warp_forward(output_t* dst, const input_t* src, int batc
   }
 }
 
-
 // softmax_warp_forward uses register to store data in fp32 even when data is fp16, which will cause register resource oversubscription when data is large,
 // and will lead to low CUDA warp occupancy and thus a poor kernel performance.
 // softmax_warp_forward_resource_efficient is implemented to solve the issue, it caches data in original data type, and casts it into fp32 when needed,
@@ -176,17 +174,19 @@ __global__ void softmax_warp_forward_resource_efficient(output_t* dst, const inp
   constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
 
   int local_idx = threadIdx.x;
-  src +=  blockIdx.x * stride + local_idx;
-  dst +=  blockIdx.x * stride + local_idx;
+  src += blockIdx.x * stride + local_idx;
+  dst += blockIdx.x * stride + local_idx;
   extern __shared__ unsigned char smem[];
-  input_t (&elements)[WARP_ITERATIONS][WARP_SIZE] = *reinterpret_cast<input_t (*)[WARP_ITERATIONS][WARP_SIZE]>(smem);
+  input_t(&elements)[WARP_ITERATIONS][WARP_SIZE] = *reinterpret_cast<input_t(*)[WARP_ITERATIONS][WARP_SIZE]>(smem);
 #pragma unroll
   for (int it = 0; it < WARP_ITERATIONS; ++it) {
     int element_index = local_idx + it * WARP_SIZE;
     if (element_index < element_count) {
       elements[it][local_idx] = src[it * WARP_SIZE];
     } else {
-      elements[it][local_idx] = -std::numeric_limits<input_t>::infinity();
+      static_assert(std::numeric_limits<acc_t>::has_infinity,
+                    "type of acc_t should have infinity to avoid infinity function return 0");
+      elements[it][local_idx] = static_cast<input_t>(-std::numeric_limits<acc_t>::infinity());
     }
   }
   // compute max_value
diff --git a/orttraining/orttraining/test/training_ops/cuda/softmax_test.cc b/orttraining/orttraining/test/training_ops/cuda/softmax_test.cc
index 45edac3df2806..ad6ee1e0950e9 100644
--- a/orttraining/orttraining/test/training_ops/cuda/softmax_test.cc
+++ b/orttraining/orttraining/test/training_ops/cuda/softmax_test.cc
@@ -70,6 +70,23 @@ TEST(CudaKernelTest, Softmax_LargeTensor_LastAxis_Float16_NoPowerOfTwo) {
   TestSoftmax<MLFloat16>(X_dims, Y_dims, 2, false, 1e-3, 1e-3);
 }
 
+TEST(CudaKernelTest, Softmax_LargeTensor_LastAxis_Float16_NoPowerOfTwo2) {
+  // at fp16 case, when input is all -65504, the output can't be inf
+  std::vector<int64_t> X_dims{8192, 1, 1050};
+  std::vector<int64_t> Y_dims{8192, 1, 1050};
+  TestSoftmax<MLFloat16>(X_dims, Y_dims, 2, false, 1e-3, 1e-3);
+  CompareOpTester test("Softmax");
+  test.AddAttribute<int64_t>("axis", 1);
+
+  std::vector<MLFloat16> X_data(detail::SizeFromDims(X_dims), (MLFloat16)-65504.0f);
+  test.AddInput<MLFloat16>("X", X_dims, X_data);
+
+  std::vector<MLFloat16> Y_data = FillZeros<MLFloat16>(Y_dims);
+  test.AddOutput<MLFloat16>("Y", Y_dims, Y_data);
+
+  test.CompareWithCPU(kGpuExecutionProvider, 1e-4, 1e-4);
+}
+
 TEST(CudaKernelTest, Softmax_LargeTensor_AllAxis) {
   std::vector<int64_t> X_dims{8, 16, 512};
   std::vector<int64_t> Y_dims{8, 16, 512};

From 0561b9576edfcd2801ed0d959a9898888a46028c Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Wed, 27 Mar 2024 12:17:22 +0800
Subject: [PATCH 250/279] Fix and Refactor Python Packaging Pipeline (#20085)

### Description
Make Windows GPU Packaging stage in Python Packaging pipeline run on CPU
machine as well


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

### Test Link

https://dev.azure.com/aiinfra/Lotus/_build/results?buildId=430961&view=results
---
 .../templates/py-packaging-stage.yml          | 211 ++++++++++--------
 .../azure-pipelines/templates/py-win-gpu.yml  |  16 +-
 2 files changed, 123 insertions(+), 104 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
index 2e5aee77604c1..7868ebe3cef6f 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
@@ -57,9 +57,8 @@ parameters:
    - MinSizeRel
 
 stages:
-- stage: Python_Packaging
+- stage: Python_Packaging_Windows_CPU
   dependsOn: []
-
   jobs:
   - ${{ if eq(parameters.enable_windows_cpu, true) }}:
     - job: Windows_py_Wheels
@@ -275,88 +274,91 @@ stages:
         displayName: 'Clean Agent Directories'
         condition: always()
 
-  - ${{ if eq(parameters.enable_windows_gpu, true) }}:
-      - template: py-win-gpu.yml
-        parameters:
-          MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4'
-          PYTHON_VERSION: '3.8'
-          EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="C:\local\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
-          ENV_SETUP_SCRIPT: setup_env_gpu.bat
-          EP_NAME: gpu
-
-      - template: py-win-gpu.yml
-        parameters:
-          MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4'
-          PYTHON_VERSION: '3.9'
-          EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="C:\local\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
-          ENV_SETUP_SCRIPT: setup_env_gpu.bat
-          EP_NAME: gpu
-
-      - template: py-win-gpu.yml
-        parameters:
-          MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4'
-          PYTHON_VERSION: '3.10'
-          EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="C:\local\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
-          ENV_SETUP_SCRIPT: setup_env_gpu.bat
-          EP_NAME: gpu
-
-      - template: py-win-gpu.yml
-        parameters:
-          MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4'
-          PYTHON_VERSION: '3.11'
-          EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="C:\local\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
-          ENV_SETUP_SCRIPT: setup_env_gpu.bat
-          EP_NAME: gpu
-
-      - template: py-win-gpu.yml
-        parameters:
-          MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4'
-          PYTHON_VERSION: '3.12'
-          EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="C:\local\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
-          ENV_SETUP_SCRIPT: setup_env_gpu.bat
-          EP_NAME: gpu
-
-      - template: py-win-gpu.yml
-        parameters:
-          MACHINE_POOL: 'onnxruntime-Win2022-GPU-dml-A10'
-          PYTHON_VERSION: '3.8'
-          EP_BUILD_FLAGS: --use_dml --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.18362.0 --enable_wcos
-          ENV_SETUP_SCRIPT: setup_env.bat
-          EP_NAME: directml
-
-      - template: py-win-gpu.yml
-        parameters:
-          MACHINE_POOL: 'onnxruntime-Win2022-GPU-dml-A10'
-          PYTHON_VERSION: '3.9'
-          EP_BUILD_FLAGS: --use_dml --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.18362.0 --enable_wcos
-          ENV_SETUP_SCRIPT: setup_env.bat
-          EP_NAME: directml
-
-      - template: py-win-gpu.yml
-        parameters:
-          MACHINE_POOL: 'onnxruntime-Win2022-GPU-dml-A10'
-          PYTHON_VERSION: '3.10'
-          EP_BUILD_FLAGS: --use_dml --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.18362.0 --enable_wcos
-          ENV_SETUP_SCRIPT: setup_env.bat
-          EP_NAME: directml
-
-      - template: py-win-gpu.yml
-        parameters:
-          MACHINE_POOL: 'onnxruntime-Win2022-GPU-dml-A10'
-          PYTHON_VERSION: '3.11'
-          EP_BUILD_FLAGS: --use_dml --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.18362.0 --enable_wcos
-          ENV_SETUP_SCRIPT: setup_env.bat
-          EP_NAME: directml
-
-      - template: py-win-gpu.yml
-        parameters:
-          MACHINE_POOL: 'onnxruntime-Win2022-GPU-dml-A10'
-          PYTHON_VERSION: '3.12'
-          EP_BUILD_FLAGS: --use_dml --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.18362.0 --enable_wcos
-          ENV_SETUP_SCRIPT: setup_env.bat
-          EP_NAME: directml
-
-  - ${{ if eq(parameters.enable_mac_cpu, true) }}:
+- ${{ if eq(parameters.enable_windows_gpu, true) }}:
+    - template: py-win-gpu.yml
+      parameters:
+        MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4'
+        PYTHON_VERSION: '3.8'
+        EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="C:\local\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+        ENV_SETUP_SCRIPT: setup_env_gpu.bat
+        EP_NAME: gpu
+
+    - template: py-win-gpu.yml
+      parameters:
+        MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4'
+        PYTHON_VERSION: '3.9'
+        EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="C:\local\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+        ENV_SETUP_SCRIPT: setup_env_gpu.bat
+        EP_NAME: gpu
+
+    - template: py-win-gpu.yml
+      parameters:
+        MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4'
+        PYTHON_VERSION: '3.10'
+        EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="C:\local\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+        ENV_SETUP_SCRIPT: setup_env_gpu.bat
+        EP_NAME: gpu
+
+    - template: py-win-gpu.yml
+      parameters:
+        MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4'
+        PYTHON_VERSION: '3.11'
+        EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="C:\local\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+        ENV_SETUP_SCRIPT: setup_env_gpu.bat
+        EP_NAME: gpu
+
+    - template: py-win-gpu.yml
+      parameters:
+        MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4'
+        PYTHON_VERSION: '3.12'
+        EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="C:\local\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+        ENV_SETUP_SCRIPT: setup_env_gpu.bat
+        EP_NAME: gpu
+
+    - template: py-win-gpu.yml
+      parameters:
+        MACHINE_POOL: 'onnxruntime-Win2022-GPU-dml-A10'
+        PYTHON_VERSION: '3.8'
+        EP_BUILD_FLAGS: --use_dml --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.18362.0 --enable_wcos
+        ENV_SETUP_SCRIPT: setup_env.bat
+        EP_NAME: directml
+
+    - template: py-win-gpu.yml
+      parameters:
+        MACHINE_POOL: 'onnxruntime-Win2022-GPU-dml-A10'
+        PYTHON_VERSION: '3.9'
+        EP_BUILD_FLAGS: --use_dml --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.18362.0 --enable_wcos
+        ENV_SETUP_SCRIPT: setup_env.bat
+        EP_NAME: directml
+
+    - template: py-win-gpu.yml
+      parameters:
+        MACHINE_POOL: 'onnxruntime-Win2022-GPU-dml-A10'
+        PYTHON_VERSION: '3.10'
+        EP_BUILD_FLAGS: --use_dml --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.18362.0 --enable_wcos
+        ENV_SETUP_SCRIPT: setup_env.bat
+        EP_NAME: directml
+
+    - template: py-win-gpu.yml
+      parameters:
+        MACHINE_POOL: 'onnxruntime-Win2022-GPU-dml-A10'
+        PYTHON_VERSION: '3.11'
+        EP_BUILD_FLAGS: --use_dml --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.18362.0 --enable_wcos
+        ENV_SETUP_SCRIPT: setup_env.bat
+        EP_NAME: directml
+
+    - template: py-win-gpu.yml
+      parameters:
+        MACHINE_POOL: 'onnxruntime-Win2022-GPU-dml-A10'
+        PYTHON_VERSION: '3.12'
+        EP_BUILD_FLAGS: --use_dml --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.18362.0 --enable_wcos
+        ENV_SETUP_SCRIPT: setup_env.bat
+        EP_NAME: directml
+
+- ${{ if eq(parameters.enable_mac_cpu, true) }}:
+  - stage: Python_Packaging_MacOS
+    dependsOn: []
+    jobs:
     - job: MacOS_py_Wheels
       timeoutInMinutes: 180
       workspace:
@@ -426,19 +428,26 @@ stages:
         parameters:
           condition: 'succeeded'
 
+
   - ${{ if eq(parameters.enable_linux_arm, true) }}:
-      - template: py-linux.yml
-        parameters:
-          arch: 'aarch64'
-          machine_pool: 'onnxruntime-linux-ARM64-CPU-2019'
-          base_image: 'arm64v8/almalinux:8'
-          devtoolset_rootpath: /opt/rh/gcc-toolset-12/root
-          ld_library_path_arg: /opt/rh/gcc-toolset-12/root/usr/lib64:/opt/rh/gcc-toolset-12/root/usr/lib:/opt/rh/gcc-toolset-12/root/usr/lib64/dyninst:/opt/rh/gcc-toolset-12/root/usr/lib/dyninst:/usr/local/lib64
-          prepend_path: '/opt/rh/gcc-toolset-12/root/usr/bin:'
-          extra_build_arg: ${{ parameters.build_py_parameters }}
-          cmake_build_type: ${{ parameters.cmake_build_type }}
+    - stage: Python_Packaging_Linux_ARM
+      dependsOn: []
+      jobs:
+        - template: py-linux.yml
+          parameters:
+            arch: 'aarch64'
+            machine_pool: 'onnxruntime-linux-ARM64-CPU-2019'
+            base_image: 'arm64v8/almalinux:8'
+            devtoolset_rootpath: /opt/rh/gcc-toolset-12/root
+            ld_library_path_arg: /opt/rh/gcc-toolset-12/root/usr/lib64:/opt/rh/gcc-toolset-12/root/usr/lib:/opt/rh/gcc-toolset-12/root/usr/lib64/dyninst:/opt/rh/gcc-toolset-12/root/usr/lib/dyninst:/usr/local/lib64
+            prepend_path: '/opt/rh/gcc-toolset-12/root/usr/bin:'
+            extra_build_arg: ${{ parameters.build_py_parameters }}
+            cmake_build_type: ${{ parameters.cmake_build_type }}
 
   - ${{ if eq(parameters.enable_linux_cpu, true) }}:
+    - stage: Python_Packaging_Linux_CPU
+      dependsOn: []
+      jobs:
       - template: py-linux.yml
         parameters:
           arch: 'x86_64'
@@ -460,6 +469,9 @@ stages:
           cmake_build_type: ${{ parameters.cmake_build_type }}
 
   - ${{ if eq(parameters.enable_windows_arm64_qnn, true) }}:
+    - stage: Python_Packaging_Windows_ARM64_QNN
+      dependsOn: []
+      jobs:
       - template: py-win-arm64-qnn.yml
         parameters:
           MACHINE_POOL: 'onnxruntime-qnn-windows-vs-2022-arm64'
@@ -468,7 +480,10 @@ stages:
           NUMPY_VERSION: '1.25.2'
 
   - ${{ if eq(parameters.enable_windows_x64_qnn, true) }}:
-      - template: py-win-x64-qnn.yml
-        parameters:
-          MACHINE_POOL: 'Onnxruntime-QNNEP-Windows-2022-CPU'
-          QNN_SDK: 'qnn-v2.18.0.240101_win'
+    - stage: Python_Packaging_Windows_x64_QNN
+      dependsOn: []
+      jobs:
+        - template: py-win-x64-qnn.yml
+          parameters:
+            MACHINE_POOL: 'Onnxruntime-QNNEP-Windows-2022-CPU'
+            QNN_SDK: 'qnn-v2.18.0.240101_win'
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml
index 17915d107dbe6..e200fb9e93bee 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml
@@ -1,4 +1,8 @@
 parameters:
+- name: MACHINE_POOL
+  type: string
+  default: 'onnxruntime-Win2022-GPU-T4'
+
 - name: EP_NAME
   type: string
 
@@ -44,8 +48,6 @@ stages:
         clean: all
       pool:
         name: onnxruntime-Win-CPU-2022
-    #    demands:
-    #      - ImageVersionOverride -equals 1.0.367516
       variables:
         GRADLE_OPTS: '-Dorg.gradle.daemon=false'
         VSGenerator: 'Visual Studio 17 2022'
@@ -126,7 +128,9 @@ stages:
               inputs:
                 targetPath: '$(Agent.TempDirectory)\onnx\onnx-1.15.0\dist\'
                 publishLocation: 'pipeline'
-                artifactName: onnx_py12_wheel
+                # In fact, the onnx artifact is same with different EP_NAME, but in CI, we couldn't publish artifact with same name.
+                # And there's not way to check if the artifact exists.
+                artifactName: onnx_py12_wheel_${{ parameters.EP_NAME }}
 
           - template: set-nightly-build-option-variable-step.yml
 
@@ -207,7 +211,7 @@ stages:
       workspace:
         clean: all
       pool:
-        name: onnxruntime-Win2022-GPU-T4
+        name: ${{parameters.MACHINE_POOL}}
       steps:
         - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
           displayName: 'Clean Agent Directories'
@@ -225,7 +229,7 @@ stages:
 
         - template: flex-downloadPipelineArtifact.yml
           parameters:
-            ArtifactName: "onnxruntime_${{ parameters.EP_NAME }}"
+            ArtifactName: onnxruntime_${{ parameters.EP_NAME }}
             StepName: 'Download Pipeline Artifact - Windows GPU Build'
             TargetPath: '$(Build.ArtifactStagingDirectory)'
             SpecificArtifact: ${{ parameters.SpecificArtifact }}
@@ -235,7 +239,7 @@ stages:
         - ${{ if eq(parameters.PYTHON_VERSION, '3.12') }}:
           - template: flex-downloadPipelineArtifact.yml
             parameters:
-              ArtifactName: "onnx_py12_wheel"
+              ArtifactName: onnx_py12_wheel_${{ parameters.EP_NAME }}
               StepName: 'Download Pipeline Artifact - Onnx Python12 wheel'
               TargetPath: '$(Agent.TempDirectory)\onnx\'
               SpecificArtifact: ${{ parameters.SpecificArtifact }}

From 3dcda13e62c6f244255803c967434145745ad3a0 Mon Sep 17 00:00:00 2001
From: Chi Lo <54722500+chilo-ms@users.noreply.github.com>
Date: Tue, 26 Mar 2024 21:20:14 -0700
Subject: [PATCH 251/279] [TensorRT EP] Fix concurrency issue for TRT custom op
 list (#20093)

The `CreateTensorRTCustomOpDomainList()` is not thread-safe due to its
static variables, `created_custom_op_list` and `custom_op_domain`.
This PR makes sure synchronization using mutex.

see issue: https://github.com/microsoft/onnxruntime/issues/20089
---
 .../tensorrt/tensorrt_execution_provider_custom_ops.cc          | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.cc
index b4f348159440f..bd8b46d9aabb8 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.cc
@@ -28,6 +28,8 @@ extern TensorrtLogger& GetTensorrtLogger();
 common::Status CreateTensorRTCustomOpDomainList(std::vector<OrtCustomOpDomain*>& domain_list, const std::string extra_plugin_lib_paths) {
   static std::unique_ptr<OrtCustomOpDomain> custom_op_domain = std::make_unique<OrtCustomOpDomain>();
   static std::vector<std::unique_ptr<TensorRTCustomOp>> created_custom_op_list;
+  static OrtMutex mutex;
+  std::lock_guard<OrtMutex> lock(mutex);
   if (custom_op_domain->domain_ != "" && custom_op_domain->custom_ops_.size() > 0) {
     domain_list.push_back(custom_op_domain.get());
     return Status::OK();

From 28907d8c59116d1fe3072c72d1c2da3e7c83ec2d Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Tue, 26 Mar 2024 21:35:49 -0700
Subject: [PATCH 252/279] [js/web] workaround NPM test fetch failure (#20020)

### Description

Sometimes the `npm test` failed with an error of "TypeError: Failed to
fetch".

I checked the callback entry of the localhost server started by karma.
When the "Failed to fetch" happens, no request is reflected on the
server side. The root cause is still not identified. However, as this
issue only happens sometimes when the browser is just launched by karma
runner, doing retry can workaround this issue for most of the time.
---
 js/web/test/test-runner.ts |  3 ++-
 js/web/test/test-shared.ts | 23 +++++++++++++++++++++--
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/js/web/test/test-runner.ts b/js/web/test/test-runner.ts
index 7c03e5b915fd7..d8ee5ef953209 100644
--- a/js/web/test/test-runner.ts
+++ b/js/web/test/test-runner.ts
@@ -159,7 +159,8 @@ async function initializeSession(
     if (preloadModelData) {
       session = await ort.InferenceSession.create(preloadModelData, sessionConfig);
     } else {
-      session = await ort.InferenceSession.create(modelFilePath, sessionConfig);
+      const modelData = await readFile(modelFilePath);
+      session = await ort.InferenceSession.create(modelData, sessionConfig);
     }
   } catch (e) {
     Logger.error(
diff --git a/js/web/test/test-shared.ts b/js/web/test/test-shared.ts
index 7c327e7c97ac4..55beb66e37e6e 100644
--- a/js/web/test/test-shared.ts
+++ b/js/web/test/test-shared.ts
@@ -15,14 +15,33 @@ export function bufferToBase64(buffer: Uint8Array): string {
   return base64.fromByteArray(buffer);
 }
 
+async function retry<T>(fn: () => Promise<T>, maxRetries = 3, delay = 100): Promise<T> {
+  let retries = maxRetries;
+  do {
+    try {
+      return await fn();
+    } catch (err) {
+      if (retries-- === 0) {
+        throw err;
+      }
+      await new Promise(resolve => setTimeout(resolve, delay));
+    }
+    // eslint-disable-next-line no-constant-condition
+  } while (true);
+}
+
 export async function readFile(file: string) {
   if (typeof process !== 'undefined' && process.versions && process.versions.node) {
     // node
     return fs.readFile(file);
   } else {
     // browser
-    const response = await fetch(file);
-    return new Uint8Array(await response.arrayBuffer());
+    //
+    // use "retry" to workaround the error "TypeError: Failed to fetch" in some test environments
+    return retry(async () => {
+      const response = await fetch(file);
+      return new Uint8Array(await response.arrayBuffer());
+    });
   }
 }
 

From 4aa84003cad57610d05603367261bce9d27cafe4 Mon Sep 17 00:00:00 2001
From: guyang3532 <62738430+guyang3532@users.noreply.github.com>
Date: Wed, 27 Mar 2024 16:10:07 +0800
Subject: [PATCH 253/279] support Pow/Div/Sqrt in PaddingElimination (#20083)

---
 .../compute_optimizer/padding_elimination.cc  | 11 ++++--
 .../python/orttraining_test_ortmodule_api.py  | 35 +++++++++++++------
 2 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/orttraining/orttraining/core/optimizer/compute_optimizer/padding_elimination.cc b/orttraining/orttraining/core/optimizer/compute_optimizer/padding_elimination.cc
index d42af92c7c66d..1f65d886a4b8b 100644
--- a/orttraining/orttraining/core/optimizer/compute_optimizer/padding_elimination.cc
+++ b/orttraining/orttraining/core/optimizer/compute_optimizer/padding_elimination.cc
@@ -224,8 +224,10 @@ void IterateSubgraphFromNode(Graph& graph,
     visited.insert(cur);
     if (graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "Add", {7, 13, 14}) ||
         graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "BiasGelu", {1}, kMSDomain) ||
-        graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "Sub", {7, 13, 14}) ||
-        graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "Mul", {7, 13, 14})) {
+        graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "Div", {7, 13, 14}) ||
+        graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "Mul", {7, 13, 14}) ||
+        graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "Pow", {7, 12, 13, 15}) ||
+        graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "Sub", {7, 13, 14})) {
       ORT_ENFORCE(subgraph.find(cur->MutableInputDefs()[0]) != subgraph.end() ||
                   subgraph.find(cur->MutableInputDefs()[1]) != subgraph.end());
       if (cur->InputDefs()[0]->Shape() && cur->InputDefs()[1]->Shape()) {
@@ -278,7 +280,10 @@ void IterateSubgraphFromNode(Graph& graph,
       subgraph.insert(cur->MutableOutputDefs()[1]);
       PushAllOutputNode(graph, to_visit, cur, visited);
     } else if (graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "Cast", {9, 13}) ||
-               graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "Gelu", {1}, kMSDomain)) {
+               graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "FastGelu", {1}, kMSDomain) ||
+               graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "Gelu", {1}, kMSDomain) ||
+               graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "QuickGelu", {1}, kMSDomain) ||
+               graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "Sqrt", {6, 13})) {
       ORT_ENFORCE(subgraph.find(cur->MutableInputDefs()[0]) != subgraph.end());
       subgraph.insert(cur->MutableOutputDefs()[0]);
       PushAllOutputNode(graph, to_visit, cur, visited);
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
index da217eb76949c..5078058995281 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
@@ -5725,8 +5725,6 @@ def run_step(model, input, target):
 @pytest.mark.parametrize("label_is_sparse", [False, True])
 @pytest.mark.parametrize("rank", [1, 2])
 def test_runtime_inspector_label_and_embed_sparsity_detection(embed_is_sparse, label_is_sparse, rank, caplog):
-    os.environ["ORTMODULE_ENABLE_EMBEDDING_SPARSE_OPTIMIZER"] = "1"
-
     class NeuralNetCrossEntropyLoss(torch.nn.Module):
         def __init__(self, num_embeddings, embedding_dim):
             super().__init__()
@@ -5797,10 +5795,12 @@ def run_step(model, input, positions):
     "test_cases",
     [
         ("Add", 0),
+        ("Add", 1),
         ("Add", 2),
         ("Add", 3),
         ("Add", 4),
         ("Sub", 0),
+        ("Sub", 1),
         ("Sub", 2),
         ("Sub", 3),
         ("Sub", 4),
@@ -5808,12 +5808,22 @@ def run_step(model, input, positions):
         ("Mul", 2),
         ("Mul", 3),
         ("Mul", 4),
+        ("Div", 0),
+        ("Div", 2),
+        ("Div", 3),
+        ("Div", 4),
+        ("Pow", 0),
+        ("Pow", 1),
+        ("Pow", 2),
+        ("Pow", 3),
+        ("Pow", 4),
         ("MatMul", 0),
         ("MatMul", 1),
         ("Dropout", 0),
         ("LayerNormalization", 0),
         ("LayerNormalization", 1),
         ("Cast", 0),
+        ("Sqrt", 0),
         ("BiasGelu", 0),
         ("Gelu", 0),
         ("ReduceMean", 0),
@@ -5821,7 +5831,6 @@ def run_step(model, input, positions):
     ],
 )
 def test_ops_for_padding_elimination(test_cases):
-    os.environ["ORTMODULE_ENABLE_EMBEDDING_SPARSE_OPTIMIZER"] = "1"
     test_op = test_cases[0]
     case = test_cases[1]
 
@@ -5848,7 +5857,7 @@ def __init__(self, vocab_size, hidden_size, pad_token_id):
         #            pattern should be insert to the arg of [batch_size, 1, hidden_size].
         # in case 3, the shapes of inputs of test_op are [batch_size, seqlen, hidden_size] and [1, hidden_size],
         #            the test_op should be included in padding elimination subgraph and a 'Expand + FlattenAndUnpad'
-        #            pattern should be insert to the arg of [batch_size, 1, hidden_size].
+        #            pattern should be insert to the arg of [1, hidden_size].
         # in case 4, the shapes of inputs of test_op are [batch_size, seqlen, hidden_size] and [batch_size, seqlen, hidden_size],
         #            the test_op should be included in padding elimination subgraph and the PadAndUnflatten should be added to
         #            output of test_op. Besides, the other input of Add should be added 'FlattenAndUnpad' to
@@ -5858,6 +5867,8 @@ def test_elementwise(self, input_ids):
             one_input = None
             if case == 0:
                 one_input = torch.ones(self.hidden_size, dtype=torch.long).to(device)
+            elif case == 1:
+                one_input = 1
             elif case == 2:
                 one_input = torch.ones((input_shape[0], 1, self.hidden_size), dtype=torch.long).to(device)
             elif case == 3:
@@ -5872,6 +5883,10 @@ def test_elementwise(self, input_ids):
                 output = one_input - inputs_embeds
             elif test_op == "Mul":
                 output = one_input * inputs_embeds
+            elif test_op == "Div":
+                output = inputs_embeds / one_input
+            elif test_op == "Pow":
+                output = inputs_embeds ** (one_input * 2)
             else:
                 output = None
             return output
@@ -5911,6 +5926,8 @@ def test_other(self, input_ids):
                 output = torch.nn.functional.gelu(inputs_embeds + bias)
             elif test_op == "Gelu":
                 output = torch.nn.functional.gelu(inputs_embeds)
+            elif test_op == "Sqrt":
+                output = torch.sqrt(inputs_embeds)
             elif test_op == "ReduceMean":
                 # In case 0, the inputs_embeds are reduced at last dimension, the ReduceMean should be included in padding
                 # elimination subgraph and the PadAndUnflatten should be added to output of ReduceMean.
@@ -5924,7 +5941,7 @@ def test_other(self, input_ids):
             return output
 
         def forward(self, input_ids):
-            if test_op in ["Add", "Mul", "Sub"]:
+            if test_op in ["Add", "Mul", "Sub", "Div", "Pow"]:
                 output = self.test_elementwise(input_ids)
             elif test_op == "MatMul":
                 output = self.test_matmul(input_ids)
@@ -5953,7 +5970,7 @@ def generate_inputs(batch_size, max_seq_length, vocab_size):
     model(x)
 
     training_model = model._torch_module._execution_manager(True)._onnx_models.optimized_model
-    if test_op == "Sub":
+    if test_op == "Sub" or test_op == "Pow":
         assert len([node.op_type for node in training_model.graph.node if node.op_type == "Sub"]) == 2
     else:
         assert len([node.op_type for node in training_model.graph.node if node.op_type == "Sub"]) == 1
@@ -5974,7 +5991,7 @@ def find_input_node_type(model, arg):
         return result[0].op_type if len(result) == 1 else None
 
     recover_pad_input_optypes = [find_input_node_type(training_model, arg) for arg in recover_pad_node.input]
-    if test_op == "Add" or test_op == "Mul" or test_op == "Sub":
+    if test_op == "Add" or test_op == "Mul" or test_op == "Sub" or test_op == "Div" or test_op == "Pow":
         assert test_op in recover_pad_input_optypes
     else:
         if case == 0:
@@ -5982,11 +5999,8 @@ def find_input_node_type(model, arg):
         else:
             assert "ATen" in recover_pad_input_optypes
 
-    del os.environ["ORTMODULE_ENABLE_EMBEDDING_SPARSE_OPTIMIZER"]
-
 
 def test_e2e_padding_elimination():
-    os.environ["ORTMODULE_ENABLE_EMBEDDING_SPARSE_OPTIMIZER"] = "1"
     seed = 5033
     random.seed(seed)
     np.random.seed(seed)
@@ -6129,7 +6143,6 @@ def generate_inputs(batch_size, max_seq_length, vocab_size):
     training_model = ort_model._torch_module._execution_manager(True)._onnx_models.optimized_model
     assert "FlattenAndUnpad" in [node.op_type for node in training_model.graph.node]
     assert "PadAndUnflatten" in [node.op_type for node in training_model.graph.node]
-    del os.environ["ORTMODULE_ENABLE_EMBEDDING_SPARSE_OPTIMIZER"]
 
 
 @pytest.mark.skipif(

From ca465dc087990682adc1db9eeaa0bcc4a1464287 Mon Sep 17 00:00:00 2001
From: Nanashi <sevenc7c@sevenc7c.com>
Date: Wed, 27 Mar 2024 18:07:00 +0900
Subject: [PATCH 254/279] [js] Make error friendly when isOrtFormat is
 undefined (#19958)

### Description
Make error friendly when isOrtFormat is undefined
(`onnxruntime.InferenceSession.create` is called with ArrayBuffer or
Uint8Array).

### Motivation and Context
I was trying to run my onnx model in WebGL EP, but it gave me the error
"Cannot read properties of null (reading 'irVersion')".
I used debugger to find that actual error is `int64 is not supported`,
but the error was invisible for me.
So I made it to show both error when isOrtFormat is undefined.
<s>I haven't written unit test yet, so I'm making it draft. (I have no
idea about how do I test this though...)</s>
[d62d942](https://github.com/microsoft/onnxruntime/pull/19958/commits/d62d9425ba7b9e5ff0d0a2ae6998dd53817d5db9)
---
 js/web/lib/onnxjs/model.ts            | 12 +++++++++++-
 js/web/test/e2e/browser-test-webgl.js | 13 +++++++++++++
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/js/web/lib/onnxjs/model.ts b/js/web/lib/onnxjs/model.ts
index f9a1b6e76089d..8e689626011be 100644
--- a/js/web/lib/onnxjs/model.ts
+++ b/js/web/lib/onnxjs/model.ts
@@ -16,6 +16,7 @@ export class Model {
   constructor() {}
 
   load(buf: Uint8Array, graphInitializer?: Graph.Initializer, isOrtFormat?: boolean): void {
+    let onnxError: Error|undefined;
     if (!isOrtFormat) {
       // isOrtFormat === false || isOrtFormat === undefined
       try {
@@ -25,10 +26,19 @@ export class Model {
         if (isOrtFormat !== undefined) {
           throw e;
         }
+        onnxError = e;
       }
     }
 
-    this.loadFromOrtFormat(buf, graphInitializer);
+    try {
+      this.loadFromOrtFormat(buf, graphInitializer);
+    } catch (e) {
+      if (isOrtFormat !== undefined) {
+        throw e;
+      }
+      // Tried both formats and failed (when isOrtFormat === undefined)
+      throw new Error(`Failed to load model as ONNX format: ${onnxError}\nas ORT format: ${e}`);
+    }
   }
 
   private loadFromOnnxFormat(buf: Uint8Array, graphInitializer?: Graph.Initializer): void {
diff --git a/js/web/test/e2e/browser-test-webgl.js b/js/web/test/e2e/browser-test-webgl.js
index e503f38ae5735..974c81d064c89 100644
--- a/js/web/test/e2e/browser-test-webgl.js
+++ b/js/web/test/e2e/browser-test-webgl.js
@@ -6,3 +6,16 @@
 it('Browser E2E testing - WebGL backend', async function() {
   await testFunction(ort, {executionProviders: ['webgl']});
 });
+
+it('Browser E2E testing - invalid buffer', async () => {
+  try {
+    await ort.InferenceSession.create(
+        new Uint8Array(Array.from({length: 100}, () => 42)), {executionProviders: ['webgl']});
+
+    // Should not reach here.
+    assert(false);
+  } catch (e) {
+    assert(e.message.includes('as ONNX format'));
+    assert(e.message.includes('as ORT format'));
+  }
+});

From 47903e701a4b20b8f43ba2c77392db8c31a3f664 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Wed, 27 Mar 2024 10:35:43 -0700
Subject: [PATCH 255/279] fix condition in web CI YAML (#20095)

### Description
fix condition in web CI YAML
---
 .../github/azure-pipelines/templates/win-web-ci.yml  | 12 ++++++------
 .../templates/win-web-multi-browsers.yml             |  6 +++---
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
index b7aee559cf73c..fa6103fb8a59d 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
@@ -155,7 +155,7 @@ jobs:
   - script: |
      powershell "Get-WmiObject Win32_Process -Filter \"name = 'chrome.exe'\" | Format-List CommandLine"
     displayName: 'Check active Chrome processes (before test)'
-    condition: and(succeeded(), eq('$(Agent.Diagnostic)', 'true'))
+    condition: and(succeeded(), eq(variables['Agent.Diagnostic'], 'true'))
   - script: |
      mkdir $(Agent.TempDirectory)\web\test\01
      npm test -- -e=chrome -b=webgl,wasm --user-data-dir=$(Agent.TempDirectory)\web\test\01
@@ -165,7 +165,7 @@ jobs:
   - script: |
      powershell "Get-WmiObject Win32_Process -Filter \"name = 'chrome.exe'\" | Format-List CommandLine"
     displayName: 'Check active Chrome processes (before test)'
-    condition: and(succeeded(), eq('$(Agent.Diagnostic)', 'true'))
+    condition: and(succeeded(), eq(variables['Agent.Diagnostic'], 'true'))
   - script: |
      mkdir $(Agent.TempDirectory)\web\test\02
      npm test -- -e=chrome -b=webgl,wasm,webgpu $(webgpuCommandlineExtraFlags) --user-data-dir=$(Agent.TempDirectory)\web\test\02
@@ -175,7 +175,7 @@ jobs:
   - script: |
      powershell "Get-WmiObject Win32_Process -Filter \"name = 'chrome.exe'\" | Format-List CommandLine"
     displayName: 'Check active Chrome processes (before test)'
-    condition: and(succeeded(), eq('$(Agent.Diagnostic)', 'true'))
+    condition: and(succeeded(), eq(variables['Agent.Diagnostic'], 'true'))
   - script: |
      mkdir $(Agent.TempDirectory)\web\test\03
      npm test -- suite1 -e=chrome -b=webgpu --io-binding=gpu-tensor $(webgpuCommandlineExtraFlags) --user-data-dir=$(Agent.TempDirectory)\web\test\03
@@ -185,7 +185,7 @@ jobs:
   - script: |
      powershell "Get-WmiObject Win32_Process -Filter \"name = 'chrome.exe'\" | Format-List CommandLine"
     displayName: 'Check active Chrome processes (before test)'
-    condition: and(succeeded(), eq('$(Agent.Diagnostic)', 'true'))
+    condition: and(succeeded(), eq(variables['Agent.Diagnostic'], 'true'))
   - script: |
      mkdir $(Agent.TempDirectory)\web\test\04
      npm test -- suite1 -e=chrome -b=webgpu --io-binding=gpu-location $(webgpuCommandlineExtraFlags) --user-data-dir=$(Agent.TempDirectory)\web\test\04
@@ -195,7 +195,7 @@ jobs:
   - script: |
      powershell "Get-WmiObject Win32_Process -Filter \"name = 'chrome.exe'\" | Format-List CommandLine"
     displayName: 'Check active Chrome processes (before test)'
-    condition: and(succeeded(), eq('$(Agent.Diagnostic)', 'true'))
+    condition: and(succeeded(), eq(variables['Agent.Diagnostic'], 'true'))
   - script: |
      mkdir $(Agent.TempDirectory)\web\test\05
      npm test -- --webgl.pack -b=webgl -e=chrome --user-data-dir=$(Agent.TempDirectory)\web\test\05
@@ -204,7 +204,7 @@ jobs:
   - script: |
      powershell "Get-WmiObject Win32_Process -Filter \"name = 'chrome.exe'\" | Format-List CommandLine"
     displayName: 'Check active Chrome processes (before test)'
-    condition: and(succeeded(), eq('$(Agent.Diagnostic)', 'true'))
+    condition: and(succeeded(), eq(variables['Agent.Diagnostic'], 'true'))
   - script: |
      mkdir $(Agent.TempDirectory)\web\test\06
      npm test -- --wasm.proxy -b=wasm -e=chrome --user-data-dir=$(Agent.TempDirectory)\web\test\06
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-web-multi-browsers.yml b/tools/ci_build/github/azure-pipelines/templates/win-web-multi-browsers.yml
index 00109b348e8cb..a0af221607dc8 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-web-multi-browsers.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-web-multi-browsers.yml
@@ -70,7 +70,7 @@ jobs:
   - script: |
       powershell "Get-WmiObject Win32_Process -Filter \"name = 'chrome.exe'\" | Format-List CommandLine"
     displayName: 'Check active Chrome processes (before test)'
-    condition: and(succeeded(), eq('$(Agent.Diagnostic)', 'true'))
+    condition: and(succeeded(), eq(variables['Agent.Diagnostic'], 'true'))
   - script: |
       mkdir $(Agent.TempDirectory)\web\test_multi_browsers\01
       npm test -- suite0 -e=chrome -b=wasm,webgl --wasm.initTimeout=30000 --file-cache --user-data-dir=$(Agent.TempDirectory)\web\test_multi_browsers\01
@@ -79,7 +79,7 @@ jobs:
   - script: |
       powershell "Get-WmiObject Win32_Process -Filter \"name = 'firefox.exe'\" | Format-List CommandLine"
     displayName: 'Check active Firefox processes (before test)'
-    condition: and(succeeded(), eq('$(Agent.Diagnostic)', 'true'))
+    condition: and(succeeded(), eq(variables['Agent.Diagnostic'], 'true'))
   - script: |
       mkdir $(Agent.TempDirectory)\web\test_multi_browsers\02
       npm test -- suite0 -b=wasm,webgl -e=firefox --wasm.initTimeout=30000 --file-cache --user-data-dir=$(Agent.TempDirectory)\web\test_multi_browsers\02
@@ -88,7 +88,7 @@ jobs:
   - script: |
       powershell "Get-WmiObject Win32_Process -Filter \"name = 'msedge.exe'\" | Format-List CommandLine"
     displayName: 'Check active Edge processes (before test)'
-    condition: and(succeeded(), eq('$(Agent.Diagnostic)', 'true'))
+    condition: and(succeeded(), eq(variables['Agent.Diagnostic'], 'true'))
   - script: |
       mkdir $(Agent.TempDirectory)\web\test_multi_browsers\03
       npm test -- suite0 -b=wasm,webgl -e=edge --wasm.initTimeout=30000 --file-cache --user-data-dir=$(Agent.TempDirectory)\web\test_multi_browsers\03

From c8676ffbff5218e226a25651b5b0c981dc5da798 Mon Sep 17 00:00:00 2001
From: Xiaoyu <85524621+xiaoyu-work@users.noreply.github.com>
Date: Wed, 27 Mar 2024 10:40:08 -0700
Subject: [PATCH 256/279] Add ModelProto support for quantize api (#20018)

### Description
Add ModelProto support for `quantize` api


### Motivation and Context
Currently, the `quantize` API only accepts a model path as the input
model. However, for large models, saving and loading from disk can be
time-consuming. By adding `ModelProto` as an input option to the
`quantize` API, significant time can be saved.
---
 .../execution_providers/qnn/preprocess.py     |  8 +--
 .../execution_providers/qnn/quant_config.py   |  8 ++-
 .../python/tools/quantization/quantize.py     | 45 +++++++++++----
 .../tools/quantization/shape_inference.py     | 55 ++++++++++++++-----
 4 files changed, 83 insertions(+), 33 deletions(-)

diff --git a/onnxruntime/python/tools/quantization/execution_providers/qnn/preprocess.py b/onnxruntime/python/tools/quantization/execution_providers/qnn/preprocess.py
index e584a65574520..85f5d967f9ee3 100644
--- a/onnxruntime/python/tools/quantization/execution_providers/qnn/preprocess.py
+++ b/onnxruntime/python/tools/quantization/execution_providers/qnn/preprocess.py
@@ -16,8 +16,8 @@
 
 
 def qnn_preprocess_model(
-    model_input: Path,
-    model_output: Path,
+    model_input: str | Path | onnx.ModelProto,
+    model_output: str | Path,
     fuse_layernorm: bool = False,
     save_as_external_data: bool = False,
     all_tensors_to_one_file: bool = False,
@@ -37,7 +37,7 @@ def qnn_preprocess_model(
     - (Optional) Fuse ReduceMean sequence into a single LayerNormalization node.
 
     Args:
-        model_input: Path to the input model file.
+        model_input: Path to the input model file or ModelProto.
         model_output: Path the output model file, which is only created if this method returns True.
         fuse_layernorm: True if ReduceMean sequences should be fused into LayerNormalization nodes.
             Defaults to False.
@@ -82,7 +82,7 @@ def qnn_preprocess_model(
             to cancel out.
     """
     modified = False
-    model = onnx.load_model(model_input)
+    model = model_input if isinstance(model_input, onnx.ModelProto) else onnx.load_model(model_input)
     onnx_model = ONNXModel(model)
 
     # Fuse Erf sequence into a single Gelu
diff --git a/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py b/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py
index 479eaf5b0c542..3a217fdfaaffd 100644
--- a/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py
+++ b/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py
@@ -39,7 +39,7 @@ def warn_unable_to_override(
 
 
 def get_qnn_qdq_config(
-    model_input: Path,
+    model_input: str | Path | onnx.ModelProto,
     calibration_data_reader: CalibrationDataReader,
     calibrate_method=CalibrationMethod.MinMax,
     activation_type=QuantType.QUInt8,
@@ -56,7 +56,11 @@ def get_qnn_qdq_config(
     if weight_symmetric is None:
         weight_symmetric = weight_type in {QuantType.QInt8, QuantType.QInt16}
 
-    model = onnx.load_model(model_input, load_external_data=False)
+    model = (
+        model_input
+        if isinstance(model_input, onnx.ModelProto)
+        else onnx.load_model(model_input, load_external_data=False)
+    )
 
     op_types = set()
     model_has_external_data = False
diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py
index 9b0c15e4b4dde..9ebd7bf3c408a 100644
--- a/onnxruntime/python/tools/quantization/quantize.py
+++ b/onnxruntime/python/tools/quantization/quantize.py
@@ -6,6 +6,9 @@
 import logging
 import tempfile
 from pathlib import Path
+from typing import Union
+
+import onnx
 
 from .calibrate import CalibrationDataReader, CalibrationMethod, TensorsData, create_calibrator
 from .onnx_quantizer import ONNXQuantizer
@@ -16,6 +19,7 @@
     QuantType,
     load_model_with_shape_infer,
     model_has_pre_process_metadata,
+    save_and_reload_model_with_shape_infer,
 )
 from .registry import IntegerOpsRegistry, QDQRegistry, QLinearOpsRegistry
 
@@ -280,8 +284,8 @@ def check_static_quant_arguments(quant_format: QuantFormat, activation_type: Qua
 
 
 def quantize_static(
-    model_input,
-    model_output,
+    model_input: Union[str, Path, onnx.ModelProto],
+    model_output: Union[str, Path],
     calibration_data_reader: CalibrationDataReader,
     quant_format=QuantFormat.QDQ,
     op_types_to_quantize=None,
@@ -304,7 +308,7 @@ def quantize_static(
 
     Args:
 
-        model_input: file path of model to quantize
+        model_input: file path of model or ModelProto to quantize
         model_output: file path of quantized model
         calibration_data_reader: a calibration data reader. It
             enumerates calibration data and generates inputs for the
@@ -435,7 +439,11 @@ def quantize_static(
         qdq_ops = list(QDQRegistry.keys())
         op_types_to_quantize = list(set(q_linear_ops + qdq_ops))
 
-    model = load_model_with_shape_infer(Path(model_input))
+    model = (
+        save_and_reload_model_with_shape_infer(model_input)
+        if isinstance(model_input, onnx.ModelProto)
+        else load_model_with_shape_infer(Path(model_input))
+    )
 
     pre_processed: bool = model_has_pre_process_metadata(model)
     if not pre_processed:
@@ -485,6 +493,15 @@ def inc_dataloader():
         model = load_model_with_shape_infer(Path(model_input))  # use smooth quant model for calibration
 
     with tempfile.TemporaryDirectory(prefix="ort.quant.") as quant_tmp_dir:
+        if isinstance(model_input, onnx.ModelProto):
+            output_path = str(Path(quant_tmp_dir) / "model_input.onnx")
+            onnx.save_model(
+                model_input,
+                output_path,
+                save_as_external_data=True,
+            )
+            model_input = output_path
+
         calibrator = create_calibrator(
             Path(model_input),
             op_types_to_quantize,
@@ -546,8 +563,8 @@ def inc_dataloader():
 
 
 def quantize_dynamic(
-    model_input: Path,
-    model_output: Path,
+    model_input: Union[str, Path, onnx.ModelProto],
+    model_output: Union[str, Path],
     op_types_to_quantize=None,
     per_channel=False,
     reduce_range=False,
@@ -560,7 +577,7 @@ def quantize_dynamic(
     """Given an onnx model, create a quantized onnx model and save it into a file
 
     Args:
-        model_input: file path of model to quantize
+        model_input: file path of model or ModelProto to quantize
         model_output: file path of quantized model
         op_types_to_quantize:
             specify the types of operators to quantize, like ['Conv'] to quantize Conv only.
@@ -609,7 +626,11 @@ def quantize_dynamic(
     if not op_types_to_quantize or len(op_types_to_quantize) == 0:
         op_types_to_quantize = list(IntegerOpsRegistry.keys())
 
-    model = load_model_with_shape_infer(Path(model_input))
+    model = (
+        save_and_reload_model_with_shape_infer(model_input)
+        if isinstance(model_input, onnx.ModelProto)
+        else load_model_with_shape_infer(Path(model_input))
+    )
 
     pre_processed: bool = model_has_pre_process_metadata(model)
     if not pre_processed:
@@ -642,15 +663,15 @@ def quantize_dynamic(
 
 
 def quantize(
-    model_input: Path,
-    model_output: Path,
+    model_input: Union[str, Path, onnx.ModelProto],
+    model_output: Union[str, Path],
     quant_config: QuantConfig,
 ):
     """Quantize a model with QuantConfig.
 
     Args:
-        model_input (Path): Path to the model to quantize.
-        model_output (Path): Path to save the quantized model.
+        model_input (str | Path | ModelProto): Path to the model or ModelProto to quantize.
+        model_output (str | Path): Path to save the quantized model.
         quant_config (QuantConfig): Quantization Configuration.
     """
 
diff --git a/onnxruntime/python/tools/quantization/shape_inference.py b/onnxruntime/python/tools/quantization/shape_inference.py
index b7d4726610387..7368304837a96 100644
--- a/onnxruntime/python/tools/quantization/shape_inference.py
+++ b/onnxruntime/python/tools/quantization/shape_inference.py
@@ -9,12 +9,13 @@
 import tempfile
 import traceback
 from pathlib import Path
-from typing import Optional
+from typing import Optional, Union
 
 import onnx
 
 import onnxruntime
 from onnxruntime.tools.symbolic_shape_infer import SymbolicShapeInference
+from onnxruntime.transformers.onnx_utils import extract_raw_data_from_model, has_external_data
 
 from .quant_utils import add_pre_process_metadata
 
@@ -22,8 +23,8 @@
 
 
 def quant_pre_process(
-    input_model_path: str,
-    output_model_path: str,
+    input_model: Union[str, Path, onnx.ModelProto],
+    output_model_path: Union[str, Path],
     skip_optimization: bool = False,
     skip_onnx_shape: bool = False,
     skip_symbolic_shape: bool = False,
@@ -39,7 +40,7 @@ def quant_pre_process(
     """Shape inference and model optimization, in preparation for quantization.
 
     Args:
-        input_model_path: Path to the input model file")
+        input_model: Path to the input model file or ModelProto
         output_model_path: Path to the output model file
         skip_optimization: Skip model optimization step if true. This may result in ONNX shape
             inference failure for some models.
@@ -68,8 +69,9 @@ def quant_pre_process(
 
         if not skip_symbolic_shape:
             logger.info("Performing symbolic shape inference...")
+            loaded_model = input_model if isinstance(input_model, onnx.ModelProto) else onnx.load(input_model)
             model = SymbolicShapeInference.infer_shapes(
-                onnx.load(input_model_path),
+                loaded_model,
                 int_max,
                 auto_merge,
                 guess_output_rank,
@@ -80,18 +82,18 @@ def quant_pre_process(
             # Use ORT optimizers (native code) to optimize model
             if not skip_symbolic_shape:
                 # Need to save the inferenced model to file so as to run the optimizer
-                input_model_path = str(temp_path / "symbolic_shape_inferred.onnx")
+                input_model = str(temp_path / "symbolic_shape_inferred.onnx")
                 if save_as_external_data:
                     onnx.save_model(
                         model,
-                        input_model_path,
+                        input_model,
                         save_as_external_data=True,
                         all_tensors_to_one_file=all_tensors_to_one_file,
                         size_threshold=external_data_size_threshold,
                         convert_attribute=False,
                     )
                 else:
-                    onnx.save(model, input_model_path)
+                    onnx.save(model, input_model)
                 model = None
 
             opt_model_path = str(temp_path / "optimized.onnx")
@@ -99,7 +101,19 @@ def quant_pre_process(
                 sess_option = onnxruntime.SessionOptions()
                 sess_option.optimized_model_filepath = opt_model_path
                 sess_option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_BASIC
-                sess = onnxruntime.InferenceSession(input_model_path, sess_option, providers=["CPUExecutionProvider"])
+                # For large model, extract external data from model and add to session options
+                if isinstance(input_model, onnx.ModelProto):
+                    if has_external_data(input_model):
+                        raise ValueError(
+                            "ModelProto has external data not loaded into memory, ORT cannot create session. "
+                            "Please load external data before calling this function. "
+                            "See https://onnx.ai/onnx/repo-docs/ExternalData.html for more information."
+                        )
+                    external_names, external_values = extract_raw_data_from_model(input_model)
+                    sess_option.add_external_initializers(list(external_names), list(external_values))
+                    input_model = input_model.SerializeToString()
+
+                sess = onnxruntime.InferenceSession(input_model, sess_option, providers=["CPUExecutionProvider"])
                 # Close the session to avoid the cleanup error on Windows for temp folders
                 # https://github.com/microsoft/onnxruntime/issues/17627
                 del sess
@@ -109,7 +123,7 @@ def quant_pre_process(
                 )
                 logger.error(traceback.format_exc())
 
-            input_model_path = opt_model_path
+            input_model = opt_model_path
 
         if not skip_onnx_shape:
             # ONNX shape inference.
@@ -117,26 +131,37 @@ def quant_pre_process(
             # If the skip optimization is specified, we could be dealing with a
             # large model. So be on the safe side, save the model
             if model is not None:
-                input_model_path = str(temp_path / "symbolic_shape_inferred.onnx")
+                input_model = str(temp_path / "symbolic_shape_inferred.onnx")
                 if save_as_external_data:
                     onnx.save_model(
                         model,
-                        input_model_path,
+                        input_model,
                         save_as_external_data=True,
                         all_tensors_to_one_file=all_tensors_to_one_file,
                         size_threshold=external_data_size_threshold,
                         convert_attribute=False,
                     )
                 else:
-                    onnx.save(model, input_model_path)
+                    onnx.save(model, input_model)
                 model = None
 
+            if isinstance(input_model, onnx.ModelProto):
+                input_model = str(Path(quant_tmp_dir) / "model_input.onnx")
+                onnx.save_model(
+                    model,
+                    input_model,
+                    save_as_external_data=True,
+                    all_tensors_to_one_file=all_tensors_to_one_file,
+                    size_threshold=external_data_size_threshold,
+                    convert_attribute=False,
+                )
+
             inferred_model_path = str(temp_path / "onnx_shape_inferred.onnx")
-            onnx.shape_inference.infer_shapes_path(input_model_path, inferred_model_path)
+            onnx.shape_inference.infer_shapes_path(input_model, inferred_model_path)
             model = onnx.load(inferred_model_path)
 
     if model is None:
-        model = onnx.load(input_model_path)
+        model = input_model if isinstance(input_model, onnx.ModelProto) else onnx.load(input_model)
 
     add_pre_process_metadata(model)
 

From 4df9d16f98cd5ac60e1c6151207dfbf9f5a165a4 Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Thu, 28 Mar 2024 03:20:57 +0800
Subject: [PATCH 257/279] [Fix] TSAUpload task must be in building stage
 (#20098)

### Description
In #20085, TSAUpload was in testing stage so main branch failed.
---
 .../github/azure-pipelines/templates/py-win-gpu.yml   | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml
index e200fb9e93bee..59387a0de4cd1 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml
@@ -204,6 +204,17 @@ stages:
             inputs:
               AnalyzeTargetGlob: '+:file|$(Build.ArtifactStagingDirectory)\**\*.dll;-:file|$(Build.ArtifactStagingDirectory)\**\DirectML.dll'
 
+          - task: TSAUpload@2
+            displayName: 'TSA upload'
+            condition: and (succeeded(), eq(variables['Build.SourceBranch'], 'refs/heads/main'))
+            inputs:
+              GdnPublishTsaOnboard: false
+              GdnPublishTsaConfigFile: '$(Build.sourcesDirectory)\.gdn\.gdntsa'
+
+          - template: component-governance-component-detection-steps.yml
+            parameters:
+              condition: 'succeeded'
+
   - stage: Win_py_${{ parameters.EP_NAME }}_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}_Tests
     dependsOn: Win_py_${{ parameters.EP_NAME }}_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}_Build
     jobs:

From ab2eaedfaa1e1abe5c128cfbccf8c6a45b75480e Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Thu, 28 Mar 2024 03:29:34 +0800
Subject: [PATCH 258/279] Install ONNX by buildling source code in Windows DML
 stage (#20079)

### Description
In #20073, I use pin onnx version to unblock the whole PR CI.
In fact, we could use the onnx that installed by building source code,
that the onnx version is controlled by deps.txt.
For some history reason, DML stage installed onnx from pypi. Now, the
onnx can be installed as other stages.

add an option to skip installing onnx in win-ci-prebuild-step
---
 tools/ci_build/build.py                                    | 7 -------
 .../templates/jobs/win-ci-prebuild-steps.yml               | 5 +++++
 tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml  | 5 ++---
 3 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index e1649ae251d88..7dfdbc301622a 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -2083,13 +2083,6 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
             # For CUDA or DML enabled builds test IOBinding feature
             if args.use_cuda or args.use_dml:
                 log.info("Testing IOBinding feature")
-                if args.use_dml:
-                    run_subprocess(
-                        [sys.executable, "-m", "pip", "uninstall", "--yes", "onnx"], cwd=cwd, dll_path=dll_path
-                    )
-                    run_subprocess(
-                        [sys.executable, "-m", "pip", "install", "-q", "onnx==1.15.0"], cwd=cwd, dll_path=dll_path
-                    )
                 run_subprocess([sys.executable, "onnxruntime_test_python_iobinding.py"], cwd=cwd, dll_path=dll_path)
 
             if args.use_cuda:
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-prebuild-steps.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-prebuild-steps.yml
index 864513bc4d671..67a2543bfb50e 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-prebuild-steps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-prebuild-steps.yml
@@ -13,6 +13,10 @@ parameters:
   type: boolean
   default: false
 
+- name: InstallONNX
+  type: boolean
+  default: true
+
 - name: WITHCACHE
   type: boolean
   default: false
@@ -106,6 +110,7 @@ steps:
 
     displayName: Install ccache and update PATH to use linked versions of gcc, cc, etc
 
+- ${{ if eq(parameters.InstallONNX, true) }}:
   - ${{ if eq(parameters.WITHCACHE, true) }}:
     - task: Cache@2
       # machinepool is used to ensure the compiler is same
diff --git a/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml
index 53eea1d69fb0e..c333c7ef084d0 100644
--- a/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml
@@ -68,6 +68,7 @@ stages:
           BuildConfig: Debug
           MachinePool: 'onnxruntime-Win-CPU-2022'
           WithCache: false
+          InstallONNX: false
           Today: $(TODAY)
 
       - task: PythonScript@0
@@ -155,7 +156,7 @@ stages:
         GenerateDocumentation: false
         WITH_CACHE: false
         MachinePool: 'onnxruntime-Win-CPU-2022'
-        
+
 - stage: x86_release
   dependsOn: []
   jobs:
@@ -256,5 +257,3 @@ stages:
         GenerateDocumentation: false
         WITH_CACHE: false
         MachinePool: 'onnxruntime-Win-CPU-2022'
-
-

From b95fd4e644775a4343c13435bd729bd64f411752 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <yuslepukhin@users.noreply.github.com>
Date: Wed, 27 Mar 2024 13:32:36 -0700
Subject: [PATCH 259/279] Enable CUDA EP unit testing on Windows (#20039)

### Description
Address build issues and source code discrepancies.
Fix cuda_test_provider gtest argument stack corruption.

### Motivation and Context
`OpTester` class that is widely used for kernel testing is not
suitable for testing internal classes for EPs that are built as shared
objects.
Currently, CUDA EP tests run only on Linux.
We want to enable testing and developments on Windows,
and create a usable pattern for testing of other EPs internals.

Alternatives considered:
Abstracting EP unit tests into separate test executable such as
`onnxruntime_test_all`.
This alternative was rejected as it would create a lot more changes in
the established patterns,
and potentially interfere with CUDA functionality with more complex
source code maintanence.
---
 cmake/CMakeLists.txt                          |   2 +-
 cmake/onnxruntime_providers_cuda.cmake        |   2 +-
 cmake/onnxruntime_unittests.cmake             |   7 +
 .../core/framework/execution_provider.h       |   2 +-
 .../onnxruntime/core/framework/run_options.h  |   2 +-
 .../core/mickey/blk_q4/f16_prepack_sm80.h     |  16 +-
 .../threadblock/quantb_mma_multistage.h       |   2 +-
 .../quantb_meta_mma_tensor_op_tile_iterator.h |  25 ++-
 .../shared_library/provider_wrappedtypes.h    |  15 +-
 onnxruntime/core/util/matrix_layout.h         |   2 +-
 .../test/cuda_host/blkq4_fp16_quant_sm80.h    |   7 +-
 .../cuda/test_cases/beam_search_topk.cc       |   3 +-
 .../cuda/test_cases/blkq4_fp16_gemm_sm80.h    |  22 ++-
 .../test_cases/blkq4_fp16_gemm_sm80_test.cc   |  16 +-
 .../test_cases/blkq4_fp16_gemm_sm80_testcu.cu | 187 +++++++++---------
 .../cuda_execution_provider_test.cc           |  11 +-
 .../cuda/test_cases/cuda_test_provider.cc     |  32 ++-
 .../cuda/test_cases/cuda_utils_test.cc        |   7 +-
 .../cuda/test_cases/gemm_options_test.cc      |   4 +-
 .../test_cases/reduction_functions_test.cc    |   4 +-
 .../azure-pipelines/linux-gpu-ci-pipeline.yml |   4 +-
 .../azure-pipelines/win-gpu-ci-pipeline.yml   |  18 +-
 22 files changed, 216 insertions(+), 174 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index ee1959bb357fe..3293506141689 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -76,7 +76,7 @@ option(onnxruntime_USE_CUDA "Build with CUDA support" OFF)
 # Enable ONNX Runtime CUDA EP's internal unit tests that directly access the EP's internal functions instead of through
 # OpKernels. When the option is ON, we will have two copies of GTest library in the same process. It is not a typical
 # use. If you hit any problem with that, please do not report it to GTest. Turn OFF the following build option instead.
-cmake_dependent_option(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS "Build with CUDA unit tests" OFF "onnxruntime_USE_CUDA;onnxruntime_BUILD_UNIT_TESTS;LINUX" OFF)
+cmake_dependent_option(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS "Build with CUDA unit tests" OFF "onnxruntime_USE_CUDA;onnxruntime_BUILD_UNIT_TESTS" OFF)
 
 option(onnxruntime_USE_CUDA_NHWC_OPS "Build CUDA with NHWC op support" OFF)
 option(onnxruntime_CUDA_MINIMAL "Build CUDA without any operations apart from memcpy ops. Usefuel for a very minial TRT build" OFF)
diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
index aeeac10ead27d..1346a9ce968c6 100644
--- a/cmake/onnxruntime_providers_cuda.cmake
+++ b/cmake/onnxruntime_providers_cuda.cmake
@@ -122,7 +122,7 @@
   endif()
   if(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS)
     # cuda_provider_interface.cc is removed from the object target: onnxruntime_providers_cuda_obj and
-    # add to the lib onnxruntime_providers_cuda separatedly.
+    # added to the lib onnxruntime_providers_cuda separately.
     # onnxruntime_providers_cuda_ut can share all the object files with onnxruntime_providers_cuda except cuda_provider_interface.cc.
     set(cuda_provider_interface_src ${ONNXRUNTIME_ROOT}/core/providers/cuda/cuda_provider_interface.cc)
     list(REMOVE_ITEM onnxruntime_providers_cuda_src ${cuda_provider_interface_src})
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index 1ffb838328643..4a351dcf90d45 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -779,6 +779,13 @@ if (onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS)
   onnxruntime_add_include_to_target(onnxruntime_providers_cuda_ut GTest::gtest GTest::gmock)
   target_include_directories(onnxruntime_providers_cuda_ut PRIVATE ${ONNXRUNTIME_ROOT}/core/mickey)
   target_link_libraries(onnxruntime_providers_cuda_ut PRIVATE GTest::gtest GTest::gmock ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common)
+  if (MSVC)
+    # Cutlass code has an issue with the following:
+    # warning C4100: 'magic': unreferenced formal parameter
+    target_compile_options(onnxruntime_providers_cuda_ut PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /wd4100>"
+                  "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/wd4100>")
+  endif()
+
   list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_cuda_ut)
 endif()
 
diff --git a/include/onnxruntime/core/framework/execution_provider.h b/include/onnxruntime/core/framework/execution_provider.h
index 40ca96a19aef1..16ad943a5f47e 100644
--- a/include/onnxruntime/core/framework/execution_provider.h
+++ b/include/onnxruntime/core/framework/execution_provider.h
@@ -53,7 +53,7 @@ struct NodeComputeInfo {
   DestroyFunctionStateFunc release_state_func;
 };
 
-using RunOptions = OrtRunOptions;
+using RunOptions = ::OrtRunOptions;
 
 enum class DataLayout {
   NCHW,
diff --git a/include/onnxruntime/core/framework/run_options.h b/include/onnxruntime/core/framework/run_options.h
index 5444c825d7991..789c3b13f2c3e 100644
--- a/include/onnxruntime/core/framework/run_options.h
+++ b/include/onnxruntime/core/framework/run_options.h
@@ -45,5 +45,5 @@ struct OrtRunOptions {
 };
 
 namespace onnxruntime {
-using RunOptions = OrtRunOptions;
+using RunOptions = ::OrtRunOptions;
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/mickey/blk_q4/f16_prepack_sm80.h b/onnxruntime/core/mickey/blk_q4/f16_prepack_sm80.h
index a08cfb97eed4a..c81b4967d2719 100644
--- a/onnxruntime/core/mickey/blk_q4/f16_prepack_sm80.h
+++ b/onnxruntime/core/mickey/blk_q4/f16_prepack_sm80.h
@@ -110,8 +110,8 @@ struct BlockwiseQuantization {
   static void prepack_weights(
       int rows,
       int columns,
-      const gsl::span<uint8_t const>& weights,     // <- int4 weights, column major
-      const gsl::span<uint8_t>& weights_prepacked  // <- int4 prepacked weights tensor, same size buffer
+      gsl::span<uint8_t const> weights,     // <- int4 weights, column major
+      gsl::span<uint8_t> weights_prepacked  // <- int4 prepacked weights tensor, same size buffer
   ) {
     ORT_ENFORCE((rows % 16) == 0 && (columns % 16) == 0 &&
                     (rows % QuantBlocking::kRow) == 0 &&
@@ -171,10 +171,10 @@ struct BlockwiseQuantization {
   static void prepack_quant_scales(
       size_t rows,
       size_t columns,
-      const gsl::span<ElementT const>& scales,     // <- quant scales, column major layout
-      const gsl::span<ElementT>& scales_prepacked  // <- quant scales prepacked, same size buffer
+      gsl::span<ElementT const> scales,     // <- quant scales, column major layout
+      gsl::span<ElementT> scales_prepacked  // <- quant scales prepacked, same size buffer
   ) {
-    auto meta_shape = get_quant_meta_shape(rows, columns);
+    auto meta_shape = get_quant_meta_shape(static_cast<int>(rows), static_cast<int>(columns));
     ORT_ENFORCE(scales.size() == size_t(meta_shape.product()),
                 "Quantization scale tensor shape mismatch!");
     ORT_ENFORCE(scales_prepacked.size() == size_t(meta_shape.product()),
@@ -241,10 +241,10 @@ struct BlockwiseQuantization {
   static void prepack_quant_offsets(
       size_t rows,
       size_t columns,
-      const gsl::span<uint8_t const>& offsets,     // <- quant offsets, int4, column major layout
-      const gsl::span<uint8_t>& offsets_prepacked  // <- quant offsets prepacked, double size buffer
+      gsl::span<uint8_t const> offsets,     // <- quant offsets, int4, column major layout
+      gsl::span<uint8_t> offsets_prepacked  // <- quant offsets prepacked, double size buffer
   ) {
-    auto meta_shape = get_quant_meta_shape(rows, columns);
+    auto meta_shape = get_quant_meta_shape(static_cast<int>(rows), static_cast<int>(columns));
 
     ORT_ENFORCE((rows % 16) == 0 && (columns % 16) == 0,
                 "Does not support odd number of rows or columns!");
diff --git a/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h b/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h
index 8b6bac8c5099a..28364cc34f2d7 100644
--- a/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h
+++ b/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h
@@ -132,7 +132,7 @@ struct DummyType{
   }
 
   CUTLASS_HOST_DEVICE
-  std::monostate& operator[](int idx) {
+  std::monostate& operator[](int /*idx */) {
     return dummy_;
   }
 };
diff --git a/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/quantb_meta_mma_tensor_op_tile_iterator.h b/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/quantb_meta_mma_tensor_op_tile_iterator.h
index 4ba39dda3db8d..26239161cf8a3 100644
--- a/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/quantb_meta_mma_tensor_op_tile_iterator.h
+++ b/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/quantb_meta_mma_tensor_op_tile_iterator.h
@@ -437,7 +437,7 @@ class QuantBMetaMmaTensorOpTileIterator<WarpShapeB_, BlockingShape_,
 
   CUTLASS_HOST_DEVICE
   static void dequant(FragmentScale const &scales,
-                      FragmentOffset const &offsets,
+                      FragmentOffset const &fragment_offsets,
                       Array<uint8_t,kExpandedSize/2> const &weights,
                       Array<ElementScale, kExpandedSize>& dest){
     static_assert(kNumBsPerCoreTileFragement == 2, "Only for 16b gemm.");
@@ -453,19 +453,18 @@ class QuantBMetaMmaTensorOpTileIterator<WarpShapeB_, BlockingShape_,
 
       uint32_t* dest_pair = reinterpret_cast<uint32_t*>(dest.data());
       const b64* scales_ptr = reinterpret_cast<const b64*>(scales.data());
-      const ElementOffset* offsets_ptr = nullptr;
-      if constexpr(kHasOffset) { offsets_ptr = offsets.data(); }
+      [[maybe_unused]] const ElementOffset* fragment_offsets_ptr = nullptr;
+      if constexpr(kHasOffset) { fragment_offsets_ptr = fragment_offsets.data(); }
 
       CUTLASS_PRAGMA_UNROLL
       for (int n_idx = 0; n_idx < kMmaIterations; n_idx++){
         // dequantize: d = scale * (weight - offset)
         // to use FMA, d = scale * weight + (scale * (-offset))
 
-        b64 offsets;
-        if constexpr(kHasOffset){
-          const uint32_t* p = reinterpret_cast<const uint32_t*>(offsets_ptr);
-
+        [[maybe_unused]] b64 offsets{0};
+        if constexpr(kHasOffset) {
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+          const uint32_t* p = reinterpret_cast<const uint32_t*>(fragment_offsets_ptr);
           asm volatile(
               "{\n\t"
               "  .reg  .b32    rb0, rb1;\n"      // b32 regs for fp16x2 mul operands
@@ -486,7 +485,7 @@ class QuantBMetaMmaTensorOpTileIterator<WarpShapeB_, BlockingShape_,
           assert(0);
 #endif
 
-          offsets_ptr += 4;
+          fragment_offsets_ptr += 4;
         } else {
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
           asm volatile(
@@ -541,7 +540,7 @@ class QuantBMetaMmaTensorOpTileIterator<WarpShapeB_, BlockingShape_,
             int idx = elem_idx + mma_tile_idx * kCoreTileFragementSize + n_idx * kCoreTileFragementSize * kTilesPerMma;
             ElementScale s = scales[idx];
             if constexpr(kHasOffset){
-              offset = s * static_cast<ElementScale>(-16 - int(offsets[idx]));
+              offset = s * static_cast<ElementScale>(-16 - static_cast<int>(fragment_offsets[idx]));
             } else {
               offset = s * static_cast<ElementScale>(-16-8);
             }
@@ -795,13 +794,13 @@ class QuantBMetaMmaTensorOpTileIterator<WarpShapeB_, BlockingShape_,
         }
       }
     } else if constexpr (kMmaIterationsB % 2 == 0) {
-      const uint32_t* scales_ptr = reinterpret_cast<const uint32_t*>(scales.data());
-      uint32_t* addon_ptr = reinterpret_cast<uint32_t*>(addon);
-
       if constexpr (kHasOffset){
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+        const uint32_t* scales_ptr = reinterpret_cast<const uint32_t*>(scales.data());
+        uint32_t* addon_ptr = reinterpret_cast<uint32_t*>(addon);
         // possible buffer over read 2 bytes here.
         const uint32_t* p = reinterpret_cast<const uint32_t*>(offsets.data());
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+
         asm volatile(
           "{\n\t"
           "  .reg  .b32    rb0, rb1, rb2;\n"
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index bdad18c7edec0..3bb938c1a3197 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -394,14 +394,6 @@ struct ConfigOptions final {
   PROVIDER_DISALLOW_ALL(ConfigOptions)
 };
 
-struct OrtRunOptions final {
-  const ConfigOptions& GetConfigOptions() const {
-    return g_host->RunOptions__GetConfigOptions(this);
-  }
-
-  PROVIDER_DISALLOW_ALL(OrtRunOptions)
-};
-
 struct ComputeCapability final {
   static std::unique_ptr<ComputeCapability> Create(std::unique_ptr<IndexedSubGraph> t_sub_graph) { return g_host->ComputeCapability__construct(std::move(t_sub_graph)); }
   static void operator delete(void* p) { g_host->ComputeCapability__operator_delete(reinterpret_cast<ComputeCapability*>(p)); }
@@ -1283,3 +1275,10 @@ template <>
 inline gsl::span<const int64_t> Tensor::DataAsSpan() const { return g_host->Tensor__DataAsSpan_int64(this); }
 
 }  // namespace onnxruntime
+
+struct OrtRunOptions final {
+  const onnxruntime::ConfigOptions& GetConfigOptions() const {
+    return onnxruntime::g_host->RunOptions__GetConfigOptions(this);
+  }
+  PROVIDER_DISALLOW_ALL(OrtRunOptions)
+};
diff --git a/onnxruntime/core/util/matrix_layout.h b/onnxruntime/core/util/matrix_layout.h
index 783a29d8a2055..43843da3fb96e 100644
--- a/onnxruntime/core/util/matrix_layout.h
+++ b/onnxruntime/core/util/matrix_layout.h
@@ -378,7 +378,7 @@ class MatrixRef {
   MatrixRef(
       NonConstMatrixRef const& ref,  ///< MatrixRef to non-const data
       /// SFINAE trick to avoid creating a copy-constructor when Element_ is already non-const
-      _Magic magic = (typename std::enable_if<!IsNonConstRef, _Magic>::type)0
+      [[maybe_unused]] _Magic magic = (typename std::enable_if<!IsNonConstRef, _Magic>::type)0
       ) : data_(ref.data()), shape_(ref.shape()), layout_(Layout::packed(ref.shape())) {}
 
   ORT_FORCEINLINE
diff --git a/onnxruntime/test/cuda_host/blkq4_fp16_quant_sm80.h b/onnxruntime/test/cuda_host/blkq4_fp16_quant_sm80.h
index 6ea8b55505214..942b1c4d2c2ad 100644
--- a/onnxruntime/test/cuda_host/blkq4_fp16_quant_sm80.h
+++ b/onnxruntime/test/cuda_host/blkq4_fp16_quant_sm80.h
@@ -74,7 +74,8 @@ inline void sm80_prepack_quant_scales_ref(
     int columns,
     const MatrixRef<ScaleElementT const, Layout, true>& tensor_scale,
     const MatrixRef<ScaleElementT, Layout, true>& tensor_scale_prepacked) {
-  ORT_ENFORCE(tensor_scale.shape()[0] == (rows / QuantBlocking::kRow) && tensor_scale.shape()[1] == (columns / QuantBlocking::kColumn),
+  ORT_ENFORCE(tensor_scale.shape()[0] == (rows / QuantBlocking::kRow) && tensor_scale.shape()[1] ==
+                                                                             (columns / QuantBlocking::kColumn),
               "Unexpected tensor_scale shape! Expected: (",
               rows / QuantBlocking::kRow, ", ", columns / QuantBlocking::kColumn, ")");
   ORT_ENFORCE(tensor_scale_prepacked.shape() == tensor_scale.shape());
@@ -84,7 +85,9 @@ inline void sm80_prepack_quant_scales_ref(
   //    2 B operand tiles per mma instruction stacked on k dimension
   //    (1,n) quantization blocking
   if constexpr (sizeof(ScaleElementT) != 2 || QuantBlocking::kRow != 1) {
-    ORT_THROW("sm80_prepack_quant_scales_ref should only be called for row-wise block quantization on 16b float values.");
+    ORT_THROW(
+        "sm80_prepack_quant_scales_ref should only be called for "
+        " row-wise block quantization on 16b float values.");
   }
 
   // In Ampere tensor op, each operand B tile is 8 x 8, in a warp of 32 threads, each thread
diff --git a/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc b/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc
index 9fecec9f7e8bb..a0d115c41c14b 100644
--- a/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc
@@ -80,7 +80,8 @@ TEST(TestBeamSearch, TopK) {
   std::vector<float> top_k_values_ref(batch_size * k);
   std::vector<int32_t> top_k_tokens_ref(batch_size * k);
   std::vector<int32_t> top_k_indices_ref(batch_size * k);
-  ComputeTopKReference(values, top_k_values_ref, top_k_tokens_ref, top_k_indices_ref, batch_size, beam_size, vocab_size, k);
+  ComputeTopKReference(values, top_k_values_ref, top_k_tokens_ref, top_k_indices_ref, batch_size,
+                       beam_size, vocab_size, k);
 
   const int32_t max_vocab_parts = 128;
   size_t buffer_size = batch_x_beam_x_vocab * 4                                      // input
diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h
index bbe370675fc48..f0dfaf1a58612 100644
--- a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h
+++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h
@@ -14,12 +14,14 @@
 
 #pragma once
 
+#include "test/cuda_host/blkq4_fp16_quant_sm80.h"
+
 #include <random>
+#include <thrust/host_vector.h>
 
-#include "core/util/matrix_layout.h"
 #include "core/common/common.h"
 #include "core/mickey/blk_q4/f16_prepack_sm80.h"
-#include "test/cuda_host/blkq4_fp16_quant_sm80.h"
+#include "core/util/matrix_layout.h"
 
 namespace onnxruntime {
 namespace cuda {
@@ -48,10 +50,10 @@ Status sm80_supported();
 template <typename ElementT, int block_size, bool col_blocking, bool has_offsets>
 inline void blkq4_weights_gen(
     int rows, int columns,
-    std::vector<ElementT>& dequants,
-    std::vector<uint8_t>& q_weights,
-    std::vector<ElementT>& q_scales,
-    std::vector<uint8_t>& q_zp) {
+    thrust::host_vector<ElementT>& dequants,
+    thrust::host_vector<uint8_t>& q_weights,
+    thrust::host_vector<ElementT>& q_scales,
+    thrust::host_vector<uint8_t>& q_zp) {
   using Base = onnxruntime::cuda::BlockwiseQuantization<
       ElementT,
       block_size,
@@ -74,7 +76,7 @@ inline void blkq4_weights_gen(
 
   const auto q_weight_shape = Base::get_quant_weights_shape(rows, columns);
   const auto meta_shape = Base::get_quant_meta_shape(rows, columns);
-  const auto zp_shape = make_Position((meta_shape[0] + 1) / 2, meta_shape[1]);
+  [[maybe_unused]] const auto zp_shape = make_Position((meta_shape[0] + 1) / 2, meta_shape[1]);
 
   //
   // For testing quantization and dequantization, it is not straight
@@ -120,9 +122,9 @@ inline void blkq4_weights_gen(
 
   q_scales.resize(meta_shape.product());
   for (size_t i = 0; i < q_scales.size(); i++) {
-    uint32_t v = dis(gen);
-    uint32_t m = (v % 63) + 1;
-    uint32_t e = (v >> 6) % 4;
+    uint32_t vl = dis(gen);
+    uint32_t m = (vl % 63) + 1;
+    uint32_t e = (vl >> 6) % 4;
     q_scales[i] = ElementT(m / static_cast<float>(1 << (2 + e)));
   }
   MatrixRef<ElementT, ColumnMajorLayout, true> tensor_scale(
diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc
index e687ae73e66f2..e7fa0dae02fda 100644
--- a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc
@@ -11,15 +11,15 @@
  *   well with CUTLASS headers.
  */
 
+#include "blkq4_fp16_gemm_sm80.h"
+
+#include "gtest/gtest.h"
+#include <thrust/host_vector.h>
 #include <random>
 
 #include "core/framework/float16.h"
 #include "core/mlas/inc/mlas_q4.h"
 
-#include "blkq4_fp16_gemm_sm80.h"
-
-#include "gtest/gtest.h"
-
 namespace onnxruntime {
 namespace test {
 
@@ -43,10 +43,10 @@ void testPrepack(int rows, int columns) {
   const auto meta_shape = Base::get_quant_meta_shape(rows, columns);
   const auto zp_shape = make_Position((meta_shape[0] + 1) / 2, meta_shape[1]);
 
-  std::vector<ElementW> q_weights;
-  std::vector<ElementT> q_scales;
-  std::vector<ElementQOffset> q_zp;
-  std::vector<ElementT> dequants;
+  thrust::host_vector<ElementW> q_weights;
+  thrust::host_vector<ElementT> q_scales;
+  thrust::host_vector<ElementQOffset> q_zp;
+  thrust::host_vector<ElementT> dequants;
   onnxruntime::cuda::test::blkq4_weights_gen<ElementT, block_size, col_blocking, has_offset>(
       rows, columns, dequants, q_weights, q_scales, q_zp);
 
diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu
index 69c929d446ce4..210c33933d90d 100644
--- a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu
+++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu
@@ -11,9 +11,11 @@
  *   well with gtest headers.
  */
 
+#include "blkq4_fp16_gemm_sm80.h"
+
 #include <random>
-#include <thrust/host_vector.h>
 #include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
 
 #include "core/mickey/blk_q4/f16_gemm_sm80.h"
 
@@ -26,13 +28,11 @@
 
 #include "core/common/common.h"
 
-#include "blkq4_fp16_gemm_sm80.h"
-
 namespace onnxruntime {
-namespace cuda{
-namespace test{
+namespace cuda {
+namespace test {
 
-Status sm80_supported(){
+Status sm80_supported() {
   cudaDeviceProp props;
 
   cudaError_t error = cudaGetDeviceProperties(&props, 0);
@@ -55,27 +55,25 @@ Status sm80_supported(){
  *        Copied directly from cutlass util/reference/device/gemm.h
  *        for the strange reason that compiler insists on asking
  *        for explicit stream argument in kernel launch.
-*/
+ */
 template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename AccumulatorType
->
+    typename ElementA,
+    typename LayoutA,
+    typename ElementB,
+    typename LayoutB,
+    typename ElementC,
+    typename LayoutC,
+    typename ScalarType,
+    typename AccumulatorType>
 void compute_gemm_ref(
-  cutlass::gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  cutlass::TensorRef<ElementA, LayoutA> tensor_a,
-  cutlass::TensorRef<ElementB, LayoutB> tensor_b,
-  ScalarType beta,
-  cutlass::TensorRef<ElementC, LayoutC> tensor_c,
-  cutlass::TensorRef<ElementC, LayoutC> tensor_d,
-  AccumulatorType initial_accum = AccumulatorType(0)) {
-
+    cutlass::gemm::GemmCoord problem_size,
+    ScalarType alpha,
+    cutlass::TensorRef<ElementA, LayoutA> tensor_a,
+    cutlass::TensorRef<ElementB, LayoutB> tensor_b,
+    ScalarType beta,
+    cutlass::TensorRef<ElementC, LayoutC> tensor_c,
+    cutlass::TensorRef<ElementC, LayoutC> tensor_d,
+    AccumulatorType initial_accum = AccumulatorType(0)) {
   // Blocking structure potentially improves performance of reference implementation
   // with a minor increase in complexity.
   //
@@ -85,30 +83,27 @@ void compute_gemm_ref(
   dim3 block(16, 8);
 
   dim3 grid(
-    (problem_size.m() + block.x * OutputTile::kRow - 1) / (block.x * OutputTile::kRow),
-    (problem_size.n() + block.y * OutputTile::kColumn - 1) / (block.y * OutputTile::kColumn)
-  );
+      (problem_size.m() + block.x * OutputTile::kRow - 1) / (block.x * OutputTile::kRow),
+      (problem_size.n() + block.y * OutputTile::kColumn - 1) / (block.y * OutputTile::kColumn));
 
   // Launch a GEMM kernel
   cutlass::reference::device::kernel::Gemm<
-    cutlass::TensorRef<ElementA, LayoutA>,
-    cutlass::TensorRef<ElementB, LayoutB>,
-    cutlass::TensorRef<ElementC, LayoutC>,
-    ScalarType,
-    AccumulatorType,
-    OutputTile,
-    cutlass::multiply_add<AccumulatorType>,
-    cutlass::NumericConverter<ElementC, ScalarType>
-  ><<<grid, block, 0, 0>>>(
-    problem_size,
-    alpha,
-    tensor_a,
-    tensor_b,
-    beta,
-    tensor_c,
-    tensor_d,
-    initial_accum
-  );
+      cutlass::TensorRef<ElementA, LayoutA>,
+      cutlass::TensorRef<ElementB, LayoutB>,
+      cutlass::TensorRef<ElementC, LayoutC>,
+      ScalarType,
+      AccumulatorType,
+      OutputTile,
+      cutlass::multiply_add<AccumulatorType>,
+      cutlass::NumericConverter<ElementC, ScalarType>><<<grid, block, 0, 0>>>(
+      problem_size,
+      alpha,
+      tensor_a,
+      tensor_b,
+      beta,
+      tensor_c,
+      tensor_d,
+      initial_accum);
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -117,28 +112,31 @@ void compute_gemm_ref(
 //
 
 template <
-  typename Element,
-  typename LayoutCutlass,
-  typename Layout = std::conditional_t<std::is_same<LayoutCutlass, cutlass::layout::ColumnMajor>::value, ColumnMajorLayout, RowMajorLayout>
-  >
+    typename Element,
+    typename LayoutCutlass,
+    typename Layout = std::conditional_t<std::is_same<LayoutCutlass,
+                                                      cutlass::layout::ColumnMajor>::value,
+                                         ColumnMajorLayout, RowMajorLayout>>
 __forceinline__
-MatrixRef<Element, Layout, true> make_MatrixRef(cutlass::HostTensor<Element, LayoutCutlass> const& tensor) {
-  static_assert(std::is_same<LayoutCutlass, cutlass::layout::ColumnMajor>::value
-                || std::is_same<LayoutCutlass, cutlass::layout::RowMajor>::value);
+    MatrixRef<Element, Layout, true>
+    make_MatrixRef(cutlass::HostTensor<Element, LayoutCutlass> const& tensor) {
+  static_assert(std::is_same<LayoutCutlass, cutlass::layout::ColumnMajor>::value ||
+                std::is_same<LayoutCutlass, cutlass::layout::RowMajor>::value);
   auto shape = make_Position(tensor.extent().row(), tensor.extent().column());
-  auto* ptr = const_cast<typename std::remove_const<Element>::type *>(tensor.host_data());
+  auto* ptr = const_cast<typename std::remove_const<Element>::type*>(tensor.host_data());
   return MatrixRef<Element, Layout, true>(ptr, tensor.capacity(), shape);
 }
 
 template <
-  typename Element,
-  typename LayoutCutlass,
-  typename Layout = std::conditional_t<std::is_same<LayoutCutlass, cutlass::layout::ColumnMajor>::value, ColumnMajorLayout, RowMajorLayout>
-  >
+    typename Element,
+    typename LayoutCutlass,
+    typename Layout = std::conditional_t<std::is_same<LayoutCutlass, cutlass::layout::ColumnMajor>::value,
+                                         ColumnMajorLayout, RowMajorLayout>>
 __forceinline__
-MatrixRef<Element const, Layout, true> make_ConstMatrixRef(cutlass::HostTensor<Element, LayoutCutlass> const& tensor) {
-  static_assert(std::is_same<LayoutCutlass, cutlass::layout::ColumnMajor>::value
-                || std::is_same<LayoutCutlass, cutlass::layout::RowMajor>::value);
+    MatrixRef<Element const, Layout, true>
+    make_ConstMatrixRef(cutlass::HostTensor<Element, LayoutCutlass> const& tensor) {
+  static_assert(std::is_same<LayoutCutlass, cutlass::layout::ColumnMajor>::value ||
+                std::is_same<LayoutCutlass, cutlass::layout::RowMajor>::value);
   auto shape = make_Position(tensor.extent().row(), tensor.extent().column());
   return MatrixRef<Element const, Layout, true>(tensor.host_data(), tensor.capacity(), shape);
 }
@@ -147,7 +145,7 @@ MatrixRef<Element const, Layout, true> make_ConstMatrixRef(cutlass::HostTensor<E
 // Invoking the kernel
 //
 
-template<
+template <
     int block_size,
     bool column_wise_blocking,
     bool small_m,
@@ -160,9 +158,9 @@ void run_blkq4_gemm(int m, int n, int k) {
 
   using ElementDequant = cutlass::half_t;
   using QuantBlocking =
-    typename std::conditional<column_wise_blocking,
-                     cutlass::MatrixShape<block_size, 1>,
-                     cutlass::MatrixShape<1, block_size>>::type;
+      typename std::conditional<column_wise_blocking,
+                                cutlass::MatrixShape<block_size, 1>,
+                                cutlass::MatrixShape<1, block_size>>::type;
 
   using GemmRunner = BlkQ4F16GemmImpl<ElementDequant, QuantBlocking, small_m, has_offsets>;
 
@@ -181,17 +179,18 @@ void run_blkq4_gemm(int m, int n, int k) {
   using LayoutInputQScale = typename GemmRunner::LayoutInputQScale;
 
   const cutlass::gemm::GemmCoord problem_size = {m, n, k};
-  const auto q_weight_shape = cutlass::make_Coord(problem_size.k()/2, problem_size.n());
-  const auto meta_shape = cutlass::make_Coord(problem_size.k()/QuantBlocking::kRow, problem_size.n()/QuantBlocking::kColumn);
+  const auto q_weight_shape = cutlass::make_Coord(problem_size.k() / 2, problem_size.n());
+  const auto meta_shape = cutlass::make_Coord(problem_size.k() / QuantBlocking::kRow, problem_size.n() /
+                                                                                          QuantBlocking::kColumn);
 
   //
   // Generate quantized and dequantizeed input matrix B [K, N]
   //
   static_assert(std::is_same<LayoutInputWPack, cutlass::layout::ColumnMajor>::value);
-  std::vector<ElementW> q_weights;
-  std::vector<ElementQScale> q_scales;
-  std::vector<ElementQOffset> q_zp;
-  std::vector<ElementDequant> dequants;
+  thrust::host_vector<ElementW> q_weights;
+  thrust::host_vector<ElementQScale> q_scales;
+  thrust::host_vector<ElementQOffset> q_zp;
+  thrust::host_vector<ElementDequant> dequants;
   onnxruntime::cuda::test::blkq4_weights_gen<ElementDequant, block_size, column_wise_blocking, has_offsets>(
       problem_size.k(), problem_size.n(), dequants, q_weights, q_scales, q_zp);
 
@@ -201,11 +200,11 @@ void run_blkq4_gemm(int m, int n, int k) {
       4,
       column_wise_blocking>;
 
-  std::vector<ElementW> packed_w(q_weight_shape.product());
+  thrust::host_vector<ElementW> packed_w(q_weight_shape.product());
   PrepackT::prepack_weights(problem_size.k(), problem_size.n(), q_weights, packed_w);
-  std::vector<ElementQScale> packed_scales(meta_shape.product());
+  thrust::host_vector<ElementQScale> packed_scales(meta_shape.product());
   PrepackT::prepack_quant_scales(problem_size.k(), problem_size.n(), q_scales, packed_scales);
-  std::vector<ElementQOffset> packed_zp;
+  thrust::host_vector<ElementQOffset> packed_zp;
   if constexpr (has_offsets) {
     packed_zp.resize(meta_shape.product());
     PrepackT::prepack_quant_offsets(problem_size.k(), problem_size.n(), q_zp, packed_zp);
@@ -240,16 +239,16 @@ void run_blkq4_gemm(int m, int n, int k) {
   //
   thrust::device_vector<ElementW> d_packed_w(packed_w);
   cutlass::TensorRef<ElementWPack const, LayoutInputWPack> ref_W(
-    reinterpret_cast<ElementWPack const *>(d_packed_w.data().get()),
-    LayoutInputWPack::packed({problem_size.k()/2, problem_size.n()/2}));
+      reinterpret_cast<ElementWPack const*>(d_packed_w.data().get()),
+      LayoutInputWPack::packed({problem_size.k() / 2, problem_size.n() / 2}));
 
   thrust::device_vector<ElementQScale> d_packed_scales(packed_scales);
   cutlass::TensorRef<ElementQScale const, LayoutInputQScale> ref_scales(
-    d_packed_scales.data().get(), LayoutInputQScale::packed(meta_shape));
+      d_packed_scales.data().get(), LayoutInputQScale::packed(meta_shape));
 
   thrust::device_vector<ElementQOffset> d_packed_zp(packed_zp);
   cutlass::TensorRef<ElementQOffset const, LayoutInputQScale> ref_zp(
-    d_packed_zp.data().get(), LayoutInputQScale::packed(meta_shape));
+      d_packed_zp.data().get(), LayoutInputQScale::packed(meta_shape));
 
   tensor_a.sync_device();
   tensor_c.sync_device();
@@ -257,16 +256,16 @@ void run_blkq4_gemm(int m, int n, int k) {
 
   // run GEMM
   cutlass::Status status;
-  if constexpr (has_offsets){
+  if constexpr (has_offsets) {
     status = GemmRunner::run(
-      nullptr, problem_size, tensor_a.device_ref(), ref_W,
-      ref_scales, ref_zp,
-      tensor_c.device_ref(), tensor_d.device_ref());
+        nullptr, problem_size, tensor_a.device_ref(), ref_W,
+        ref_scales, ref_zp,
+        tensor_c.device_ref(), tensor_d.device_ref());
   } else {
     status = GemmRunner::run(
-      nullptr, problem_size, tensor_a.device_ref(), ref_W,
-      ref_scales,
-      tensor_c.device_ref(), tensor_d.device_ref());
+        nullptr, problem_size, tensor_a.device_ref(), ref_W,
+        ref_scales,
+        tensor_c.device_ref(), tensor_d.device_ref());
   }
   ORT_ENFORCE(status == cutlass::Status::kSuccess, "Kernel execution failed: ", cutlassGetStatusString(status));
 
@@ -275,7 +274,7 @@ void run_blkq4_gemm(int m, int n, int k) {
   using LayoutInputB = cutlass::layout::ColumnMajor;
   thrust::device_vector<ElementInputB> d_dequants(dequants);
   cutlass::TensorRef<ElementInputB, LayoutInputB> ref_B(
-    d_dequants.data().get(), LayoutInputB::packed(problem_size.kn()));
+      d_dequants.data().get(), LayoutInputB::packed(problem_size.kn()));
   cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_d(
       problem_size.mn());  // <- Create matrix D with dimensions M x N used to store output from
                            // reference kernel
@@ -289,9 +288,9 @@ void run_blkq4_gemm(int m, int n, int k) {
   ElementComputeEpilogue beta = ElementComputeEpilogue(0);
 
   compute_gemm_ref<ElementInputA, LayoutInputA,
-               ElementInputB, LayoutInputB,
-               ElementOutput, LayoutOutput,
-               ElementComputeEpilogue, ElementAccumulator>(
+                   ElementInputB, LayoutInputB,
+                   ElementOutput, LayoutOutput,
+                   ElementComputeEpilogue, ElementAccumulator>(
       problem_size,
       alpha,
       tensor_a.device_ref(),
@@ -300,17 +299,17 @@ void run_blkq4_gemm(int m, int n, int k) {
       tensor_c.device_ref(),
       tensor_ref_d.device_ref());
 
-  // Wait for kernels to finish
+  //// Wait for kernels to finish
   cudaDeviceSynchronize();
 
-  // Copy output data from CUTLASS and reference kernel to host for comparison
+  //// Copy output data from CUTLASS and reference kernel to host for comparison
   tensor_d.sync_host();
   tensor_ref_d.sync_host();
 
-  // Check if output from CUTLASS kernel and reference kernel are equal or not
+  //// Check if output from CUTLASS kernel and reference kernel are equal or not
   bool passed = cutlass::reference::host::TensorEquals(
-    tensor_d.host_view(),
-    tensor_ref_d.host_view());
+      tensor_d.host_view(),
+      tensor_ref_d.host_view());
   ORT_ENFORCE(passed, "Gemm kernel result wrong!");
 }
 
diff --git a/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc b/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc
index 8dfaaedcbb378..72357ec7e02d2 100644
--- a/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc
@@ -5,11 +5,14 @@
 // extra code in the core of CUDA EP and that code may
 //  1. slow down performance critical applications and
 //  2. increase binary size of ORT.
+
+#include "gtest/gtest.h"
 #include <iostream>
-#include "core/providers/cuda/cuda_execution_provider.h"
+
+#include "core/framework/run_options.h"
 #include "core/providers/cuda/cuda_allocator.h"
+#include "core/providers/cuda/cuda_execution_provider.h"
 #include "core/providers/cuda/cuda_stream_handle.h"
-#include "gtest/gtest.h"
 
 namespace onnxruntime {
 namespace cuda {
@@ -22,7 +25,7 @@ TEST(TestDeferredRelease, WithArena) {
   CUDAExecutionProvider ep(info);
   AllocatorPtr gpu_alloctor = ep.CreatePreferredAllocators()[0];
 
-  RunOptions run_opts;
+  onnxruntime::RunOptions run_opts;
   run_opts.run_tag = "log1";
   // Allocator for call cudaMallocHost and cudaFreeHost
   // For details, see CUDAPinnedAllocator in cuda_allocator.cc.
@@ -54,7 +57,7 @@ TEST(TestDeferredRelease, WithoutArena) {
   CUDAExecutionProviderInfo info;
   CUDAExecutionProvider ep(info);
 
-  RunOptions run_opts;
+  onnxruntime::RunOptions run_opts;
   run_opts.run_tag = "log1";
 
   OrtDevice pinned_device{OrtDevice::CPU, OrtDevice::MemType::CUDA_PINNED, DEFAULT_CPU_ALLOCATOR_DEVICE_ID};
diff --git a/onnxruntime/test/providers/cuda/test_cases/cuda_test_provider.cc b/onnxruntime/test/providers/cuda/test_cases/cuda_test_provider.cc
index 96c1e173316de..d8384b432786b 100644
--- a/onnxruntime/test/providers/cuda/test_cases/cuda_test_provider.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/cuda_test_provider.cc
@@ -6,12 +6,11 @@
 #include "core/providers/cuda/cuda_provider_factory_creator.h"
 #include "core/providers/cuda/cuda_provider_options.h"
 
+#include "gtest/gtest.h"
 #include <memory>
 #include <chrono>
 
 #include "core/common/gsl.h"
-#include "gtest/gtest.h"
-
 #include "core/providers/cuda/cuda_execution_provider.h"
 #include "core/providers/cuda/cuda_execution_provider_info.h"
 #include "core/providers/cuda/cuda_allocator.h"
@@ -64,8 +63,15 @@ struct ProviderInfo_CUDA_TestImpl : ProviderInfo_CUDA {
 
   void cuda__Impl_Cast(void*, const float*, double*, size_t) override {}
 
-  Status CudaCall_false(int retCode, const char* exprString, const char* libName, int successCode, const char* msg, const char* file, const int line) override { return CudaCall<cudaError, false>(cudaError(retCode), exprString, libName, cudaError(successCode), msg, file, line); }
-  void CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode, const char* msg, const char* file, const int line) override { CudaCall<cudaError, true>(cudaError(retCode), exprString, libName, cudaError(successCode), msg, file, line); }
+  Status CudaCall_false(int retCode, const char* exprString, const char* libName, int successCode,
+                        const char* msg, const char* file, const int line) override {
+    return CudaCall<cudaError, false>(cudaError(retCode), exprString, libName,
+                                      cudaError(successCode), msg, file, line);
+  }
+  void CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode,
+                     const char* msg, const char* file, const int line) override {
+    CudaCall<cudaError, true>(cudaError(retCode), exprString, libName, cudaError(successCode), msg, file, line);
+  }
 
   void CopyGpuToCpu(void*, const void*, const size_t, const OrtMemoryInfo&, const OrtMemoryInfo&) override {}
 
@@ -93,19 +99,27 @@ struct ProviderInfo_CUDA_TestImpl : ProviderInfo_CUDA {
     return nullptr;
   }
 
-  std::shared_ptr<IAllocator> CreateCudaAllocator(int16_t, size_t, onnxruntime::ArenaExtendStrategy, onnxruntime::CUDAExecutionProviderExternalAllocatorInfo&, const OrtArenaCfg*) override {
+  std::shared_ptr<IAllocator> CreateCudaAllocator(int16_t, size_t, onnxruntime::ArenaExtendStrategy,
+                                                  onnxruntime::CUDAExecutionProviderExternalAllocatorInfo&,
+                                                  const OrtArenaCfg*) override {
     return nullptr;
   }
 
   void TestAll() override {
-    // TestAll is the entry point of CUDA EP's insternal tests.
+    // TestAll is the entry point of CUDA EP's internal tests.
     // Those internal tests are not directly callable from onnxruntime_test_all
     // because CUDA EP is a shared library now.
     // Instead, this is a test provider that implements all the test cases.
     // onnxruntime_test_all is calling this function through TryGetProviderInfo_CUDA_Test.
-    int argc = 1;
-    std::string mock_exe_name = "onnxruntime_providers_cuda_ut";
-    char* argv[] = {const_cast<char*>(mock_exe_name.data())};
+    char mock_exe_name[] = "onnxruntime_providers_cuda_ut";
+
+    // InitGoogleTest decrements argc and removes args from argv if
+    // recognized. By doing so it decrements argc and shifts argv,
+    // to do so, from the code comments it expects argc + 1 with the last one always being nullptr
+    // otherwise, windows diagnostics reports stack corruption. when
+    int argc = 1;  // Change argc to 2 and edit the filter below if necessary
+    char* argv[] = {mock_exe_name, nullptr};
+    // char* argv[] = {mock_exe_name, "--gtest_filter=ReductionFunctionsTest.*", nullptr};
     ::testing::InitGoogleTest(&argc, argv);
     ORT_ENFORCE(RUN_ALL_TESTS() == 0);
   }
diff --git a/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc b/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc
index 9d20bc545df5f..7468a5718425e 100644
--- a/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc
@@ -1,11 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "gtest/gtest.h"
+
 #include <memory>
 #include <vector>
 
-#include "gtest/gtest.h"
-
 #include "core/common/common.h"
 #include "core/providers/cuda/shared_inc/cuda_call.h"
 #include "core/providers/cuda/shared_inc/cuda_utils.h"
@@ -32,7 +32,8 @@ void TestFillCorrectness(size_t num_elements, TElement value) {
   Fill<TElement>(nullptr, buffer.get(), value, num_elements);
 
   auto cpu_buffer = std::make_unique<TElement[]>(num_elements);
-  CUDA_CALL_THROW(cudaMemcpy(cpu_buffer.get(), buffer.get(), num_elements * sizeof(TElement), cudaMemcpyKind::cudaMemcpyDeviceToHost));
+  CUDA_CALL_THROW(cudaMemcpy(cpu_buffer.get(), buffer.get(), num_elements * sizeof(TElement),
+                             cudaMemcpyKind::cudaMemcpyDeviceToHost));
 
   std::vector<TElement> expected_data(num_elements, value);
   EXPECT_EQ(std::memcmp(cpu_buffer.get(), expected_data.data(), num_elements * sizeof(TElement)), 0);
diff --git a/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc b/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc
index 4917701e5197d..6636e15040393 100644
--- a/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc
@@ -1,11 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "gtest/gtest.h"
+
 #include "core/common/common.h"
 #include "core/providers/cuda/cuda_common.h"
 
-#include "gtest/gtest.h"
-
 namespace onnxruntime {
 namespace cuda {
 namespace test {
diff --git a/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc b/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc
index c460e806c1a80..ec7e98528504e 100644
--- a/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc
@@ -1,10 +1,10 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <memory>
-
 #include "gtest/gtest.h"
 
+#include <memory>
+
 #include "core/providers/cuda/shared_inc/cuda_utils.h"
 #include "core/common/optional.h"
 #include "core/providers/cuda/reduction/reduction_functions.h"
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
index b7232e9dc4ba1..0e885b71b486a 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
@@ -150,7 +150,9 @@ stages:
               --enable_cuda_profiling --enable_cuda_nhwc_ops \
               --enable_pybind --build_java \
               --use_cache \
-              --cmake_extra_defines  CMAKE_CUDA_ARCHITECTURES=75; \
+              --cmake_extra_defines  CMAKE_CUDA_ARCHITECTURES=75 \
+              --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON \
+              --cmake_extra_defines  onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON; \
                 ccache -sv; \
                 ccache -z"
       workingDirectory: $(Build.SourcesDirectory)
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
index eee38ac04b355..291e2f4e19401 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
@@ -42,7 +42,12 @@ stages:
         BuildConfig: 'RelWithDebInfo'
         EnvSetupScript: setup_env_cuda.bat
         buildArch: x64
-        additionalBuildFlags: --enable_pybind --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --enable_cuda_profiling --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+        additionalBuildFlags: >-
+          --enable_pybind --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8"
+          --enable_cuda_profiling
+          --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+          --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON
+          --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
         msbuildPlatform: x64
         isX86: false
         job_name_suffix: x64_RelWithDebInfo
@@ -59,7 +64,10 @@ stages:
         BuildConfig: 'RelWithDebInfo'
         EnvSetupScript: setup_env_cuda.bat
         buildArch: x64
-        additionalBuildFlags: --enable_pybind --enable_training --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --skip_onnx_tests --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+        additionalBuildFlags: >-
+          --enable_pybind --enable_training --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8"
+          --skip_onnx_tests
+          --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
         msbuildPlatform: x64
         isX86: false
         job_name_suffix: x64_RelWithDebInfo
@@ -95,7 +103,11 @@ stages:
         EnvSetupScript: setup_env_cuda.bat
         buildArch: x64
         # note: need to specify `--gen_doc` when creating the build config so it has to be in additionalBuildFlags
-        additionalBuildFlags: --gen_doc validate --skip_tests --enable_pybind --use_dml --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF
+        additionalBuildFlags: >-
+          --gen_doc validate --skip_tests --enable_pybind --use_dml --use_cuda
+          --cuda_home="$(Agent.TempDirectory)\v11.8"
+          --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+          --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF
         msbuildPlatform: x64
         isX86: false
         job_name_suffix: x64_RelWithDebInfo

From a9d9b083e425f102b1e1fbc9ed4d715afc392aab Mon Sep 17 00:00:00 2001
From: Ye Wang <52801275+wangyems@users.noreply.github.com>
Date: Wed, 27 Mar 2024 15:59:35 -0700
Subject: [PATCH 260/279] Fix py package pipeline (#20065)

### Description
<!-- Describe your changes. -->


### Motivation and Context
Fixes #20068
---
 onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu
index 5e6e484567988..c299cdcfe6a3d 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu
@@ -656,7 +656,9 @@ inline __device__ float4 operator*(const float4 a, const float4 b) {
   return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
 }
 
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 530
+// TODO(wy): use cuda common header and investigate pipeline build issue.
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 530 && \
+    ((__CUDACC_VER_MAJOR__ < 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ < 2)))
 inline __device__ half operator*(const half a, const half b) {
   return __float2half(__half2float(a) * __half2float(b));
 }
@@ -666,8 +668,10 @@ inline __device__ half2 operator*(const half2 a, const half2 b) {
 }
 #endif
 
+// TODO(wy): use cuda common header and investigate pipeline build issue.
 inline __device__ Half4 operator*(const Half4 a, const Half4 b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 530
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 530 && \
+    ((__CUDACC_VER_MAJOR__ < 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ < 2)))
   Half4 result;
   result.x = a.x * b.x;
   result.y = a.y * b.y;

From 55f63a48ca07ceb7480756b68a7795a221d3a45d Mon Sep 17 00:00:00 2001
From: pengwa <pengwa@microsoft.com>
Date: Thu, 28 Mar 2024 08:40:34 +0800
Subject: [PATCH 261/279] Keep original name during fusion (#20097)

### Keep original name during fusion

This could be helpful to know where the fused node coming from, I feel
this is very useful when debugging the execution order issues between
different transformer layers.

For example:

- A node named
`/_original_module/model/layers.1/self_attn/MatMul/MatmulTransposeFusion//MatMulScaleFusion/`
goes through two fusion paths in the 1st transformer layer - e.g.
`MatmulTransposeFusion` and `MatMulScaleFusion`.

-
`/_original_module/model/layers.2/post_attention_layernorm/Mul_1/SimplifiedLayerNormFusion/`
node is a fused node by `SimplifiedLayerNormFusion`.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 onnxruntime/core/optimizer/gather_fusion.cc                 | 2 +-
 onnxruntime/core/optimizer/gemm_transpose_fusion.cc         | 2 +-
 onnxruntime/core/optimizer/layer_norm_fusion.cc             | 4 ++--
 onnxruntime/core/optimizer/matmul_scale_fusion.cc           | 2 +-
 onnxruntime/core/optimizer/matmul_transpose_fusion.cc       | 6 +++---
 onnxruntime/core/optimizer/quick_gelu_fusion.cc             | 2 +-
 onnxruntime/test/optimizer/graph_transform_test.cc          | 6 +++---
 .../orttraining/core/optimizer/concat_replacement.cc        | 2 +-
 8 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/onnxruntime/core/optimizer/gather_fusion.cc b/onnxruntime/core/optimizer/gather_fusion.cc
index 90cabff88122c..1f2b31526c6b8 100644
--- a/onnxruntime/core/optimizer/gather_fusion.cc
+++ b/onnxruntime/core/optimizer/gather_fusion.cc
@@ -273,7 +273,7 @@ Status GatherSliceToSplitFusion::ApplyImpl(Graph& graph, bool& modified, int gra
     split_initializer_proto.add_dims(static_cast<int64_t>(split_values.size()));
     split_initializer_proto.mutable_int64_data()->Add(split_values.begin(), split_values.end());
     NodeArg* split_initializer_arg = &graph_utils::AddInitializer(graph, split_initializer_proto);
-    Node& split_node = graph.AddNode(graph.GenerateNodeName("Split"), "Split", "Split for Fused Gather nodes",
+    Node& split_node = graph.AddNode(nodes_to_fuse[0].get().Name() + "/GatherSliceToSplitFusion/", "Split", "Split for Fused Gather nodes",
                                      {graph.GetNodeArg(node_arg->Name()), split_initializer_arg}, split_outputs);
     split_node.AddAttribute("axis", axis);
     split_node.SetExecutionProviderType(nodes_to_fuse[0].get().GetExecutionProviderType());
diff --git a/onnxruntime/core/optimizer/gemm_transpose_fusion.cc b/onnxruntime/core/optimizer/gemm_transpose_fusion.cc
index b97cce9c2e785..a52517d23db86 100644
--- a/onnxruntime/core/optimizer/gemm_transpose_fusion.cc
+++ b/onnxruntime/core/optimizer/gemm_transpose_fusion.cc
@@ -75,7 +75,7 @@ Status GemmTransposeFusion::Apply(Graph& graph, Node& node, RewriteRuleEffect& m
     nodes_to_remove.push_back(output_node);
   }
 
-  Node& new_gemm_node = graph.AddNode(graph.GenerateNodeName(gemm_node.Name() + "_transformed"),
+  Node& new_gemm_node = graph.AddNode(graph.GenerateNodeName(gemm_node.Name() + "/GemmTransposeFusion/"),
                                       gemm_node.OpType(),
                                       "Fused Gemm with Transpose",
                                       new_gemm_input_defs,
diff --git a/onnxruntime/core/optimizer/layer_norm_fusion.cc b/onnxruntime/core/optimizer/layer_norm_fusion.cc
index ce696154adb6d..48edf4854fbbb 100644
--- a/onnxruntime/core/optimizer/layer_norm_fusion.cc
+++ b/onnxruntime/core/optimizer/layer_norm_fusion.cc
@@ -455,7 +455,7 @@ Status LayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level,
     }
 
     InlinedVector<NodeArg*> layer_norm_input_defs{x_input, scale, bias};
-    Node& layer_norm_node = graph.AddNode(graph.GenerateNodeName("LayerNormalization"),
+    Node& layer_norm_node = graph.AddNode(graph.GenerateNodeName(mul_node.Name() + "/LayerNormFusion/"),
                                           "LayerNormalization",
                                           "fused LayerNorm subgraphs ",
                                           layer_norm_input_defs,
@@ -705,7 +705,7 @@ Status SimplifiedLayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int gr
 
     InlinedVector<NodeArg*> layer_norm_input_defs{x_input, scale};
     Node& layer_norm_node =
-        graph.AddNode(graph.GenerateNodeName("SimplifiedLayerNormalization"), "SimplifiedLayerNormalization",
+        graph.AddNode(graph.GenerateNodeName(mul_node.Name() + "/SimplifiedLayerNormFusion/"), "SimplifiedLayerNormalization",
                       "fused LayerNorm subgraphs ", layer_norm_input_defs, {}, {}, kOnnxDomain);
 
     // Get constant "epsilon" from "Add" node if available. Else, default value will be used.
diff --git a/onnxruntime/core/optimizer/matmul_scale_fusion.cc b/onnxruntime/core/optimizer/matmul_scale_fusion.cc
index b04d794cc9469..e4cdeadbf54d7 100644
--- a/onnxruntime/core/optimizer/matmul_scale_fusion.cc
+++ b/onnxruntime/core/optimizer/matmul_scale_fusion.cc
@@ -245,7 +245,7 @@ Status ProcessNode(
   }
 
   Node& matmul_scale_node = graph.AddNode(
-      graph.GenerateNodeName(node.Name() + "_FusedMatMulAndScale"),
+      graph.GenerateNodeName(node.Name() + "/MatMulScaleFusion/"),
       "FusedMatMul",
       "Fused MatMul and Scale",
       fused_node_inputs,
diff --git a/onnxruntime/core/optimizer/matmul_transpose_fusion.cc b/onnxruntime/core/optimizer/matmul_transpose_fusion.cc
index 789466778edc6..8eb224013618d 100644
--- a/onnxruntime/core/optimizer/matmul_transpose_fusion.cc
+++ b/onnxruntime/core/optimizer/matmul_transpose_fusion.cc
@@ -154,14 +154,14 @@ static Node* ReorderCastAndTranspose(Graph& graph, Node* cast,
   const ONNX_NAMESPACE::TensorProto_DataType element_type =
       static_cast<ONNX_NAMESPACE::TensorProto_DataType>(cast_output->TypeAsProto()->tensor_type().elem_type());
   new_cast_output_type_proto.mutable_tensor_type()->set_elem_type(element_type);
-  auto& new_cast_output = graph.GetOrCreateNodeArg(cast_output->Name() + "_transformed", &new_cast_output_type_proto);
+  auto& new_cast_output = graph.GetOrCreateNodeArg(cast_output->Name() + "/MatmulTransposeFusion/", &new_cast_output_type_proto);
 
   const std::array new_cast_input_defs{transpose_input};
   const std::array new_cast_output_defs{&new_cast_output};
   const std::array new_transpose_input_defs = {&new_cast_output};
   const std::array new_transpose_output_defs = {cast_output};
 
-  Node& new_cast = graph.AddNode(graph.GenerateNodeName(cast->Name() + "_transformed"),
+  Node& new_cast = graph.AddNode(graph.GenerateNodeName(cast->Name() + "/MatmulTransposeFusion/"),
                                  cast->OpType(),
                                  "Created a new Cast node to interchange Cast and Transpose nodes",
                                  new_cast_input_defs,
@@ -385,7 +385,7 @@ Status MatmulTransposeFusion::ApplyImpl(Graph& graph, bool& modified, int graph_
     const std::array input_defs{left_input, right_input};
     const std::array output_defs{node.MutableOutputDefs()[0]};
 
-    Node& matmul_node = graph.AddNode(graph.GenerateNodeName("MatMul_With_Transpose"),
+    Node& matmul_node = graph.AddNode(graph.GenerateNodeName(node.Name() + "/MatmulTransposeFusion/"),
                                       "FusedMatMul",
                                       "fused MatMul and Transpose ",
                                       input_defs,
diff --git a/onnxruntime/core/optimizer/quick_gelu_fusion.cc b/onnxruntime/core/optimizer/quick_gelu_fusion.cc
index 6e5eb5612a701..b09ef1c460b8e 100644
--- a/onnxruntime/core/optimizer/quick_gelu_fusion.cc
+++ b/onnxruntime/core/optimizer/quick_gelu_fusion.cc
@@ -88,7 +88,7 @@ Status QuickGeluFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level,
 
     NodeArg* quick_gelu_output_arg = mul_node.MutableOutputDefs()[0];
     Node& quick_gelu_node =
-        graph.AddNode(graph.GenerateNodeName("QuickGelu"), "QuickGelu", "QuickGelu", std::array{quick_gelu_input_arg},
+        graph.AddNode(graph.GenerateNodeName(mul_node.Name() + "/QuickGeluFusion/"), "QuickGelu", "QuickGelu", std::array{quick_gelu_input_arg},
                       std::array{quick_gelu_output_arg}, {}, kMSDomain);
     quick_gelu_node.AddAttribute("alpha", alpha);
     quick_gelu_node.SetExecutionProviderType(node.GetExecutionProviderType());
diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc
index 97f1feaaa612d..0d1f213618e54 100755
--- a/onnxruntime/test/optimizer/graph_transform_test.cc
+++ b/onnxruntime/test/optimizer/graph_transform_test.cc
@@ -2724,7 +2724,7 @@ TEST_F(GraphTransformationTests, GemmTransposeFusion2OutputsFromTranspose) {
   auto gemm_node =
       std::find_if(
           graph.Nodes().cbegin(), graph.Nodes().cend(),
-          [](const Node& node) { return node.Name() == "Gemm_transformed"; });
+          [](const Node& node) { return node.Name() == "Gemm/GemmTransposeFusion/"; });
 
   auto& node = *gemm_node;
   ASSERT_TRUE(node.OpType() == "Gemm");
@@ -2760,7 +2760,7 @@ TEST_F(GraphTransformationTests, GemmTransposeFusion2OutputsFromTransposeTo2Gemm
   auto gemm1_node =
       std::find_if(
           graph.Nodes().cbegin(), graph.Nodes().cend(),
-          [](const Node& node) { return node.Name() == "Gemm1_transformed"; });
+          [](const Node& node) { return node.Name() == "Gemm1/GemmTransposeFusion/"; });
 
   auto& node1 = *gemm1_node;
   ASSERT_TRUE(node1.OpType() == "Gemm");
@@ -2773,7 +2773,7 @@ TEST_F(GraphTransformationTests, GemmTransposeFusion2OutputsFromTransposeTo2Gemm
   auto gemm2_node =
       std::find_if(
           graph.Nodes().cbegin(), graph.Nodes().cend(),
-          [](const Node& node) { return node.Name() == "Gemm2_transformed"; });
+          [](const Node& node) { return node.Name() == "Gemm2/GemmTransposeFusion/"; });
 
   auto& node2 = *gemm2_node;
   ASSERT_TRUE(node2.OpType() == "Gemm");
diff --git a/orttraining/orttraining/core/optimizer/concat_replacement.cc b/orttraining/orttraining/core/optimizer/concat_replacement.cc
index 37d302765cda8..2c919591ec081 100644
--- a/orttraining/orttraining/core/optimizer/concat_replacement.cc
+++ b/orttraining/orttraining/core/optimizer/concat_replacement.cc
@@ -23,7 +23,7 @@ Status ConcatReplacement::Apply(Graph& graph, Node& concat_node, RewriteRuleEffe
 
   concat_outputs.push_back(&ip_shape_op);
 
-  Node& concat_training_node = graph.AddNode(graph.GenerateNodeName("ConcatTraining"),
+  Node& concat_training_node = graph.AddNode(graph.GenerateNodeName(concat_node.Name() + "/ConcatReplacement/"),
                                              "ConcatTraining",
                                              "Concat with extra output",
                                              concat_inputs,

From 16af7adc70070813ac3c047d6789deb6223119ae Mon Sep 17 00:00:00 2001
From: wejoncy <wejoncy@163.com>
Date: Thu, 28 Mar 2024 09:52:10 +0800
Subject: [PATCH 262/279] [llm exporter]auto infer output shape (#20071)

### Description
<!-- Describe your changes. -->


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 onnxruntime/python/tools/transformers/large_model_exporter.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/large_model_exporter.py b/onnxruntime/python/tools/transformers/large_model_exporter.py
index 9e8b284bf56c7..2083419087a69 100644
--- a/onnxruntime/python/tools/transformers/large_model_exporter.py
+++ b/onnxruntime/python/tools/transformers/large_model_exporter.py
@@ -253,8 +253,6 @@ def fetch_onnx_inputs_outputs_name(
         for i in range(num_of_past_key):
             onnx_out_names += (f"present.{i}.key",)
             onnx_out_names += (f"present.{i}.value",)
-            onnx_dynamic_axes[onnx_out_names[-1]] = kv_cache_axis
-            onnx_dynamic_axes[onnx_out_names[-2]] = kv_cache_axis
 
     for idx, name in enumerate(torch_input_names):
         if input_with_past:

From 8f069f81c44a4ccd954cec3a2b3fed0fec662cf4 Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Thu, 28 Mar 2024 12:55:44 +0800
Subject: [PATCH 263/279] Split more windows GPU workflow into 2 stages,
 building and testing, to make them more stable (#20080)

### Description
reactor win-ci.yml to solve the random hang issue in more GPU workflows,
move nugget-zip packages and python cuda12 packages building to CPU
machine.

---------

Co-authored-by: Yi Zhang <your@email.com>
---
 .../c-api-noopenmp-packaging-pipelines.yml    |  14 +-
 .../cuda-packaging-pipeline.yml               |   2 +
 .../stages/nuget-combine-cuda-stage.yml       |   4 +-
 .../stages/nuget-win-cuda-packaging-stage.yml |  20 +-
 .../templates/make_java_win_binaries.yml      |  47 ++++
 .../azure-pipelines/templates/win-ci.yml      | 225 +++++++++++++-----
 6 files changed, 238 insertions(+), 74 deletions(-)
 create mode 100644 tools/ci_build/github/azure-pipelines/templates/make_java_win_binaries.yml

diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index a63f1b74b7633..a124cb3c1ac77 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -222,6 +222,8 @@ stages:
     buildJava: true
     java_artifact_id: onnxruntime_gpu
     CudaVersion: 11.8
+    SpecificArtifact: ${{ parameters.SpecificArtifact }}
+    BuildId: ${{ parameters.BuildId }}
 
 # CUDA with Tensorrt
 - template: templates/win-ci.yml
@@ -310,8 +312,8 @@ stages:
   dependsOn:
   - Linux_C_API_Packaging_GPU_x64
   - Linux_C_API_Packaging_GPU_TensorRT_x64
-  - Windows_Packaging_gpu
-  - Windows_Packaging_tensorrt
+  - Windows_Packaging_gpu_Testing
+  - Windows_Packaging_tensorrt_Testing
   - Download_Java_Tools
   condition: succeeded()
   jobs:
@@ -488,8 +490,8 @@ stages:
 
 - stage: Windows_Packaging_combined_GPU
   dependsOn:
-  - Windows_Packaging_gpu
-  - Windows_Packaging_tensorrt
+  - Windows_Packaging_gpu_Testing
+  - Windows_Packaging_tensorrt_Testing
   condition: succeeded()
   jobs:
   - job:
@@ -582,9 +584,9 @@ stages:
 - stage: NuGet_Packaging_GPU
   dependsOn:
   - Setup
-  - Windows_Packaging_gpu
+  - Windows_Packaging_gpu_Testing
   - Windows_Packaging_CPU_x64_default
-  - Windows_Packaging_tensorrt
+  - Windows_Packaging_tensorrt_Testing
   - Linux_C_API_Packaging_GPU_x64
   - Linux_C_API_Packaging_GPU_TensorRT_x64
   condition: succeeded()
diff --git a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
index 0c24d4897ddf1..3c9baf97b8e68 100644
--- a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
@@ -143,6 +143,8 @@ stages:
       CudaVersion: ${{ parameters.CudaVersion }}
       win_trt_home: ${{ variables.win_trt_home }}
       win_cuda_home: ${{ variables.win_cuda_home }}
+      SpecificArtifact: ${{ parameters.SpecificArtifact }}
+      BuildId: ${{ parameters.BuildId }}
   - template: stages/nuget-combine-cuda-stage.yml
     parameters:
       DoCompliance: ${{ parameters.DoCompliance }}
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml
index 064e2ea91d194..2452e2885e74e 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml
@@ -17,8 +17,8 @@ stages:
 - stage: NuGet_Packaging_GPU
   dependsOn:
     - Set_ReleaseVersionSuffix
-    - Windows_Packaging_gpu
-    - Windows_Packaging_tensorrt
+    - Windows_Packaging_gpu_Testing
+    - Windows_Packaging_tensorrt_Testing
     - Linux_C_API_Packaging_CPU_x64
     - Linux_C_API_Packaging_GPU_x64
     - Linux_C_API_Packaging_GPU_TensorRT_x64
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
index 66b6bc4ab3591..f599f45059c0b 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
@@ -23,6 +23,16 @@ parameters:
 - name: win_trt_home
   type: string
 
+- name: SpecificArtifact
+  displayName: Use Specific Artifact
+  type: boolean
+  default: false
+
+- name: BuildId
+  displayName: Specific Artifact's BuildId
+  type: string
+  default: '0'
+
 stages:
 # Windows CUDA without TensorRT Packaging
 - template: ../templates/win-ci.yml
@@ -40,6 +50,8 @@ stages:
     buildJava: false
     java_artifact_id: onnxruntime_gpu
     PublishProtoc: true
+    SpecificArtifact: ${{ parameters.SpecificArtifact }}
+    BuildId: ${{ parameters.BuildId }}
 # Windows CUDA with TensorRT Packaging
 - template: ../templates/win-ci.yml
   parameters:
@@ -56,12 +68,14 @@ stages:
     buildJava: false
     java_artifact_id: onnxruntime_gpu
     UseIncreasedTimeoutForTests: ${{ parameters.UseIncreasedTimeoutForTests }}
+    SpecificArtifact: ${{ parameters.SpecificArtifact }}
+    BuildId: ${{ parameters.BuildId }}
 
 # Windows CUDA Combined Testing and Publishing
 - stage: Windows_Packaging_combined_GPU
   dependsOn:
-    - Windows_Packaging_gpu
-    - Windows_Packaging_tensorrt
+    - Windows_Packaging_gpu_Testing
+    - Windows_Packaging_tensorrt_Testing
   condition: succeeded()
 
   jobs:
@@ -159,4 +173,4 @@ stages:
           displayName: 'Publish Pipeline Combined GPU Package Artifact'
           inputs:
             artifactName: 'onnxruntime-win-x64-gpu'
-            targetPath: '$(Build.ArtifactStagingDirectory)/onnxruntime-win-x64-gpu-$(OnnxRuntimeVersion).zip'
\ No newline at end of file
+            targetPath: '$(Build.ArtifactStagingDirectory)/onnxruntime-win-x64-gpu-$(OnnxRuntimeVersion).zip'
diff --git a/tools/ci_build/github/azure-pipelines/templates/make_java_win_binaries.yml b/tools/ci_build/github/azure-pipelines/templates/make_java_win_binaries.yml
new file mode 100644
index 0000000000000..756a7a48343a3
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/templates/make_java_win_binaries.yml
@@ -0,0 +1,47 @@
+parameters:
+  - name: msbuildPlatform
+    type: string
+  - name: java_artifact_id
+    type: string
+
+steps:
+    - task: CmdLine@2
+      displayName: 'Add symbols and notices to Java'
+      inputs:
+        script: |
+          @echo on
+          cd $(Build.SourcesDirectory)\java
+          call $(Build.SourcesDirectory)\java\gradlew.bat cmakeCheck -DcmakeBuildDir=$(Build.BinariesDirectory)\RelWithDebInfo
+          if %errorlevel% neq 0 exit /b %errorlevel%
+          cd $(Build.BinariesDirectory)\RelWithDebInfo
+          set NATIVE_FOLDER=$(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\stage\ai\onnxruntime\native\win-x64
+          mkdir %NATIVE_FOLDER%
+          echo "Directories created"
+          copy .\java\build\libs\*.jar $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}
+          pushd $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}
+          set artifact_id=${{ parameters.java_artifact_id }}
+          jar xf onnxruntime-$(OnnxRuntimeVersion).jar META-INF\maven\com.microsoft.onnxruntime\%artifact_id%\pom.xml
+          move META-INF\maven\com.microsoft.onnxruntime\%artifact_id%\pom.xml onnxruntime-$(OnnxRuntimeVersion).pom
+          rd /s /q META-INF
+          popd
+          copy .\RelWithDebInfo\onnxruntime.pdb %NATIVE_FOLDER%
+          copy .\RelWithDebInfo\onnxruntime4j_jni.pdb %NATIVE_FOLDER%
+          copy $(Build.SourcesDirectory)\docs\Privacy.md $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\stage\Privacy.md
+          copy $(Build.SourcesDirectory)\ThirdPartyNotices.txt $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\stage\ThirdPartyNotices.txt
+          @echo $(OnnxRuntimeGitCommitHash) > $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\stage\GIT_COMMIT_ID
+          pushd $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\stage
+          jar uf $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\onnxruntime-$(OnnxRuntimeVersion).jar ai\onnxruntime\native\win-x64\onnxruntime.pdb
+          jar uf $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\onnxruntime-$(OnnxRuntimeVersion).jar ai\onnxruntime\native\win-x64\onnxruntime4j_jni.pdb
+          jar uf $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\onnxruntime-$(OnnxRuntimeVersion).jar Privacy.md ThirdPartyNotices.txt GIT_COMMIT_ID
+          popd
+          pushd $(Build.SourcesDirectory)\java\build\classes\java\test
+          if %errorlevel% neq 0 exit /b %errorlevel%
+          jar cvf $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\testing.jar .
+          if %errorlevel% neq 0 exit /b %errorlevel%
+          popd
+          pushd $(Build.SourcesDirectory)\java\build\resources\test
+          rd /s /q ai\onnxruntime\native
+          jar uvf $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\testing.jar .
+          popd
+          rd /s /q $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\stage
+          dir /s /b $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
index e32956d6eb913..a68442308eddd 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
@@ -71,6 +71,16 @@ parameters:
       - 11.8
       - 12.2
 
+- name: SpecificArtifact
+  displayName: Use Specific Artifact
+  type: boolean
+  default: false
+
+- name: BuildId
+  displayName: Specific Artifact's BuildId
+  type: string
+  default: '0'
+
 stages:
 - stage: Windows_Packaging_${{ parameters.stage_name_suffix }}
   dependsOn: []
@@ -82,13 +92,20 @@ stages:
   - job:
     workspace:
       clean: all
-    pool: ${{ parameters.ort_build_pool_name }}
+    ${{ if contains(parameters.ort_build_pool_name, 'GPU') }}:
+      pool: onnxruntime-Win-CPU-2022
+    ${{ else }}:
+      pool: ${{ parameters.ort_build_pool_name }}
     ${{ if eq(parameters['UseIncreasedTimeoutForTests'], 'true') }}:
       timeoutInMinutes: 1200
     ${{ else }}:
       timeoutInMinutes: 300
 
     steps:
+      - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+        displayName: 'Clean Agent Directories'
+        condition: always()
+
       - checkout: self
         clean: true
         submodules: none
@@ -155,6 +172,7 @@ stages:
             arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "$(VSGenerator)" --enable_onnx_tests $(TelemetryOption) ${{ parameters.buildparameter }} '
           workingDirectory: '$(Build.BinariesDirectory)'
 
+
       - task: VSBuild@1
         displayName: 'Build'
         inputs:
@@ -166,19 +184,59 @@ stages:
           logProjectEvents: true
           workingFolder: '$(Build.BinariesDirectory)\RelWithDebInfo'
           createLogFile: true
-          msbuildArgs: "/p:CL_MPCount=2"  # 2x cl.exe per project building.
 
-      - task: PythonScript@0
-        displayName: 'test'
-        condition: and(succeeded(), eq('${{ parameters.runTests}}', true))
-        inputs:
-          scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-          arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "$(VSGenerator)" --enable_onnx_tests  $(TelemetryOption) ${{ parameters.buildparameter }}'
+      # For CPU job, tests are run in the same machine as building
+      - ${{ if contains(parameters.ort_build_pool_name, 'CPU') }}:
+        - ${{ if eq(parameters.buildJava, 'true') }}:
+          - template: make_java_win_binaries.yml
+            parameters:
+              msbuildPlatform: ${{ parameters.msbuildPlatform }}
+              java_artifact_id: ${{ parameters.java_artifact_id }}
+
+          - task: PublishBuildArtifacts@1
+            condition: and(succeeded(), eq('${{ parameters.buildJava}}', true))
+            displayName: 'Publish Java temp binaries'
+            inputs:
+              pathtoPublish: '$(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}'
+              artifactName: 'drop-onnxruntime-java-win-${{ parameters.packageName }}${{parameters.artifact_name_suffix}}'
+        - task: PythonScript@0
+          displayName: 'test'
+          condition: and(succeeded(), eq('${{ parameters.runTests}}', true))
+          inputs:
+            scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
+            arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "$(VSGenerator)" --enable_onnx_tests  $(TelemetryOption) ${{ parameters.buildparameter }}'
+            workingDirectory: '$(Build.BinariesDirectory)'
+      - ${{ else }}:
+        - powershell: |
+            New-Item $(Agent.TempDirectory)/RelWithDebInfo -Force -ItemType Directory
+            Copy-Item -Path "$(Build.BinariesDirectory)/RelWithDebInfo/CTestTestfile.cmake" -Destination $(Agent.TempDirectory)/RelWithDebInfo/ -Force
+            Copy-Item -Path "$(Build.BinariesDirectory)/RelWithDebInfo/RelWithDebInfo" -Destination $(Agent.TempDirectory)/RelWithDebInfo/ -Recurse -Force
+            Get-ChildItem -Path "$(Agent.TempDirectory)/RelWithDebInfo" -Include *.pdb -File -Recurse | ForEach-Object { $_.Delete() }
+            Get-ChildItem -Path "$(Agent.TempDirectory)/RelWithDebInfo" -Include *.lib -File -Recurse | ForEach-Object { $_.Delete() }
+            Copy-Item -Path $(Build.BinariesDirectory)/RelWithDebInfo/RelWithDebInfo/onnxruntime.pdb -Destination $(Agent.TempDirectory)/RelWithDebInfo/RelWithDebInfo -Force
+            Copy-Item -Path $(Build.BinariesDirectory)/RelWithDebInfo/RelWithDebInfo/onnxruntime4j_jni.pdb -Destination $(Agent.TempDirectory)/RelWithDebInfo/RelWithDebInfo -Force
+            cd $(Agent.TempDirectory)/RelWithDebInfo
+            tree /f
+          displayName: 'Copy native test needs files'
           workingDirectory: '$(Build.BinariesDirectory)'
 
+        - ${{ if eq(parameters['buildJava'], 'true') }}:
+          - powershell: |
+              Copy-Item -Path "$(Build.BinariesDirectory)/RelWithDebInfo/java" -Destination $(Agent.TempDirectory)/RelWithDebInfo/ -Recurse -Force
+              cd $(Agent.TempDirectory)/RelWithDebInfo
+              tree /f
+            displayName: 'Copy java folder for java test'
+            workingDirectory: '$(Build.BinariesDirectory)'
+
+        - task: PublishPipelineArtifact@1
+          inputs:
+            targetPath: '$(Agent.TempDirectory)/RelWithDebInfo'
+            artifactName: 'Windows_Packaging_${{ parameters.stage_name_suffix }}_build_artifacts'
+            publishLocation: 'pipeline'
+
       - script: |
-         dir *.dll
-         mkdir $(Build.ArtifactStagingDirectory)\testdata
+          dir *.dll
+          mkdir $(Build.ArtifactStagingDirectory)\testdata
         workingDirectory: '$(Build.BinariesDirectory)/RelWithDebInfo/RelWithDebInfo'
         displayName: 'List built DLLs'
 
@@ -205,7 +263,6 @@ stages:
           targetPath: '$(Build.BinariesDirectory)\RelWithDebInfo\installed\bin\protoc.exe'
           artifactName: 'drop-extra${{ parameters.artifact_name_suffix }}'
 
-
       - task: CopyFiles@2
         displayName: 'Copy custom_op_library to: $(Build.ArtifactStagingDirectory)'
         condition: and(succeeded(), eq('${{ parameters.packageName}}', 'x64'))
@@ -214,55 +271,6 @@ stages:
           Contents: 'custom_op_library.dll'
           TargetFolder: '$(Build.ArtifactStagingDirectory)/testdata'
 
-      - task: CmdLine@2
-        condition: and(succeeded(), eq('${{ parameters.buildJava}}', true))
-        displayName: 'Add symbols and notices to Java'
-        inputs:
-          script: |
-            @echo on
-            cd $(Build.SourcesDirectory)\java
-            call $(Build.SourcesDirectory)\java\gradlew.bat cmakeCheck -DcmakeBuildDir=$(Build.BinariesDirectory)\RelWithDebInfo
-            if %errorlevel% neq 0 exit /b %errorlevel%
-            cd $(Build.BinariesDirectory)\RelWithDebInfo
-            set NATIVE_FOLDER=$(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\stage\ai\onnxruntime\native\win-x64
-            mkdir %NATIVE_FOLDER%
-            echo "Directories created"
-            copy .\java\build\libs\*.jar $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}
-            pushd $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}
-            set artifact_id=${{ parameters.java_artifact_id }}
-            jar xf onnxruntime-$(OnnxRuntimeVersion).jar META-INF\maven\com.microsoft.onnxruntime\%artifact_id%\pom.xml
-            move META-INF\maven\com.microsoft.onnxruntime\%artifact_id%\pom.xml onnxruntime-$(OnnxRuntimeVersion).pom
-            rd /s /q META-INF
-            popd
-            copy .\RelWithDebInfo\onnxruntime.pdb %NATIVE_FOLDER%
-            copy .\RelWithDebInfo\onnxruntime4j_jni.pdb %NATIVE_FOLDER%
-            copy $(Build.SourcesDirectory)\docs\Privacy.md $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\stage\Privacy.md
-            copy $(Build.SourcesDirectory)\ThirdPartyNotices.txt $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\stage\ThirdPartyNotices.txt
-            @echo $(OnnxRuntimeGitCommitHash) > $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\stage\GIT_COMMIT_ID
-            pushd $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\stage
-            jar uf $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\onnxruntime-$(OnnxRuntimeVersion).jar ai\onnxruntime\native\win-x64\onnxruntime.pdb
-            jar uf $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\onnxruntime-$(OnnxRuntimeVersion).jar ai\onnxruntime\native\win-x64\onnxruntime4j_jni.pdb
-            jar uf $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\onnxruntime-$(OnnxRuntimeVersion).jar Privacy.md ThirdPartyNotices.txt GIT_COMMIT_ID
-            popd
-            pushd $(Build.SourcesDirectory)\java\build\classes\java\test
-            if %errorlevel% neq 0 exit /b %errorlevel%
-            jar cvf $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\testing.jar .
-            if %errorlevel% neq 0 exit /b %errorlevel%
-            popd
-            pushd $(Build.SourcesDirectory)\java\build\resources\test
-            rd /s /q ai\onnxruntime\native
-            jar uvf $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\testing.jar .
-            popd
-            rd /s /q $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\stage
-            dir /s /b $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}
-
-      - task: PublishBuildArtifacts@1
-        condition: and(succeeded(), eq('${{ parameters.buildJava}}', true))
-        displayName: 'Publish Java temp binaries'
-        inputs:
-          pathtoPublish: '$(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}'
-          artifactName: 'drop-onnxruntime-java-win-${{ parameters.packageName }}${{parameters.artifact_name_suffix}}'
-
       - ${{ if eq(parameters['DoCompliance'], 'true') }}:
         - task: CredScan@3
           displayName: 'Run CredScan'
@@ -295,6 +303,97 @@ stages:
         parameters :
           condition : 'succeeded'
 
-      - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
-        displayName: 'Clean Agent Directories'
-        condition: always()
+- ${{ if contains(parameters.ort_build_pool_name, 'GPU') }}:
+  - stage: Windows_Packaging_${{ parameters.stage_name_suffix }}_Testing
+    dependsOn: Windows_Packaging_${{ parameters.stage_name_suffix }}
+    variables:
+      CUDA_MODULE_LOADING: 'LAZY'
+    jobs:
+    - job: Windows_Packaging_${{ parameters.stage_name_suffix }}_Testing
+      workspace:
+        clean: all
+      pool: ${{ parameters.ort_build_pool_name }}
+      timeoutInMinutes: 120
+      steps:
+        - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+          displayName: 'Clean Agent Directories'
+          condition: always()
+
+        - checkout: self
+          clean: true
+          submodules: none
+
+        - template: flex-downloadPipelineArtifact.yml
+          parameters:
+            ArtifactName: "Windows_Packaging_${{ parameters.stage_name_suffix }}_build_artifacts"
+            StepName: 'Download Pipeline Artifact - Windows GPU Packages Build'
+            TargetPath: '$(Build.BinariesDirectory)/RelWithDebInfo/'
+            SpecificArtifact: ${{ parameters.SpecificArtifact }}
+            BuildId: ${{ parameters.BuildId }}
+
+        - powershell: |
+            tree /f
+            $drive = (Get-Location).Drive.Name
+            $file = 'CTestTestfile.cmake'
+            (Get-Content $file ) -replace 'C:\\a', -join($drive, ':\\a') | Set-Content $file
+            (Get-Content $file ) -replace 'C:/a', -join($drive, ':/a') | Set-Content $file
+          displayName: 'List built files and update CTestTestfile.cmake drive letter'
+          workingDirectory: '$(Build.BinariesDirectory)/RelWithDebInfo/'
+
+        - template: telemetry-steps.yml
+
+        - template: set-version-number-variables-step.yml
+
+        - ${{ if eq(parameters['buildJava'], 'true') }}:
+          - task: JavaToolInstaller@0
+            inputs:
+              versionSpec: "11"
+              jdkArchitectureOption: ${{ parameters.buildArch }}
+              jdkSourceOption: 'PreInstalled'
+
+        - task: UsePythonVersion@0
+          inputs:
+            versionSpec: '3.8'
+            addToPath: true
+            architecture: ${{ parameters.buildArch }}
+
+        - task: NodeTool@0
+          condition: and(succeeded(), eq('${{ parameters.buildNodejs}}', true))
+          inputs:
+            versionSpec: '18.x'
+
+        - ${{ if ne(parameters.CudaVersion, '') }}:
+          - template: jobs/download_win_gpu_library.yml
+            parameters:
+              CudaVersion: ${{ parameters.CudaVersion }}
+              ${{ if contains(parameters.buildparameter, 'use_cuda') }}:
+                DownloadCUDA: true
+              ${{ if contains(parameters.buildparameter, 'use_tensorrt') }}:
+                DownloadCUDA: true
+                DownloadTRT: true
+
+        - powershell: |
+            Write-Host "##vso[task.prependpath]C:\Program Files (x86)\dotnet"
+          displayName: 'Append dotnet x86  Directory to PATH'
+          condition: and(succeeded(), eq('${{ parameters.buildArch}}', 'x86'))
+
+        - task: PythonScript@0
+          displayName: 'test'
+          condition: and(succeeded(), eq('${{ parameters.runTests}}', true))
+          inputs:
+            scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
+            arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --enable_onnx_tests  $(TelemetryOption) '
+            workingDirectory: '$(Build.BinariesDirectory)'
+
+        - ${{ if eq(parameters.buildJava, 'true') }}:
+          - template: make_java_win_binaries.yml
+            parameters:
+              msbuildPlatform: ${{ parameters.msbuildPlatform }}
+              java_artifact_id: ${{ parameters.java_artifact_id }}
+
+          - task: PublishBuildArtifacts@1
+            condition: and(succeeded(), eq('${{ parameters.buildJava}}', true))
+            displayName: 'Publish Java temp binaries'
+            inputs:
+              pathtoPublish: '$(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}'
+              artifactName: 'drop-onnxruntime-java-win-${{ parameters.packageName }}${{parameters.artifact_name_suffix}}'

From c5d7310f1b686364f5b85d24efcf18159a1a788d Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Thu, 28 Mar 2024 13:15:03 +0800
Subject: [PATCH 264/279] Remove TSA upload in testing stage (#20115)

### Description
<!-- Describe your changes. -->


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

---------

Co-authored-by: Yi Zhang <your@email.com>
---
 .../github/azure-pipelines/templates/py-win-gpu.yml   | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml
index 59387a0de4cd1..b3a1650fbdae1 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml
@@ -281,14 +281,3 @@ stages:
             python onnx_backend_test_series.py
           workingDirectory: '$(Build.sourcesDirectory)'
           displayName: 'Run Python Tests'
-
-        - task: TSAUpload@2
-          displayName: 'TSA upload'
-          condition: and (succeeded(), eq(variables['Build.SourceBranch'], 'refs/heads/main'))
-          inputs:
-            GdnPublishTsaOnboard: false
-            GdnPublishTsaConfigFile: '$(Build.sourcesDirectory)\.gdn\.gdntsa'
-
-        - template: component-governance-component-detection-steps.yml
-          parameters:
-            condition: 'succeeded'

From 2a38168f0b491123470a30b473ac716eaa637656 Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Thu, 28 Mar 2024 13:30:33 +0800
Subject: [PATCH 265/279] increase cl mpcount since Compilation is moved on CPU
 machine (#20116)

### Description
The CPU machine has 16 cores, so we can increase the parallel count.
Compared with 2 runs.
1.
https://dev.azure.com/aiinfra/Lotus/_build/results?buildId=432328&view=results
2.
https://dev.azure.com/aiinfra/Lotus/_build/results?buildId=432331&view=results
The compilation took about 25 minutes if the parallel count is 15, while
it took 41 minutes if the parallel count is 3


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

Co-authored-by: Yi Zhang <your@email.com>
---
 tools/ci_build/build.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 7dfdbc301622a..6d99ff07dd2a7 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -1701,12 +1701,12 @@ def build_targets(args, cmake_path, build_dir, configs, num_parallel_jobs, targe
                 # https://github.com/Microsoft/checkedc-clang/wiki/Parallel-builds-of-clang-on-Windows suggests
                 # not maxing out CL_MPCount
                 # Start by having one less than num_parallel_jobs (default is num logical cores),
-                # limited to a range of 1..3
-                # that gives maxcpucount projects building using up to 3 cl.exe instances each
+                # limited to a range of 1..15
+                # that gives maxcpucount projects building using up to 15 cl.exe instances each
                 build_tool_args += [
                     f"/maxcpucount:{num_parallel_jobs}",
-                    # one less than num_parallel_jobs, at least 1, up to 3
-                    f"/p:CL_MPCount={min(max(num_parallel_jobs - 1, 1), 3)}",
+                    # one less than num_parallel_jobs, at least 1, up to 15
+                    f"/p:CL_MPCount={min(max(num_parallel_jobs - 1, 1), 15)}",
                     # if nodeReuse is true, msbuild processes will stay around for a bit after the build completes
                     "/nodeReuse:False",
                 ]

From 3ed0c81b3075368e24705ad004195e302efa2c69 Mon Sep 17 00:00:00 2001
From: Pranav Sharma <prs@microsoft.com>
Date: Thu, 28 Mar 2024 12:28:37 -0700
Subject: [PATCH 266/279] Expose Reserve() in OrtAllocator to allow custom
 allocators to work when session.use_device_allocator_for_initializers is
 specified. (#19904)

### Description
Expose Reserve() in OrtAllocator to allow custom allocators to work when
session.use_device_allocator_for_initializers is specified.
Update: this change has been verified by Bing Ads and brings a
significant benefit in terms of memory utilization: 30GB less memory and
also better CPU utilization.

### Motivation and Context

https://microsoft-my.sharepoint.com/:w:/p/prs/Eeidf5YNtWtKrPVkfuTDsuABak1oL4QRpuBGuhqRbLKoJg?e=Zl3bah
---
 include/onnxruntime/core/framework/allocator.h  |  1 -
 .../core/session/onnxruntime_c_api.h            |  6 ++++++
 onnxruntime/core/session/allocator_adapters.cc  | 16 ++++++++++++++++
 onnxruntime/core/session/allocator_adapters.h   |  2 ++
 onnxruntime/test/shared_lib/test_inference.cc   | 16 +++++++++++++++-
 onnxruntime/test/util/include/test_allocator.h  |  3 +++
 onnxruntime/test/util/test_allocator.cc         | 17 +++++++++++++++++
 7 files changed, 59 insertions(+), 2 deletions(-)

diff --git a/include/onnxruntime/core/framework/allocator.h b/include/onnxruntime/core/framework/allocator.h
index 9015b23296e08..097873c5e3653 100644
--- a/include/onnxruntime/core/framework/allocator.h
+++ b/include/onnxruntime/core/framework/allocator.h
@@ -80,7 +80,6 @@ class IAllocator {
 
   virtual void Free(void* p) = 0;
 
-  // TODO: Find a better name than Reserve() and update in all places.
   // Reserve() is an interface exposed for an implementation of IAllocator
   // to optionally implement some allocation logic that by-passes any arena-based
   // logic that may be housed in the Alloc() implementation.
diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index 41b034e9c1dcc..2a7b17977f1ae 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -319,6 +319,12 @@ typedef struct OrtAllocator {
   void*(ORT_API_CALL* Alloc)(struct OrtAllocator* this_, size_t size);                ///< Returns a pointer to an allocated block of `size` bytes
   void(ORT_API_CALL* Free)(struct OrtAllocator* this_, void* p);                      ///< Free a block of memory previously allocated with OrtAllocator::Alloc
   const struct OrtMemoryInfo*(ORT_API_CALL* Info)(const struct OrtAllocator* this_);  ///< Return a pointer to an ::OrtMemoryInfo that describes this allocator
+  /**
+   * @brief Optional allocation function to use for memory allocations made during session initialization.
+   * Use this function if you want to separate allocations made by ORT during Run() calls from
+   * those made during session initialization. This allows for separate memory management strategies for these allocations.
+   */
+  void*(ORT_API_CALL* Reserve)(struct OrtAllocator* this_, size_t size);  ///< Returns a pointer to an allocated block of `size` bytes
 } OrtAllocator;
 
 typedef void(ORT_API_CALL* OrtLoggingFunction)(
diff --git a/onnxruntime/core/session/allocator_adapters.cc b/onnxruntime/core/session/allocator_adapters.cc
index dc864836580fe..ac5ea75453558 100644
--- a/onnxruntime/core/session/allocator_adapters.cc
+++ b/onnxruntime/core/session/allocator_adapters.cc
@@ -17,12 +17,20 @@ OrtAllocatorImplWrappingIAllocator::OrtAllocatorImplWrappingIAllocator(onnxrunti
       [](OrtAllocator* this_, void* p) { static_cast<OrtAllocatorImplWrappingIAllocator*>(this_)->Free(p); };
   OrtAllocator::Info =
       [](const OrtAllocator* this_) { return static_cast<const OrtAllocatorImplWrappingIAllocator*>(this_)->Info(); };
+  if (OrtAllocator::version >= 18) {
+    OrtAllocator::Reserve =
+        [](OrtAllocator* this_, size_t size) { return static_cast<OrtAllocatorImplWrappingIAllocator*>(this_)->Reserve(size); };
+  }
 }
 
 void* OrtAllocatorImplWrappingIAllocator::Alloc(size_t size) {
   return i_allocator_->Alloc(size);
 }
 
+void* OrtAllocatorImplWrappingIAllocator::Reserve(size_t size) {
+  return i_allocator_->Reserve(size);
+}
+
 void OrtAllocatorImplWrappingIAllocator::Free(void* p) {
   i_allocator_->Free(p);
 }
@@ -42,6 +50,14 @@ void* IAllocatorImplWrappingOrtAllocator::Alloc(size_t size) {
   return ort_allocator_->Alloc(ort_allocator_, size);
 }
 
+void* IAllocatorImplWrappingOrtAllocator::Reserve(size_t size) {
+  if (ort_allocator_->version >= 18 && ort_allocator_->Reserve) {
+    return ort_allocator_->Reserve(ort_allocator_, size);
+  }
+
+  return ort_allocator_->Alloc(ort_allocator_, size);
+}
+
 void IAllocatorImplWrappingOrtAllocator::Free(void* p) {
   return ort_allocator_->Free(ort_allocator_, p);
 }
diff --git a/onnxruntime/core/session/allocator_adapters.h b/onnxruntime/core/session/allocator_adapters.h
index 587e9f733ca2c..48f4ea03118c8 100644
--- a/onnxruntime/core/session/allocator_adapters.h
+++ b/onnxruntime/core/session/allocator_adapters.h
@@ -27,6 +27,7 @@ struct OrtAllocatorImplWrappingIAllocator final : public OrtAllocatorImpl {
   void Free(void* p);
 
   const OrtMemoryInfo* Info() const;
+  void* Reserve(size_t size);
 
   ORT_DISALLOW_COPY_AND_ASSIGNMENT(OrtAllocatorImplWrappingIAllocator);
 
@@ -43,6 +44,7 @@ class IAllocatorImplWrappingOrtAllocator final : public IAllocator {
   ~IAllocatorImplWrappingOrtAllocator() override = default;
 
   void* Alloc(size_t size) override;
+  void* Reserve(size_t size) override;
 
   void Free(void* p) override;
 
diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc
index 52dd2a84e383b..68f5d866a9276 100644
--- a/onnxruntime/test/shared_lib/test_inference.cc
+++ b/onnxruntime/test/shared_lib/test_inference.cc
@@ -2862,6 +2862,17 @@ TEST(CApiTest, TestSharedAllocators) {
                         expected_dims_y,
                         expected_values_y,
                         nullptr);
+
+      // create session 3 to test separate allocation for initializers
+      session_options.AddConfigEntry("session.use_device_allocator_for_initializers", "1");
+      Ort::Session session3(*ort_env, MODEL_URI, session_options);
+      RunSession<float>(allocator_for_input_memory_allocation.get(),
+                        session3,
+                        inputs,
+                        "Y",
+                        expected_dims_y,
+                        expected_values_y,
+                        nullptr);
     }
 
     // Remove the registered shared allocator from the global environment
@@ -2872,7 +2883,10 @@ TEST(CApiTest, TestSharedAllocators) {
     // We should have seen 2 allocations per session (one for the sole initializer
     // and one for the output). So, for two sessions, we should have seen 4 allocations.
     size_t num_allocations = custom_allocator.NumAllocations();
-    ASSERT_TRUE(num_allocations == 4);
+    ASSERT_TRUE(num_allocations == 6);
+
+    size_t num_reserve_allocations = custom_allocator.NumReserveAllocations();
+    ASSERT_TRUE(num_reserve_allocations == 1);
 
     // Ensure that there was no leak
     custom_allocator.LeakCheck();
diff --git a/onnxruntime/test/util/include/test_allocator.h b/onnxruntime/test/util/include/test_allocator.h
index a6dea91f58e7a..c700098c87f33 100644
--- a/onnxruntime/test/util/include/test_allocator.h
+++ b/onnxruntime/test/util/include/test_allocator.h
@@ -14,7 +14,9 @@ struct MockedOrtAllocator : OrtAllocator {
   void* Alloc(size_t size);
   void Free(void* p);
   const OrtMemoryInfo* Info() const;
+  void* Reserve(size_t size);
   size_t NumAllocations() const;
+  size_t NumReserveAllocations() const;
 
   void LeakCheck();
 
@@ -24,5 +26,6 @@ struct MockedOrtAllocator : OrtAllocator {
 
   std::atomic<size_t> memory_inuse{0};
   std::atomic<size_t> num_allocations{0};
+  std::atomic<size_t> num_reserve_allocations{0};
   OrtMemoryInfo* cpu_memory_info;
 };
diff --git a/onnxruntime/test/util/test_allocator.cc b/onnxruntime/test/util/test_allocator.cc
index 002e759bd062b..05dd454e875d5 100644
--- a/onnxruntime/test/util/test_allocator.cc
+++ b/onnxruntime/test/util/test_allocator.cc
@@ -9,6 +9,7 @@ MockedOrtAllocator::MockedOrtAllocator() {
   OrtAllocator::Alloc = [](OrtAllocator* this_, size_t size) { return static_cast<MockedOrtAllocator*>(this_)->Alloc(size); };
   OrtAllocator::Free = [](OrtAllocator* this_, void* p) { static_cast<MockedOrtAllocator*>(this_)->Free(p); };
   OrtAllocator::Info = [](const OrtAllocator* this_) { return static_cast<const MockedOrtAllocator*>(this_)->Info(); };
+  OrtAllocator::Reserve = [](OrtAllocator* this_, size_t size) { return static_cast<MockedOrtAllocator*>(this_)->Reserve(size); };
   Ort::ThrowOnError(Ort::GetApi().CreateCpuMemoryInfo(OrtDeviceAllocator, OrtMemTypeDefault, &cpu_memory_info));
 }
 
@@ -30,6 +31,18 @@ void* MockedOrtAllocator::Alloc(size_t size) {
   return (char*)p + extra_len;
 }
 
+void* MockedOrtAllocator::Reserve(size_t size) {
+  constexpr size_t extra_len = sizeof(size_t);
+  memory_inuse.fetch_add(size += extra_len);
+  void* p = new (std::nothrow) uint8_t[size];
+  if (p == nullptr)
+    return p;
+  num_allocations.fetch_add(1);
+  num_reserve_allocations.fetch_add(1);
+  *(size_t*)p = size;
+  return (char*)p + extra_len;
+}
+
 void MockedOrtAllocator::Free(void* p) {
   constexpr size_t extra_len = sizeof(size_t);
   if (!p) return;
@@ -47,6 +60,10 @@ size_t MockedOrtAllocator::NumAllocations() const {
   return num_allocations.load();
 }
 
+size_t MockedOrtAllocator::NumReserveAllocations() const {
+  return num_reserve_allocations.load();
+}
+
 void MockedOrtAllocator::LeakCheck() {
   if (memory_inuse.load())
     ORT_THROW("memory leak!!!");

From f7b52d2e3e1a907b52a386d287a359107fa896a3 Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Fri, 29 Mar 2024 05:06:28 +0800
Subject: [PATCH 267/279] [Fix] Only copy java files when build_java is True
 (#20121)

### Description


### Motivation and Context
Fix error in Nuget-CUDA-Packaging-Pipeline
---
 tools/ci_build/github/azure-pipelines/templates/win-ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
index a68442308eddd..a6351e8c4fe9e 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
@@ -214,7 +214,6 @@ stages:
             Get-ChildItem -Path "$(Agent.TempDirectory)/RelWithDebInfo" -Include *.pdb -File -Recurse | ForEach-Object { $_.Delete() }
             Get-ChildItem -Path "$(Agent.TempDirectory)/RelWithDebInfo" -Include *.lib -File -Recurse | ForEach-Object { $_.Delete() }
             Copy-Item -Path $(Build.BinariesDirectory)/RelWithDebInfo/RelWithDebInfo/onnxruntime.pdb -Destination $(Agent.TempDirectory)/RelWithDebInfo/RelWithDebInfo -Force
-            Copy-Item -Path $(Build.BinariesDirectory)/RelWithDebInfo/RelWithDebInfo/onnxruntime4j_jni.pdb -Destination $(Agent.TempDirectory)/RelWithDebInfo/RelWithDebInfo -Force
             cd $(Agent.TempDirectory)/RelWithDebInfo
             tree /f
           displayName: 'Copy native test needs files'
@@ -222,10 +221,11 @@ stages:
 
         - ${{ if eq(parameters['buildJava'], 'true') }}:
           - powershell: |
+              Copy-Item -Path $(Build.BinariesDirectory)/RelWithDebInfo/RelWithDebInfo/onnxruntime4j_jni.pdb -Destination $(Agent.TempDirectory)/RelWithDebInfo/RelWithDebInfo -Force
               Copy-Item -Path "$(Build.BinariesDirectory)/RelWithDebInfo/java" -Destination $(Agent.TempDirectory)/RelWithDebInfo/ -Recurse -Force
               cd $(Agent.TempDirectory)/RelWithDebInfo
               tree /f
-            displayName: 'Copy java folder for java test'
+            displayName: 'Copy java pad and folder for java test'
             workingDirectory: '$(Build.BinariesDirectory)'
 
         - task: PublishPipelineArtifact@1

From 2f82400b13f8cb7a7c362a7437971cb0943314e6 Mon Sep 17 00:00:00 2001
From: Adam Pocock <adam.pocock@oracle.com>
Date: Thu, 28 Mar 2024 18:51:22 -0400
Subject: [PATCH 268/279] [java] Java 21 build support (#19876)

### Description
Bump spotless and the Gradle wrapper to 6.25.0 and 8.6 respectively to
allow compiling ORT on Java 21. The build still targets Java 8.

I'm not sure if there will be CI changes necessary to use this PR,
specifically for the Gradle version as I don't know if that is cached
somewhere earlier in the CI build process.

The new Gradle version adds a warning that using `--source` and
`--target` to select the Java language version is obsolete which is
annoying, we can fix it if we decide to only allow building on newer
versions of Java, while still supporting running on Java 8.

### Motivation and Context
Java 21 is the latest LTS release of Java and ORT should be able to
build on it.
---
 java/README.md                                |   5 +--
 java/build.gradle                             |   2 +-
 java/gradle/wrapper/gradle-wrapper.jar        | Bin 61608 -> 43462 bytes
 java/gradle/wrapper/gradle-wrapper.properties |   5 +--
 java/gradlew                                  |  29 ++++++++++--------
 .../java/ai/onnxruntime/OnnxJavaType.java     |   2 ++
 .../src/main/java/ai/onnxruntime/OnnxMap.java |   1 +
 .../main/java/ai/onnxruntime/OnnxRuntime.java |   6 ++++
 .../java/ai/onnxruntime/OnnxSparseTensor.java |   1 +
 java/src/test/java/sample/ScoreMNIST.java     |   1 +
 .../templates/mac-cpu-packing-jobs.yml        |   6 ++++
 11 files changed, 41 insertions(+), 17 deletions(-)

diff --git a/java/README.md b/java/README.md
index 2ce9a8bf62e4d..5c5baeb43a278 100644
--- a/java/README.md
+++ b/java/README.md
@@ -14,7 +14,7 @@ Use the main project's [build instructions](https://www.onnxruntime.ai/docs/how-
 
 #### Requirements
 
-JDK version 8 or later is required.
+Java 11 or later is required to build the library. The compiled jar file will run on Java 8 or later.
 
 The [Gradle](https://gradle.org/) build system is used here to manage the Java project's dependency management, compilation, testing, and assembly.
 In particular, the Gradle [wrapper](https://docs.gradle.org/current/userguide/gradle_wrapper.html) at `java/gradlew[.bat]` is used, locking the Gradle version to the one specified in the `java/gradle/wrapper/gradle-wrapper.properties` configuration.
@@ -35,6 +35,7 @@ This allows the CMake system to ensure all of the C/C++ compilation is achieved
 The Java build depends on C/C++ onnxruntime shared library and a C JNI shared library (source located in the `src/main/native` directory).
 The JNI shared library is the glue that allows for Java to call functions in onnxruntime shared library.
 Given the fact that CMake injects native dependencies during CMake builds, some gradle tasks (primarily, `build`, `test`, and `check`) may fail.
+To run the Java build independently of CMake supply `-DcmakeBuildDir=<path-to-onnx-runtime-build-dir>`, though this will only succeed after an initial build of the native libraries has completed.
 
 When running the build script, CMake will compile the `onnxruntime` target and the JNI glue `onnxruntime4j_jni` target and expose the resulting libraries in a place where Gradle can ingest them.
 Upon successful compilation of those targets, a special Gradle task to build will be executed. The results will be placed in the output directory stated above.
@@ -61,4 +62,4 @@ Then the corresponding C files in `./src/main/native/ai_onnxruntime*.c` may be u
 
 ### Dependencies
 
-The Java API does not have any runtime or compile dependencies currently.
+The Java API does not have any runtime or compile dependencies.
diff --git a/java/build.gradle b/java/build.gradle
index c0a75f8165f7f..5a0c4a9e39377 100644
--- a/java/build.gradle
+++ b/java/build.gradle
@@ -3,7 +3,7 @@ plugins {
 	id 'maven-publish'
 	id 'signing'
 	id 'jacoco'
-	id "com.diffplug.spotless" version "6.13.0"
+	id "com.diffplug.spotless" version "6.25.0"
 }
 
 allprojects {
diff --git a/java/gradle/wrapper/gradle-wrapper.jar b/java/gradle/wrapper/gradle-wrapper.jar
index ccebba7710deaf9f98673a68957ea02138b60d0a..d64cd4917707c1f8861d8cb53dd15194d4248596 100644
GIT binary patch
literal 43462
zcma&NWl&^owk(X(xVyW%ySuwf;qI=D6|RlDJ2cR^yEKh!<L)kv!^b;wzjN=MbLPE6
z#Qs54Mb(a4xpF<3xwf(#I0QP#moHyHKtM=7umAmr3<3k9AfYb8AfqVBBrhW-p{ORI
zp$-WG`qx|5b@g0VIWYsKzV}*LSf1fX%5<DxH2bTXmT7RMuqAb62#S(Z1H@42g>@I-
zp9QeisK*rlxC>+~7Dk4IxIRsKBHqdR9b3+fyL=ynHmIDe&|>O*VlvO+%z5;9Z$|DJ
zb4dO}-R=MKr^6EKJiOrJdLnCJn>np<Vr_Xn3)te~Xt>?~vU-1sSFgPu;pthGwf}bG
z(1db%xwr#x)r+`4AGu$j7~u2MpVs3VpLp|mx&;>`0p0vH6kF+D2CY0fVdQOZ@h;A`
z{infNyvmFUiu*X<?lkm_Rwc7`N28EaGe!}kzj7nfhW^@lTVlH-#Uo^46(tYuSUgOx
zQr0fsq(~O?24S?tV(okgsek@TWWcu+UvB}S%m>G}RNMNwXrbec_*a3N=2zJ|Wh5z*
z5rAX$JJR{#zP>KY**>xHTuw?|-Rg|o24V)74HcfVT;WtQHXlE+_4iPE8QE#DUm%x0
zEKr75ur~W%w#-My3Tj`hH6EuEW+8K-^5P62$7Sc5OK+22qj&Pd1;)1#4tKihi=~8C
zHiQSst0cpri6%OeaR`PY>HH_;CPaRNty%WTm4{wDK8V6gCZlG@U3$~JQZ;<Vs5#qH
zEVy+t;!5@Xu1$jID=`9nIoF+Jc9_az6+@ZeQX+!p62E#%NU_ikW&7u6D)sZpOG{u|
z={bCQI06wwYzSWO$q~5IHw{K<h(x`GAQV}I+HC2mJ9);BffzPtNZV^JzK+Q*#E)sp
z_;y^CR19xFFVGX1#sx$S&@R1md`SKw94gSZefoLMIz1SgFUJeHlDdu>HPvDJcT1V{
z?>H@13MJcCNe#5z+MecYNi@VT5|&UiN1D4ATT+%M+h4c$t;C#UAs3O_q=GxK0}8%8
z8J(_M9bayxN}69ex4dzM_P3oh@ZGREjVvn%%r7=xjkqxJP4kj}5tlf;QosR=%4L5y
zWhgejO=vao5oX%mOHbhJ8V+SG&K5dABn6!WiKl{|oPkq(9z8l&Mm%(=qGcFzI=eLu
zWc_oCLyf;hVlB@dnwY98?75B20=n$>u3b|NB28H0u-6Rpl((%KWEBOfElVWJx+5yg
z#SGqwza7f}$z;n~g%4HDU{;V{gXIhft*q2=4zSezGK~nBgu9-Q*rZ#2f=Q}i2|qOp
z!!y4p)4o=LVUNhlkp#JL{tfkhXNbB=Ox>M=n6soptJw-IDI|_$is2w}(XY>a=H52d
z3zE$tjPUhW<B7^QI+mzDc0r|3FgQFs!Jsdf2mD!`%+)SGMT!&dDeNq8Wnr~TJ=;SJ
zCjA5AMnKC>WS+5h=KVH&uqQS=$v3nRs&p$%11b%5qtF}S2#Pc`IiyBIF4%A!;AVoI
zXU8-Rpv!DQNcF~(qQnyyMy=-AN~U>#&X1j5BLDP{?K!%h!;hfJI>$mdLSvktEr*89
zdJHvby^$xEX0^l9g$xW-d?J;L0#(`UT~zpL&*cEh$L|HPAu=P8`OQZV!-}l`noSp_
zQ-1$q$R-gDL)?6YaM!=8H=QGW$NT2SeZlb8PKJdc=F-cT@j7Xags+Pr*jPtlHFnf-
zh?q<6;)27IdPc^Wdy-mX%2s84C1xZq9Xms+==F4);O`VUASmu3(RlgE#0+#giLh-&
zc<QGvU&1r_Xz58P7NkF*I*90qex!^xxfEgH#K;#C|KMCf;CA5Qt-NV8mGe5b-lG!j
zRL`7OWA4AJCL!FWu3g%<l7t>xm3_e}n4<JRr%rS6Swi_EMqL;`T8Bl3(r42Q<|~(Y
zc;e@g+fVh%OUP%og+-&}AUrto$4spr+PoQd2Zp+clpMO`)?XEs_x|w9_1so-38=4Y
zn`D1h2@&{Ai|aMqEbZDK1O5PGO%pa3=lgn}`i!wzdMR^A4OKHJ)Gs9YZ1vnbkiv-D
z$-P%T9AC{vA3^Up7DULFj^rOQ`7gHyAFny;2s;Lb$MDVB@Qs!<`=}5GFJ_Xz>{%|X
zJp{G_j+%`j_q5}k{eW&TlP}J2wtZ2^<^E(O)4OQX8FDp6RJq!F{(6eHWSD3=f~(h}
zJXCf7=r<16X{pHkm%yzYI_=VDP&9bmI1*)Y<!NUzHwGU;+XI38Q(`+NB8>XZeB}F?
z(%QsB5fo*FUZxK$<e}vt0yO7dH1jD~7>oX~X^69;x~j7ms8xlzpt-T15e9}$4T-pC
z6PFg@;B-j|Ywajpe4~bk#S6(fO^|mm1hKOPfA%8-_iGCfICE|=P_~e;Wz6my&)h_~
zkv&_xSAw7AZ%ThYF(4jADW4vg=oEdJGVOs>FqamoL3Np8>?!W#!R-0%2Bg4h?kz5I
zKV-rKN2n(vUL%D<4oj@|`eJ>0i#TmYBtYmfla;c!ATW%;xGQ0*TW@PTlGG><@dxUI
zg>+3SiGdZ%?5N=8uoLA|$<tQF__q{Hb+omJ>4isK$aJ%i{hECP$bK{J#0W2gQ3YEa
zZQ50Stn6hqdfxJ*9#NuSLwKFCU<kW<Z$>Gk@c=(igyVL;;2^wi4o30YXSIb2g_ud$
zgpCr@H0qWtk2hK8Q|&wx)}4+hTYlf;$a4#oUM=V@Cw#!$(nOFFpZ;0lc!qd=c$S}Z
zGGI-0jg~S~cgVT=4Vo)b)|4phjStD49*EqC)IPwyeKBLcN;Wu@Aeph;emROAwJ-0<
z_#>wVm$)ygH|qyxZaet&(Vf%pVdnvKWJn9`%DAxj3ot;v>S$I}jJ$FLBF*~iZ!ZXE
zkvui&p}fI0Y=IDX)mm0@tAd|fEHl~J&K}ZX(Mm3cm1UAuwJ42+AO5@HwYfDH7ipIc
zmI;1J;J@+aCNG1M`Btf>YT>~c&3<N>j~Qi@Py5JT6;zjx$cvOQW@3oQ>|}GH?TW-E
z1R;q^QFjm5W~7f}c3Ww|awg1BAJ^slEV~Pk`Kd`PS$7;SqJZNj->it4DW2l15}xP6
zoCl$kyEF%yJni0(L!Z&14m!1ur<bj#167-*(B|jp)F*o{Q;Hn)6)_<P63qS{7s)%O
z``Aek8i5TJj-mjjYtt1A_~`C%@M}|?ur(!4Oz?<A^)?FLyfSWzL9}|;jFV^_SWWx7
zZqoBj%8Zht{DR?*BSX3Fo`9QF2<={td!w9oLBkZ!>Xh6Btj_5JYt1{#+H8w?5QI%%
zo-$KYWNMJVH?Hh@1n7OSu~QhSswL8x0=$<8QG_zepi_<zlB#8m+hcE7gc<MZ-I}wy
z>`y_79=nK=_ZP_`Em2UI*tyQoB+r{1QYZCpb?2OrgUw#oRH$?^Tj!Req>XiE#~B|~
z+%HB;=ic+R@px4Ld8mwpY;W^A%8%l8$@B@1m5n`TlKI6bz2mp*^^^1mK$COW$HOfp
zUGTz-cN9?BGEp}5A!mDFjaiWa2_J2Iq8qj<W!S?C&KT9grEb&=%wm;aC1~>0mXzk;
z66JBKRP{p%wN7XobR0YjhAuW9T1Gw3FDvR5dWJ8ElNYF94e<ioutKi#n7!$mwZ7cG
z1bc^4#{Lo^rv1yy&HM`wbm`jfSY+G{qjDC1m?i9np*9^ecJ6!CKPZ;Z?_@`Nrs+nA
zB6#eGiAgK!RqyysJp%o~7rj*4vtuR7j|$OCbL9xyI9^gP(08>F3ebu+QwKjtvVu4L
zI9ip#mQ@4uqVdkl-TUQMb^XBJVLW(-$s;Nq;@5gr4`UfLgF$adIhd?rHOa%D);whv
z=;krPp~@I+-Z|r#s3yCH+c1US?dnm+C*)r{m+86sTJusLdNu^sqLrfWed^ndHXH`m
zd3#cOe3>w-ga(Dus_^ppG9AC>Iq{y%%CK+Cro_sqLCs{VLuK=dev>OL1dis4(PQ5R
zcz<j+gH(MtWYW4^8ed>)>DjEkfV+M<e_sEdzrS<1AHM%agf4oS_E5Eo5a@5bZSJRE
z-3LG-!nD1<2E1K6xQ;KRI>O;~>VUlYF00SgfUo~@(&9$Iy2|G0T9BSP?&T22>K46D
zL*~j#yJ?)^*%J3!16f)@Y2Z^kS*BzwfAQ7K96rFRIh>#$*$_Io;z>ux@}G98!fWR@
zGTFxv4r~v)Gsd|pF91*-eaZ3Qw1MH$K^7JhWIdX%o$2kCbvGDXy)a?@8T&1dY4`;L
z4Kn+f%SSFWE_rpEpL9bnlmYq`D!6F%di<&Hh=+!VI~j)2mfil03T#jJ_s?}VV0_hp
z7T9bWxc>Jm2Z0WMU?`Z$xE74Gu~%s{mW!d4uvK<j&<1yHv!7+02LGZ>Cx@WD+gPUQ
zV0vQS(Ig++z=EHN)BR44*EDSWIyT~R4$FcF*VEY*8@l=218Q05D2$|fXKFhRgBIEE
zdDFB}1dKkoO^7}{5crKX!p?dZWNz$m>1icsXG2N+((x0OIST9Zo^DW_tytvlwXGpn
zs8?pJXjEG;T@qrZi%#h<Ub!eG-{OloH-RpCzw35}x@i|jcqI|*S)Mk#vJASbW?htA
zkoiPl104oY;UP=8R1euujt$?djSOx?y-rqs2lMK%Qb9yZr^vF%!MGNK6X7qcO{3$l
z`SpE|3;1<tJDRMxF=rVtiibsxjcy7ac&I!rJ(vX~wSh6hna0U?6s6xBR8R}cWK=Mr
z0w`kyzSZL7v262fj&Zs-DwNn*X?a01@1FD@>93?FP$!&P4JA(&H61tqQi=opRzNpm
zkrG}$^t9&XduK*Qa1?<l%^7R<o*-c=iC4sk-`i4!S6!9X4fSx+qbvvgrLLuj@E8FE
zq?-x^MET$9MfCquFDi&A%1BD6sWU1_{+DLFRrob7FUP<*gCNI1JNawshbr?t+t&Wg
zFNRT>355wd8G2CI6QEh@Ua>AsD;7oRUNLPb76m4HG3K?)wF~IyS3`fXuNM>${?wmB
zpVz;?6_(Fiadfd{vUCBM*_kt$+F3J+IojI;9L(gc9n3{sEZyzR9o!_mOwFC#tQ{Q~
zP3-`#uK#tP3Q7~Q;4H|wjZHO8h7e4IuBxl&vz2w~D8)w=Wtg31zpZhz%+kzSzL*dV
zwp@{WU4i;hJ7c2f1O;7Mz6<tj2!R+wyuceauj<?kVu{wyl=%JHPkLzkWmg+Z?RG$#
zRU+Lf6+%iJNryt(CfbHrg5cMQPHCKXNZUXf@5VtUpcIZ(1nKR6v)V%+OF~*L&Q2bo
zXdQmkq&Da<4ZR}a$I6XIt`dd!z6Ld%&o(8?bghVIHa*@Mm4a2#r;SUX#A+KRSH^xk
zTs2!r=Ribpu&~Zx#mw!4jE91^t<FzpP{8m$mj$1xwQ{_Z*|&XtH^s|T`GZmEqGKAk
zj@Xz#kk3*eWc-B>qRKeASoIv0_bV=i@NMG*l<#+;INk-^`5w@}Dj~;k=|}qM1vq_P
z|GpBGe_IKq|LNy9SJhKOQ$c=5L{Dv|Q_lZl=-ky*BFBJLW9&y_C|!vyM~rQx=!vun
z?rZJQB5t}Dctmui5i31C_;_}C<yrm%u8E>En}_W%>oSXtt>@kE1=JW*4*v4tPp;O6
zmAk{)m!)}34pTWg8{i>($%NQ(Tl;QC@J@FfBoc%Gr&m560^kgSfodAFrIjF}aIw)X
zoXZ`@IsMkc8_=w%-7`D6Y4e*CG8k%Ud=GXhsTR50jUnm+R*0A(O3UKFg0`K;qp1bl
z7``HN=?39ic_kR|^R^~w-*pa?Vj#7|e9F1iRx{GN2?wK!xR1GW!qa=~pjJb-#u1K8
zeR?Y2i-pt}yJq;SCiVHODIvQJX|ZJaT8nO+(?HXbLefulKKgM^B(UIO1r+S=7;kLJ
zcH}1J=Px2jsh3Tec&v8Jcbng8;V-`#*UHt?hB(pmOipKwf3Lz8rG$heEB30Sg*2rx
zV<|KN86$soN(I!BwO`1n^^uF2*x&vJ$2d$>+`(romzHP|)K_KkO6Hc>_dwMW-M(#S
zK(~SiXT1@fvc#U+?|?PniDRm01)f^#55;nhM|wi?oG>yBsa?~?^xTU|fX-R(sTA+5
zaq}-8Tx7zrOy#3*JLIIVsBmHYLdD}!0NP!+ITW+Thn0)8SS!$@)HXwB3tY!fMxc#1
zMp3H?q3eD?u&Njx4;KQ5G>32+GRp1Ee5qMO0lZjaRRu&{W<&~DoJNGkcYF<5(Ab+J
zgO>VhBl{okDPn78<%&e2mR{jwVCz5Og;*Z;;3%VvoGo_;HaGLWYF7q#jDX=Z#Ml`H
z858YVV$%J|e<1n`%6Vsvq7GmnAV0wW4$5qQ3uR@1i>tW{xrl|ExywIc?fNgYlA?C5
zh$ezAFb5{rQu6i7BSS5*J-|9DQ{6^BVQ{b*lq`xS@RyrsJN?-t=MTMPY;WYeKBCNg
z^2|pN!Q^WPJuuO4!|P@jzt&tY1Y8d%FNK5xK(!@<w&%9D$8VsECTj#p>`jO2aEA*4
zkO6b|UVBipci?){-Ke=+1;mGlND8)6+P;8sq}UXw2hn;fc7nM>g}GSMWu&v&fqh<p
z##88~l{cY%DBl|WjH>iViYT=fZ(|3Ox^$aWPp4a8h24tD<|8-!aK0lHgL$N7Efw}J
zVIB!7=T$U`ao1?upi5V4Et*-lTG0XvExbf!ya{cua==$WJyVG(CmA6Of*8E@DSE%L
z`V^$qz&RU$7G5mg;8;=#`@rRG`-uS<w08Td3B}9G%N}FK9HR~!f^Cj{`y^tJfREum
z*~gqwb#dcwc-QI{m8GKpLpC4_K%Vfi)$F_F!#pL|shy<Q%NAC^CU`RL!|-5srD#ZO
zn{I~O)p%c{5m;d<;UUXgV+yNvL<rtSIQngc|6F6>18$0WPN@!v2d{H2sOqP|!(cQ@
zUHo!d>>yFArLPf1q`uBvY32miqShLT1B@gDL4XoVTK&@owOoD)OIHXrYK-a1d$B{v
zF^}8D3Y^g%^cnvScOSJR5QNH+BI%d|;J;wWM3~l>${fb8DNPg)wrf|GBP8p%LNGN#
z3EaIiItgwtGgT&iYCFy9-LG}bMI|4LdmmJ<aSE*uYY8*ef191mD07amYtQ+>t@V@%
zb6B)1kc=T)(|L@0;wr<>=?r04N;E&ef+7C^`wPWtyQe(*pD1pI_&XHy|0gIGHMekd
zF_*M<adlI3H~C+q^IzcHq+zQxXN(?TC=A;~jCHxB64cks3Odw>4yi6J&Z4LQj65)S
zXwdM{SwUo%3<O37{DHPAG$A+a&Uh?}DLaiJmVVmFZ1SIa$#%_k^;QaeeZ7P1{c?b9
zC=eLHcdO3e<gc?V;V!z6HlJL{r#Zyj=E&V_!6PB!qLm)(8_YSrHh0%Boz_*kUx6mK
zb|)@dlgu8i#ZFeI!mo!f$fZhLo%K}Hqt2m#>SbPwFsHgqF@V|6afT|R6?&S;lw=8%
z3}@9<sV<+=?Zw{9R&#fEo?wO?NZ(DJrAWh4NL*AP6WG<pY>B=#JI3@B*#4s!O))~z
zc>2_4Q_#&+5V`GFd?88^;c1i7;Vv_I*qt!_Yx*n=;rj!82rrR2rQ8u5(Ejlo{15P%
zs~!{%XJ>FmJ})H^I9<JZ=qNB8Uvp_IODk79lcQ%6N8nJ<layarnSw*wERT@1y+@T9
zbCRk63Z9EFi*?65Y?t(rNyKH`R2OmS8*97sR}##9$$k=`zv4t1*Bd!||1<$^?K3bV
zch~R<>bn^Re&38H{xA!0l3^89k(oU;bZWXM@kn$#aoS&Y4l^-WEn<v2GMB&`=$+t{
zsqH-Hrg^zC-u%NR+$BDUf%Zr&u$O+4nJ{Bn;W>-fH39Jb9lA%s*WsKJQl?n9B7_~P
z-XM&WL7Z!PcoF6_D>V@$CvUIEy=+Z&0kt{szMk=f1|M+r*a43^$$B^MidrT0J;RI`
z(?f!O<8UZkm$_Ny$<xT<$ZIyDj(fr1FYD^^at+o!IT*&wJZ2YcAjrNtR7B|~_E5=s
zOz!Ci^%eTS=@CxD@zJ?@F7iX3EI*kkt`>Hth1J#^4ni+im8M9mr&k|3cIgwvjAgjH
z8`N&h25xV#v*d$qBX5jkI|xOhQn!>IYZK7l5#^P4M&twe9&Ey@@GxYMxBZq2e7?`q
z$~Szs0!g{2fGcp9PZEt|rdQ6bhAgpcLHPz?f-vB?$dc*!9OL?Q8mn7->bFD2Si60*
z<SxIYe5*?<(^};SmYZJUE5uTQgi>!O%y)fCdMSV|lkF9w%x~J*A&srMyYY3{=&$}H
zGQ4VG_?$2X(0|vT0{=;W$~icCI{b6W{B!Q8xdGhF|D{25G_5_+%s(46lhvNLkik~R
z><gHECVNA9lSE@px%4b)MU9AlX-3O&uNmc}yl!RiOZVkYH*st9io|}a-%W^-z%%sS
zBRKzv>nr(&C#5wwOzJZQo9m|U<;&Wk!_#q|V>fsmj1g<6%hB{jGoNUPjgJslld><h
z0D4sDtR`m}US*Y@1-q?v_8uo!>xmODzGjY<PkPw{-%~Z34UsBBxRhv{JbTqaVf$5q
z{S2pJqjiGYd2`{L@&>c?7JSuA?A_QzjDw5AsRgi@Y|Z0{F{!1=!NES-#*f^s4l0Hu
zz468))2IY5dmD9pa*(yT5{EyP^G>@ZWumealS-*WeRcZ}B%gxq{MiJ|RyX-^C1V=0
z@iKdrGi1jTe8Ya^x7yyH$kBNvM4R~`fbPq$BzHum-3Zo8C6=KW@||>zsA8-Y9uV5V
z#oq-f5L5}V<&wF4@X@<3^C%ptp6+Ce)~hGl`kwj)bsAjmo_GU^r940Z-|`<)oGnh7
zFF0Tde3>ui?8Yj{sF-Z@)yQd~CGZ*w-6p2U<8}JO-sRsVI5dBji`01W8A&3$?<gda
z?BZWzCe?!+Mzz3|voidco80@c=7!Ur=?8_d@M{njoqdXf;HO=-j8&nJI$X=2*nR5b
zdkQR1yvkj-(Jw_Mm=h7q^yxfjEp7^;bcyq6D@_K&VWKp^Sop&nM1y{dpIssXCy2i4
z=LyazjEE+1j0KJwbLD2TWETW$BXbT?w}d!pg$Bwi+bH5Fd+>}lxBaC&vn0E$c5tW*
zX>5(zzZ=qn&!J~KdsPl;P@bmA-Pr8T*)eh_+Dv5=Ma|XSle6t(k8qcgNyar{*ReQ8
zTXwi=8vr>!3Ywr+BhggHDw8ke==NTQVMCK`$69fhzEFB*4+H9LIvdt-#IbhZvpS}}
zO3lz;P?zr0*0$%-Rq_y^k(?I{Mk}h@w}cZpMUp|ucs55bcloL2)($u%mXQw({Wzc~
z;6nu5MkjP)0C(@%6Q_I_vsWrfhl7Zpoxw#WoE~r&GOSCz;_ro6i(^hM>I$8y>`!wW
z*U^@?B!MMmb89I}2(hcE4zN2G^kwyWCZp5JG>$Ez7zP~D=J^LMjSM)27_0B_X^C(M
z`fFT+%DcKlu?^)FCK>QzSnV%IsXVcUFhFdBP!6~se&xxrIxsvySAWu++IrH;FbcY$
z2DWTvSBRfLwdhr0nMx+URA$j3i7_*6BWv#DXfym?ZRDcX9C?cY9sD3q)uBDR3uWg=
z(lUIzB)G$Hr!){>E{s4Dew+tb9kvToZp-1&c?y2wn@Z~(VBhqz`cB;{E4(P3N2*nJ
z_>~g@;UF2iG{Kt(<1PyePTKahF8<)pozZ*xH~U-kfoAayCwJViIrnqwqO}7{0pHw$
zs2Kx?s#<e4dK%3b&+)k6Y|;S4DL85-WyrKVIj_j{{6I$&%2Do^b$2hE@5=a;FpA#?
zj?ue-OJN&$@J%?YKM$eGC(JtcZy!B$NWApB@di<X3W1e<da}udYMoeI3o7_<JF=Zh
zYpeXWvB)_@QIIYWL@3|7rB4|sFVTbw@}gXInS&gjZDpG1DvPM{2=+Ykc&(i5ifc$_
zh}LZ8qwN(P?@iKtl~(pPzWr%^DDJwnMm5!z?#c5FjG;&ZIUT6z@~tYZ+k;3g@lXUv
zc!5>vQr7XZ264>5RNKSL8|Ty^=PsIx^}QqOOcfpGUU4tRkUc|kc7-!Ae6!+<d3LhN
z&XzYh?Io|}4gfQtnbqxLyJD!7AFApf16JvF_cy6Y`x$Llw|Gfz;B@D2LO~qGw&vqe
zsH^CM_BX(-!fK<Eo8IYbMVZ6+S7(ZE0!FE(^I(-o-1+IZ<U0&RNxqDrwiyUBYR)d<
z#!m|@8#oaZv2FVRDr{+-;DajT%LpK4DvrtxtKBx!-{g~??&MQ(9~5@mIpt?Y@cvL%
zN}LG<11t5Cf)G^&gR<1lgJ{-R<J>B{o~7nFpm3|G5^=0#Bnm6`V}oSQlrX(u%OWnC
zoLPy&Q;1J<Q=Rw7zP0W`<I7X@-?=Nf;Xee%N*w}vJKiE|cD<4a=^PO7MQlM02j-+$
zKCm^lM#p{(Wj}5i#kBavYT-0#nTBp`m_35(`HY&Y@4YUMZTiPw%I|bpPk6PK|CYyI
z`Xes=050k(L_<N^Jv(Mpm{{2H2c!?vIl%96&k^E-?K&Vk`$KkeX~Jw~Fsfk(d16L!
z-bf)Sz$PAGgB&vJhQ}eDyRs{2lK?Gq>ui&7ST0~#+}I^&?vcE*t47~Xq#YwvA^6^}
z`WkC)$AkNub|t@S!$8CBlwbV~?yp&@9h{D|3<UyWg7|l{^glZ5Dp<R^T02<&bDDly
zL;vF_RkK%`Q$`P93~`T9H0DLwoQ7TLazhh0=(S3=G6^=$?i+3C_*1LCvRZO39g|43
z<9HQ9$%`iR2>z-vJXgzRC5^nYm+PyPcgRzAnEi6Q^gslXYRv4nycsy-SJu?lMps-?
zV`U*#WnFsdPLL)Q$AmD|0`UaC4ND07+&UmOu!eHruzV|OUox<+Jl|Mr@6~C`T@P%s
zW7sgXLF2SSe9Fl^O(I*{9wsFSYb2l%-;&Pi^d<d5gZY$ouXkSxh!i;YM~orayzvd9
z$>pv!{)C3d0AlNY6!4fgmSgj_wQ*7Am<LG|<(H6ae*!sIhw}*%cn8L{G&Qz)Il&k~
zdUB%q=yN3Oz|C{wnn*7=Y}H`Tp)^(dl#`ZFq_B51Ks``*Lp3oxGdwsr3V(Qn1t<fg
ziSMz83x=Ap$h4F(3LJRw8rYw8Xe?}PuG+VUk_bdjjrHLOpBsF^D$Ey0j+dIBQyzkk
z^3ERO0R}UDBy9?czbAa9qFbE3)`4Z5^?OXpH_#GiC*r#QhzMY=jT%oM-$ku0=Z=SS
zPUHZ>7&$z;Jg&wgR-Ih;lUvWS|KTSg!&s_E9_bXBkZvGiC6bFKDWZxsD$*NZ#_8bl
zG1P-#@?OQzE<T`h^Ta9L)yZGSKEF-RZ;~y;F=H0>D7@jlMJTH@V!6k;W>auvft)}g
zhoV{7$q=*<qV(a={b8n4s4yhm_oFzSM6cYhPn}4ss!6ccUzX#rKA~T>;=l{O>Q4a@
ziMjf_u*o^PsO)#BjC%0^h>Xp@;5$p{JSYDt)zbb}s{Kbt!T*I@Pk@X0zds6wsefuU
zW$XY%yyRGC94=6mf?x+bbA5CDQ2AgW1T-jVAJbm7K(gp+;v6E0WI#kuACgV$r}6L?
zd|Tj?^%^*N&b>Dd{Wr$FS2qI#Ucs1yd4N+RBUQiSZGujH`#I)mG&VKoDh=KKFl4=G
z&MagXl6*<)$6P}*Tiebpz5L=oMaPrN+caUXRJ`D?=K9!e0f{@D&cZLKN?iNP@X0aF
zE(^pl+;*T5qt?1jRC=5PMgV!XNITRLS_=9{CJExaQj;<KOp}Ss$xf3w95Awwovm~a
z$iUieZ+;LRH(+2InXK7*7kZ|*f#fpjzjcGEbKAy*9*Q%Ik>lt!&pdzpK?8p>%Mb+D
z?yO*uSung=-`QQ@yX@Hyd4@CI^r{2oiu`%^bNkz+Nkk!IunjwNC|WcqvX~k=><-I3
zDQdbdb|!v+I<k8C-8h1&l^p%~4R~&So2|v->z01$w@aMl!R)koD77Xp;eZwzSl-AT
zr@Vu{=xvgfq9akRrrM)}=!=xcs+U1JO}{t(avgz`6RqiiX<|hGG1pmop8k6Q+G_mv
zJv|RfDheUp2L3=^C=4aCBMBn0aRCU(DQwX-W(RkRwmLeuJYF<0urcaf(=7)JPg<3P
zQs!~G)9CT18o!J4{zX{_e}4eS)U-E)0FAt}wEI(c0%HkxgggW;(1E=>J17_hsH^sP
z%lT0LGgbUXHx-K*CI-MCrP66UP0PvGqM$MkeLyqHdbgP|_Cm!7te~b8p+e6sQ_3k|
zVcwTh6d83ltdnR>D^)BYQpDKlLk3g0Hdcgz2}%qUs9~~Rie)A-BV1mS&naYai#xcZ
z(d{8=-LVpTp}2*y)|gR~;qc7fp26}lPcLZ#=JpYcn3AT9(UIdOyg+d(P5T7D&*P}#
zQCYplZO5|7+r19%9e`v^vfSS1sbX1c%=w1;oyruXB%Kl$ACgKQ6=qNWLsc=28xJjg
zwvsI5-%SGU|3p>&zXVl^vVtQT3o-#$UT9LI@Npz~6=4!>mc431VRNN8od&Ul^+G<O
z&+UaGr()Me%6V)8@*Bx3oT4=TBj_vjymMyd8nWP{_ePd?ZpLR5y@P!PwPSnq@b%?+
z-3jAw7s2p_He|nVH%u;ROIVANf3n6TyT+w(2_c_sy)MF$<SLbp^<>_kHC`G=6WVWM
z%9eWNyy(FTO|A+@x}Ou3CH)oi;t#7rAxdIXfNFwOj_@Y&TGz6P_sqiB`Q6<Z9`{@I
zG)gW^Bs^4xVzBtFwh6KfO16ddPYr(3GPqiVlbqYNdp{z1`WEdF){s~pqbp&T6o|uZ
zd@{Wd+K`h(<$gjo>Lxy|Q{`|fgmRG(k+!#b*M+Z9zFce)f-7;?Km5O=LHV9f9_87;
zF7%R2B+$?@sH&&-$@tzaPYkw0;=i|;vWdI|Wl3q_Zu>l;XdIw2FjV=;Mq5t1Q0|f<
zs08j54Bp`3RzqE=2enlkZxmX6OF+@|2<)A^RNQpBd6o@OXl+i)<H_i&N`daO9#gRD
zbNRiucg54Gy(2fGrWtv`LB0NuWCH3@RHOJamGJ+B#lKG`{v$j0pO8><bNM_W{ENJH
zS2a-j%gz<EZJA$~1=AFf&`NH0`}A|!jr|e^sYu1)OS{vLFWmU*j55~$42^zF3vE?V
zDx)gAg1%Glt~U?HFUqs}9{v)ryb$pHbibYndMV|B@l;eyS(h=KEpcIvczVDE78@Xj
zSAm;1J^1Dwm>zO%D4iGiQNuXd+zIR{_lb96{lc~bxsBveIw6umhShTX+3@ZJ=YHh@
zWY3(d0azg;7oHn>H<>?4@*RQbi>SmM=JrHvIG(~BrvI)#W(<iZ@?A1TuACNgdXURW
z+)gvyNh;kbU(liR<_1NHe=TW&=0kt+K?wKELV`s)-;TebXWQlc!-`aY6o(nMsy+|=
z(E4T?*-uU}N^Du>EAeO6fS+}mxxcc+X~W6&YVl86W9WFSS}Vz-f9vS?XUDBk)3TcF
z8V?$4Q)`uKFq>xT=)Y9mMFVTUk*NIA!0$?RP6Ig0TBmUFrq*Q-Agq~DzxjStQyJ({
zBeZ;o5qUUKg=4Hypm|}>>L=XKsZ!<FKQ_ntYAcXmoND))pw_T3pH%fHY!pNqk5cuV
zOHA2*8nP`0K~OlwGy0SH3EWaDB-i#4%#|`_{yB={(lcN;3!10+Pxox-HkRohVDL0|
z8{E!B)=^(A?$N_ctiS3J)!iY`)ts!F9ODnu96)<93qPUBnzlYuy=Jt+bF8r45!B4r
zWzRGHHPnO9ti}*NAt&~?DFW+%!bE>F$yNTDO)jt4H0gdQ5$f|d&bnVCMMXhNh)~mN
z@_UV6D7MVlsWz+zM+inZZp&P4fj=tm6fX)<V@r2uJW+0vvpHTeeFv#Sgo7s}BAm4K
zjGGF<Q!n^&4xvzX0>SG5H>OsQf_I8c<Dg8clrV~^w3Z*%r!X6dX4x@j9;{JD(8l7}
z(Qgx)wY{>~uGCig$GzuwViK54bcgL;VN|FnyQl>Ed7(@>=8$a_UKIz|V6CeVSd2(P
z0Uu>A8A+muM%HLFJQ9UZ5c)<?oM~-8FHMf~w|t%VP|2Tazr<2Sjx#;msM?}B)jn`T
z*pYak+6)TB+ee7iYW3p=zQkY>BSAv_zH#1f02x?h9C}@pN@6{>UiAp>({Fn(T9Q8B
z^`zB;kJ5b`>%dLm+Ol}ty!3;8f1XDSVX0AUe5P#@I+FQ-`$(a;zNgz)4x5hz$Hfbg
z!Q(z26wHLXko(1`;(BAOg_wShpX0ixfWq3ponndY+u%1gyX)_h=v1zR#V}#q{au6;
z!3K=7fQwnRfg6FXtNQmP>`<;!N137paFS%y?;lb1@BEdbvQHYC{976l`cLqn;b8lp
zIDY>~m{gDj(wfnK!lpW6pli)HyLEiUrNc%eXTil|F2s(AY+LW5hkKb>TQ3|Q4S9rr
zpDs4uK_co6XPsn_z$LeS{K4jFF`2>U`tbgKdyDne`xmR<@6AA+_hPNKCOR-Zqv;xk
zu5!HsBUb^!4uJ7v0RuH-7?l?}b=w5lzzXJ~gZcxRKOovSk@|#V<jQ-(2_xKpDYt{m
zs6<ysaM*D{ARcUnlk4D!8f*k1G{Ip@*-dFQ!bhc3zg>+MuX%Y+=;14i*<yct;~?4+
z8a$HdaeCZdGnM?f>%{)_gSW9(#4%)AV#3__kac1|qUy!uyP{>?U#5wYNq}y$S9pCc
zF<Mndv5MKme1!JVgsoWscJ<knfP)Xe)2}9N1^EiWQ;6~WJPU`LW&2l%<A5V5ht~*^
zy?4tqc_+c^CwHLPg-(Ehm=L2yf~2Mxlz*2rKsp2n_Y-esI>c~4mgSC*G~j0u#qqp9
z${>3HV~@->GqEhr_Xwoxq?Hjn#=s2;i~g^&Hn|aDKpA<n^4$fC$Kuspwq}<LNwu8C
zbf6;H40RGwOB}XjobHn85?fmF9ub`bI+IQM@Py%7F9WcE4R&_4B5<%dGT-3U#$8JL
z+9W_t43rJ$Gf^G?+|wOo&KIwqf2&OR?zMoHUZhcc%t4i);VELMxvn-h%aEuLgl_t^
zn}SzihDXMuweFhp8a#vz8k>>Oc%HlW(KA1?BXqpxB;Ydx)w;2z^MpjJ(Qi(X!$5RC
z*P{~%JGDQqojV>2JbEeCE*OEu!$XJ>bWA9Oa_Hd;y)F%MhBRi*LPcdqR8X`NQ&1L#
z5#9L*@qxrx8n}LfeB^J{%-?SU{FCwiWyHp682F+|pa+CQa3ZLzBqN1{)h4d6+vBbV
zC#NEbQLC;}me3eeYnOG*nXOJZEU$xLZ1<1Y=7r0(-U0P6<g54#mo~h%m97FKEIHpA
zi1;stE`DWrPG;ZcvR%GkPkplBUD5E>-AqwMAM`a(Ed#7vJkn6plb4eI4?2y3y<C7^
zMN|!317AP-8b+h`frBg^oc&CV#@gdFKbOG_+V@md3_}H++3*0>OTGmmDQ!z9`wzbf
z_OY#0@5=bnep;MV<TyaesE-ymV~)h1zKx)bzGTLWVIp8Jc0Xjtfz?g}02XS&L-sgn
zz+J7FV@}%gTsBPancF*aFOCi$QUrJa8Ibmw3#H6HwLunm!~S7uJfJF*x^4TdZ((BQ
z!OA8ez?xzj5&N>0X_;;<l|1-%NB+RStv%4V3*k9dM&8$D*6KHj&I{f#=G0sJycjJ9
zsbnF%tPoJ^)WY9aH4DzA@Up>SJJWEf^E6Bd^tVJ9znWx&Ks8t*<NkWUjNZuHXm?wX
z&k+-16-gZ2l39lpjw5PGa}Nvw$IGx#fYlU<*{);MA3*W3V0a83>B>AM@?;D4oWUGc
z!H*`6d7Cxo6VuyS4Eye&L1ZRhrRmN6Lr`{NL(wDbif|y&z)JN>Fl5#Wi&mMIr5i;x
zBx}3YfF>><oG8>8EC(fYnmpu~)CYHuHCyr5*`ECap%t@y=jD>!_%3iiE|LN$mK9>-
zHdtpy8fGZtkZF?%TW~29JIAfi2jZT8>OA7=h;8T{{k<t+bcx0feOM-&l9hI?Xmy(z
z-fq}pe2pf8j{{(B0xe64n-!77hMklQf7$y)E8W+2Z@Tt{aWpu0WCZ>?c2`nCEx9$r
zS+*&vt~2o^^J+}RDG@+9&M^K*z4p{5#IEVbz`1%`m5c2};aGt=V?~vI<yv(rk2~bL
zj=};pL9GggLHjow%gVWrUmkyePLZ<Qj(q4u0~rKInZFBgS5;8-hpCcg={gcFHnWum
zMonS>M}ZdPECD<VzUqoNT#)>I)47|CWBCfDWUbxBCnmYivQ*0Nu_xb*C>~C9(VjHM
zxe<*D<#dQ8TlpMX2c@M<9$w!RP$hpG4cs%AI){jp*Sj|*`m)5(Bw*A0$*i-(C<so-
zb=dYmBN5sxWooS6f<r|QIT;nm(*5IP;!0gq<>A5#%>a)$+jI2C9r6|(>J8InryENI
z$NohnxDUB;wAYDwrb*!N3noBTKP<UImuz_p@!@WwtKs6Oq=w->pPN}~09SEL18tkG
zxgz(RYU_;DPT{l?Q$+eaZaxnsWCA^ds^0PVRkIM%bOd|G2IEBBiz{&^JtNsODs;5z
zICt_Zj8wo^KT$7Bg4H+y!Df#3mbl%%?|EXe!&(Vmac1DJ*y~3+kRKAD=Ovde4^^%~
zw<9av18HLyrf*_>Slp;^i`Uy~`mvBjZ|?Ad63<fPCD(QY8B|u!l)brK@3#~ULtEdK
zqfCRe>yQa#YK`4+c6;pW4?XIY9G1(Xh9WO8{F-Aju+nS9Vmv=$Ac0ienZ+p9*O%NG
zMZKy5?%Z6TAJTE?o5vEr0r>f>hb#2w2U3DL64*au_@P!J!TL`oH2r*{>ffu6|A7tv
zL4juf$DZ1MW5ZPsG!5)`k8d8c$J$o;%EIL0va9&GzWvkS%ZsGb#S(?{!UFOZ9<$a|
zY|a+5kmD5N&{vRqkgY>aHsBT&`rg|&kezoD)gP0fsNYHsO#TRc_<dLhZf-oCe<uor
zVn+D3J+?dI`OPTIwX%7HL4coV@n&0F`$sgzfV#jy^Nxhx;htyfm`2*%2*E<EEua3X
z>$n6Lf1Z{?+DLziXlHrq4sf(!>O{?Tj;Eh@%)+nRE_2VxbN&&%%caU#JDU%vL3}Cb
zsb4AazPI<wjJ5Xm?P>{>8H&d=jUaZDS$-0^AxE@utGs;-Ez_F(qC9T=UZX=>ok2k2
ziTn{K?y~a5reD2A)P${NoI^>JXn>`IeArow(41c-Wm~)wiryEP(OS{YXWi7;%dG9v
zI?mwu1MxD{yp_rrk!j^cKM)dc4@p4Ezyo%lRN|XyD}}>v=Xoib0gOcdXrQ^*61HNj
z=NP|pd>@yfvr-=m{8$3A8TQGMTE7g=z!%yt`8`Bk-0MMwW~h^++;qyUP!J~ykh1GO
z(FZ59xuFR$(WE;F@UUyE@Sp>`aVNjyj=Ty>_Vo}xf`e7`F;j-IgL5`1<e8<5GZ9t_
zSKJ;j#8L2sA)KLlG+guS4jf40SgEe!dKKK0Hbs4NAYj<w(>~-#70$9_=uBMq!2&1l
zomRgpD58@)YYfvLtPW}{C5B35R;ZVvB<<#)x%srmc_S=A7F@DW8>QOEGwD6suhwCg
z>Pa+YyULhmw%BA*4yjDp|2{!T98~<6Yf<Ht&p|`G9M?uugEk_wVc(bM<s*XMD&4B1
z!i3%8q|snVIZ`!_i1*YyreC8Lohwejbmzog)&}vE7Rz1dcR%OnN}_3vj`{K=-3O~_
zu1c5_k};f^gB06dul({<`Lcpka0Ph<!;#yPQz#pwe?I#d5?HpUA@y)AJdD~*W6*^J
z9IAb}`aqXze3Z5+o@S&yu8d^LhgI0a?q{$=xrJP?yBJszi{*k);E$b`3mcYPuTL=d
zCCNFg0QG16+KKF$c43P(5eJVL61PLUzK~wHo_6%n7f<5cmB2yHn6OgGuGvm#^QB$O
zIXl<)?hk{+{p_;>d(wo1mQ!KWwq0eg+6)o1>W~f~kL<-S+P@$wx*zeI|1t7z#Sxr5
zt6w+;YblPQNplq4Z#T$GLX#j6yldXAqj>4gAnnWtBICUnA&-dtnlh=t0Ho_vEKwV`
z)DlJi#!@nkYV#$!)@>udAU*hF?V`2$Hf=V&6PP_|r#Iv*J$9)pF@X3`k;5})9^o4y
z&)~?EjX5yX1<xGQ%1@{UCW^23>2O(BsFy-l6}nYeuKkiq`u9145&3Ssg^y{5G3Pse
z9w(YVa0)N-fLaBq1`P!_#>SS(8fh_5!f{UrgZ~uEdeMJIz7DzI5!NHHqQtm~#CPij
z?=N|J>nPR6_sL7!f4hD_|KH`vf8(Wpnj-(gPWH+ZvID}%?~68SwhPTC3u1_cB`otq
z)U?6qo!ZLi5b>*KnYHWW=3F!p%h1;h{L&(Q&{qY6)_qxNfbP6E3yYpW!EO+IW3?@J
z);4>g4gnl^8klu7uA>eGF6rIGSynacogr)KUwE_R4E5Xzi*Qir@b-jy55-JPC8c~(
zo!W8y9OGZ&`xmc8;=4-U9=h{vCqfCNzYirONmGbRQlR`WWlgnY+1wCXbMz&NT~9*|
z6@FrzP!LX&{no2!Ln_3|I==_4`@}V?4a;YZKTdw;vT<+K+z=uWbW(&bXEaWJ^W8Td
z-3&1bY^Z*oM<=M}LVt>_<PFYrPfhFhSu%npt;+8<VSwjlcQC8wPbX!R<;Rgr<C++E
zby{kGH;!C6486yrVIwy8>j+p=2Iu7<ee5Yzkv)1V_(^OyjiyljyAy{*({<c49<wJ_
zD`WoEKZ35Gv<M<<pCYpQZ?|m!#mlmG_*_DC0N62ESbuImD+AoD)Lj4`<}R)PJ25MB
zQ(JSFe<_~3nt|(_B)R}zmNZN0*RPG}Ru~x4q$U+I`RU|gs%W~s18LSc)J(SC3~+Y<
zk0lr!;49KA_>pZmbXrhQ_k)ysE9yXKygFNw$5hwDn(M>H+e1&9BM5!|81vd%r%vEm
zqxY3?F@fb6O#5UunwgAHR9jp_W2zZ}NGp2%mTW@(hz7$^<W>+a`A?mb8|_G*GNMJ)
zjqegXQio=i@AINre&%ofexAr95aop5C+0MZ0m-l=MeO8m3epm7U%vZB8+I+C*iNFM
z#T3l`gknX;D$-`2XT^Cg*vrv=RH+P;_dfF++cP?B_msQI4j+lt&rX2)3GaJx%W*Nn
zkML%D{z5tpHH=dk<tahvnnE?`>sQ*gzc|}gzW;lwAbxoR07VNgS*-c3d&8J|;@3t^
zVUz*J*&r7DFRuFVDCJDK8V9NN5hvpgGjwx+5n)qa;YCKe8TKtdnh{I7NU9BCN!0dq
zczrBk8pE{{@vJa9ywR@mq*J=v+PG;?fwqlJVhijG!3VmIKs>9T6r7MJpC)m!Tc#>g
zMtVsU>wbwFJEfwZ{vB|ZlttNe83)$iz`~#8UJ^r)lJ@HA&G#}W&ZH*;k{=TavpjWE
z7hdyLZPf*X%Gm}i`Y{OGeeu^~nB8=`{r#TUrM-`;1cBvEd#d!kPqIgYySYhN-*1;L
z^byj%Yi}Gx)Wnkosi337BKs}+5H5dth1JA{Ir-JKN$7zC)*}hqeoD(WfaUDPT>0`-
z(6sa0AoIqASwF`>hP}^|)a_j2s^P<w)m}Rj^15uY<n!4_%<!d#{vZ=Z_P}@eeW-Ox
z;JQLPzZSHN-l$$DLG%r}N3ZZZvwUxH+BWNXE{dS!hk|G5x)?-llV>Qn*qVC{Q}htR
z5-)duBFXT_V56-+UohKXlq~^6uf!6sA#ttk1o~*QEy_Y-S$gAvq47J9Vtk$5oA$Ct
zYhYJ@8{hsC^98${!#Ho?4y5MCa7iGnfz}b9jE~h%EA<g+BN#3fNo`|_hPZ+|%)bb7
zIn_D<^wYb`{#zL<_<s|myPLHg(|`4wmJ7hi$=pTU+V#^hHu-$b(Luw-PR!Bav-v(-
z@?W|xOvONHUKm|~3~s1|_Dl38>Av~Qxu)_rAV;^cygV~5r_~?l=B`zObj7S=H=~$W
z<UmLzInv1Ql%M`_G9-u94*9n+0oO@^hhKgl*ZXu|6{=bN1o_txgnax72@;~Z7?^Oq
z7?^&}>PtI_m%g$`kL_fVUk9J<CtwAz7a!$Q&-Jh3I_W5n|1(dhB)p#U+Wl>@>EiBH
zOO&jtn~&`hIFMS5S`g8w94R4H40mdNUH4W@@XQk1sr17b{@y|JB*G9z1|CrQjd+GX
z6+KyURG3;!*BQrentw{B2R&@2&`2}n(z-2&X7#r!{yg@<!&3xwQWd$^>Soy}cRD~j
zj<Og|6*w}HqZTw8{Il!Ft=<O5!dlKfOz|j$Z^w5&_=~)z5Z-}bt_7jqebZL&Vjmk}
z(RA*=wrL0UL*<vcZEZprIhH<3JJLr)CAtGH+&D8sC~U9fHYR9L!BM()S6a1mtsI!E
z--M=*g<DRnwm1hG$L!!=946pI4AzFaqTKQdn(cd9nxm|_%Tp(UbUJVdmn&m&OV5sT
zjBA&CZ;!B-hP7XTLul+iNP4Dg{KGjcnsK_H51vQ)!>9@UBW+N|4HW4AWapy4wfUI-
zZ`gSL6DUlgj*f1hSOGXG0IVH8HxK?o2|3HZ;KW{K+yPAlxtb)NV_2AwJm|E)FRs&&
z=c^e7bvUsztY|+f^k7NXs$o1EUq>cR7C0$UKi6IooHWlK_#?IWDkvywnzg&ThWo^?
z2O_N{5X39#?eV9l)xI(>@!vSB{DLt*oY!K1R8}_?<kMv(YfJ|7amweKbCcwmp-oP{
zR^MH3r^g&R=wr#qxEEp(KK<zYcr_;1=X$Imnu<Z`RiZwY8*iRY{cWxHh1c@XJZ&pv
zV{U_Z%$qRKL70X;eBqb*t1O=8k$SB(oK)NPT~SfHMOv=9#+69sQt>%+0^C{d9a%N4
zoxHVT1&Lm<qomrTXsCPGIz?rb+qQLuzE?D*s2JcateF>|uDX%$QrBun5e-F`HJ^T$
zmzv)p@4ZHd_w9!%Hf9UYNvGCw2TTTbrj9pl+T9<ayV5<}<wvT;Sn~c<QFT@1O08w<
zkEzBm=w)jYybjnZrhSE(k<3uFR)#=txys^{+XA;N)s*?BH^)|A82SR=)(_iGhC7T3
z6;In+x<8Dm5KW<GS7?86#S~&}Tl{Cy3IDd}BL8#I#Xpxf?HmDS<l^QQ0CzjL|Nnnw
z7e`AMb5~dSPx>%-_-}L(tES>Or-}Z4F*{##n3~L~TuxjirGuIY#H7{%$E${?p{Q01
zi6T`n;rbK1yIB9jmQNycD~yZq&mbIsFWHo|ZAChSFPQa<(%d8mGw<NE-PqxpYmaZY
z>*V3fh|yFoxOOiWJd(qvVb!Z$b88cg->N=qO*4<Ju1L$F55Eg|&pd*ih%*g;pO{D%
z0by!&Tpi~?D_*9Y{Y99`;u%i~<7J9|%7CE#RGy9%il*j9uH#6qR8ZZ3+-)Oz_wKW(
z^pYp_3KlClW0cl`;)I55^HEmIrxSK3i7Y3l?;+5qj8LrRLEa*uvXRuegx26kvwYL_
zb#;U>k~6;R==|9ihg&riu#P~s4Oap9O7f%crSr^rljeIfXDEg>wi)&v*a%7zpz<9w
z*r!3q9J|390x`Zk;g$&OeN&ctp)VKRpDSV@kU2Q>jtok($Y-*x8_$2piTxun81@vt
z!Vj?COa0fg2RPXMSIo26T=~0d`{oGP*eV+$!0I<(4azk&Vj3SiG=Q!6mX0p$z7I};
z9BJUFgT-K9MQQ-0@Z=^7R<{bn2Fm48endsSs`V7_@%8?Bxkqv>BDoVcj?K#dV#uUP
zL1ND~?D-|VGKe3Rw_7-Idpht>H6XRLh*U7epS6byiGvJpr%d}XwfusjH9g;Z98H`x
zyde%%5mhGOiL4wljCaWCk-&uE4_OOccb9c!ZaWt4B(wYl!?vyzl%7n~QepN&eFUrw
zFIOl9c({``6~QD+43*_tzP{f2x41h(?b43^y6=iwyB)2os5hBE!@YUS5?N_tXd=h(
z)WE286Fbd>R4M^P{!G)f;h<3<yF&*^O7$OuoRrI)h{eq;PvnVS@*1~LuT)USkQaNv
zOM$`oAtUE4j7%H>Q>Fipuy+d2q-)!RyTgt;wr$(?9ox3;q+{E*ZQHhOn;lM`cjnu9
zXa48ks-v(~b*;MAI<>YZH(^NV8vjb34be<!O&B^}ixjt}bVWn%lGLAylA+d{-I3Ov
zYGthSdKw$ke{f2yKKz!}A?+JRYrVrnNgphJxf8a2-SUbcd-<f$tEQ|wN=!zhIVy6o
z{e*aA58OB1vmh4GpWd_ASE)%_0rN^UY7rBTo&!Y1@Ctrd$H;scFhnl=M`1%EsufNC
z-OLaEwV5;h{|wOY5$Wp2@8oFu?G`jM&~vo;?-m}Z`0Z8WNBRVdp)MQ|2K`3!%98{%
zdNe?VYC=$}mLx4b>E<_cwKlJoR;k6lJNSP6v}uiyRD?|0w+X@o1ONrH8a$fCxXpf?
z?$DL0)7|X}Oc%h^zrMKWc-NS9I0Utu@>*j}b@tJ=ixQSJ={4@854wzW@E>VSL+Y{i
z#0b=WpbCZS>kUCO_iQz)LoE>P5LIG-hv9E+oG}DtlIDF>$tJ1aw9^LuhLEHt?BCj&
z(O4I8v1s#HUi5A>nIS-JK{v!7dJx<eVzUtPaTI7fl_Lc9Q7H|gBw|{WKmed^JnGV(
zu>)^Yg%XjNmlkWAq2*cv#tHgz`Y(bETc6CuO1VkN^L-L3j_x<4NqYb5rzrLC-7uOv
z!5e`GZt%B782C5-fGnn*GhDF$%(qP<74Z}3xx+{$4cYKy2ik<NlkqF%3WflOIsu<r
zJ1Ia^)U`#vAWMh>xI7B2N+2r07DN;|-T->nU&!=Cm#rZt%O_5c&1Z%nlWq3TKAW0w
zQqemZw_ue--2uKQsx+niCUou?HjD`xhEjjQd3%rrBi82crq*~#uA4+>vR<_S{~5ce
z-2<NdSzF&R56UcB$b`5iPf1tk$#inbE+bN7x7fQ!g0(-tUeP$5mD|<oEEQs$_4x-D
zh0c}WhYdr7ofXz6ohh@zpmk*n(FlBp!zEgxx$uEo2H<WsrEGtxER628jLj$t%l9)Q
zpgsl-$*Z~WylJ9&D{WG%{31lN8lbCEKy<E~v-RJ-A&Im)1f}cclb5D<{1Q#Vu%0dO
zWKBG_7m(=clSMuCN`u^GxdNe&YF`-TUb|;%bmJ`YPw4~vsx(y)NXfrJ;zl=TdJvdM
zgjB=Fy{n+66I+(Z4-Ri^DAN$}Va@@|i(LY(Ta%N-!9$yT@Di8@Q|ICHs-$>EIl?~s
z1=<moH2Vo(ywx9NJa$4J-?u#55cOFdV)B}0g(o*O*fidd5c?ici{Ux!YWxH)lJBu&
zebpoNCFHe!@Sp-PDw*l@?whO<W~J*cdfDfxv>GVL{NxP1N3%=AOaC}j_Fv=ur&THz
zyO!d9kHq|c73kpq`$+t+8Bw7MgeR5~`d7ChYyGCBWSteTB>8WAU(NPYt2Dk`@#+}=
zI4SvLlyk#pBgV<A2?oY=e+fBwTE4;jq%tyXWMQiSYYYO}#)dHPJlDLuJulo6SGjK1
z`gH3!=BMJnht1ob+aBAuUFTlcx2QPoAUzlvdTgFMd@}Q&V?T*GzNthb2P4Otx?F}b
zQU!B?T171=l1DVswq8U{dUopH<i_9aHNW4O!%Ue4mI5N4Rk3KVw;&F(OhCj^%kuG+
z8MtBDbnF(k2oZuHMNq<)Iaf2h9OF2sY%r9g4<_CbzLUIxWdSMTHg+tXTNiq(CW|G{
zGd*nwn$nSQ3yn2F)sHm_LxN&3a)|o1Bxxow1q4-amB&cP3_zydal7X0#bqu|rbi}z
za?8dV(p~@OkF-Z^V2VNz4r_~<b6L?KbFZxteo)3x8MUXZIB5k|m&5ONIJ2mEmpJ8+
zS^3scTkT>igEe`?NG*vl7V6m+<}%FwPV=~PvvA)=#ths==DRTDEYh4V5}Cf$z@#;<
zyWfLY_5sP$gc3LLl2x+Ii)#b2nhNXJ{R~vk`s5U7Nyu^3yFg&D%Txwj6QezMX`V(x
z=C`{76*mNb!qHHs)#GgGZ_7|vkt9izl_&PBrsu@}L`X{95-2jf99K)0=*N)VxBX2q
z((vkpP2RneSIiIUEnGb?VqbMb=Zia+rF~+iqslydE34cSLJ&BJW^3knX@M;t*b=EA
zNvGzv41Ld_T+WT#XjDB840vovUU^FtN_)G}7v)1lPetgpEK9YS^OWFkPoE{ovj^=@
zO9N$S=G$1ecndT_=5ehth2Lmd1II-PuT~C9`XVePw$y8J#dpZ?Tss<6wtVglm(Ok7
z3?^oi@pPio6l&!z8JY(pJvG=*pI?GIOu}e<i-gr8K;Jwm`jb9}cg?7%k?$ozkvP<q
z^_0<VPv%dHn>^EB6QYk$#FJQ%^AIK$I4epJ+9t?KjqA+bkj&PQ*|vLttme+`9G=L%
ziadyMw_7-M)hS(3E$QGNCu|o23|%O+VN7;Qggp?PB3K-iSeBa2b}V4_wY`G1Jsfz4
z9|SdB^;|I8E8gWqHKx!vj_@SMY^hLEIb<lAtPrMWpQ?Gx@0V+>SMCuE?WKq=c2mJK
z8LoG-pnY!uhqFv&L?yEuxo{dpMTsmCn)95xanqBrNPTgXP((H$9N${Ow~Is-FBg%h
z53;|Y5$MUN)9W2HBe2TD`ct^LHI<(xWrw}$qSoei?}s)&w$;&!14w6B6>Yr6Y8b)S
z0r71`WmAvJJ`1h&poLftLU<!Z5k{sKI&`B$86NJr<vepFmH?(W7PH*&i|mr?Ft$yK
zlQ2hYqAhr7MyE%yewrikAeSqlmaqf6`n)*-iF$8(XN#mJ2C<)bI6Vjy(bW4=>S6Ir
zC$bG9!Im_4Zjse)<TV^7Y-skQbZaPLqlJGP?6u?Ar53a`=TEMdV9Pz$A~Oa(Rc*59
zlP6c&tH(X*j_BPct#psJ_4ej*FB0-ZmxrgT1go#{dSJYBN5b(ilJd0C{DrQAlZa_y
z_`$lr_=xcxo6aRz`CVouz-G0iMAr=_WWB~^j&$tY`E*!!JHXJsUn?rao*@+eB&qDA
zW99F#`#iK!JA{ggeYTsjHg%UzJKlG}4g|0~H1^LtyCeCE^d_K<#AGK3JmN+YTaC5S
z91&>#K=oJM9mHW1{%l8sz$1o?ltdKlLTxWWPB>Vk22czVt|1%^wn<WkC7-mZ1~!CF
z*?s-mfPHuh>N@*!l)}?EgtvhC>vlHm^t+ogpgHI1_$1ox9e;>0!+b(tBrmXR<YSIe
z+ujWarO6U*;i)~AxpZt4t`{u9*(+s;XcKm*(c~N9u#mFIlYU2WORAXM_UYQcUg$Ee
zx5S=-_TAMk8a1Q-)f_5{=PkrVgJA;Ro116-3NLM9UpBJ!h?qkH2BSize2e<I)M16H
z`{Y|kfSkRI?YvZ4;f|#JKgvo95q7X$@!Rjljd&2-g(yOk)xl#i>B`PY1vp-R**8N7
zGP|QqI$m(Rdu#=(?!(N<WmSu~%G3p+kgCkb(yT{~ITG_cvZ}c9BIo-Bx{A8PA+JSS
zM`u{@ucuz=u?fnE^32?!>}G9QhQ%o!aXE=aN{&wtGP8|_qh+7a_j_sU5|J^)vxq;#
zjvzLn%_QPHZZIWu1&mRAj;Sa_97p_lLq_{~j!M9N^1yp3U_SxRqK&JnR%6VI#^E12
z>CdOVI^_9aPK2eZ4h&^{pQs}xsijXgFYRIxJ~N7&BB9jUR1fm!(xl)mvy|3e6-B3j
zJn#ajL;bFTYJ2+Q)tDjx=3IklO@Q+FFM}6UJr6km7hj7th9n_&JR7fnqC!hTZoM~T
zBeaVFp%)0cbPhejX<8pf5HyRUj2>aXnXBqDJe73~J%P(2C?-RT{c<cL>3NjE`)om!
zl$uewSgWkE66$Kb34+QZZvRn`fob~Cl9=cRk@Es}KQm=?E~CE%spXaMO6YmrMl%9Q
zl<Oi>A3Q$3|L1QJ4?->UjT&<QWaf5;5SDgEG|O+++eQ`L7IecmPP{~vvxpw+mh_w?
z)_nm*UV;|XpG)3akT+?)#!p)ksH$g>CBd!~ru<az8{#I2>{Ih^in&JXO=|<6J!&qp
zRe*OZ*cj5bHYlz!!~iEKcuE|;U4vN1rk$xq6>bUWD*u(V@8sG^7>kVuo(QL@Ki;yL
zWC!FT(q{E8#on>%1iAS0HMZDJg{Z{^!De(vSIq&;1$+b)oRMwA3nc3mdTSG#3uYO_
z>+x;7p4I;uHz?ZB>dA-BKl+t-3IB!jBRgdvAbW!aJ(Q{aT>+iz?91`C-xbe)IBoND
z9_Xth{6?(y3rddwY$GD65IT#f3<(0o#`di{sh2gm{dw*#-Vnc3r=4==&PU^hCv$qd
zjw;<UDv>>i&?L*Wq#TxG$mFIUf>eK+170KG;~+o&1;Tom9}}mKo23KwdEM6UonXgc
z!6N(@k8q@HPw{O8O!lAyi{rZv|DpgfU{py+j(X_cwpKqcalcqKIr0kM^%Br3SdeD>
zHSKV94Yxw;pjzDHo!Q?8^0bb%L|wC;4U^9I#pd5O&eexX<SpYMUi;NrGyaua!-+-%
z(b6a1gZ;D+I4*J4ZzM0c^5vOx!1kF+S*rg^EpYq4Q$8Zv`)iCLe*9oY_%}BDzxap$
zLASmGRxF(y%$&cS<d#PK1_s~QhLoPQp2`0OZ5YXZBV6=I+(qKW;)rW-X|QN4Rtu+O
zUSe{k6uD&;e7|DG*7jRd*&ewJO@NTU@h#Yzt1=3xB&wH^G8Ykks+&HQ<6Vd9>+Im{
z?jKnCcsE|H?{uGMqVie_C~w7GX)kYGWAg%-?8|N_1#W-|4F)3YTDC+QSq1s!DnO<W
zN9@pSmktp>ML3@d`mG%o2YbYd#jww|jD$gotpa)kntakp#K;+yo-_ZF9qrNZw<%#C
zuPE@#3RocLgPyiBZ+R_-FJ_$xP!RzWm|aN)S+{$LY9vvN+IW~Kf3TsEIvP+B9Mtm!
zpfNNxObWQpLoaO&cJh5>%sl<u+Ou}Gr38*?+PIrbJtKK2C4@@i^3b)w#L5cfzsc$p
zGS#mh6Pugtp<?+N{dA+4Q%bSY%c7pXA|R9VW%COsIn2Bo=M^XtO8d8nsr~aAmv2eX
zu1Bz3botI2)|TFV6HCJkzYB4<>ZnHl_Q~(-Tfh!DMz(dTWld@LG1VRF`9`DYKhyNv
z2pU|UZ$#_yUx_B_|MxUq^glT}O5Xt(Vm4Mr02><%C)@v;vPb@pT$*yzJ4aPc_FZ3z
z3}PLoMBIM>q_9U2rl^sGhk1VUJ89=*?7|v`{!Z{6bqFMq<nxEmclgCD+90@&@li(W
z@s`8&MDulaH_%E?T~GV|zVm+MR`OB^kjC~xrEgb}MlNn^7GQ&p?tO;j2-%IuU~fD(
z0>(mYiA?%KbsI~JwuqVA9$H5vDE+VocjX+G^%bieqx->s;XWlKcuv(s%y%D5Xbc9+
zc(_2nYS1&^yL*ey664&4`IoOeDIig}y-E~_GS?m;D!xv5-xwz+G`5l6V+}CpeJDi^
z%4ed$qowm88=iYG+(`ld5Uh&>Dgs4uPHSJ^TngXP_V6fPyl~>2bhi20QB%lSd#yYn
zO05?KT1z@?^-bqO8Cg`;ft>ilejsw<PFMVi?0Z@%ZV7rUN23RZ3KB-HQ4YJB9u)4=
za)vbs@fTDRfs8YOO9lTq9t(nMd5S}gTT;muT}2|Lp9^&=2z~`<$j8jb{Rm0Lj+(J;
zBs1-Ce%8FDmkdHi5cJ6Wi^?0Snek8_6eCPHqaWM%UBUa}9l;;Shpu<^ueJT0tU&6s
zM}#unAgS&xl(DwgfMw$=1QcnDcIW6g!~<;08&1lIeTOvuGw?sf1Lf^Kz4~1^b^i*7
zQv6%-{2%J%9~}I@Dko75!V~i_(Z_~qE@Eg*k5ZaIz;BO8D2h5gAVv@$PDearM7jpi
z&t7+EZUrTlNkkN39;K<FA&>@2%RR7;`$Vs;FmO(Yr3Fp`pHGr@P2hC%QcA|X&N2Dn
zYf`MqXdHi%cGR@%y7Rg7?d3?an){s$zA{!H;Ie5exE#c~@NhQUFG8V=SQh%UxUeiV
zd7#UcYqD=lk-}sEwlpu&H^T_V0{#G?<Ioj?HL)KBNR`OQ7)HFN^?vHg<3bOCMy{-_
zoJlS{0I_uDhHNKG64k_@&-j<$nist8llO}aD}1PO>lZMxL7ih_&{(g)MWBnCZxtXg
znr#}>U^6!jA%e}@Gj49LWG@*&<lmaA<6<$*MXpyYdp&6c<H(Q`F-~?W^X9RVJSh6?
zS8Z8DnkwU?#%B!h%)j^e-VAuVDAU!JK1md@BbHM`aL3Du8QeW#u5Ow!LJ{cJE;g7g
zfS6lhpcAG^4%Z7tD(JDejR@8=mF27gB&U9t&uA8@v6X<1)e#$W_^iPrw&Q6FYe!O;
z;mr4?<{+g_EB>t0V>Cxc3?oO7LSG%~)Y5}f7vqUUnQ;STjdDU}P9IF9d9<$;=QaXc
zL1^X7>fa^jHBu_}9}J~#-oz3Oq^JmGR#?GO7b9a(=R@fw@}Q{{@`Wy1vIQ#Bw?>@X
z-_RGG@wt|%u`XUc%W{J<m*s=4j2#hKvp}Xrwm)9{h;W6E2ZrwT0V)(gGD?GnU0RwM
z#HcK41SrivBddca)fHXF?z{gCQT_P@wx-H|POi8hsVAB%nToV4iMN+KZh1=!UMaa>
z>iSeiz8C3H7@St3mOr_mU+&bL#Uif;+Xw-aZdNYUpdf>Rvu0i0t6k*}vwU`XNO2he
z%miH|1tQ8~ZK!zmL&wa3E;l?!!XzgV#%PMVU!0xrDsNNZUWKlbi<KaWZPRa*yQzPD
z?-!`siNqS2^Ar7FQc!m$_+UxWcy|gEk+B~FLt>OjzH-1Uoxm8E#r`#2Sz;-o&qcqB
zC-O_R{QGuynW14@)7&@yw1U}uP(1cov)t<bZtw#(M)(XiJYfL8BvaF25ezUn8smY%
zIL-I&o$OIFs>wxeLus0s|7ayrtT8c#`&2~Fiu2=R;1_4bCaD=*E@cYI>7YSnt)nQc
zohw5CsK%m?8Ack)qNx`W0_v$5S}nO|(V|RZKBD+btO?JXe|~^Qqur%@eO~<8-L^9d
z=GA3-V14ng9L29~XJ>a5k~xT2152zLhM*@zlp2P5Eu}bywkcqR;ISbas&#T#;HZSf
z2m69qTV(V@EkY(<qXx5YNic%z2;It?O+T%icQRw=w~P}(cE?O$<=1P^LjThO*$J$+
z*<@biHe!q%g+LQ{F#~S5QTZUfOZ3GP3D<=D3j0f!9E7HkjxKc_g)z1!o;_rP|9bAk
z`BjEQCx<KIvi^GM1jRkNM0nRs6P0hPcy|5t18}ZQi<ZQS>1Dk3`}j)JMo%ZVJ*5eB
zYOjIisi+igK0#yW*gBGj?@I{~mUOvRFQR^pJbEbzFxTubnrw(Muk%}jI+vXmJ;{Q6
zrSobKD>T%}jV4Ub?L1+MGOD~0Ir%-`iTnWZN^~YPrcP5y3VMAzQ+&en^VzKEb$K!Q
z<7Dbg&DNXuow*eD5yMr+#08nF!;%4vGrJI++5HdCFcGLfMW!KS*Oi@=7hFwDG!h2<
zPunUEAF+HncQkbfFj&pbzp|MU*~60Z(|Ik%Tn{BXMN!hZOosNIseT?R;A`W?=d?5X
zK(FB=9mZusYahp|K-wyb={rOpdn=@;4YI2W0E<s2&%i8x7RnmR_dIeK2wKPtVA?`W
z&(y5N6Uhf;St!k{QtA^2hklC0F0jNQ1^WG!Djrw#gypMTn;8cIm2IT14W4+8Zfd?P
z(RlNsG)S|I($<XQ_wo(zU~QxgZ&c8}J!LwJ+gC5c>cbMKyo~-#^?h`BA9~o285%oY
zfifCh5Lk$SY@|2A@a!T2V+{^!psQkx4?x0HSV`(w9{l75QxMk!)U52Lbhn{8ol?S)
zCKo*7<MT2Z&#yn*rVnwZQkr>R(z!uk<6*qO=wh!Pul{(qq6g6xW;X68GI_CXp`XwO
zxuSgPRAtM8K7}5E#-GM!*ydOOG_{A{)hkCII<|2=ma*71ci_-}VPARm3crFQjLYV!
z9zbz82$|l01mv`$WahE2$<xD1<`ujC!7ijORpLN)M&U@lEiyOfPgz+#&<j+G)Ve@{
z49uhDM!il+f+1Ohddi`=p(=u!h7sLm7yGqK4%#ZKd%YvUI(=BCzpn|IbHf!`E?jjq
zNkN-}Pa3AG2My;G@zYOnXl4g(LduE~%FKPJPA6{M4Z@(IC6iu#g0fgCkY2yVR!o7w
zSAe0_FH9n8VszXsaA+KDe(<8VA=dXe;@Mvzyql~A=eKPoX4qg$E{b4p|B|j96u7YW
zNCPo=y&#Ttz?U7fK~W6j?Kuo=wXeV4?w*;OO9xZst|SP^Q4kbKqBIl8hkGC7OOt_I
zzHTK9Aly><gDl|FkR~>=fAGWkd^X2kY(J7<hlY%A5J0#uldARym;T+|d;|>iz}WGS
z@%MyBEO=A?HB9=^?nX`@nh;7;la<vP=GJ_DWqB#{#!`KAdu6%FTU8VbENgD#KJ^CI
z);B^q)Y@yF(YlyL5A}rZXOX1<`Dd>Ajs+fbo!|K^mE!tOB>$2a_O0y-*uaIn8k^6Y
zSbuv;5~##*4Y~+y7Z5O*3w<R;J{o5i>4qgI5V^17u*ZeupVGH^nM&$qmAk|anf*>r
zWc5CV;-JY-Z@Uq1Irpb^O`L_7AGiqd*YpGUShb==os$uN3yYvb`wm6d=?T*it&pDk
zo`vhw)RZX|91^^Wa_ti2zBFyWy4cJu#g)_S6~jT}CC{DJ_kKpT`$oAL%b^!2M;JgT
zM3ZNbUB?}kP(*YYvXDIH8^7LUxz5oE%kMhF!rnPqv!GiY0o}NR$OD=ITDo9r<LYGN
z?2+*it!7d=84FF3EQ!5V&5166oc{Z1O8<?lF+=(kiua&7_M#TrD_|it=_mE6Cz1eX
zwfacTRHwa2jO`w@vfr7ByY*=K7se2@f$`RJFOlykOa!$prgR<-&f{zzV_tCE6E=v(
zZk!PdC3n<fkSRL#m(DteyDn?OK=a6ita@qk4DQ*rtk@>%4E>E0Y^R(rS^~XjWyVI6
zMOR5rPXhTp*G*M&X#NTL`Hu*R+u*QNoiOKg4CtNPrjgH>c?H<b@rgzW2`xq%+zxdV
zGQjVif3BBaXojc?{FLuym0L2fO1Nh(c42RW12b^>i4MUG#I917fx**+<zC5r06AVA
zs!KCNuuLq&Z|e$b{bN7-`NGtKU0>pJfOo!z<a0Lrf~l|OU$j3PPNH#8yRT5C5cWK;
z4Ck57(G?59g@;(s9m49`EtaF2lDH|dGl0xCg>FM&*da&G_x)L(`k&TPI*t3e^{crd
zX<4I$5nBQ8Ax_lmNRa~E*zS-R0sxkz`|>7q_?*e%7bxqNm3_eRG#1ae3gtV9!fQpY
z+!^a38o4ZGy9!J5sylDxZTx$JmG!wg7;>&5H1)>f4dXj;B+@6tMlL=)cLl={jLMxY
zbbf1ax3S4>bwB9-$;SN2?+GULu;UA-35;VY*^9Blx)Jwyb$=U!D>HhB&=jSsd^6yw
zL)?a|>GxU!W}ocTC(?-%z3!IUhw^uzc`Vz_g>-tv)(XA#JK^)ZnC|l1`@CdX1@|!|
z_9gQ)7uOf?cR@KDp97*>6X|;t@Y`k_N@)aH7gY27)COv^P3ya9I{4z~vUjLR9~z1Z
z5=G{mVtKH*&$*t0@}-i_v|3B$AHHYale7>E+jP`ClqG%L{u;*ff_h@)al?RuL7tOO
z->;I}>%WI{;vbLP3VIQ^iA$4wl6@0sDj|~112Y4OFjMs`13!$JGkp%b&E8QzJw_L5
zOnw9joc0^;O%OpF$Qp)W1HI!$4BaXX84`%@#^dk^hFp^pQ@rx4g(8Xjy#!X%<A@Ix
z5U>+X5Jd@fs3amGT`}mhq#L97R>OwT5-m|h#yT_-v@(k$q7P*9X~T*3)LTdzP!*B}
z+SldbVWrrwQo9wX*%FyK+sRXTa@O?WM^FGWOE?S`R(0P{<6p#f?0NJvnBia?k^fX2
zNQs7K-?EijgHJY}&zsr;qJ<*PCZUd*x|dD=IQPUK_nn)@X4KWtqoJNHkT?ZWL_hF?
zS8lp2(q>;RXR|F;1O}EE#}gCrY~#n^O`_I&?&z5~7N;zL0)3Tup`%)oHMK-^r$NT%
zbFg|o?b9w(q@)6w5V%si<$!U<#}s#x@0<ZXLbXYKvlKR0q_0uaLFS0K_yZx{oRI6O
zi*Yz=QW(i&SHEhi(35LX5YAi{vy`;KnIJ$O_Y#C4gG~=hcxXw*NH*okugRb;j#&_y
z)}Rg9+nz^&!aY|=fJZ?L&1;1?%7EYHgT5ryT&e6MH~EPzrS<7r*22}w^{P@eQei1e
zd>aX-hP>zwS#9*75VXA4K*%gUc>+yzupTDBOKH8WR4V0pM(HrfbQ&eJ79>HdCvE=F
z|J>s;;iDLB^3(9}?biKbxf1$lI!*Z%*0&8UUq}wMyPs_hclyQQi4;NUY+x2qy|0J;
zhn8;5)4ED1oHwg+VZF|80<4MrL97tGGXc5Sw$wAI#|2*cvQ=jB5+{AjMiDHmhUC*a
zlmiZ`LAuAn_}hftXh;`Kq0zblDk8?O-`tnilIh|;3lZp@F_osJUV9`*R29M?7H{Fy
z`nfVEIDIWXmU&YW;NjU8)EJpXhxe5t+scf|VXM!^bBlwNh)~7|3?fWwo_~ZFk(22%
zTMesYw+LNx3J-_|DM~`v<!_@4wF#cHZ_E9}9+lbenMh?8-P99`3uA4wl$95cN^e{4
z^f$@0?%=HSAz>93yXe=jPD{q;li<xOVXMn1yKMTqv6XhqrnsCUXSTSsjJ{x^XFo@J
z{WA$2#dyTq?%~VAg|L)nty{-VrW88Lu82|x7td?b;LG2_DA_BpkSSx!@P8<G?h<XX
zNw7xGt`p+BMOcS|$jn%|wK)WAaS99%p8&_kFrLJVp7_Jg4yOpvWS`>;5PD?Dyk+b?
zo21|XpT@)$BM$%F=P9J19Vi&1#{jM3!^Y&fr&_`toi`XB1!n>sbL%U9I5<7!@?t)~
z;&H%z>bAaQ4f$wIzkjH70;<8tp<T&Yl>UoxzKrPhn#IQfS%9l5=Iu))^XC<58D!-O
z{B+o5R^Z21H0T9JQ5gNJnqh#qH^na|z92=hONIM~@_i<m8HDcej!764)8GxDF)RMg
z!DFoaWC9)+nME7X6Bjpkb{QXd4;jG|u8$8kw{X8T%Z{fASla5Kj43Dc#xwsm?e^ow
zcpj5R3LQeCN@Z#}!7^cGYJ3aEd-Gp-Jj@_K{U*}A((dH-)nN*GjCV>uOi|F>jBh<M
zWxy1uqU)B}MVkHZ;j2o<4&9z0o{ro;0~$*^2bQ$8Q;vmWoz6BD$xqze?6Q)tzTZs7
zG{RspTbUAmB!P|sU1Q@1do~tC7@>-?aA20}Qx~EpDGElELNn~|7WRXRFnw+Wdo`|#
zBpU=Cz3z%cUJ0mx_1($X<40XEIYz(`noWeO+x#yb_pwj6)R(__%@_Cf>txOQ74wSJ
z0#F3(zWWaR-jMEY$7C*3HJrohc79>MCUu26mfYN)f4M~4gD`}EX4e}A!U}QV8!S47
z6y-U-%+h`1n`*pQuKE%Av0@)+wBZr9mH}@vH@i{v(m-6QK7Ncf17x_D=)32`FOjjo
zg|^VPf5c6-!FxN{25dvVh#f<C2j|FW^wcEVOx|0?I*o1i)tbC%E;daCUUqnT*va?<
z#^@&RTCo<xj8b($3`PD<adop6xc~jgfA19=fzjFXxO)$!??RO(s_s88E>og=NNpXz
zfB$o+0jbRkHH{!TKhE709f+jI^$3#v1Nmf80w`@7-5$1Iv_`)W^px8P-({xwb;D0y
z7LKDAHgX<84?l!I*Dvi2#D@oAE^J|g$3!)x1Ua;_;<@#l1fD}lqU2_tS^6Ht$1Wl}
zBESo7o^)9-Tjuz$8YQSGhfs{BQV6zW7dA?0b(Dbt=UnQs&4zHfe_sj{RJ4uS-vQpC
zX;Bbsuju4%!o8?&m4UZU@~Z<l{fpuI7gBNBuLonV&UnEv8Mm+}6y;;iZ7)_U-2H6P
zik(*@ZnhVtEFTNE*=&MLnOTGYcGC{O?!<?O$^7><Ix|i$iDSBHQSY+=yJ@Z(hm=j8
zLng0c8d`c!aP=6?o|-71!|y*rlW^s|L>ZjeFF6ex2ss5_60_JS_|iNc+R0GIjH1@Z
z=rLT9%B|WWgOrR7IiIwr2=T;Ne?30M!@{%Qf8o`!>=s<2CBpCK_TWc(DX51>e^xh8
z&@$^b6CgOd7KXQV&Y4%}_#uN*mbanXq(2=Nj`L7H7*k(6F8s6{FOw@(DzU`4-*77{
zF+dxpv}%mFpYK?>N_2*#Y?oB*qEKB}VoQ@bzm>ptmVS_EC(#}Lxxx730trt0G)#$b
zE=wVvtqOct1%*9}U{q<)2?{+0TzZzP0jgf9*)arV)*e!f`|jgT{7_9iS@e)recI#z
zbzolURQ+TOzE!ymqvBY7+5NnAbWxvMLsLTwEbFqW=CPyCsmJ}P1^V30|D5E|p3BC5
z)3|qgw@ra7aXb-wsa|l^in~1_<?%wr8cBwUUhaLFkF#>fm{7bS9jhVRkYVO#U{qMp
z)Wce+|DJ}4<2gp8r0_xfZpMo#{Hl2MfjLcZdRB9(B(A(f;+4s*FxV{1F|4d`*sRNd
zp4#@sEY|?^FIJ;tmH{@keZ$P(sLh5IdOk@k^0uB^<vY$Teo5@@0*v8p)6|Qc!#V-V
z19rz;O<~HzAD{>BWr@pk6mHy$qf&~rI>P*a;h0C{%oA*i!VjWn&D~O#MxN&f@1Po#
zKN+<Mx&_V;mQ6by$gOOv@*=130y8ws;u!(S9My%BQ_xyRTms3Q+kzSy&vNnQacNo>
zrGrkSjcr?^R#nGl<#Q722^wbYcgW@{+6CBS<1@%dPA8HC!~a`jTz<`g_l5N1M<n^b
zu`mSxZ+unU+QCuJ((?b36-TN-d1@FTfBA^ddw8UCuT`zjb=Fz?S6Qufs*9jSR|1nK
zFf2vJS;<?+uRmFfTer5Vv1$_#5QG8wt@)CbC^|w;&`=|x<yh14$tmJUYy0g0%PA`M
zOgeNCX{{Nxe0ra@9=~<n^L&fj)_rb#gMU)NmxDTM+6}H9CM!Fi?NW<y$)i_5yC^Lw
z2UV(8qc395hk@%W59BzlhhVt(<xJvm!~c3l+ocXQq>@9wn9GOAZ>nqNgq!yOCb<ux
z3a7GGof9{?JXCwFGGvl~3dP~BNs%SovKoTvXW8FuXj`m7Bnn?jUZ$?pzzIRqprlp7
z2WeoTGz*SooG8Jko3Cp>Z@1z`U_N`Z>}+1HIZxk*5RDc&rd5{3qjRh8QmT$VyS;jK
z;AF+r6XnnCp=wQYoG|rT2@8&IvKq*IB_WvS%nt%e{MCFm`&W*#LX<V7)65J}gDBN(
zk<mMjk3Ksz1;V7!3@X8he*FSaSG=@7E0@=@yHKsO4iF;tj3{tU-4xewSJc&$#1UMZ
zuOWXXn?>c|HrD?nVBo=(8*=Aq?u$sDA_sC_RPDUiQ+wnIJET8vx$&fxkW~kP9qXKt
zozR)@xGC!P)CTkjeWvXW5&@2?)qt)jiYWWBU?AUtzAN}{JE1I)dfz~7$;}~BmQF`k
zpn11qmObXwRB8&rnEG*#4Xax3XBkKlw(;tb?Np^i+H8m(Wyz9k{~ogba@laiEk;2!
zV*QV^6g6(QG%vX5Um#^sT&_e`B1pBW5yVth<mfskR&uj=IULHb38<tse=w*}Q<7Qz
z3<|`SLDlf5BpLb9#iQyjF6U}7JJ`^RZ>~xUs#0}nv?~C#l?W+9Lsb_5)!71rirGvY
zTIJ$OPOY516Y|_014sNv+Z8cc5t_V=i>lWV=vN<QPmtSpXv;kT!7hs-HH@i&RiI&9
z!dO2CvGs>u#!58y9Zl&G<qC8KlC>sMEW#pPYPYGHQ|;vFvd*9eM==$_=vc7xnyz0~
zY}r??$<`wAO?JQk@?RGvkWVJlq2dk9vB(yV^vm{=NVI8dhsX<)O(#nr9YD?I?(VmQ
z^r7VfUBn<~p3()8yOBjm$#KWx!5hRW)5Jl7wY@ky9lNM^jaT##8QGVsYeaVy<knK7
z-|ItZG@v_5HqwNL*X;&)fE*o2En9}_QSQ~Sxpo=`)<(LLcGS!($DADohdab0N6KTw
zF9i4%@WsNPJ2f@Kf&OK)$mJTfFx16uGFyQhd(9)Ota05_m2b4&e^Iw3r!jC#Hp$0t
zt!S{)s#CdvfKjo>wmpv>X|Xj7gWE1Ezai&wVLt3p)k4w~yrskT-!PR!kiyQlaxl((
zXhF%Q9x}1TMt3~u@|#wWm-Vq?ZerK={8@~&@9r5JW}r#45#rWii};t`{5#&3$W)|@
zbAf<Wd~LF&Fa+z%sLq|a7`8BxH;x+F8(HU&VXoK6B>2yDNe0q}NEUvq_Quq3cTjcw
z@H_;$hu&xllCI9CFDLuScEMg|x{S7GdV8<&Mq=ezDnRZAyX-8gv97YTm0bg=d)(>N
z+B2F<g3)ia(W`M0HCo-hfSmEqFQ$W5j$wJ;?r7W^?`K&S(*5L=PU?;K=%mT~AoyLc
zmB9dB0BfY<$42H7qp*b&gqwjPG@?GHqEWMaXbG2l^~e?yEl$rb2e+7tNeQjQi!%mG
z!n&myqi%P$bT;(Jy7TA*2tP}NXfszN-aSbAw)CRWi!Y&moQuMP!ZIYY+|-Hi;nfsw
zSAibHD0K*Rx488O94=w;IYZ)nc%KdXcP-D4kAzBYZwl}PDc(ZIiFm=)7;@JjuDF@@
zh({Ks)K#T@$U#?|cc5wW7j|#<s#)XeeNmP*9S3P%QoJ+84!)t-Q3y&LB4VHT${%s|
zTG3XIg~2`B_RVw|0J-Nqqo+Pe*FeW61Sh*R7{v~e$s_}74Z{>cqvI9>jGtnK%eO%y
zoBPkJTk%y`8TLf4)IXPBn`U|9>O~WL2C~C$z~9|0m*YH<-vg2CD^SX#&)B4ngOSG$
zV^wmy_iQk>dfN@Pv(ckfy&#ak@MLC7&Q6Ro#!ezM*VEh`+b3Jt%m(^T&p&WJ2Oqvj
zs-4nq0TW6cv~(YI$n0UkfwN}kg3_fp?(ijSV#tR9L0}l2qjc7W?i*q01=St0eZ=4h
zyGQbEw`9OEH>NMuIe)hVwYHsGERWOD;JxEiO7cQv%pFCeR+IyhwQ|y@&^24k+|8fD
zLiOWFNJ2&vu2&`Jv96_z-Cd5RLgmeY3*4rDOQo?Jm`;I_(+ejsPM03!ly!*Cu}Cco
zrQSrEDHNyzT(D5s1rZq!8#?f6@v6dB7a-aWs(Qk>N?UGAo{gyt<fmzm5NzZ|fs&#}
zncD~rYY0$e-KIJ+=Op({4j(9PIkXkc_SkA;(m*CznUsh<u&YaQ2?WX;ffj?q#rkaD
zhgb+R9A8jgPt^w}K)OK@H5E_#!Z=VQBe3%qPvYY%REdJYYg39A`{$NP_M^}lrMFR7
z+z-hJlg#zeS)0^WAq|DzTPsJlg*U_m2#Hg-0Wmz@?iH!<yiUNaD#c-kYUp!%4RSKq
z{aEO|^oK+y2EtPt%bwO#VBE9DxJ9N@ufrh+8SIe+8$#E{7>lh$%_IhyL7h?DLXDGx
zgxGE<S);`7Ye2u;8Dq575#~501>BQoCAWo-$LRvM=F5MTle`M})t3vVv;2j0HZY&G
z22^iGhV@uaJh(X<Ji#4pfqzo+6CV|xa;=Mz*tzH^bNhkjCY|vEWb}4_nF}mp3zj6F
z<RK*`vdncONn`be@hcYCs`Fx)%bT8wV$T8!b%e<R8G~jKI85M<1J$4NY<nL0CutfG
z<0HvW$c5I*1#_hw)1(15*aH)~KVw0;{Zp_ZQI?8k=6OO?W$jqY0waZgj%rb>yyY%}
zd4iH_UfdV#T=3n}(Lj^|n;O4|$;xhu*8T3hR1mc_A}fK}jfZ7LX~*n5+`8N2q#rI$
z@<<NwLoApZ<e3!J$+qrZqXhfixi{}uCew+VjVR``fj1Lf`2I-jD>_2VANlYF$vIH$
zl<)+*tIWW78IIINA7Rr7i{<Y3K#4WL2gwP{V}$lxr|)k#^h4W{mX@Le*u!g#hfAGO
zCeSa;u(-xw0ZspUEBo+8Ru(%}-m6Ro%1{7aEL$_dA#RWFH+pZ@3`vn|LIY-BbZ`xA
z;0Hf*V6m6EEIJ?MTj|I!l!v*XnqWfITy`ve|BDjjbn-XMEaj|oNyd!_<K^Ti36kh1
z(%3QLBv(><;#^yzxoLNkXL)eSs=%<Gis)1JtKs|A*=kDqutu=D>|P>$YQIh+ea_3k
z_s7r4%j7%&*NHSl?R4k%1>Z=M9o#zxY!n8sL5>BO-ZP;T3Gut>iLS@U%IBrX6BA3k
z)&@q}V8a{X<5B}K5s(c(LQ=%v1ocr`t$EqqY0EqVjr65usa=0bkf|O#ky{<C<t3zu
z`hM@H`e*c^To>j3)WBR<Rk0XEu3I97LQj|W=oPZA>(((L^wmyHRzoWuL2~WTC=`yZ
zn%VX`L=|Ok0v7?s<AoD@n8D7kz^e<8kpe+<X#n2|WDSoR5^oCGt<jgs)X#u5@PWV+
z<b`lm>>IHg?yA<ed=nI4vx^O}jp~R0s4CS5p`R9jtX~z7xF-Z5gJMA(5vqNQrH2}B
zLEIs&NxKWPrwn0(*pI+NhLe0_cU!>rBcync5rG#^+u)>a%qjES%dRZoIyA8gQ;StH
z1Ao7{<&}6U=5}4v<)1T7t!J_CL%U}CKNs<r5fMyAUKFZK{y9U{q!s_ZSKXqW%qBRr
z?bu0a7Zic(_|KEJWkFJk=&bobT<sZBi;dXUmO?)}m$)B<s0Nr7^=$mY+Dr7ke?^*x
z4Vt?Ovyz0Mk+fmZjXJTAv@hijH2y5Cf|5J3ryKO57~3?_EAxOUYF##HdjopeNH4%@
zuGqW!`j7vRX*U7T5B1|m8}h%c4gNobJO8sb_&>-0xWoTTeqj{5{?Be$L0_tk>M9o8
zo371}S#30rKZFM{`H_(L`EM9DGp+Mifk&IP|C2Zu_)Ghr4Qtpmkm1osCf@%Z$%t+7
zYH$Cr)Ro@3-QDeQJ8m+x6%;?YYT;k6Z0E-?kr>x33`H%*ueBD7Zx~3&HtWn0?2Wt}
zTG}*|v?{$ajzt}xPzV%lL1t-URi8*Zn)YljXNGDb>;!905Td|mpa@mHjIH%VIiGx-
zd@MqhpYFu<g)7l4&_bNaog=DANE?ZcfM)*NEMkm~{>4_?y5N4xiHn3vX&|e6r~Xt>
zZG`aGq|yTNjv;9E+Txuoa@A(9V7g?1_T5FzRI;!=NP1Kqou1z5?%X~Wwb{trRfd>i
z8&y^H)8YnKyA_Fyx>}RNmQIczT?w2J4SNvI{5J&}Wto|8FR(W;Qw#b1G<1%#tmYzQ
zQ2mZA-PAdi%RQOhkHy9Ea#TPSw?WxwL@H@cbkZwIq0B!@ns}niALidmn&W?!Vd4Gj
zO7FiuV4*6Mr^2xlFSvM;Cp_#r8UaqIzHJQg_z^rEJw&OMm_8NGAY2)rKvki|o1bH~
z$2IbfVeY2L(^*rMRU1lM5Y_sgrDS`Z??nR2lX;zyR=c%UyGb*%TC-Dil?Si<D{nTD
zMw8K;)%(lo#`W8jOV@qoJ#X}Mwbyz4a;PcPaSBZqr;F)uJhh<vQG%Y4Mw<phKlLRw
zIw!C3k>hkjrQy~TMv6;BMs7P8il`H7DmpVm@rJ;b)hW)BL)GjS154b*xq-NXq2cwE
z^;VP7ua2pxvCmxrnqUYQMH%a%nHmwmI33nJM(>4LznvY*k&C0{<x1=XMzKZ~Bs`;;
zzjN;>8f*%?zggpDgkuz&JBx{9mfb@wegEl2v!=}Sq2Gaty0<)UrOT0{MZtZ~j5y&w
zXlYa_jY)I_+VA-^#mEox#+G>UgvM!Ac8zI<%JRXM_73Q!#i3O|)lOP*qBeJG#BST0
zqohi)<I-A(NiPhXic7WGcd${Bs6q40O%lhVI#-EDK6jPHL9qSG))QSk?b4HNO46Kt
zq~AhM!n=+cMgd8rvaDcAUqe9_Py+)1zbv_Wy{}+a|F1es@~Qc9kW@MgedI!i{HhtJ
z!kAvsCV6n8=$My(7M_90IyH+8**uSL-iyt^&8vOeIRo?HTA5+#Xr^9U=2eV`Xlmc$
zFjy%l4c$~))iDJkH$oB2LfNc)TX?BjEeaRo@~X+PeGiiD85sGHqRqi@$1TzmJ+rd3
zVg^Gn@P5KfN#t5@T7Lvq5F&=Y)v)n4Cjfha{J2+Jic#JjOH^ggzd<;^a<68vsD0fr
zC*U$7*i1uw{BlNp8mMWqc46bk3VHJ?|MSSFbx~ox6ZBjsfPa3I5EWvc##{^VJo^EN
z<}Cr>O!|$|2SeJQo(w6w7%*92S})XfnhrH_Z8qe!G5>CglP=nI7JAOW?(Z29;pXJ9
zR9`KzQ=WEhy*)WH>$;7Cdz|>*i>=##0bB)oU0OR>>N<21e4rMCHDemNi2LD>Nc$;&
zQRFthpWniC1J6@Zh~iJCoLOxN`oCKD5Q4r%ynwgUKPlIEd#?QViIqovY|czyK8>6B
zSP%{2-<;%;1`#0mG^B(8KbtXF;Nf>K#Di72UWE4gQ%(_26Koiad)q$xRL~?pN71ZZ
zujaaCx~jXjygw;<Ay^b{e7rQ>rI!WB=xrOJO6HJ!!w}7eiivtCg5K|F6$EXa)=xUC
za^JXSX98W`7g-tm@uo|BKj39Dl;sg5ta;4qjo^pCh~{-HdLl6qI9Ix6f$+qiZ$}s=
zNguKrU;u+T@ko(Vr1>)Q%h$?UKXCY<ELeH%x_-+;N!NPjI@5|c>>3se%&;h2osl2D
zE4A9bd7_|^njDd)6cI*FupHpE3){4NQ*$k*cOWZ_?CZ>Z4_fl@n(mMnYK62Q1d@+I
zr&<rzyB}9!OG>O))G4hMih<XQCmlZu^@8%Z@1T|FyI_m%FM=cL3%T#%II(MZ`FKla
zpXXu4TSaVkB7;tYPyY<uMC(fA=<j-47<k>gBqRIAJkLdk(p(D~X{-oBUA+If@B}j&
zsHbeJ3RzTq96lB7d($h$xTeZ^gP0c{t!Y0c)aQE;$FY2!mACg!GDEMKXFOPI^)nHZ
z`aSPJpvV0|bbrzhWWkuPURlDeN%<qMpCcxwvwqRMaqc9@{O;iK_N2snE#x<P5c1;#
zmjI^QLBwgKbA!596SF19-xAg`j~Dtp%yL%PsIy-Iy#n3_juW-DvzXBY7{iAZ@^xZK
z@xr2R*rSktAvjWK{FY-U0t8|bspU3(%@64Udl&2uFK6|JrwGTmi+3cF9F%O9@lnIC
zs=6lONtdy$oHQyWIz$zmM@0(12^Y0f!FDBU-xO1sA%B@8>VT8tndV8?d)eN*i4I@u
zVKl^6{?}A?P)Fsy?3oi#clf}L18t;TjNI2>eI&(ezDK7RyqFxcv%>?oxUlonv(px)
z$vnPzRH`y5A(x!yOIfL0bmgeMQB$H5wenx~!ujQK*nUBW;@Em&6Xv2%s(~H5WcU2R
z;%Nw<$tI)a`Ve!>x+qegJnQsN2N7HaKzrFqM>`6R*gvh%O*-%THt<rb___hz1Q*?5
z6WlWA+_DZ_V%@(Z;W6IiWcwb%0tQS@7FVT8L%H@;-K!LOiZIstqiCW*I~5y<2j59v
z*;R0rbBXIxu3n@3a3wxbyp;2oPnr)69p#+<DwNc<oL?vCKSPPJA(K9!1O*f4{YL62
zn|Sj8G%K~6J)K=!JP_YX8V=PjLK~TW_=@Nh@Cw-!NlZLY;2_j0TFoobFt-2R;qd|Z
z5LA6AxE3z?^1m|nTtbf_LVu#lgMMaoQSi!)(ir+k@yiKNqZa8D=pY1qBEL59w8xB>
zrB$Nk;lE;z{s{r^PPm5qz(&lM{sO*g+W{sK+m3M_z=4=&CC>T`{X}1Vg2PEfSj2x_
zmT*(x;ov%3F?qoEeeM>dUn$a*?SIGyO8m<CDx(PoF9Z7P(-Vz@5G~R_9zlbonzZYL
z!@f`rWC1aNE3{SDF~{lPR$(P=;{QgE*BcIk`ok>806J1W1o+4HRhc2`9$s6hM#qAm
zChQ87b~GEw{ADfs+5}FJ8+|bIlIv(jT$Ap#hSHoXdd9#w<#cA<1Rkq^*EEkknUd4&
zoIWIY)sAswy6fSERVm&!SO~#iN$OgOX*{9@_BWFyJTvC%S++ilSfCrO(?u=Dc?CXZ
zzCG&0yVR{Z`|ZF0eEApWE<q~aV-3QaRPuwFB`iCbViORA{tX~ST{rFw+In$j?7gLL
zT8ETr1QUOZxkIpI@5JqQk|b^xG*nZ910-;>o#s9osV>F{uK{QA@BES#&;#KsScf>y
zvs?vIbI>VrT<*!;XmQS=bhq%46-aambZ(8KU-wOO2=en~D}MCToB_u;Yz{)1ySrPZ
z@=$}EvjTdzTWU7c0ZI6L8=yP+YRD_eMMos}b5vY^S*~VZysrkq<`cK3>>v%uy7jgq
z0ilW9KjVDHLv0b<1K_`1IkbTOINs0=m-22c%M~l=^S}%hbli-3?BnNq?b`hx^HX2J
zIe6ECljRL0uBWb`%{EA=%!<byF4eC4KX4aZdAqSwE!&>i^4sMcj+U_TaTZRb+~GOk
z^ZW!nky0n*Wb*r+Q|9H@ml@Z5gU&W`(z4-j!OzC1wOke`TRAYGZVl$PmQ16{3196(
zO*?`--I}Qf(2HIwb2&1FB^!faPA2=sLg(@6P4mN)>Dc3i(B0;@O-y2;lM4akD>@^v
z=u>*|!s&9zem70g7zfw9FXl1bpJW(C#5w#uy5!V?Q(U35A~$dR%LDVnq@}kQm13{}
zd53q3N(s$Eu{R}k2esbftfjfOITCL;jWa$}(mmm}d(&7JZ6d3%IABCapFFYjdEjdK
z&4Edqf$G^MNAtL=uCDRs&Fu@FXRgX{*0<(@c3|PNHa>L%zvxWS={L8%qw`STm+=Rd
zA}FLspESSIpE_^41~#5yI2bJ=9`oc;GIL!JuW&7YetZ?0H}$$%8rW@*J37L<XFjIr
zWJ?$CMV^YggN$kCs0w8InCxL-qk`fNzT7&VA*&~~T=_(uA$CzC9vrKbN28^=3?MLv
zFoK23^axE9-(UuJp6l-WH~flb#LWz<-GZXz4PS*&{c?IaOs6}Rx>-~Rsx!)8($nI4
zZhcZ2^=Y+p4YPl%j!nFJA|*M^gc(0o$i3nlphe+~-_m}jVkRN{spFs(o0ajW@f3K{
zDV!#BwL322CET$}Y}^0ixYj2w>&Xh12|R8&yEw|wLDvF!lZ#dOTHM9pK6<Y!obdB7
zJ?!B#A1=BS7Ka=6k6_mfjXh1lGwLZR4PT*9n^^MB#jyfEy~L|)^EN8(11T8rrN7f?
z?@)HsizJFWKFr(u1x(=}jA~A*@;u9iCG}YS49mN9degGJEr@RQbFX&wZoBr{H3Nr7
z?H=$Zj>@Nm-@9Lnng4ZHBgBSrr7KI8YCC9DX5Kg|`HsiwJHg2(7#nS;A{b3tVO?Z%
za{m5b3rFV6EpX;=;n#wltDv1LE*|g5pQ+OY&*6qCJZc5oDS6Z6JD#6F)bWxZSF@q%
z+1WV;m!lRB!n^PC>RgQCI#D1br_o^#iPk>;K2hB~0^<~)?p}LG%kigm@moD#q3PE+
zA^Qca)(xnqw6x>XFhV6ku9r$E>bWNrVH9fum0?4s?Rn2LG{Vm_+QJHse6xa%nzQ?k
zKug4PW~#Gtb;#5+9!QBgyB@q=s<UxAOfSGI17`#WQPoP8A+0%MXyAyK3J=T4^Kbj7
zlt}Oe#L`UOUU8o&zkUW`eBtn39$xzlz~(3x_>k9=$S{4T>wjFICStOM?__fr+Kei1
z3j~xPqW;W@YkiUM;HngG!;>@AITg}vAE`M2Pj9Irl4w1fo4w<|Bu!%rh%a(Ai^Zhi
zs92>v5;@Y(Zi#RI*ua*h`d_7;byQSa*v9E{2x$<hp}Q26jsc}%C<*BjhE^Jt66tP1
zKtKU0Q9$V-l~zCy7`juGR=*kVy&nUYi+t;Q*P6xR{P8}s&pC7U+3()_`4tk8P7SO*
zts+C&EBA;W!X3G&`&E+LxzU~6Q<emFH@kgR*8?20u2XLWT3aI!YMwas1=TsetFPnK
zg5`qHR`D=R<(><-_=5Z<7{%)}4XExANcz@rK69T0x3%H<@frW>RA8^swA+^a(FxK|
zFl3LD*ImHN=X<Tm`8DfC>DUkrR<xW<(jZxC)!wKS)92Mp`by<@CGgeKED~ZJ3Jy$+
zjW_5$gXkQl+$Z+QJ~I#o6Pp!?bb78VSX!(@(92a;u-;wW%qe9fMiUm=#PwkrAbXT5
zVo>hp6RY5$rQ{bRgSO*(vEHYV)3Mo6Jy3puiLmU&g82p{qr0F?ohmbz)f2r{X2|T2
z$4fdQ=>0BeKbiVM!e-lIIs8wVTuC_m7}<Ui;+XED-5oBiR>y4A_%ikI;Wm5$9j(^Y
z(cD%U%k)X>_>9~t8;pGzL6L-fm<ULs+oY32iJK8t`MZXpBuoD$6OMAKyL46!gYNWE
zud<`{zju@c#T_7EyRG#xA7t1QT}ZgtiYwog7J*c#?<fNv<K7Htr4Bz79uv2x*w)<k
z=a;7q%vaAz-hD(kAxqWM%g9t=$}0NKb?)GaBa8K156rk8wWyI)MB=&^_{THh+}COk
zUzRKn7T0APWWnx<r&J-F&{b%n+JRIgYHuv;^|;Hor$0q~|8BI{QjuMG8JYxvYF}Gn
zy*In4ANN}0dpJx*JPO@eiJFX7bD|1WV+M;rfsj3PIBa@gpsi3FnkV~X3nx%2?_ypI
zh}S3zV-DKR|K!F9Z3-RZumjj)P_`}W2J~q|bB0)a9xHmA&%h{<vyYF9&zIJ>QO@K;
zo&vQzMlgY95;1BSkngY)e{`n0!NfVgf}2mB3t}D9@*N;FQ{HZ3Pb%BK6;5#-O|WI(
zb6h@qTLU~AbVW#_6?c!?Dj65Now7*pU{h!1+eCV^KCuPAGs28~3k@ueL5+u|Z-7}t
z9|lskE`4B7W8wMs@<RoRq4K_1jQ|2oJg$O-rP`~;Bzz?@Jlrn0Z@<=ZmJTM3jjq_~
zRZ}mS?n&K}m}#OA-nYs)kQ@R^-(uFAD{$CgCv6|t>xJa{#bsCGDFoRSNSnmNYB&U7
zVGKWe%+kFB6kb)e;<v}cel^QYVzboCh<hrVzAa>TyHfqtU6~fRg)f|>=5(N36)0+C
z`hv65J<$B}WUc!wFAb^QtY31yNleq4dzmG`1wHTj=c*=hay9iD071Hc?oYoUk|M*_
zU1GihAMBsM@5rUJ(qS?9ZYJ6@{bNqJ`2Mr+5#hKf?doa?F|+^IR!8lq9)wS3tF_9n
zW_?hm)G(M+MYb?V9YoX^_mu5h-LP^TL^!Q9Z7|@sO(rg_4+@=PdI)WL(B7`!K^ND-
z-uIuVDCVEdH_C@c71YGYT^_Scf_dhB8Z2Xy6vGtBSlYud9vggOqv^L~F{BraSE_t}
zIkP+Hp2&nH^-MNEs}^`oMLy11`PQW$T|K(`Bu*(f@)mv1-qY(_YG&J2M2<7k;;RK~
zL{Fqj9yCz8(S{}@c)S!65aF<=&eLI{hAMErCx&>i7OeDN>okvegO87OaG{Jmi<|}D
zaT@b|0X{d@OIJ7zvT>r+eTzgLq~|Dpu)Z&db-P4z*`M$UL51lf>FLlq6rfG)%doyp
z)3kk_YIM!03eQ8Vu_2fg{+osaEJPtJ-<V*830?$46vO`8Mw##QM*`Rr?w|#MyZ6A&
z_}pwQU2m8=Sp54^L})3wA`G=0rsaz56>s36R+5_AEG12`NG)IQ#TF9c@$99%0iye+
zUzZ57=m2)$D(5Nx!n)=5Au&O0BBgwxIBaeI(mro$#&UGCr<;C{UjJVAbVi%|+WP(a
zL$U@TYCxJ=1{Z~}rnW;7UVb7+ZnzgmrogDxhjLGo>c~MiJAWs&&;AGg@%U?Y^0JhL
ze(x6Z74JG6FlOFK(T}SXQfhr}RIFl@QXKnIcXYF)5|V~e-}suHILKT-k|<*~Ij|VF
zC;t@=uj=hot~*!C68G8hTA%8SzOfETOXQ|3FSaIEjvBJp(A)7SWUi5!Eu#yWgY+;n
zlm<$+UDou*V+246_o#V4kMdto8hF%%Lki#zPh}KYXmMf?hrN0;>Mv%`@{0Qn`Ujp)
z=lZe+13>^Q!9zT);H<(#bIeRWz%#*}sgUX9P|9($kexOyKIOc`dLux}c$7It4u|Rl
z6SSkY*V~g_B-hMPo_ak>>z@AVQ(_N)VY2kB3IZ0G(iDUYw+2d7W^~(Jq}KY=JnWS(
z#rzEa&0uNhJ>QE8iiyz;n2H|SV#Og+wEZv=f2%1ELX!SX-(d3tEj$5$1}70Mp<&eI
zCkfbByL7af=qQE@5vDVxx1}FSGt_a1DoE3SDI+G)mBAna)KBG4p8Epxl9QZ4BfdAN
zFnF|Y(umr;gRgG6NLQ$?ZWgllE<HJXYDr?-K8^D!ZlPa8ZG&+c+DJy(x5;Feg3L)*
zvdpRL&@{hyOZBRzTt*TCk9hewUpi`*Kt|w8q}nwi%?T`D?1ox^RZSRut44vMd{LO!
z5<xD{qy<T%oDMUy9H#9$W-Wu8GwxNdV(5T7B?RP@ZWH6Jt?*TpR4hY26nALXC@f!D
z_AE{^bi`q0U%atoVV5-5alLF?pA>eeq~z^ZS7L?<(~O&$5|y)Al^iMKy}&W+eMm1W
z7EMU)u^ke(A1#XCV>CZ71}P}0x)4wtHO8#JRG3MA-6g=`ZM!FcICCZ{IEw8Dm2&LQ
z1|r)BUG^0GzI6f946RrBlfB1Vs)~8toZf~7)+G;pv&XiUO(%5bm)pl=p>nV^o*;&T
z;}@oZSibzto$arQgfkp|<o4}2aWyA-u@g@X@*3Qq>z4Z($P>dTXE{4O=vY0!)kDO*
zGF8a4wq#VaFpLfK!iELy@?-SeRrdz%F*}hjKcA*y@mj~VD3!it9lhRhX}5YOaR9$}
z3m<U!dsnGfk?KY|+&M@U4to6WN5Fr~Z2Y~S6z;ugWov!c#T4Pid7?1x8wY`#cM-K&
zXZ31E^^+4l<8TS_eV}PTTl9cUjRQB-789L;;mi28ms9Ok_n}yunSfm?pRC6bk9iMK
z{Me@L*Hx9gKeGtG68!~RS?gRYfl2zINnyRg@p)U*X(<_ksV`=o%2XWE5-Y+=UYL-Y
ztqFc{r$bTOOr%6GK_kGlR5`+;tTS|8zSb;+levJ}UbU#B1Mej>S%$2Be7{l(+MVx3
z(4?h;P!jnRmX9J9sYN#7i=iyj_5q7n#X(!cdqI2lnr8T$IfOW<_v`eB!d9xY1<?8m
zc=9ctC~_znEmY{3do4Y&tSr#K8MEwV*K>P=2q&WtOXY=D9QYteP)De?S4}FK6#6Ma
z=E*V+#<o63M9PbR(KQau)lC&KzK0v*^~#=c>s8>L;8aVroK^6iKo=MH{4yEZ_>N-N
z`(|;aOATba1^asjxlILk<4}f~`39dBFlxj>Dw(hMYKPO3EEt1@S`1lxFNM+J@uB7T
zZ8WKjz7HF1-5&2=l=fqF-*@>n5J}jIxdDwpT?oKM3s8Nr`x8JnN-kCE?~aM1H!hAE
z%%w(3kHfGwMnMmNj(SU(w42OrC-euI>Dsjk&jz3ts}WHqmMpzQ3vZrsXrZ|}+MHA7
z068obeXZTsO*6RS@o3x80E4ok``rV^Y3hr&C1;|ZZ0|*EKO`$lECUYG2gVFtUTw)R
z4Um<0ZzlON`zTdvVdL#KFoMFQX*a5wM0Czp%wTtfK4Sjs)P**RW&?lP$(<}q%r68Z
zS53Y!d@<WHAUBA*8bUiRbm%Z;gZhfNqOZBHucO1A@&fFR43kejZro_v)VLcPwS7}~
zK5k!0!+rSL*HKlXVA@0UZ>&~ne9O)A^tNrXHhXBkj~$8j%pT1%%mypa9A<G!EKb<E
z`}8*o7Vl;U@K#TU5!9`Os$JU!y8FCud{w+#5g>W5E&s9)rjF4@O3ytH{0z6riz|@<
zB~UPh*wRFg2^7EbQrHf0y?E~dHlkOxof_a?M{LqQ^C!i2dawHTPYUE=X@2(<D?FV&
zcM@&VpBXA?L{0tb_j)NYM}%xYbEn~9R`f?;g|efm#wA%S{AP`lHyTMh8%rB%5P`TA
z^U37=Hfa1dqP}{-l>3<=OOxs8qn_(y>pU>u^}3y&df{JarR0@VJn0f+U%UiF=$Wyq
zQvnVHESil@d|8&R<%}uidGh7@u^(%?$#|&J$pvFC-n8&A>utA=n3#)yMkz+qnG3wd
zP7xCnF|$9Dif@N~L)Vde3hW8W!UY0BgT2v(wzp;tlLmyk2%N|0jfG$%<;A&IVrOI<
z!L)o>j>;dFaqA3pL}b-Je(bB@VJ4%!JeX@3x!i{yIeIso^=n?<yGgTTZ!PNB?9gX<
zIb-Mwo=t8!k;=1f5ke{{mjD;8+drvc=j_XPyvGfArKBJEaX2!s2-K1+v_zbm8v7BT
za`|K1N2}$TyJ@nhBOHb0du-5xRz>fDX`3bU=eG7sTc%g%ye8$v8P@yKE^XD=NYxTb
zbf!Mk=h|otpqjFaA-vs5YOF-*GwWPc7VbaOW&stlANnCN8iftFMMrUdYNJ_Bnn5Vt
zxfz@Ah|+4&P;reZxp;MmEI7C|FOv8NKUm8njF7Wb6Gi7DeODLl&G~}G4be&*Hi0Qw
z5}77vL0P+7-B%UL@3<Hx#GB2%vG%0rC1(94hq216=?s(ssfs{vDgp`%$+N_!WRy9C
za}K<}u;;Pqc=!2V0k{Hgo5p0-m1!D$f?OU9<FJ2<5oh^cbOGSGfdj2^%OxwDfU-z~
z(XR@2k_uf5r(>n1&JPxW^d@vVwp?u#gVcJqY9#@-3X{ok#UfW3<1fb%FT`|)V~ggq
z(3AUoUS-;7)^hCjdT0Kf{i}h)mBg4qhtHHBti=~h^n^OTH5U*XMgDLIR@sre`AaB$
zg)IGBET_4??m@cx&c~bA80O7B8CHR7(LX7%HThkeC*@vi{-pL%e)yXp!B2InafbDF
zjPXf1mko3h59{lT6EEbxKO1Z5GF71)WwowO6kY|6tjSVSWdQ}NsK2x{>i|MKZK8%Q
zfu&_0D;CO-Jg0#YmyfctyJ!mRJp)e#@O0mYdp|8x;G1%OZQ3Q847YWTyy|%^cpA;m
zze0(5p{tMu^lDkpe?HynyO?a1$_LJl2L&mpeKu%8YvgRNr=%2z${%WThHG=vrWY@4
zsA`OP#O&)TetZ>s%h!=+CE15lOOls&nvC~$Qz0Ph7tHiP;O$i|eDwpT{cp>+)0-|;
zY$|bB+Gbel>5aRN3>c0x<ruzyyQ%Zy5vf6}`4)t?d-C#9!GHs<@Hns>)4U=|X+z+{
zn*_p*EQ<B%95agq4hNqm^)z*%GD}I4q^^7sH5Hy)x3mf2i}!E6)Yg(THpWRZ7O-FM
zN0W_>oquRL+=+p;=lm`d71&1NqBz&_ph)MXu(Nv6&XE7(RsS)^MGj5Q?Fwude-(sq
zjJ>aOq!7!EN>@(fK7EE#;i_BGvli`5U;r!YA{JRodLBc6-`n8K+Fjgwb%sX;j=qHQ
z7&Tr!)!{HXoO<2BQrV9Sw?JRa<KG5emNB(T<y(NWmqssKxUatWWx*DdyYw_kOIJd)
z;k&Pys2Tnf4Po((JL<eVe)|O!Lv`GXH>LXV8HrsNevvnf>Y-6|{T!pYLl7jp$-nEE
z#X!4G4L#K0qG_4Z;Cj6=;b|Be$hi4JvMH!-voxqx^@8cXp`B??eFBz2lLD8RRaRGh
zn7kUfy!YV~p(R|p7iC1Rdgt$_24i0cd-S8HpG|`@my70g^y`gu%#Tf_L2<gJMO);m
zEk>1-k?sRRZHK&at(*ED0P8iw{7?R$9~OF$Ko;Iu5)ur5<->x!m<Z%nn+oUd>93Eb
zFYpIx60s=Wxxw=`$aS-O&dCO_9?b1yKiPCQmSQb>T)963`*U+Ydj5kI(B(B?HNP8r
z*bfSBpSu)w(Z3j7HQoRjUG(+d=IaE~tv}y14zHHs|0UcN52fT8V_<@2ep_ee{QgZG
zmgp8iv4V{k;~8@I%M3<#B;2R>Ef(Gg_cQM7%}0s*^)SK6!Ym+~P^58*wnwV1BW@eG
z4sZLqsUvBbFsr#8u7S1r4teQ;t)Y@jnn_m5jS$CsW1um!p&PqAcc8!zyiXHVta9QC
zY~wCwC<otwTnmD(e1ezd?<0DwVxo}=Fub>F0U%xiQPD_INKtTb;A|Zf29(mu9NI;E
zc-e>*1%(LSXB`g}kd`#}O;veb<(sk~RWL|f3ljxCnEZDdNSTDV6#Td({6l&y4IjKF
z^}lIUq*ZUqgTPumD)RrCN{M^jhY>E~1pn|KOZ5((%F)G|*ZQ|r4zIbrEiV%42hJV8
z3xS)=!X1+=olbdGJ=yZil?oXLct8FM{(6ikLL3E%=q#O6(H$p~gQu6T8N!plf!96|
z&Q3=`L~>U0zZh;z(pGR2^S^{#PrPxTRHD1RQOON&f)Siaf`GLj#UOk&(|@0?zm;Sx
ztsGt8=29-MZs5CSf1l1jNFtNt5rFNZxJPvkNu~2}7*9468TWm>nN9TP&^!;J{-h)_
z7WsHH9|F%I`Pb!>KAS3jQWKfGivTVkMJLO-HUGM_a4UQ_%RgL6WZvrW+Z4ujZn;y@
zz9$=oO!7qVTaQAA^BhX&ZxS*|5dj803M=k&2%QrXda`-Q#IoZL6E(g+tN!6CA!CP*
zCpWtCujIea)ENl0liwVfj)Nc<9mV%+e@=d`haoZ*<oI1yD0)ode<2{}h3=ee3^GUp
zK^Zv;a(K^#^f3JYFh@>`B7+PNjEbXBkv=B+Pi^~L#EO$D$ZqTiD8f<5$eyb54-(=3
zh)6i8i|jp(@OnRrY5B8t|LFXFQVQ895n*P16cEKTrT*~yLH6Z4e*bZ5otpRDri&+A
zfNbK1D5@O=sm`fN=WzWyse!za5n%^+6dHPGX#8DyIK>?9qyX}2XvBWVqbP%%D)7$=
z=#$WulZlZR<{m#gU7lwqK4WS1Ne$#_P{b17qe$~UOXCl>5b|6WVh;5vVnR<%d+Lnp
z$uEmML38}U4vaW8>shm6CzB(Wei3s#NAWE3)a2)z@i{4jTn;;aQS)O@l{rUM`J@K&
l00vQ5JBs~;vo!vr%%-k{2_Fq1Mn4QF81S)AQ99zk{{c4yR+0b!

literal 61608
zcmb5VV{~QRw)Y#`wrv{~+qP{x72B%VwzFc}c2cp;N~)5ZbDrJayPv(!dGEd-##*zr
z)#n-$y^sH|_dchh3@8{H5D*j;5D<{i*8l<n`R`94An31eIWbisdMSBvMo=KdzZu#!
z2=IUVG7$V4U%UUmhH^skQsQDNstj`C4{}qJvNH4x^YAkCG&57PP0CD5tUr(Lr|8F|
zrsbw-rRacR&cjU84vV#^0hr{ahs87@nB*8}#Ta+ach127GUL}I|L4%azP25lE&lDO
z{@DihA2t@wMy9rA|5sDgzngkE8#y|fIse-(VW+DelrTU*`j|jKH2?E168}A!#$SIR
zXJlp1U}9_J;*z5Y>5IFJ|DjL!e)upfGNX(kojugZ3I`oH1PvW`wFW_ske0j@lB9bX
zO;2)`y+|!@X(fZ1<2n!Qx*)_^Ai@Cv-dF&(vnudG?0CsddG_&Wtae(n|K59ew)6St
z#dj7_(Cfwzh$H$5M!$UDd8=4>IQsD3xV=lXUq($;(h*$0^yd+b{qq63f0r_de#!o_
zXDngc>zy`uor)4A^2M#U*DC~i+dc<)Tb1Tv&~Ev@oM)5iJ4Sn#8iRw16XXuV50BS7
zdBL5Mefch(&^{luE{*<o)$0CtHMXCiFaqU;N{t<$9@JbXquVr@cf{y~BNB(J5=Tji
zlK?_g|E;1zl$VJ=#ZmElT~Y6jy-|?2PUv}kl<0irKUHY7@2t={_gVdY)lv8kM+ad9
zC<O%>5qtCZk$oFr3<io|2$Itc(&(T+V0vhN)K$Fl^c3u8y`}{@R7L#c1&Qu_+u$L|
zkw6sZeUEd0xxV1r@X7Bj^XUCX<ecNL?GSk}zL!>RH=H!c3wGR=HJ(yKc_re_X9pD`
zJ;uxPzUfVpgU>DSq?J;I@a+10l0ONXPcDkiYcihREt5~T<to{?YLB3#Ek~Bd_FRTK
z3SVU)NWfW~bevBhSgga`J`3XaEJ;UR&tR-QNI#e+fX1mkLg(kYRIlBUeP!g)rVvkV
zmBQF>5Gb}sT0+6Q;AWHl`<y=xe2MOa)>S5dV>lv%-p9l#xNNy7ZCr%cyqHY%TZ8Q4
zbp&#ov1*$#grNG#1vgfFOLJCaNG@K|2!W&HSh@3@Y%T?<RDDZ2kvE4KZX_tTk{8@Y
z+1Qu}v&0qF!3ps~B5R6-#N&o4vQEcX3!~lWKK-JjRoUbPQR)>3YI75bJp!VP*$*!<
z;(ffNS_;@RJ`=c7yX04!u3JP*<8jeqLHVJu#WV&v6wA!OYJS4h<_}^QI&97-;=ojW
zQ-1t)7wnxG*5I%U4)9$wlv5Fr;cIizft@&N+3<m!sp`}{5>2O%B{R1POm$oap@&f|
zh+5J{>U6ftv|vAeKGc|zC=kO(+l7_cLpV}-D#oUltScw})N>~JOZLU_0{Ka2e1evz
z{^a*ZrLr+JUj;)K&u2CoCAXLC2=fVScI(m_p~0FmF>>&3DHziouln?;sxW`NB}cSX
z8?I<poVWwH93~xX>sJB)Z=aYRz!X=yJn$kyOWK%rCYf-YarNqKzmWu$ZvkP12b4qH
z<cj_@_^h^p^q&$rHm}tFrF$o@p+N@Luju~MbeZxq_WbMvMAonH{#8FcaQx#1Ex963
zthr*D;hp#t`U%;8Lw{en#r&PBH>hS9Q>j<}(*frr?z<%9hl*i^#@*O2q<G8@m-E{I
z`}pP(W$_?tQz?qiq)AkeSb{O1HEI<O&IPY2fz^)h2U5WFf)$o|GVN9!>(Z^CN)c2c
z>1B~D;@YpG?G!Yk+*yn4vM4sO-_!&m6+`k|3zd;8DJnxsBYtI;W3We+FN@|tQ5EW=
z!VU>jtim0Mw#iaT8t_<+qKIEB-WwE04lBd%Letbml9N!?SLrEG$nmn7&W(W`VB@5S
zaY=s<l}}fvx=2PUlRXVFqYw_pix_=MLAKV-vfffnNa-G}V}-DjqeGu81{_6c7DT4*
zgNTK&HNdPkT}|m;Wopt-pwH(=vK!Mcs#L3p7EuhKtdS*$(gi7K6)2mt;vO}}@U2?@
zic8*RBj6lGpirRD%IH>Ew2}i@F_1P4OtEw?xj4@D6>_e=m=797#hg}f*l^`AB|Y0#
z9=)o|%TZFCY$SzgSjS|8AI<m~)~<LWT=KD$snpvb;<|raYO=8NN=pEex{aVNGen|i
z4hGyCiz+M`>-%J4x}J)!IMxY3_KYze`_I=c1nmrk@E8c9?MVRu)7+Ue79|<R7R(*W
zmGI9WxS<;F_rj?)6ZJ2+&*@e<mlh^Wi>)rBX7tVB7U|w4*h(;Gi3D9le49B38`wuv
zp7{4X^p<CFK*NrFla6?I(q;<C*K@ag4>+K4*$@gU(Tq3K1a#3SmYhvI42)GzG4f|u
zwQFT1<JTz}_6=eHFU^e2CZtm7+S~2?G10jrHLa$Yc>n_=n|jpi=70-yE9LA+d*T8u
z`=VmmXJ_f6WmZve<c3j)L*cT@L>ZPct$Cgu^~gFiyL>Lnpj*6ee>*0pz=t$IJ}+rE
zsf@>jlcG%Wx;Cp5x)YSVvB<GcbWPQ65t~gc{a(L|Y**_KX&N^LV{4p;>1$yyY1l&o
zvwX=D7k)Dn;ciX?Z)Pn8$flC8#m`nB&(8?RSdBvr?>T9?E$U3u<MGKL6<gI3+cigX
zr2;7xjAPPdw|q3|5<Av+0yh@5pePF?so63EF4(f;!m<(9QF+GK>IX7T?$v4dWCa46
z+&`ot8ZTEgp7G+c52oHJ8nw5}a^dwb_l%MOh(ebVj9>_koQP^$2B~eUfSbw9RY$_<
z&DDWf2LW;b0ZDOaZ&2^i^g+5uTd;GwO(-bbo|P^;CNL-<vp1D1$R<L}_zoyFQ(?^n
zl`6VAFTjED$Nit=axARyg>%?9mRmxEw~5&z=X^Rvbo^WJW=n_%*7974RY}JhFv46>
zd}`2|qkd;89l}R;i~9T)V-Q%K)O=yfVKNM4Gbacc7AOd>#^&W&)Xx!Uy5!BHnp9kh
z`a(7MO6+Ren#>R^D0K)1sE{Bv>}s6Rb9MT14u!(NpZOe-?4V=>qZ>}uS)!y~;jEUK
z&!U7Fj&{WdgU#L0%bM}SYXRtM5z!6M+kgaMKt%3FkjWYh=#QUpt$XX1!*XkpSq-pl
zhMe{muh#<vd{NzT8hJO~2nwSu@|uKui`Q8EdXeGz4>knk{9_V3%qdDcWDv}v)m4t9
z<k^O7as2~K;#kz6&_j;+XcIB_r9LslJ=plZ802GD7!wKurp5N7C0N7MrBiyAL~c=u
zE%@soR=E%Ksd7<Rzkb}c1=?E^tRZO%BD}eh;$H);oB)^Nt6e4N2J+}eE=O>Qhv{;}
zc{}#V^N3H>9mFM8`i`0p+fN@GqX+kl|M94$BK3J-X`Hyj8r!#x6Vt(PXjn?N)qedP
z=o1T^#<s;C9Ui_c^t!}2S-XqPF?-?4;fe4415B~F0>?1^a{;bZ&x`U{f?}TMo8ToN
zkHj5<VbXBbPLm`saJ%OL;G18~%@f$_blKkP1#<P0FY;5DtZHS)$u-A?Yn3SA3J@bT
zA1d!HbKV+f1Ugw07K&jwzua_~#;P<Rn>v|}r}wDEi7I@)Gj+S1aE<Lr;qg@51w32$
zyxn{bK>-GdnLN+$hw!=DzglMaj#{qjXi_dwpr|HL(gcCXwGLEmi|{4&4#OZ4ChceA
zKVd4K!D>_N=_X;{poT~4Q+!Le+ZV>=H7v1*l%w`|`Dx8{)McN@NDlQyln&N3@bFpV
z_1w~O4EH3fF@IzJ9kDk@7@QctFq8FbkbaH7K$iX=bV~o<VCiV&YRTZ}?C^!Fu2yC)
zv{Vzb(sB&ct#XXgvg1<Aax>#gfh?2JD6lZf(XP>~DACF)fGFt)X%-h1yY~MJU{nA5
ze2zxWMs{YdX3q5XU*9hOH0!_S24DOBA5usB+Ws$6{|AMe*joJ?RxfV}*7AKN9V*~J
zK+OMcE@bTD>TG1<D&k;gXJl_GYh`aH;$ZLob;4%Of6;ZSs-6Ri5E?%yZ1lwjNo$M0
zh+s;*GL1qh63T)l8*vTt!qBLZ)~cQ14>*yc?*qGqjBN8mgg@h1cJLDv)0!WRPIkC`
zZrWXrceVw;fB%3`6kq=a!pq|hFIsQ%ZS<kf2ia2#pBvu`A3V%+`AJvHB*NUK3~nQF
zw*gxnx7LCX(Z^1w*|SqdvT{$S%V#1K_mVQ7La-Aw%y<w}ejK@Lu|-CGm40~>lo~)D
z|64!aCnw-?>}AG|*iOl44KVf8@|joXi&|)1rB;EQWgm+iHfVbgllP$f!$<xMKNPGw
z75lQ-&s?W5309;y6gIrMn!YgKCh2h_t)HK6EcT@xYc0sgM!#>Wf42%NO5b(j9Bw6L
z;0dpUUK$5GX4QbMlTmLM_jJt!u<VK-KUt7Z%d43gTkafnEz;tKrLF`kq7eb@)^GVH
zVzlnCl^>r`_0~$b#BB7FL*%XFf<<YlClUogc56^3Yyh4jgqXW7(#Qu|X^(|f$!!nL
zr<Jlyt{`j<%HJ7(Ibr+qi51D$ikY1it_}mi&OTSv%-y{FbY?e9I<zP))1O}CdnlMB
z)E{0F(+ck9%;u_OGgFgau=Rw8qE6u}01y?;f@M5NLv*P|4@P3@#u%P9aWCL)&PJT|
zX@dygu5XWA26#e~n6RWn&*Bl^^VBtoVJBn^bDnW4mHo4ME6_YI9>b__1o)Ao<oAII
zl<ghkn)lbTvrX_mEpa~6_wy3!knhoEQy$s)O&Eje&DuVJ{~mIy!7WXiU&-a=SC+^7
zzq_L1{|UJN-6?C-bu@6*&_3i@#`~C#P@p9X(Ce2%iic!mTBMYuD`LZ<OM}*McxA(w
zkj(d|!1fegueE#LwG9egYdYR8KktNowE4+1AfZ@IuxN3gT>3rlobbN8-(T!1d<VYe
z=uu*dc`@_NH-vid1r!+qd!W<p6Hp2sR=vY4yh`?ujy)PePx7Y^!w{->-bR8D3S0@d
zLI!*GMb5s~Q<&sjd}lBb8Nr0>PqE6_!3!2d(KAWFxa{hm`@u|a(%#i(#f8{BP2wbs
zt+N_slWF4IF_O|{w`c~)Xvh&R{Au~CFmW#0+}MBd2~X}t9lz6*E7uAD`@EBDe$>7W
zzPUkJx<`f$0VA$=>R57^(K^h86>09?>_@M(R4q($!Ck6GG@pnu-x*exAx1jOv|>KH
zjNfG5pwm`E-=ydcb+3BJwuU;V&OS=6yM^4Jq{%AVqnTTLwV`AorIDD}T&<zk=U4_F
z%akElkXp@CbeS<cl%y^#t}u_*o+Kw^Xa%!S>jWr8pB&j28fVtk_y*JRP^t@l*($UZ
z6(B^-PBNZ+z!p?+e8@$&jCv^EWLb$WO=}Scr$6SM*&~B95El~;W_0(Bvoha|uQ1<y
zI;g~pq<puh8JAZSg`e`{9Ul}WlQxSt?3%o&hA!;)cXW-;B<UPjMu}?EtHvVS7g>T<
zO$%_oLAwf1bW*rKWmlD+@CP&$ObiDy=nh1b2ejz%LO9937N{LDe7gle4i!{}I$;&Y
zkexJ9Ybr+lrCmKWg&}p=`2&Gf10orS?<wSRKh%(i*-EzBy^*(nk#EV0x%s+gVr5#i
zF*^yn?NFz@z)jkaF%P~*zrnDtj18`Mit$=8TVU0_Xu0XQT-29W)`{}4Y{_WLO}la2
z3kum*Acd(?w(30MQ0iXECV4}56Baro5eg?Ji{&xv>4$Vr<ApIaAwLyRgnDz_63EnQ
zb0F~DwJxa8Y6V&P@8Y;IWU23PX|5YXwRO5>zWidT=*6{KzOGMo?KI0>GL0{iFWc;C
z+LPq%VH5g}6V@-tg2m{C!-$fapJ9y}c$U}aUmS{9#0CM*8pC|sfer!)nG7Ji>mfRh
z+~6CxNb<thuojmgyDIx-O?L~|1OMp?{&5*5nw(NYRF76i1VE!yuFbdk^SXpYh9d!e
zisi>>6eWKMHBz-w2{mLL<sWnSR{lp+GVAVGNcs2U?&%}ZbUT({ThKL33h5&godIvq
z#4SFCl~dpzw{Kf9GWC*<(5@{J-YWs96Ulo#)6da2L@e?NLIhPLoWud(Gbix6rPhyM
z+#ezG31H`whsp_@rDLe9hoK&0hz}tS!3q2%y1yY-p%>wdA7dA-qfTu^A2yG1+9s5k
zcF=le_UPYG&q!t5Zd_*E_P3Cf5<i9lV%B>T6821bO<oZ<I;eq^g7*0L=5+o%xOyh3
zV}b+qIu^3vM+=S`g6~mUfaz2O^0b~+Y02%irk{L(|9!#otC{hV00sh*`O?q-K|B9x
zc@lEAaI-VBcNOzAF>`daa`;DODm8Ih8k89=RN;-asHIigj`n=ux>*f!OC5#;X5i;Q
z+V!GUy0|&Y_*8k_QRUA8$lHP;GJ3UUD08P|ALknng|YY13)}!!HW@0z$q+kCH%xet
zlWf@BXQ=b=4}QO5eNnN~CzWBbHGUivG=`&eWK}<gH9L&>beuV*;?zt=P#pM*eTuy3
zP}c#}AXJ0OIaqXji78l;YrP4sQe#^pOqwZUiiN6^0RCd#D271XCbEKpk`HI0IsN^s
zES7YtU#7=8gTn#lkrc~6)R9u&SX6*Jk4GFX7){E)WE?pT8a-%6P+zS6o&A#ml{$WX
zABFz#i7`DDlo{34)oo?bOa4Z_lNH>n;f0nbt$JfAl~;4QY@}NH!X|A$KgMmEsd^&Y
zt;pi=>AID7ROQfr;MsMtClr5b0)xo|fwhc=qk33wQ|}$@?{}qXcmECh>#kUQ-If0$
zseb{Wf4VFGLNc*Rax#P8ko*=`MwaR-DQ8L8V8r=2N{Gaips2_^cS|oC$+yScRo*uF
zUO|5=?Q?{p$inDpx*t#Xyo6=s?bbN}y>NNVxj9NZCdtwRI70jxvm3!5R7yiWjREEd
zDUjrsZhS|P&|Ng5r+f^kA6BNN#|Se}_GF>P6sy^e8kBrgMv3#vk%m}9PCwUWJg-AD
zFnZ=}lbi*mN<K#(vlYbGZAX^KQmjvAYCRG*UOU`z2$j+74AdgXr3(r`Z*t~vhyGOF
z)w@e8rCo#wjxU`Xq#TN0kURQy8Y45b@jCRNbbQi7ac)K;Y9F%JPMNFNffNKTTeU*T
zHQTmYG^Gu1I@&Jv`71fu(BSKE_ZcDAC6eM{-i#Ce{raky!z_b9d|h7zARvnW>-AOm
zCs)r=*YQAA!`e<R&0)*Xk7%|k&^;uv62@(5&ac_hW*F9=TfvBeS~Qh~EX`oba74cG
z_zl_hTH19>#1N>aHF=bb*z*hXH#Wl$z^o}x##ZrUc=kh%OHWhp=7;?8%Xj||@V?1c
ziWoaC$^&04;A|T)!Zd9sU<cT<Lad$0pGXX1w=fLRLa7aSLO9sinK2%NmW<mIFjiuc
z-cT9?*>zE&$ODyJ<B|PnBKliB6c94vLSghm91pGb$1o^7rM2a&%c}D$u}j(J@zRz#
zi%s0i4BD9?+o@$HB_##NjTPLR3oh&PgIxvX>aBpvqsw19Uiuq{i#VK1!htkdRWBnb
z`{rat=nHArT%^R>u#CjjCkw-7%g53|&7z-;X<Ac^=g(0g1=gRkv{@6{)+2MuRw4?q
zSyffm46G$5&03=o2M%0CNA&bH8`|Q+lj*sOSA!_VPI<qibefjTL~ySR5|HpXSu-Wk
zjm)E}CNtR?XF>+ewb?OLWiV|#nuc8mp*LuGSi3IP<<*Wyo9GKV7l0Noa4Jr0g3p_$
z*R9{qn=?IXC#WU>48-k5V2Oc_>P;4_)J@bo1|pf=%Rcbgk=5m)CJZ`caHBTm3%!Z9
z_?7LHr_BXbKKr=JD!%?KhwdYSdu8XxPoA{n8^%_lh5cjRHuCY9Zlpz8g+$f@bw@0V
z+6DRMT9c|>1^3D|$Vzc(C?M~iZurGH2pXPT%F!JSaAMdO%!5o0uc&iqHx?ImcX6fI
zCApkzc~OOnfzAd_+-DcMp&AOQxE_EsMqKM{%dRMI5`5CT&%mQO?-@F6tE*xL?aEGZ
z8^wH@wRl`Izx4sDmU>}Ym{ybUm@F83qqZPD<I_<D@SDBXpcm$%pP;@}1x+1rECR~6
z%mPO96ZtCMfz6TZL_tB_o<jX(0%{4O*=Jpf{(}rOT%n6FF#H{^%{gCRk)ccFmy
zlAyZVmLT4N#~F)~@`1bcBU<gu4>6nFm?t?(7>h*?`fw)L3t*l%*iw0Qu#?$5eq!Qc
zpQvqgSxrd83NsdO@lL6#{%lsYXWen~d3p4fGBb7&5xqNYJ)yn84!e1PmPo7ChVd%4
zHUsV0<QfI}<M8O`g)!{5VcjkDZIjCu8(aqo6;;=sPlL7o>Mh?VpzZD=A6%)Qrd~i7
z96*RPbid;BN{Wh?adeD_p8YU``kOrGkNox3D9~!K?w>#kFz!4lzOWR}puS(DmfjJD
z`x0z|qB33*^0mZdM&6$|+T>fq>M%yoy(BEjuh9L0>{P&XJ3enGpoQRx`v6$txXt#c
z0#N?b5%srj(4xmPvJxrlF3H%OM<X=kF451d5XRpaI3Rddya;o<MiVe63o}q9!6}_c
zo)Za~rjO%XWDn6$-;t})ZmU#rhSPD)qiCJFwO-$XixQk0X*gbZ^iyuL^ft*8RskMZ
z61oYTT##Iok;Rg+0anh212gV|jFfog*GZX}VV7x@cwuYn2k0l|CdXJ3M&=>B!jvfy
z;wx8RzU~lb?h_}@V=bh6p8PSb-dG|-T#A?`c&H2`_!u+uenIZe`6f~A7r)`9m8atC
zt(b|6Eg#!Q*DfRU=Ix`#B_dK)nnJ_+>Q<1d7W)eynaVn`FNuN~%B;uO2}vXr5^zi2
z!ifIF5@Zlo0^h~8+ixFBGqtweFc`C~JkSq}&*a3<b*AGX+4JAVcr=k1@(BfrL*bH3
zB2tsVQA!i($9n4x3TKj4fyB9v6dVeLF9ce$&KiuST#O+L;`7)j^T{2s!k-fHs3AFL
z;*i&)+V}HhjAA_Rcq9bBAlY`@fUE4EXY~}ibwoho??7zC!;EPmIuC?iA|=eX-ry23
zydv?^AaCLg6^~XLVJgXk5t3-5-l5#+-WH4#R6H+-pH>C}L?b5Mh-bW=e)({F_g4O3
zb@SFTK3VD9QuFgFnK4Ve_pXc3{S$=+Z;;4+;*{<o#P)-O8F)a#4K`1Xm|~?q)i|U3
zYQ`j;(xom@I4xe9dA2S6y-d+xYe;^;M{B3B`KM&`C&=Gb<o8unUCEbv9DNO{|Er29
z8aca|Ig>H}Rc;845rP?DLK6G5Y-xdUKkA6E3Dz&5f{F^FjJQ(NSpZ8q-_!L3LL@H*
zxbDF{gd^U3uD;)a)sJwAVi}7@%pRM&?5IaUH%+m{E)DlA_$IA1=&jr{KrhD5q&lTC
zAa3c)A(K!{#nOvenH6XrR-y>*4M#DpTTOGQEO5Jr6kni9pDW`rvY*fs|ItV;CVITh
z=`rxcH2nEJpkQ^(;1c^hfb8vGN;{{oR=qNyKtR1;J>CByul*+=`NydWnSWJR#I2lN
zTvgnR|MBx*XFsfdA&;tr^dYaqRZp*2NwkAZE6kV@1f{76e56eUmGrZ>MDId)oqSWw
z7d&r3qfazg+W2?bT}F)4jD6sWaw`_fXZGY&wnGm$FRPFL$HzVTH^MYBHWGCOk-89y
zA+n+Q6EVSSCpgC~%uHfvyg@ufE^#u?JH?<73A}jj5iILz4Qqk5$+^U(SX(-qv5agK
znUkfpke(KDn~dU0>gdKqjTkVk`0`9^0n_wzXO7R!0Thd<OO)*@xLj!dA|^KI{(+g5
z4&&;v3+^PaBya7Rnu#!)XYc}vIWqv)^MY!O)bd!?B<}^dB*bn^DfNh`{LBe@BaZ7K
z79Vu@{$pu8y#gTfUJ?t()owinp0&lUvSWm~f6lhfPNSF&`a(>@S;U`y)VVP&mOd-2
z(hT(|$=>4FY;CBY9#_lB$;|Wd$aOMT5<N7HW=#J5xiuClp{tnl<jC$q#gWfwjqeAY
zV;sA^S=5DG9oD|_sR@+2OPrAQibqT{OGVV96@Akgvd57K5T@^KQN}?9VsiR^`m+&4
z6Wo=&#vs$B<Y9Yj#aZVD^shN}siQ$PUDTmt>O_3}DYXEHn&Jrc3`2JiB`b6X@EUOD
zVl0S{ijm65@n^19T3l%>*;F(?3r3s?zY{thc4%AD30CeL_4{8x6&cN}zN3fE+x<9;
zt2j1RRVy5j22-8U8a6$pyT+<`f+x2l$fd_{qEp_bfxfzu>ORJsXaJn4>U6oNJ#|~p
z`*ZC&NPXl&=vq2{Ne79AkQncuxvbOG+28*2wU$R=GOmns3W@HE%^r)Fu%Utj=r9t`
zd;SVOnA(=MXgnOzI2@3SGKHz8HN~Vpx&!Ea+Df~`*n@8O=0!b4m?7cE^K*~@fqv9q
zF*uk#1@6Re_<^9eElgJD!nTA@K9C732tV~;B`hzZ321Ph=^BH?zXddiu{Du5*IPg}
zqDM=QxjT!Rp|#Bkp$(mL)aar)f(dOAXUiw81pX0DC|Y4;>Vz>>DMshoips^8Frdv}
zlTD=cKa48M>dR<>(YlLPOW%rokJZNF2gp8fwc8b2sN+i6&-pHr?$rj|uFgktK@jg~
zIFS(%=r|QJ=$kvm_~@n=ai1lA{7Z}i+zj&yzY+!t$iGUy|9jH#&oTNJ;JW-3n>DF+
z3aCOzqn|$X-Olu_<wOD+V1cxb0Z}9)qPN6k=yG%7N(OXSN(!|;<~~&ZV7<|dWJ*$O
zcc8BYF-@yY+0BQ2=@gx;O-;QS>p7brzn`uk1F*N4@=b=m;S_C?#hy{&NE#3Hk<sC+
z@RVY+px5c26lyz%OfzZTn@(3s>ATrg?enaVGT^$qIjvgc61y!T$9<1B@?_ibtDZ{G
zeXInVr5?OD_nS_O|CK3|RzzMmu+8!#Zb8Ik;rkIAR%6?$pN@d<0dKD2c@k2quB%s(
zQL^<_EM6ow8F6^wJN1QcPOm|ehA+dP(!>IX=Euz5qqIq}Y3;ibQtJnkDmZ8c8=Cf3
zu`mJ!Q6wI7EblC5RvP*@)j?}W=WxwCvF3*5Up_`3*a~z$`wHwCy)2risye=1mSp%p
zu+tD6NAK3o@)4VBsM!@);qgsjgB$kkCZhaimHg&+k69~drbvRTacWKH;YCK(!rC?8
zP#cK5JPHSw;V;{Yji=55X~S+)%(8fuz}O>*F3)hR;STU`z6T1aM#Wd+FP(M5*@T1P
z^06O;I20S<pPBYLx^KQ-E#4lJKf0#2<$Urm^J75xe^_~ooFOaniz#EWEnAqL5nl;d
z;Y?#EUwvbZHb_{bP#Z+Xi6;``%`1xT4(Qh>k!bxW<-O;E081KRdHZrtsGJflFRRFS
zdi5w<L%xAIZMaxEN{|sC`S2LX=HNoo7yNMxu?JQZn!#EHpMVSC`Z-rSU>9OVDGSL3
zNrC7GVsGN=b;YH9jp8Z2$^!K@h=r-xV(aEH@#JicPy;A0k1>g1g^XeR`YV2HfmqXY
zYbRwaxHvf}OlCAwHoVI&QBLr5R|THf?nAevV-=~V8;gCsX>jndvNOcFA+DI+zbh~#
zZ7<oMFIjT?dRB+;KT%*|Gjj)Lv;R$(lsDCpKH})P;^<HgAW$|Ic$UC!!9k_^)<VFb
z+R-4(+=Oiwvgpt>`qNk&w+_+Yp!}j;OYxIfx_{f0-ONc?mHCiCUak=>j>~>YR4#w#
zuKz~UhT!L~GfW^CPqG8Lg)&Rc6y^{%3H7iLa%^l}cw_8UuG;8nn9)kbPGXS}p3!L_
zd#9~5CrH8xtUd?{d2y^PJg+z(xIfRU;`}^=OlehGN2=?}9yH$4Rag}*+AWotyxfCJ
zHx=r7ZH>j<rs-kbQ;s$ZI)B{YCAt<1f8=Z!C#+cW@(f}Vui2`~bhsJNt4X5FEVH#V
zmS~5qafT)ZOfofB3RY^p$qiO+hKg5MB@4BiWOlTuD_ywdEG^^`73sk%6$@P{w!m`d
zG%&#}O$F6xyMIL5Ey>2kV?%7WTtp+-HMa0)_*DBBmC{sd$)np&GEJ__kEd`xB5a2A
z*J+yx>4o#ZxwA{;NjhU*1KT~=ZK~GAA;KZHDyBNTaWQ1+;tOFFthnD)DrCn`DjBZ%
zk$N5B4^$`n^jNSOr=t(zi8TN4fpaccsb`zOPD~iY=UEK$0Y70bG{idLx@IL)7^(pL
z{??Bnu=lDeguDrd%qW1)H)H`9otsOL-f4bSu};o9OXybo6J!Lek`a4ff>*O)BDT_g
z<6@SrI|C9klY(>_PfA^qai7A_)VNE4c^ZjFcE$Isp>`e5fLc)rg@8Q_d^Uk24$2bn
z9#}6kZ2ZxS9<C46&Y+Q7nYM#)S{~e<-0SXbx^w1jyAP0t!{t{i)+bD@w$9YAlUQVZ
z1TZ|^=9cLiz;Bipmt#c?%u(c5s;}6EMb|KG%X+!BskufNDiLAbfcJAi-eKFCylmQ6
zcLgpiYS;T5u|4vj(43@Xs-;?LT?Reu-O1voTo*8Sg!T${N!fhDdj5F-jP4kcswNTc
zUPNlqr9(p*&QkY(6{Uw9+-&ZY^AVhuru!iEZSXWk{J62Y8RTWl#jvm?@UsOLN*n1U
z!!2c97^PYdYbw;1W(h-dY_NJ_bbOqzz80YwLA6En%W5F}=@a-dB;!cvFG55bE7@zZ
zf}Zz=u;({6%w-qMyr7YLW0H?0K>sI(RqT7?El2@B+($>eBQrNi_k#CDJ8D9}8$mmm
z4oSKO^F$i+NG)-HE$O6s1--6EzJa?C{x=QgK&c=)b(Q9OVoAXYEEH20G|q$}Hue%~
zO3B^bF=t7<z$Rj(z@}-%hhp0KDg5g-Vvj!qOr85&aqTpaaojC^CwQZHKk%N1&RJ@?
z3@mmU8UkLd^u+>t48sN<h@~F@WN(LX`%4J3P$~sLqIq2q^WYYan1y*WKS{^KXRSVj
zlRp2YD0*vmi}GIu(VMSMj`)AFtcV!7m`T~YnAy8nxmvlKskk~@*;{;3?|-#CT^;_>
zWh_zA`w~|){-!^g<vJDMm4#3w(!Hhyj3dofOB57x=Mu^T@6Gt<KN~lv>?6Mqf6ieV
zFx~aPUOJGR=4{KsW7I?<=J2|lY`NTU=lt=%JE9H1vBpkcn=uq(q~=?iBt_-r(PLBM
zP-0dxljJO>4W<w&)Z{UhZ0!m()I68e=px8_4B`37AI|bCZuMk_SVKAQz?8+4(l0C)
z<3()qDfD9UTW*wnelf4D7bR(}=TB;gs;ds+7QE~CAQ*jDKKADDC`3G?7kn$!=a5d&
z?I(JT9>q-;stY)CLB4q`-r*T$!K2o}?E-w_i>3_aEbA^MB7P5piwt1dI-6o!qWCy0
ztYy<q;G5p>!x9arGTS?kabkkyv*yxvsPQ7Vx)twkS6z2T@kZ|kb8yjm+^$|sEBm<L
zGtKcNM?a1<P1GHe%USdss^9iYmKI=GuiV`dL*Z(*)<W%!5IIDyJ!oJjHJOEa1m1VQ
zKco1NMHn5?h{5SRY#VFF?T!bo5_IIEbO;WfqdSQACJa+&8o3bgw;L^BimN?NlN(v)
zotn;%myS`DPUIQ+7RCnB)mY`2o&e;1Xh962y`p4wurO(bDXEWXms!a&F9;L0^G^Mo
zh1W&LQdXhd1KHjKV}xwOkQ>vACeqbz)RmxkkDQX-A*K!YFziuhwb|ym>C$}U|J)4y
z$(z#)GH%uV6{ec%Zy~AhK|+GtG8u@c884Nq%w`O^wv2#A(&xH@c5M`Vjk*SR_tJnq
z0trB#aY)!EKW_}{#L3lph5ow=@|D5Lz<fcUCo&Ka|9|4HGWHH0_J4ujUnr>JYUFD6
z7XnUeo_V0DVSIKMFD_T0AqAO|#VFDc7c?c-Q%#u00F%!_TW1@JVn<z*P@k#}SDu4q
z5BK|xV6S3>sfvm@_9HKWflBOUD~)RL``-!P;(bCON_4eVdduMO>?IrQ__*zE@7(OX
zUtfH@AX*53&xJW*Pu9zcqxGiM>xol0I~QL5B%Toog3Jlenc^WbVgeBvV8C8AX^Vj&
z^I}H})B=VboO%q1;aU5ACMh{yK4J;xlMc`jCnZR^!~LDs_MP&8;dd@4LDWw~*>#OT
zeZHwdQWS!tt5MJQI~cw|Ka^b4c|qyd<d8BjG@CVcx~A0@_+-3ySS5}V#nYxqHn&dJ
z3huaTsOBL$pM0~v6%?s%@?17;o|*#UY1tt-m0po1{B8Xt+V4%@*4l_1x6MTTu=i^t
zEF!^0`A{SAgixqmbf=fe`Q#RQV7q0JEE%qC5Cl7U3dvP`CnnYy>_ly(+Ql2m&AAw^
zQeSXDOOH!!mAgzAp0z)DD>6Xo``b6QwzUV@w%h}Yo>)a|xRi$jGuHQhJVA%>)PUvK
zBQ!l0hq<3VZ*RnrDODP)>&iS^wf64<Gan-0fT=xEEaI^H)!ok-sB8re6ozEmX5c@6
zvzFx43)HzN8|btxEr_+m_ES??hMpoBdA+u`<Ko)3jSDsJ<bNahp^L1kFKCk01nKG#
zd~B+qtlfL5f8$8ToxOxz!oqk&<wEbF*v1K2QV8d>C;MGqDvx>|p;35%6(u+IHoNbK
z;Gb;TneFo<v+>*`zUKS6kwF*&b!U8e5m4YAo03a_e^!5BP42+r)LFhEy?_7U1IR<;
z^0v|DhCYMSj<-;MtY%R@Fg;9Kky^pz_t2nJfKWfh5Eu@_l{^ph%1z{jkg5jQrkvD<
z#vdK!nku*RrH~TdN~`wDs;d>XY1PH?O<4^U4lmA|wUW{Crrv#r%N>7k#{Gc44Fr|t
z@UZP}Y-TrAmnEZ39A*@6;ccsR>)$A)S>$-Cj!=x$rz7IvjHIPM(TB+JFf{ehuIvY$
zsDAwREg*%|=>Hw$`us~RP&3{QJg%}RjJKS^mC_!U;E5u>`X`jW$}P`Mf}?7G7FX#{
zE(9u1SO;3q@ZhDL9O({-RD+SqqPX)`0l5IQu4q)49TUTkxR(czeT}4`WV~pV*KY&i
zAl3~X%D2cPVD^B43*~&f%+Op)wl<&|D{;=SZwImydWL6@_RJjxP2g)s=dH)u9Npki
zs~z9A+3fj0l?yu4N0^4aC5x)O<N_(0*g4u)%5Tt4@gHE>snm0qrhz@?nwG_`h(71P
znbIewljU%T*cC=~NJy|)#hT+lx#^5MuDDnkaMb*Efw9eThXo|*WOQzJ*#3dmRWm@!
zfuSc@#kY{Um^gBc^_Xd<M_=Opb*sV>xnl!n&y&}R4yAbK&RMc+P<gSSGsa9{ngu3h
za2rxBU6lA9Q9VAy<_CQ=#9?ge+|8rFr3YI44QC0@KPf?KG3#CkaUontfvoWcA#`fT
zUZ-M@9-{1Ei|?wN2X<<LG$En}QHwMqs=8ZuZNc+NsKkIl=}k#BjOIG2xpH6pY<h{d
zJ7c4SQ-wCPPp+Ave;R605<i{lO4KXOUo>^Ti;YIUh|C+K<WCtgj)+#X5!{~T0amf)
zA{NO!xG0_A(b+3`Y%~$@K6*;z4@GJOlO9iW_I)Uf=v75p{Zaa%riIlQ1XqxqD1P*v
zC_nl;^-H^oHskLi&AkX0pf_;|=*Q=gaUudCp%zN>1|=Z^{nZ}}rxH*v{xR!i%qO~o
zTr`WDE@k$M9o0r4YUFFeQO7xCu_Zgy)==;fCJ94M_rLAv&~NhfvcLWCoaGg2ao~3e
zBG?Ms9B+efMkp}7BhmISGWmJsKI@a8b}4lLI48oWKY|8<gk-*;t9-{k%FCJZFy<gM
z@C~rOBUWWT##Z+g3*3Vzs8fuTtjp`u#+{x*gRagQ8={zUb)t|^B2y%Lt=XH5-VU*g
zu-s*8g`Ceku&#kTTsG4pdKc+Q1?Ns^+`Anuzw^Kt@dXzw8(rtBy~EfPkytdOlMc6V
z+PjsVo1fq23ba`d{M8JQ|H)T-V`Ygmnsk8K`>?zuuNc$lt5Npr+<T4KxJJ<bPDeY<
zV$Y5gj%daxmn&XvpKy&xAedNSRNzj*+uARZbEwx*_BW(K#OMC!{`XgH-y>p7a#sWu
zh!@2nnLBVJK!$S~>r<AjX6^_+fORZ96soQxKn~@)BfuHDd$;Hq1kJ%oj=cQPA05n|
zlDech7|+hqRvU>2-pN||^w|fY`CT{TFnJy`B|e5;=+_v4l8O-fkN&UQbA4NKTyntd
zqK{xEKh}U{NHoQUf!M=2(&w+eef77VtYr;xs%^cPfKLObyOV_9q<<ILDt_So;x8tA
z{AwHiN2#Wqm5a+41^y+oU(NG>(%76-J%vR>w9!us-0c-~Y?_EVS<!Xa#y}`2>%v!*
z15s2s3eTs$Osz$JayyH|5nPAIPEX=U;r&p;K14G<1)bvn@?bM5kC{am|C5%hyxv}a
z(DeSKI5ZfZ1*%dl8frIX2?);R^^~LuDOpNpk-2R8U1w92HmG1m&|j&J{EK=|p$;f9
z7Rs5|jr4r8k5El&qcuM+YRlKny%t+1CgqEWO>3;BSRZi(LA3U%Jm{@{y+A+w(gzA<
z7dBq6a1sEWa4cD0W7=Ld9z0H7RI^Z7vl(bfA;72j?SWCo`#5mVC$l1Q2--%V)-uN*
z9ha*s-AdfbDZ8R8*fpwjzx=WvOtmSzGFjC#X)hD%Caeo^OWjS(3h|d9_*U)l%{Ab8
z<xdQ$23|WMjf-IqBJa@-|5QJamPBg?UmANYzk#NVaoTNbS)|8H20|;zb3-A+V#wVA
z0O?V!?94t>fv$yoP{OuUl@$(-sEVNt{*=qi5P=lpxWVuz2?I7Dc%BRc+NGNw+323^
z5BXGfS71oP^%apUo(Y#xkxE)y?>BFzEBZ}UBbr~R4$%b7h3iZu3S(|A;&HqBR{nK&
z$;GApNnz=kNO^FL&nYcfpB7Qg;hGJPsCW44CbkG1@l9pn0`~<fs1~obTx_FSX-JYV
zGQWAl6QMe=gj$TPFe4r4b4Ol;Htq0ghUXm#FhLL;q=vj^?zll8F~1Y_ME5KlGBn?W
zJLZAtGO*e1y^&@oxuzM@8GNx$4<>oKy5S777uH)l{irK!ru|X+;4&0D;VE*Ii|<3P
zUx#xUqvZT5kVQxsF#~MwKnv7;1pR^0;PW@$@T7I?s`_rD1EGUdSA5Q(C<>5SzE!vw
z;{L&kKFM-MO>hy#-8z`sdVx})^(Dc-dw;k-h*9O2_YZw}|9^y-|8RQ`BWJUJL(Cer
zP5Z@fNc>p<r+olf3Wx4QNlGzhncc!S>TXABbTRY-B5*MphpZv6#i802giwV&SkFCR
zGMETyUm(KJbh+&$8X*RB#+{surjr;8^REEt`2<qz>&Dubw3$mx>|~B5IKZJ`s_6fw
zKAZx9&PwBqW1Oz0r0A4GtnZd7XTKViX2%kPfv+^X3|_}RrQ2e3l<T~g*|IE{P97HV
zvf#Y<i{KPN_dP%1)NHb~ix&=&GH9>=KG_VyY`H?I5&CS+lAX5HbA%TD9u6&s#v!G>
zzW9n4J%d5ye7x0y`*{KZvqyXUfMEE^ZIffzI=Hh|3J}^yx7eL=s+TPH(Q2GT-sJ~3
zI463C{(ag7-hS1ETtU;_&+49ABt5!A7C<XW?{o=2DnJxLDD~{m*zq$azI0t7>wLwe
z=SoA8mYZIQeU;9txI=zcQVbuO%q@E)JI+6Q!3lMc=Gbj(ASg-<Uq;hB9d^p}DAXc~
zT?U|Ep>{V27u>z2e8n;Nc*pf}AqKz1D>p9G#QA+7mqqrEjGfw+85Uyh!=tTFTv3|O
z+)-kFe_8FF_EkTw!YzwK^Hi^_dV5x-Ob*UWmD-})qKj9@aE8g240nUh=g|j28^?v7
zHRTBo{0KGaWBbyX2+lx$wgXW{3aUab6B<q-FjF>hm1G1{jTC7ota*JM6t+qy)c5<@
zpc&<Cv-}2TvNf)-u^)w4IR#IAb30P8NKX2F^|M`)t)gNvmzY$92){_sASc~#MG?G6
z01+~17JwM!JPSxaJJtTz7$&8s`H3FldxQ%9@~nj<<O#kvf=K=$4nLLmHGiFo3Mq&*
ziIi#gQw#(**q&>(jVdTJf(q3xB=JotgF$X>cxh7k*(T`-V~AR+`%e?YOeALQ2Qud(
zz35YizXt(aW3qndR}fTw1p()Ol4t!D1pitGNL95{SX4ywzh0SF;=!wf=?Q?_h6!f*
zh7<+GFi)q|XBsvXZ^qVCY$LUa{5?!CgwY?EG;*)0ceFe&=A;!~o`ae}Z+6me#^sv-
z<kA1n(=XTnu@rJsCenhu-Zv&%WBDK;wE+-m5)3gqDM=UJSV|IgE?>1F6=WNd6>M(~
z+092z>?Clrcp)lYNQl9jN-JF6n&Y0mp7|I0dpPx+4*RRK+VQI~>en0Dc;Zf<!>l+x
z_e_b7s`t1_A`RP3$H}y7F9_na%D7EM+**G_Z0l_nwE+&d_kc35n$Fxkd4r=ltRZhh
zr9zER8>j(EdV&Jgh(+i}ltESBK62m0nGH6tCBr90!4)-`HeBmz54p~QP#dsu%nb~W
z7sS|(Iydi>C@6ZM(Us!jyIiszMkd)^u<1D+R@~O>HqZIW&kearPWmT>63%_t2B{_G
zX{&a(gOYJx!Hq=!T$RZ&<8LDnxsmx9+TBL0gTk$|vz9O5GkK_Yx+55^R=2g!K}NJ3
zW?C;XQCHZl7H`K5^BF!Q5X2^Mj93&0l_O3Ea3!Ave|ixx+~bS@Iv18v2ctpSt4zO{
zp#7pj!AtDmti$T`e9{s^jf(ku&E|83JIJO5Qo9weT6g?@vX!{7)cNwymo1+u(YQ94
zopuz-L@|5=h8A!(g-<F;G9^=CwUG2BBM&6@esQFH4>MXgLJC0MA|CgQF8qlonnu#j
z;uCeq9ny9QSD|p)9sp3ebgY3rk#y<wu$Scub#>0DA(SHdh$DUm^?GI<>%e1?&}w(b
zd<n{_{wZL^#}W>ip1;P2Z=1wM+$q=TgLP$}svd!vk+BZ@h<^4R=GS2+sri7Z*2f`9
z5_?i)xj?m#pSVchk-SR!2&uNhzEi+#5t1Z$o0PoLGz*pT64%+|Wa+rd5Z}60(j?X=
z{NLjtgRb|W?CUADqOS@(*MA-l|E342NxRaxLTDqsO<GMIr8u8#%dIQrz(r`Q(hkza
zil8N-`Js{wU0Gy<JdGKt>yfWWe%N(jjBh}G<qND?0TH2WotV2BO}oGFXR`nNIoZPu
zAYBqht4AIf6%UvOQWL(@v@#P!g?Z{m=yxdflhU-MrdJ3Lu4OwZ%yKkuPkk0$Ko)O*
z;5yrsNkvYZsjZQILNsEr+ECa0P<^XyVVf2;%`lxDRkz-!;wa1;EB{emo`C=%{Gykq
zq<4i~ETk#P9zK#gq4PdG1l$Vspzwyb@<LIRCp@UiYQvSVfg*oiL+eCZD0<3etyAQ>
zm7WPel6jXijaTiNita+z(5GCO0NM=Melxud57P<u@R2P46Q9-DyjXBHUN>P^d_U##
zbA;9iVi<@wr0DGB8<n8`yw;2Kv**CeqAs$L&plPhIa#v7(dTNoPt@&}ED@M*lxC!x
z`6s~+J|uy;3o7Lq<uMmSEF9Dw$gP)!=7bwIZF}v$SuOexM&6SRtdGcL+`+Tm+leuz
zpp$tX{Sz|>=T9Ab#2K_#zi=<XArhO6r_`n&7XSM212-MzWyRNG*!uO-#ecnE^8eXw
z{A)4%t2FvosVP<UQ~s;l`0?z0m3m-lgN!65Mz=sfFM<3$$g-N5nIt_Q>$igy<I%16
z>K48@;V|W`fg~7;+!q8)aCOo{HA@vpSy-4`^!ze6-~8|QE||hC{ICKllG9fbg_Y7v
z$jn{00!ob3!@~-Z%!rSZ0JO#@>|3k10mLK0JR<I1S>KP-Cc8UYFu>z93=Ab-r^oL2
zl`-&VBh#=-?{l1TatC;VweM^=M7-DUE>m+xO7Xi6vTEsReyLs8KJ+2GZ&rxw$d4IT
zPXy6pu^4#e;;ZTsgmG+ZPx>piodegkx2n0}SM77+Y*j^~ICvp#2wj^BuqRY*&cjmL
zcKp78aZt>e{3YBb4!J_2|K~A`lN=u&5j!byw`1itV(+Q_?RvV7&Z5XS1HF)L2v6ji
z&kOEPmv+k_lSXb{$)of~(BkO^py&7oOzpjdG>vI1kcm_oPFHy38%D4&A4h_CSo#lX
z2#oqMCTEP7UvUR3mwkPxbl8AMW(e{ARi@HCYLPSHE^L<1I}OgZD{I#YH#GKnpRmW3
z2jkz~Sa(D)f?V?$gNi?<F$5NpPo_(+mLu%j0uVGhEpW~}8A-6p@(iN<J78jy&84)}
zW71~;kMKbRG+MZ(!>6)Y;Sm{&?~2p=0&BUl_(@hYeX8YjaRO=IqO7neK0RsSNdYjD
zaw$g2sG(>JR=8Iz1<iqC50Fc?zkwnhu-?J#4v?gbo)h!toq+!EipMj&Dd=4)`^!2@
zL(!GW5QxLJO&{?1u~Q}Au)moY@9Q-~Yr01D0la`rUI3jK%5PxGU7;z+IlI=Bb;^2b
zL|Kc&B2+#W3&e}l>SK4`*kqd_3-?;_BIcaaMd^}<@MYbYisWZm2C2<aQM85hCqTrH
z{L!?Z_;my2c?%RMej)yS*$eqpa!UR3e9te>|Np_l|8r9yM|JkUngSo@?wci(7&O9a
z%|V(4C1c9pps0xxzPbXH=}QTxc2rr7fXk$9`a6TbWKPCz&p=VsB8^W96W=BsB|7bc
zf(QR8&Ktj*iz)wK&mW`#V%4XTM&jWNnDF56O+2bo<3|NyUhQ%#OZE8$Uv2a@J>D%t
zMVMiHh?es!Ex19q&6eC&L=XDU_BA&uR^^w>fpz2_`U87q_?N2y;!Z!bjoeKrzfC)}
z?m^PM=(z{%n9K`p|7Bz$LuC7!>tFOuN74MFELm}OD9?%jpT>38J;=1Y-VWtZAscaI
z_8jUZ#GwWz{JqvGEUmL?G#l5E=*m>`cY?m*XOc*yOCNtpuIGD+Z|kn4Xww=BLrNYS
zGO=wQh}Gtr|7DGXLF%|`G>J~l{k^*{;S-Zhq|&HO7rC_r;o`gTB7)uMZ|WWIn@e0(
zX$MccUMv3ABg^$%_lNrgU{EVi8O^UyGHPNRt%R!1#MQJn41aD|_93NsBQhP80yP<9
zG4(&0u7AtJJXLPcqzjv`S~5;Q|5TVGccN=Uzm}K{v)?f7W!230C<``9(64}D2raRU
zAW5bp%}VEo{4Rko`bD%Ehf=0voW?-4Mk#d3_pXTF!-TyIt6U+({6OXWVAa;s-`Ta5
zTqx&8msH3+DLrVmQOTBOAj=uoxKYT3DS1^zBXM?1W+7gI!aQNPYfUl{3;PzS9*F7g
zWJN8x?KjBDx^V&6iCY8o_gslO16=kh(|Gp)kz8qlQ`dzxQv;)V&t+B}wwdi~uBs4?
zu~G|}y!`3;8#vIMUdyC7YEx6bb^1o}G!Jky4cN?BV9ejBfN<&!4M)L&lRKiuMS#3}
z_B}Nkv+zzxhy{dYCW$oGC&J(Ty&7%=5B$sD0bkuPmj7g>|962`(Q{ZZMDv%YMuT<n
z1<0L@A~^*&C~fETTawHVh1kk4b*^p0vQ^7?+3dKBe<pM8Snh`k_7R%#IZRUEl1U~%
z`#y5ddd+xk?tVQb4dNJ(7Ry%2!BTF1HzW?PK!2%Oj>^Kwe<oH3RpEUQV(1=JAftKZ
zy};jv^`iGA^yoK}($W9zl~UM?CzovcbP5)_-K0QR<B0^>iRDvYTEop3IgFv#)(w>1
zSzH><Zx#DBcM*ETggCrIL|G$?#sL+^<gVn#xwx<>J`q!LK)c(AK>&Ib)A{g`<Y-)}
z(@A>Fdykxqd`Yq@yB}E{gnQV$K!}RsgMGWqC3DKE(=!{}ekB3+(1?g}xF>^icEJbc
z5bdxAPkW90atZT+&*7qoLqL#p=>t-(-lsnl2XMpZcYeW|o|a322&)yO_8p(&Sw{|b
zn(tY$xn5yS$DD)UYS%sP?c|z>1dp!QUD)l;aW#`%qMtQJjE!s2z`+bTSZmLK7SvCR
z=@I4|U^sCwZLQSfd*ACw9B@`1c1|&i^W_OD(570SDLK`MD0wTiR8|$7+%{cF&){$G
zU~|$^Ed?TIxyw{1$e|D$050n8AjJvvOWhLtLHbSB|HIfjMp+gu>DraHZJRrdO53(=
z+o-f{+qNog+qSLB%KY;5>Av6X(>-qYk3IIEwZ<NM%r#@ph<M*8##>5~6a+P9lMpC^
z8CJ0q>rEpjlsxCvJm=kms@tlN4+sv}He`xkr`S}bGih4t`<??rFdF>+#VEIt{1veE
z{ZLtb_pSbcfcYPf4=T1+|BtR!x5|X#x2TZEEkUB6kslKAE;x)*0x~ES0kl4Dex4e-
zT2P~|lT^vUnMp{7e4OExfxak0EE$Hcw;D$ehTV4a6hqxru0$|Mo``<ad1s?_=B%gG
zj{L^&w-1CqbSvv%+|q1FJ)359<5+$A?k|aG#gf7{>>*a5=1Ym0u>BDJKO|=<aBj&U
z!0#X%Y@1|KDHEa^`rc~}k?jREFe&j9TBf#RRLMm;>TEWJ5jZu!W}t$Kv{1!q`4Sn7
zrxRQOt>^6}Iz@%gA3&=5r;Lp=N@WKW;>O!eGIj#J;&>+3va^~GXRHCY2}*g#9ULab
zitCJ<BaQ=1ucETLnpX3y9t5BR+e(ST=YEEKt=Xy112Qwtm%k5x3c!p6oV{m@40JL~
zMB3IXGFiti$`r3qU9KkWdD0B0w2nH;70p*qOE_JEtaHOH+N1->t-OV0*D_Q3<g8V9
zm;BOnA1)>Q`p1_+GbPxRtV_T`<bX@8s=3hE?KYV`p2tWy54^?jcuq~6l*BOhxjN!!
z^YfKP5Wx%5767qY3bUCxe@{w@7?)5tek^Vgrr3xWzb~<2Na~Zi%p1V6d4U4sHORMh
z9<@8lXrisZuzFU)mBl0t68eB`oJwH{20{o{g2s-%u#nv&QU2tWM_W%e#~|NHMfxMp
zRM-h=($EQMHtd>jyATjax<;zZ?;S+VD}a(aN7j?4<~>BkHK7bO8_Vqfdq1#W&p~2H
z&w-gJB4?;Q&pG9%8P(oOGZ#`!m>qAeE)SeL*t8KL|1oe;#+uOK6w&PqSDhw^9-&Fa
zuEzbi!!7|YhlWhqmiUm!muO(F8-F7|r#5lU8d0+=;<`{$mS=AnAo4Zb^{%p}*gZL!
zeE!#-zg0FWsSnabl<CMbQ4500Q{TvpvXiCGkUi%tX9!KeG$87g>w!9$<&K(#z!XOW
z;*BVx2_+H#`1b@>RtY@=KqD)63brP+`Cm$L1@ArAddNS1oP8UE$p05R=bvZoYz+^6
z<)!v7pRvi!u_-V?!d}XWQR1~<lt}&9=&X{!*q{T%vI&{Sql_q~<bs=JfiC4k%hiD{
zRMjOdmSC*@3g=cAidK~^ywyFbdK)j^Qfk#UXd3U_FVoVd36bG{jjlOgvTnRjwERxE
z-E?_B9}RvmAC==a9mt*EnLWKm#&~+??Fr%8dgmR~zKWmR?y!954DdPLk@GI+AE4lI
zaun<-;SD&jV2s%R#Q0+$h!8+tU=(WXL8diA>0q(H3{d^4JGa=W#^Z<@TvI6J*lk!A
zZ*UIKj*hyO#5akL*Bx6iPKvR3_2-^2mw|Rh-3O_SGN3V9GRo52Q;JnW{iTGqb9W99
z7_+F(Op6>~3P-?Q8LTZ-lwB}xh*@J2Ni5HhUI3`ct|*W#pqb>8i*TXOLn~GlYECIj
zhLaa_rBH|1jgi(S%~31Xm{NB!30*mcsF_wgOY2N0XjG_`kFB+<Q#E!oi6v4mWmmdp
zpCrFYK7#3Llx<^JC{`QMc}y7Mn#rZi!bmnYGsz|v>uQuJbBm3bIM$qhUyE&$_u$gb
zpK_r{<D#n}U~r18Q9cpu7_WA|bVw~jkB-T@SBIs83_WOQD$QsrBK8v#YGtIb&;BA|
z^f-o}1u<5bRf8_Ai4>99svp3N3p4yHHS=#csK@j9ql*>j0X=+cD2dj<^Wiu@i>c_v
zK|ovi7}@4sVB#bzq$n3`EgI?~xDmkCW=2&^tD5RuaSNHf@Y!5C(Is$hd6cuyoK|;d
zO}w2AqJPS`Zq+(mc*^%6qe>1d&(n&~()6-ZATASNPsJ|XnxelLkz8r1x@c2XS)<e9
zVDeO<GC^N~@|t1?t&LxH8U-PQrqH;wsans3SgO3&9*BgbwK}9Dflo4DW(9&*G**t#
z5(08JP06aA!pS}33kTL;jmGw?eS&0eIbzv7kJDI{B{&cbYAXd|poH#n6YP+a2@zVP
zWf{CReG1A6D#=7l;{fY(f3mEcnyX-WV_u(j7L&<Bc&1CA15=N1l%Cemxh+LFLxzmc
zt>R*H(_B=IN>JeQUR;T=i3<^~;$<+8W*eRKWGt7c#>N`@;#!`kZ!P!&{9J1>_g8Zj
zXEXxmA=^{8A|3=Au+LfxIWra)4p<}1LYd_$1KI0r3o~s1N(x#QYgvL4#2{z8`=mXy
zQD#iJ0itk1d<pD1VGk%&4ci=+Hjcz)Hl94LD0g{3<>@Iy*DtXw)Wz!H@G2St?QZFz
zVPkM%H8Cd2EZS?teQN*Ecnu|PrC!a7F_XX}AzfZl3fXfhBtc<ff5WA{o}hG14*z(i
z*K_njw8`Ho`NMT6nej4znco^0DB`cm%kp~PK-34BfJ(Qcochw!NghIgsj#o-Nb`t)
zVGl9HfN7q|>2-)zaC2eKx*{XdM~Q<w$HWL{PMI^AT`G`ii!EB$YscY^Nuk(!FqRxn
z*|K+rA*VtJW^XRqgd8Y5^-S8rFcCRbuvV#xa2-&i<B=Kn-}$dv?>Uo4IwcGgVdW69
z1UrSAqqMALf^2|(I}hgo38l|Ur=-SC*^Bo5ej`hb;C$@3%NFxx5{cxXUMnTyaX{>~
zjL~xm;*`d08bG_K3-E+T<mWVYr6j8n!!pJNx6>I>#oqIN2=An(C6aJ*MrKlxj?-;G
zICL$hi>`F%{xd%V{$NhisHSL~R>f!F7AWR&7b~TgLu6!3s#~8|VKIX)KtqTH5aZ8j
zY?wY)XH~1_a3&>#j7N}0az+HZ;is;Zw(Am{MX}YhDTe(t{ZZ;TG}2qWYO+hdX}vp9
z@uIRR8g#y~-^E`Qy<u*MJx^Q>em(31{H0&V?GLdq9LEOb2(ea#e-$_`5Q{T%E?W(6
z(XbX*Ck%TQM;9V2LL}*Tf`yzai{0@pYMwBu%(I@wTY!;kMrzcfq0w?X`+y@0ah510
zQX5SU(I!*Fag4U6a7Lw%LL;L*PQ}2v2WwYF(lHx_Uz2ceI$mnZ7*eZ?RFO8UvKI0H
z9Pq-mB`mEqn6n_W9(s~Jt_D~j!Ln9HA)P;owD-l~9FYszs)oEKShF9Zzcmnb8kZ7%
zQ`>}ki1k<x*@h~9?|=_B5VcO=(N^K=D-I1iVfpHyvC6!w*bgI46!csKF<oV!JERin
zC3YrClGibB+Okmq$7*m2AOB?ld|Q}!uqu?r3IphaCt<@Ce+TI8+X!99XM!>wUO3j~
zEmh140sOkA9v>j@#56ymn_RnSF`p@9cO1XkQy6_Kog?0ivZDb`QWOX@tjMd@^Qr(p
z!sFN=A)QZm!sTh(#q%O{Ovl{IxkF!&+A)w2@50=?a-+VuZt6On1;d4YtUDW{YNDN_
zG@_jZi1IlW8cck{uHg^g=H58l<Vm9}1(ifQn)gr*uOc)p;i+Z*up!G6Q|ct@A`Rg+
z(Pz0_^vr7n#ozLauq<wGnI09PQ^$^ELvBezzpX||sLv`=K>PQ^HwnybWy@@8iw%G!
zwB9qVGt_?~M*nFAKd|{cGg+8`+w{j_^;nD>IrPf-S%YjBslSEDxgKH{5p)3LNr!lD
z4ii)^%d&cCXIU7UK?^ZQwmD(RCd=?OxmY(Ko#+#CsTLT;p#A%{;t5YpHFWgl+@)N1
zZ5VDyB;+TN+<e(DQM$-KBEOK~JQZ@8DI*iN)<P3BrrACFCTIK*u%S0|53PEu^7Ehl
zV?l&OVm@;rL5p)wh*^@1$Q~X9$Hva^6d8UaU{+1U9laP}p;1lPmY%c!ngm|hODD5L
zR(ju!H$Q+Hyll6?dXg{bqQO>g@u~{UrWrv)&#u~k$S&GeW)G{M#&Di)LdYk?{($Cq
zZGMKeYW)aMtjmKgvF0Tg>Mmkf9IB#2tYmH-s%D_9y3{tfFmX1BSMtbe<(yqAyWX60
zzkgSgKb3c{QPG2MalYp`7mIrYg|Y<4Jk?XvJK)?|Ecr+)oNf}XLPuTZK%W>;<|r+%
zTNViRI|{sf1v7CsWHvFrkQ$F7+FbqPQ#Bj7XX=#M(a~9^80}~l-DueX#;b}Ajn3VE
z{BWI}$q{XcQ3g{(p>IOzFcAMDG0xL)H%wA)<(gl3I-oVhK~u_m=hAr&oeo|4lZbf}
z+pe)c34Am<=z@5!2;_lwya;<D$$PCDT2t07f~T~1`gm+|*CJY!i+dS#iTCYHqqaw)
zj3blRU6F#gfxw(pj>l?xV5&kWe}*5uBvckm(d|7R>&(iJNa6Y05SvlZcWBlE{{%2-
z`86)Y5?H!**?{QbzGG~|k2O%eA8q=gxx-3}&Csf6<9BsiXC)T;x4YmbBIkNf;0Nd5
z%whM^!K+9zH>on_<&>Ws?^v-EyNE)}4g$Fk?Z#748e+GFp)QrQQETx@u6(1fk2!(W
zWiCF~MomG*y4@<N?CDr+4XG>Zk;h#2H8S@&@xwBIs|82R*^K(i*0MTE%Rz4rgO&$R
zo9Neb;}_ulaCcdn3i17MO3NxzyJ=l;LU*N9ztBJ30j=+?6>N4{9YXg$m=^9@Cl9VY
zbo^{yS@gU=)EpQ#;UIQBpf&zfCA;00H-ee=1+TRw@(h%W=)7WYSb5a%$UqNS@oI@=
zDrq|+Y9e&SmZrH^iA>Of8(9~Cf-G(P^5Xb%dDgMMIl8gk6zdyh`D3OGNVV4P9<?|P
z-j@F$ZGb0Iv;Q#+1Y3DEbQ`j-$mx&dLKYe~f)g}x!W9%BxOE?Kegu;<P)6;RKC|^}
zuJcGCm-}}fgpn&rErt=!;=F^)mLo%`4-bb4yDU-hjLg207h})(!Na3IFjNh3pKxT}
z=E-#C&M%2%{LBV_R9}(GHx_~&?Nkjbxq||ycJqejQ<li(BiXow>X|EvIhplXDld8d
z^YWtYUz@tpg*38Xys2?z<AsaQ9`FX9=vO?VclV^OO+^DZP}?dQBIRV1wR9IEL|AA?
z7%zFkNU4u`%FPFrUA%S(<E|t-Co&aM3igfI^tDcE$Eb&ti7J;v)|Nx;#i_p~XN=1!
z)X%9>j$F8%ivA47cGSl;h<CJwzu8LTNYjfHY0+mYtoV3~Y9LHg`fNrZqRIVH7(WTd
z42@(A4wsKs;UK_xXvYSsGH}!mq-W9RUS>jD23#*62w3+fwxNE7M7zVK?x_`dBSgPK
zWY_~wF~OEZi9|~CSH8}Xi>#8G73!QLCAh58W+KMJJC81{60?&~BM_0t-u|VsPBxn*
zW7viEKwBBTsn_A{g@1!wnJ8@&h&d>!qAe+j_$$Vk;OJq`hrjzEE8Wjtm)Z>h=*M25
zOgETOM9-8xuuZ&^@rLObtcz>%iWe%!uGV09nUZ*nxJAY%&KAYGY}U1WChFik7HIw%
zZP$3Bx|TG_`~19XV7kfi2GaBEhKap&)Q<9`aPs#^!kMjtPb|+-fX66z3^E)iwyXK7
z8)_p<)O{|i&!qxtgBvWXx8*69WO$5zACl++1qa;)<j>0zlXf`eKWl!0zV&<N=whgf
z=0i25aXKIFe5mUpdv-EHM0EKf_KV-DV|@p1jm^E@ea)Z2_?f|N<(<RzPtbPo`ldD#
zI%|89YOckdy*j(=6Uy}VGgbFV<WBdKw7C5m8tRP;Q*C~&kBFvF=i}Ch{i_Y2;40L{
z)K`6LTNK+2*O5nKkI3d23^q*hZS22n$b8(0jEvxXx>I`8?sG)OD2Vy?reNN<{eK+_
za4M;H<OEXEJ~5%>h%&IszR%)&gpgRCP}yheQ+l#AS-GnY81M!kzhWxIR?PW`G3G?}
z$d%J28uQIuK@QxzGMKU_;r8P0+oIjM+k)&lZ39i#(ntY)*B$fdJnQ3Hw3Lsi8z&V+
zZly2}(Uzpt2aOubRjttzqrvinBFH4jrN)f0hy)tj4__UTwN)#1fj3-&dC_Vh7}ri*
zfJ=oqLMJ-_<#rwVyN}_a-rFBe2>U;;1(7UKH!$L??zTbbzP#bvyg7OQBGQklJ~DgP
zd<1?RJ<}8lWwSL)`jM53iG+}y2`_yUvC!JkMpbZyb&50V3sR~u+lok<jf)#biL}k>
zT0uFRS-yx@8q4fPRZ%KIpLp8R#;2%c&Ra4p(GWRT4)qLaPNxa&?8!LRVdOUZ)2vrh
zBSx&kB%#Y4!+>~)<&c>D$O}!$o{<1AB$M7-<IbItSWbLS$Bc!yc*D!JSUNWc8Xxf_
zVwhYbcOY|jLi^_r`vZ|o_zYvjAL=OwCK>^`h!eW;c(3J~ztoOgy6Ek8Pwu5Y`Xion
zFl9fb!k2`3uHPAbd(D^IZmwR5d<V^a36$=BnmM)YfOhH6RTrUdB@GYwz`_7nuN|p$
z2V+%uCpN}@8r+zC1m$Ow)b62uBx&p=p4>8D$495nN2`Ue&`W;M-nlb8T-OVKt|fHk
zBpjX$a(IR6*-swdNk@#}G?k6F-~c{AE0EWoZ?H|ZpkBxqU<Xf3W+7zF%bMQpG=#+f
zxoMuDrN{!Y23^6mg$;7E;@7@XK+}=Qy_1Iq=)6u><0NUtvubJtwJ1mHV%9v?GdDw;
zAyXZiD}f0Zdt-cl9(P1la<zqwbP8fB?G%>+vQ$Er0~v}gY<d-*YcL&hT|=1btDzQ0
z(6p4%7<LXbtyHJnL793Fi@vF^&-iy&?CL=~h5%o&zh5-LcEwfjJAHm-%n>JVwQazv
zH#+Z%2CIfOf90fNMGos|{zf&N`c0@x0N`tkFv|_9af3~<0z@mnf*e;%r*Fbuwl-IW
z{}B3=(mJ#iwLIPiUP`J3SoP~#)6v;aRXJ)A-pD2?_2_CZ#}SAZ<#v7&Vk6{*i(~|5
z9v^nC`T6o`CN*n%&9+bopj^r|E(|pul;|q6m7Tx+U|UMjWK8o-lBSgc3ZF=rP{|l9
zc&R$4+-UG6i}c==!;I#8aDIb<c|ovf|J^|Tj-z?&rajbtaSL_j7Uc0sHFrW;_^dt-
zQ#fau&15?b-*_*c%@nFsI=MGU)fm~M{Fd@lsp;TW6Jkt3nUS9YuP-Tfk2eIECIc%$
z>AvgLuB66CQLRoTMu~jdw`fPlKy@AKYWS-xyZzPg&JRAa@m-H43*+ne!8B7)HkQY4
zIh}NL4Q79a-`x;I_^>s$Z4J4-Ngq=XNWQ>yAUCoe&SMAYowP>r_O}S=V+3=3&(O=h
zNJDYNs*R3Y{WLmBHc?mFEe<W&)(jM#)z+=`SB74}QX|RMwfle6Thto6L#|vM@u#0*
ztZPlYvT0zrSFcI`K3V@ZCpyY>A4`0Y`_CN%?8qbDvG2m}kMA<oe^a9RxBh_fznjAU
zf#du4cCea-8;%*8ujt0;{b1xdnQNl5jFDE#5oW@VJH^(R%s7q~rTyVyDmv+^@G9kH
z%EEY*)+Bm)^R;|yi#2u-s5sOFeMj}4zCG@p*Wui}tgN&XA2U*^<u-uW$>iqCv`_BK
z_6a@n`$#w6Csr@e2YsMx8udNWtNt=kcqDZdWZ-lGA$?1PA*f4?X*)hjn{sSo8!bHz
zb&lGdAgBx@iTNPK#T_wy`KvOIZvTWqSHb=gWUCKXAiB5ckQI`1KkPx{{%1R*F2)Oc
z(9p@yG{fRSWE*M9cdbrO^)8vQ2U`H6M>V$gK*rz!&f%@3t*d-r3mSW>D;wYxOhUul
zk~~&ip5B$mZ~-F1orsq<|1bc3Zpw6)Ws5;4)HilsN;1t<c`^?I-+NnRQ4BGkk~5AC
z0<$*Ro7kbksCaNZpO7OZW9m3e<*|QD8IN<HqQL0bQb?^#z-zS1;!&h|;E)H}5Jl#+
zCY~IC9W~KX9IJ7*k9=2dUeTLAi3<AIBU3?FCq$C2J&dn1i(kvgVStn3OK%1HGEEH`
zl}RP!K66_VDRlySL3JlB;5@HA+t}EVjAW&zabiDB5+pE{4&BssI0+>x;N6)tuePw&
z==OlmaN*ybM&-V`yt|;vDz(_+UZ0m&&9#{9O|?0I|4j1YCMW;fXm}YT$0%EZ5^YEI
z4i9WV*JBmEU{qz5O{#bs`R1wU%W$qKx?bC|e-iS&d*Qm7S=l~bMT{~m3iZl+PIXq{
zn-c~|l)*|NWLM%ysfTV-oR0AJ3O>=uB-v<rdTg~-U`VQ{V%4>pld{V|cWFhI<K__!
zY_p+lRa{>~sx>ciV9sPkC*3i0Gg_9G!=4ar*-W?D9)?EFL1=;O+W8}WGdp8TT!Fgv
z{HKD`W>t(`Cds_qliEzuE!r{ihwEv1l5o~iqlgjAyGBi)$%zNvl~fSlg@M=C{TE;V
zQkH`zS8b&!ut(m)%4n2E6MB>p*4(oV>+PT51#I{OXs9j1vo>9I<4CL1k<SbW+&Ap)
zSP*RZJ^w%mwwxG^lpbQp<y#<c`2j6=tpT??9=M$m1k}On6M$1Q5AVN0sPBwg`c0Qr
zK7<G4?wG+Kw(dAkFQ`e4mKX@c)c%GbcTo2#ozT|>v1aurV*AFZ^w_qfVL*G2rG@D2
zrs87oV3#mf8^E5hd_b$IXfH6vHe&lm@7On~Nkcq~YtE!}ad~?5*?X*>y`o;6Q9lkk
zmf%TYonZM`{vJg$`lt@MXsg%*&zZZ0uUSse8o=!=bfr&DV)9Y6$c!2$NHyYAQf*Rs
zk{^?gl9<l6aHM&uD+l3hFT+28k8g^pGn-H<ZHHcXYg$z(F4Jtp_Xx|{2t8=tZN`>E
z5Im8wlAsvQ6C2?DyG@95gUXZ3?pPijug25g;#(esF_~3uCj3~94}b*L>N2GSk%Qst
z=w|Z>UX$m!ZOd(x<j9eXHfco7njabXV6r(0b^4(&cOlQ%n~ZRCkU^)bOK|u;Tf(Z<
z)%A&Z?!gx&FfQcJ1ka~zxh=jKlF$aS^I`L9N6Zfp2RNgghhUbdivb)_o_wrkQKiXN
z>V*2xvWjN&c5BVEdVZ0wvmk)I+YxnyK%l~caR=7uNQ=+cnNTLZ@&M!I$Mj-r{!P=;
z`C2)D=VmvK8@T5S9JZoRtN!S*D_oqOxyy!q6Zk|~4aT|*iRN)fL)c>-yycR>-is0X
zKrko-iZw(f(!}dEa?hef5yl%p0-v-8#8CX8!W#n2KNyT--^3hq6r&`)5Y@>}e^4h-
zlPiDT^zt}Ynk&x@F8R&=)k8j$=N{w9qUcIc&)Qo9u4Y(Ae@9tA`3oglxjj6c{^pN(
zQH+Uds2=9WKjH#KBIwrQI%bbs`mP=7V>rs$KG4|}>dxl_k!}3ZSKeEen4Iswt9<Vx
zE>6GGw`E6^5Ov)VyyY}@itlj&sao|>Sb5<mx7{zaG9j_ba8Zn5K#G`;;jnCwfaV@2
zfB--<s_Ho?au3X)Qq+^=$Mw2HA}6H$ph@(};1h67duDcz_!MjV18bU|6b%Yl$;l2>
zeY+#1EK(}iaYI~EaHQkh7Uh>DnzcfIKv8ygx1Dv`8N8a6m+AcTa-f;17RiEed>?RT
zk=dAksmFYPMV1v<x&y*?3CAt4n)nj%7c<8p@rCUMNPw>IS(Qc6tUO+`1jRZ}tcDP?
zt)=7B?yK2RcAd1+Y!$K5*ds=SD;EEqCMG6+OqPoj{&8Y5IqP(&@zq@=A7+X|JBRi4
zMv!czlMPz)gt-St2VZwDD=w_S>g<u1Alo`a@)M5xCeMm@9WTygn}Yg{&GRz>Rpc-g
zUd*J3>bXeZ?Ps<QEw|QH!GID7xAtR|TnBoHBL<H(2d@TU#K4aKAqr%K?)oO@TEsL|
z$E~b1c<o8NtC45_R5dw2rqX6;J5J_&h8cu*m3XD9tC|sJ;e8Y=eOnkOd$m`EdItJX
zTGByXs@S95>johe;z7k|d<*T21PA1i)AOi8iMRwTBSCd0ses{)Q`9o&p9rsKeLaiY
zluBw{1r_IFKR76YCAfl&_S1*(yFW8HM^T()&p#<HVVqAydIWX%r4Ei9mtCFU`OZ^T
z_1ESE&U8ftMAYS>6y%{(j7Qu56^ZJx1LnN`-RTwimdnuo*M8N1ISl+$C-%=HLG-s}
zc99>IXRG#FEWqSV9@GFW$V8!{>=lSO%v@X*pz*7()xb>=yz{E$3VE;e)_Ok@A*~El
zV$sYm=}uNlUxV~6e<6LtYli1!^X!Ii$L~j4e<Di*Wks4eNuVJ#URzGdQAorX=_3nr
zTY4XzG?{}|&rMky&CY^jd+B~p1Ozm))}*z)nm|j_SWDXWZBTh?(8Uy`cyh1-pmxt5
zjD#-N>{sI$tq_A(OkGquC$+>Rw3NFObV2Z)3Rt~Jr{oYGnZaFZ^g5TDZlg;gaeIP}
z!7;T{(9h7mv{s@piF{-35L=Ea%kOp;^j|b5ZC#xvD^^n#vPH=)lopYz1n?Kt;vZmJ
z!FP>Gs7=W{sva+aO9S}jh0vBs+|(B6Jf7t4F^jO3su;M13I{2rd8PJjQe1JyBUJ5v
zcT%>D?8^Kp-70bP8*rulxlm)SySQhG$Pz*bo@mb5bvpLAEp${?r^2!Wl*6d7+0Hs_
zGPaC~w0E!bf1qFLDM@}zso7i~(``)H)zRgcExT_2#!YOPtBVN5Hf5~L<CCU1e$yOy
z6y7=%D3L0U<zJX-p<cShe85I0&7b?ixuHr*ev;88LfDXt4Y`h@aRt6fu_)dfSD<Cr
z3(iZbika`#1V{5t?5XV6saWc^(a!gyl;XpLlkO0|q305rjpn!-bJS#zH-=YO2U^?b
zP1+|Sw)zw1D{|SG>l3f~rWZ(UsJtM?O*cA1_W0)&qz%{bDoA}{$S&-r;0iIkIjbY~
zaAqH45I&ALpP=9Vof4OapFB`+_PLDd-0hMqCQq08>6G+C;9R~}Ug_nm?hhdkK$xpI
zgXl24{4jq(!gPr2bGtq+hyd3%Fg%nofK`psHMs}EFh@}sdWCd!5NMs)eZg`ZlS#O0
zru6b8#NClS(25tXqnl{|Ax@RvzEG!+esNW-VRxba(f`}hGoqci$U(g30i}2w9`&z=
zb8XjQLGN!REzGx)mg~RSBaU{KCPvQx8)|TNf|Oi8KWgv{7^tu}pZq|<WiMz3rpp0|
zLKIB_Ddvk1cM8BJ6cz;POHz%T{sZWf@&L60uqu2&dwrJ0x%22+Vt2z(`{TJ@iys}w
z!}_>BS&S<53fC2K4Fw6>M^s$R$}LD*sUxdy6Pf5YKDbVet;P!bw5Al-<ZmNM;fK}9
z(;Q617)?cwmeRBfr;WSdE!Fz~?sf588oJj!s$c%WwoO(_AtXkQF4F2xuV&`n3oC*l
zLD81gPrag(&l;y|c-6wE>8I1Nr(`SAubX5^D9hk6$agWpF}T#Bdf{b9-F#2WVO*5N
zp+5uGgADy7m!hAcFz{-sS0kM7O)qq*rC!>W@St~^OW@R1wr{ajyYZq5H!T?P0e+)a
zaQ%IL@X_`hzp~vRH0yUblo`#g`LMC%9}P;TGt+I7qNcBSe&tLGL4zqZqB!Bfl%SUa
z6-J_XLrnm*WA`34&mF+&e1sPCP9=deazrM=Pc4Bn(nV;X%HG^4%Afv4CI~&l!Sjzb
z{rHZ3od0!Al{}oBO>F*mOFAJrz>gX-vs!7>+_G%BB(ljWh$252j1h;9p~xVA=9_`P
z5KoFiz96_QsTK%B&>M<OVS!jKFsPBm{4n!1mDS$m*VO~m3uK*2h!9O|jhe3!GzDyp
zmM22!H~y1+2FiH-J&wO+VnjS|-3)(QdJ8GSi_0-d@y02VONd&XMRu?%tIoeyAs1$&
z2@H?hToaxd!hmqD8)Z|B2n$^2x?>SXEYh`|U5PjX1(+4b#1PufXRJ*uZ*KWdth1<0
zsAmgjT%bowLyNDv7bTUGy|g~N34I-?lqxOUtFpTLSV6?o?<7-UFy*`-BEUsrdANh}
zBWkDt2SAcGHRiqz)x!iVoB~&t?$yn6b#T=SP6Ou8lW=B>=>@ik93LaB<r_MA^r8W%
ziwVD->L56ub`>Uo!>0@O8?e)<QIehhF!1N7m_bQW9~;%fo1oD+{TinM^gu6xUX$9F
z^gK5Cv;EFbXK<zio1!h?E;s-FrZ3QhuHd!0)cpDvEbI3w{vR$q;(uOJ7iSCW{{p7}
zzpuyoZ?3nn{f4C38vNHAyE@U#eZOM?=C{-2UlQnl`=S4Hqo#$ulC_2XKi`(Y%2I9%
zqNtzb4J1RV?vRT8h2_}z5~y*Sgu$dNV1nirGVW<X<VBI;%p5`{W68rsXC;}R;jakJ
z0|m|oj_XPtH?ikucx#{-lLC?H?Xm4UZUlvXpMMX?{2yLx{C@@l#~h%b<Z5!*?F#~7
z095<~W**NHf+sw4TNn%fn$l^{-JfWtHQojXS8Dc{?sWuvz_8QOqS~YA!SB*+&X?5E
zc^efB&iaXnDoz0~4^Spx+uz)U7%v&d#A(?qFNi&QSA^y{ire}JOmf8O+>$t(sgy$I
z6tk3nS@yFFBC#aFf?!d_3;%>wHR;A3f2SP?Na8~$r5C1N(>-ME@HOpv4B|Ty7%jAv
zR}GJ<XhgBAgrpe%$kuJ?YuHb|v3u|gT`)+e7qK&fd+OIT;m{2y88S)Qjteg<Q~$%4
ztWsl)3xFMc%#UK*uV$e^e8U)bASn;aR5V)4e)Ps$ABOTa{{}89$FHcj_RA}KxYL!h
zSrRM4@Tm%S<9Rz@I&&S5FGd?4IjT9i`X#uinJ?GKu9Y@)$2gn)g#xfRYzz}$o40kc
zUGLF?z*ujtv{_Guq~e)cC*Ac97qz^+p^m9P_~1ybTfTl`R|}Lp|Ml;SQqm*IP;a^W
z9;xGPWdv5qh996kRjsCCzq4}wrC^JZQF7^!wb*8uc0Cs&R?O~n$pgltCC+R$?wqfx
zwy~I{d?&0hu}J>wsiJZ5@H+D$^Cwj#0XA_(m^COZl8y7Vv(k=iav1=%QgBOVzeAiw
zaDzzdrxzj%sE^c9_uM5D;$A_7)Ln}BvBx^=)fO+${ou%B*u$(IzVr-gH3=zL6La;G
zu0Kzy5CLyNGoKRtK=G0-w|tnwI)puPDOakRzG(}R9fl7#<|oQ<MB>EX;E#yCWVg95
z;NzWbyF&wGg_k+_4x4=z1<Z>GUcn6JrdX4nOVGaAQ8#^Ga>aFvajQN{!+9rgO-dHP
zIp@%&ebVg}IqnRWwZRTNxLds+gz2@~VU(HI=?Epw>?yiEdZ>MjajqlO>2KDxA>)cj
z2|k%dhh%d8SijIo1~20*5YT1eZTDkN2rc^zWr!2`5}f<2f%M_$to*3?Ok>e9$X>AV
z2jYmfAd)s|(h?|B(XYrIfl=Wa_lBvk9R1KaP{90-z{xKi+&8=dI$W0+qzX|ZovW<E
zC+%g!uYXZ#{jQrY@A!`XSl_$&A2T@qFMZ>GOotP+vvYR(o=jo?k1=oG?%;pSqxcU*
zWVGVMw?z__XQ9mnP!hziHC`ChGD{k#SqEn*ph6l46PZVkm>JF^Q{p&0=MKy_6apts
z`}%_y+Tl_dSP(;Ja&sih$>qBH;bG;4;75)jUoVqw^}ee=ciV;0#t09AOhB^Py7`NC
z-m+ybq<g4G1!&FidKvq|9sin}V;o>1>_OO+V*Z>dhk}QFKA8V?9Mc4WSpzj<uaKj4
zz}(y<qx>{6IWfFpF7l^au#r7&^BK2Ac7vCkCn{m0uuN93Ee&rXfl1NBY4NnO9lFUp
zY++C1I;_{#OH#TeP2Dp?l4KOF8ub?m6zE@XOB5Aiu$E~QNBM@;r+A5mF2W1-c7>ex
zHiB=WJ&|`6wDq*+xv8UNLVUy4uW1OT>ey~Xgj@MMpS@wQbHAh>ysYvdl-1YH@&+Q!
z0<dQW^;Ehn%fw-)fLb!<4HzREhM1Zj4kjQ!T09v0)QjqeWq<OYpA_`5xmMw!4+hTR
zy23Q(soXn+J<2mcH_6m%eJ0y`u>75(Qd4C!V`9Q9<BqX7QhEx1y7+HYACF;~N>jI4
zSt{HJRvZec>vaL_brKhQ<Sq8Wy8KCM4!1b}xe_|=NZ|wZo%q0h&x4@*-#-Tbp8b4J
zfIv}0{e+iDV}#Wuh874=&m%<?pIcYWQ;w#b%}b$Vn_q_?BZ`tWoll1nw0pK?nDM!-
z=IiSL)l|LG>QwbpQd4_Lmmr0@1GdUeU-QcC{{8o=@nwwf>+dIKFVzPriGNX4VjHCa
zTbL9w{Y2V87c2ofX%`(48A+4~mYTiFFl!e{3K^C_k%{&QTs<v>gOd0*95KmWN)P}m
zTRr{`f7@=v#+z_&fKYkQT!mJn{*crj%ZJz#(+c?>cD&2Lo~FFAWy&UG*<xxo3kzJ2
zrDbwgE`xL0p0OC~L$bVXy!zt2LN-Bue6r%b!O~*CllYPQ0{bnjSehnxRLfHKB-|@S
zlh%>Op^pV`BR^I|g?T>4l5;b|5OQ@t*?_Slp`*~Y3`&RfKD^1uLezIW(cE-Dq2z%I
zBi8bWsz0857`6e!ahet}1>`9cYyIa{pe53Kl?8|Qg2RGrx@Alv<K-1&TVh*%QiVeJ
zw<z)|0uz2^&{c%EX_mp!Y-<wmxQMClEpwtlv3E9lug>G3HAL-^9c^1GW;)vQt8IK+
zM>!IW*~682A~MDlyCukldMd;8P|<R1w8m*driNj?SBnHCqQ^}34Oe^3F0Wd5k+$DP
zQwx>JCZ&oNL(;HZgJ>ie1PlaInK7C@Jg{3kMKYui?e!b`(<sqcbV3|*ChSZ+u!m%p
zEk@sY6s}w^f<Db4!cs^*ZGoz!MF)itzenWdp0zit8&_HB23ZX%(}%Z-Chi<kQ@WUl
zT(v|=*&C-%O}S43l>&?t6PTb5UPrW-6DVU%^@^E`*y-Fd(p|`+JH&MzfEq;kikdse
ziFOiDW<yyXIOCw#j{)5?sD|95dJ6|$X;1y3bb<7j_^}nbX8;6j=KzRx3FdimHz<(c
zmSo@-#x~D(T}VTzSST7#biiP$37XpFV(s+vv(iSQ7I!I8ZKqf)^AfEZ&wdg|>H(D<
zyV7Rxt^D0_N{v?O53N$a2gu%1pxbeK;&ua`ZkgSic~$+zvt~|<wi2DKd{6zkbA_nq
zIDNbroZZR_UCliI>1Yb=UfKJW2F7wC^evlPf(*El+#}ZBy0d4kbV<JHB3rsDar1c{
z*Grb7r<dUjw#Am*!d&=V6x;*79S0!!;4TYA32Cmkx<yQp-R3&_+bQ0~49f!Bd2Cjh
zY&ZW=x21K3F4Sqd64lC<?QGcC)^WatWhoHF<DfExT|%m}Lve<W;m6AecsIu@s0HQ0
z;sGJq(EmLz1*2Qiyo444kgnXvB8<S~!4hSZRBcLs5-RY^L){UWNUr}q*A1gbe)#eg
zz(0&3ca!IWGoRZDX)3;$au=OUlsYF1T^to-gPZnvv#Y<VH=JL`_g(awH73M`{N`#+
zM=?0F=nU77E!)#P{EV4*TbZi+Vs~bStn<aVgf!NSMHxgf7h?GP4G2di7FV36>JsK-
z05>;>?<bli&gVpMCL_8-A4$tJhB_D&hw-4^NArA$&y|p#ogw7&h}dJ%CEK$3W)AC;
zB<R-89-M6xZfPWd_I--v3O~wpU}AYdMP!!rzs-<7%souf_ov6p&#=XmicWCcA^v(i
z^$FXa(|I&$nbQ+B27F?0?Z^K8%^&ecmd7x56FH*E3-SkLxGOY1IdX{hC<?uJylK#7
zi2V;{_ejM8ds|1sKS)_lFkhgT7C<dvCtL4&gyZ~4^H+QGzy7tLWUlb(Q}7*2=f6KB
z|9wmq{^na*+Zh=D$LrK5O6DKL<=`E2)@Eu~hCrAV@a{mqT|sU4z}1jd<b(ueFLn(4
zUZHTR{bj6UIE0=T-J21VUTh)oVManyGcTv~O$<%sVXX~J8<8H`Ass|=?nH|Vn8KKb
zp7ksuqRA>HZO(YBF&v5tNv_WcI@O@LKFl*VO?L(!BAd!KbkVzo;v@~3v`-816GG?P
zY+H3ujC>5=Am3RIZDdT#0G5A6xe`vGCNq88ZC1aVXafJkUlcYmHE^+Z{*S->ol%-O
znm9R0TYTr2w*N8Vs#s-5=^w*{Y}qp5GG)Yt1oLNsH7y~N@>Eghms|K*Sdt_u!&I}$
z+GSdFTpbz%KH+?B%Ncy;C`uW6oWI46(tk>r|5|-K6)?O0d_neghUUOa9BXHP*>vi;
z={&jIGMn-92HvInCMJcyXwHTJ42FZp&Wxu+9Rx;1x(EcIQwPUQ@YEQQ`bbMy4q3hP
zNFoq~Qd0=|xS-R}k1Im3;8s{BnS!iaHIMLx)aITl)+)?Yt#fo<CYMyMO(N&hjUd(9
zrYcKju%_{w9OiZ0tXHbEUp6a7E}jZhUfM~7VrxCUryOyfws<yaoJP#_!TR~^I8*%M
z_blHbfonlI>v|Eh>}dv@o6R{tG>uHsy&jGmWN5+*wAik|78(b?jtysPHC#e+Bzz~V
zS3eEXv7!Qn4uWi!FS3B?afdD*{fr9>B~&tc671fi--V}~E4un;Q|PzZRwk<k{$576
z3)*^Wn$J&4ijeAub)TAsrRanvyo+2b|HaO}Wck&TsBIjDEciAZ)xNHrg{+;Ot(c~w
z>-azprM$4AesvUb5`S`(5x#5VJ~4%ET6&%GR$<o;Uw@jHNXKq{o{-vatV``S=;pJq
zosYFs*w$E-x(PmXNY}^Ik!Ys=1N(H?1>}muHV-5lTsCi_R|6KM(g2PCD@|yOpKluT
zakH!1V7nKN)?6JmC-zJoA#ciFux8!)ajiY%K#RtEg$gm1#oKUKX_Ms^%hvKWi|B=~
zLbl-L)-=`bfhl`>m!^sRR{}cP`Oim-{7}oz4p@>Y(FF5FUEOfMwO!ft6YytF`iZRq
zfFr{!&0Efqa{1k|bZ4KLox;&V@ZW$997;+Ld8Yle91he{BfjRhjFTFv&^YuBr^&Pe
zswA|Bn$vtifycN8Lxr`D7!Kygd7CuQyWqf}Q_PM}<N^zG*sz{_fcU3v7O7XnC*4Aa
zpb=9eR0`H6KOu7y5Lux$;j6glQ=Cnx<g;4DUpTr~Jiv#5xF0h9DaFBD=)!%6jHEWX
z889cm)A<znb}Qr@2Ih@T`xk4BNl)lK%%ilOmpuyg+8+eO;vs(GN026wy-zF$0Bpho
zpq~g*HNw~h8hh>cX~S1$-6xUD%-jrSi24sBTFNz(Fy{QL2AmNbaVggWOhP;UY4D>S
zqKr!UggZ9Pl9Nh_H;qI`-WoH{ceXj?m8<o#3`K%nSO84PqMf~>y==MGY`AOJ7l0Uu
z)>M%?dtaz2rjn1SW3k+p`1vs&lwb%msw8R!5nLS;upDSxViY98IIbxnh{}mRfEp=9
zbrPl>HEJeN7J=KnB6?dwEA6YMs~chHNG?pJsEj#&iUubd<U9vAx5z41A?(bgtMGpP
zwCR*+-u<wI9%Dmvw`F@KM2us_U9@<TVJ^`{G6!5FE!-2ia+H}$Pk<_s|H<&5&#|JG
z3Zn(zsf_P;Kw|kH0@8N`a`zB$aW?-R7Gz=c|MM@CRcxG;L{NR7$sfsEaj*@Gm8+D)
z(~4{oh<j#<<rU=_ed_JxH3P1uWIgXZ?Yi3Q9}(WL@MIKBarw?*{rs^Y<VO%bf%sm-
z_bzSLCFW%#vYlVDwobBmPB_h8cs)NCW`4*P8t+ny7P4fq&4#)qdy+@I-1a2!GTstR
zV`C1q#W;Z%KqC%`A!VsO1lwv42E|Arp%SMpI;`;{)L@G58XaAs;HkN28=c;-R9iC5
zFv>f3JJwu=C(t?J<di8FrDi&_<8Gi8I{@c2;!g0*Fs7Y{NizU~U~DCr9#QGUWLJ8$
z6So7^8!gBboMj9cpk@?3jQ~}GdEFW_-L0Go4=cq7S|ztSu>pE6xMyhA3e}SRhunDC
zn-~83*9=mADU<BlwTn$=?eGI*${p@%-gJ&Jk4}GnjS5v2d|X>sk^<vsaU5-S-Dt?B
z^CJPj)ts%v&i(kUo{Z{ey^V2<p;Ul;SaF~f0+aoAk+x^fU+c<H8=??`0+ELm<<WYa
zejibs7IS57q5_UvyT>sCc%&&G1q5T^HR9$P#2DejaG`Ui*z1hI#h7dwpIXg)C{8s<
z%^#@uQRAg-$z&fmnYc$Duw63_Zopx|n{Bv*9Xau{a)2%?H<6D>kYY7_)e>OFT<6TT
z0A}MQLgXbC2uf`;67`mhlcUhtXd)Kbc$PMm=|V}h;*_%vCw4L6r>3Vi)lE5`8hkSg
zNGmW-BAOO)(W((6*e_tW&I>Nt9B$xynx|sj^ux~?q?J@F$L4;rnm_xy8E*JYwO-<G
zAw2a{1_{}UDtg6k<H!^~Ep9z*WtT)Wa}-x7C<<#g4x~4C&Od75vZaK9?9f%g*!4dQ
zlJiKHKS~B<_~zQHLZ6Yr&U0^y*|A2BEYT5Dfy(GmpC7JBa0_LKHpXMU18JJV<l9Ls
zF!CE=82FF4{}RO?1()5W%NVF=#KHe0qmy=`QxbdS#(f7zm(X!J9h!WZ#V5Q+oI9e!
z6moH<;DB=(^c^&S`<-WI`KlIm1rpiXUn1r!q_#i$C`6hPDt7K}0@M9MwW@Xl0adOq
zH-`cCnl0!pNDx>02<I8n_zF__isE*QA^i~z{Rx#YCf{3mI<#SoM*K+EC4@|p=};uD
z7_^98UUK#Y*>u9_@@W0_2@?B@1J{y~Q39N3NX^t7#`=34Wh)X~sU&uZWgS1Z09%<L
zecj}N4CM_`#U|#O1uFgDAre$wf)a-3-~Yms53N&v-}+YY_<tMCi2mPu{-62klqQt-
zKRjkVla6-e>_k|EjA4w_QqPdY`oIdv$dJZ;(!k)#U8L+|y~gCzn+6WmFt#d{OUu<Y
zOY_Uc<|e?RL-YmB)%)`@T7RB<R-QF>KHqh1-uX_p*Af8pFYkYvKPKBxyid4KHc}H`
z*KcyY;=@wzXYR{`d{6RYPhapShXIV?0cg_?ahZ7do)Ot#mxgXYJYx}<%E1pX;zqHd
zf!c(onm{~#!O$2`VIXezECAHVd|`vyP)Uyt^-075X@NZDBaQt<>trA3nY-Dayki4S
zZ^j6CCmx1r46`4G9794j-WC0&R9(G7kskS>=y${j-2;(BuIZTLDmAyWTG~`0)Bxqk
zd{NkDe9ug|ms@0A>JVmB-IDuse9h?z9nw!U6tr7t-Lri5H`?TjpV~8(gZWFq4Vru4
z!86bDB;3lpV%{rZ`3gtmcRH1hjj!loI9jN>6stN6A*ujt!~s!2Q+U1(EFQEQb(h4E
z6VKuRouEH`G6+8Qv2C)K@^;ldIuMVXdDDu}-!7FS8~k^&+}e9EXgx~)4V4~o6P^52
z)a|`J-fOirL^oK<pJlgk;{wTU6<tzCapMDlBXxQB3Le5okt#72xJqlvoHh5h4)(GA
zUTE7RMa>}tqD@pqBZi_;7N43%{IQ{v&G9^Y^1?SesL`;Z(dt!nn9Oj5Odde%opv&t
zxJ><~b#m+^KV&b?R#)fRi;eyqAJ_0(nL*61yPkJGt;gZxSHY#t>ATnEl-E%q$E16%
zZdQfvhm5B((y4E3Hk6cBdwGdDy?i5CqBlCVHZr-rI$B#>Tbi4}Gcvyg_~2=6O9D-8
zY2|tKrNzbVR$h57R?Pe+gUU_il}<WpPK0&c7Cf)^kTlHgLh*e9eYX-e@u&J%`pP#5
zSj}Dq@1tQz)i#Nl`VxH|LIiAxaA`Jd1@-!*lgdAtByP{OU%az&?42B}362L=feNrt
z*RxHj$ZWxdny=m=<L*KF`s!?l3-xy=FSk?@Svj!g{kbFhL)s~jx0dj5*@zub3TmN4
zewl(O8Lgo<<c>ZaWu|Az#QO@};=|(L-R<FtHfil3F$IN__;{K(;=@HW!#QUN35a{%
zD~|)u$b9KdYl2xdz7y|l$=m8DlwhQqP%?Kb=<$vL)3-UG2JBk4^e{I9(8U>Vf0AIW
zq#pO+RfM7tdV`9lI6g;{qABNId`fG%U9Va^ravVT^)CklDcx)YJKeJdGpM{W1v8jg
z@&N+mR?BPB=K1}kNwXk_pj44sd>&^;d!Z~P>O78emE@Qp@&8PyB^^4^2f7e)gekMv
z2aZNvP@;%i{+_~>jK7*2wQc6nseT^n6St9KG#1~Y@$~zR_=AcO2hF5lCoH|M&c{vR
zSp(GRVVl=T*m~dIA;HvYm8HOdCkW&&4M~UDd^H)`p__!4k+6b)yG0Zcek8OLw$C^K
z3-BbLiG<G*%4Ey*o=CXuznD%-E)(BI7(ff9<6pa$Zrz&C5rTB04~@$dmXojzFZ}L`
zRg%4@Gzl_8#O!M!**o3FiOinwAw)3js>_%qX|ZYpXJ$(c@aa7b4<SFTt=8S6LG4l8
zlcVxZ?pLGnmVxFRWw8r17^k5AI;7>-*IQkDF}=gZSV`*ljP|5mWuHSCcf$5<NC<*)
ze^<2Ph^^DKawiOIF%89MPGPWue;M0vE~6E*nOMq4MZhj`p<K636GBRJmj%~d#Domc
ztZs(zo{Afl&>qqhZTv&P?I$z^>}qP(q!Aku2yA5vu38d8x*q{6-1<l@ZEBJgr=yj*
z=|5IuLM<?M78&Zia4KQ3zqq+Pj;>`%PrE_r0-9Qo?a#7Zbz#iGI7K<(@k^|i4QJ1H
z4jx?{rZbgV!me2VT72@n<DR5mdN;#-C(eB6{T-1WCMC<4OG#$>BjucoT<x)|o8t&d
zcz=#pXpp<Y6uDD$*V387M|STow%*8!)gvQM!5ojXJnYQsrshiZijh#8qBZXPD|nB>
zUM9;Y%TCoDop<GM?%qaP$qD5N_Re&x#fQeb`~mmirR*mz_8isV?5XN|lxXW*GxnA&
zqsd>?Q5fEQ35bCYk7!;gH*;t9t-QHLXGmUF;|vm365<zVCrfXPo#;){ym}VQ_ne6P
z!(%<fPO&{N)}Gkkd3S$5{D%<)xgUX}Egyy6<A)LM2E@O{a2F|=%ja^_%j2M#nVpYQ
z+Vn8PewW=9ex!-s%zI(~4dD%n1{+wKXJGwACYXJl_&|&PO&v1w1{IxqSjzH=O`!a0
z5uLkNYxP09lfjZM9-3XAXLL^%{CG2G^qD8VIQJT6@woCmj1pDW{O5QmD{ub{Aim!S
zn{UQG3M0t%#80q7^iVb`W7wUM)To100*n8PpM~ji<yC~2IwTb__!9_^>#X)6b2<By
zsr~`(aBfe%efKJJR45%#bJrSl;-dx20E32$(x^6gOAVg&3g6`rfg4?vYT5$B3KPW4
zR@y2a@?kp2S0H8-qBmQlH&;6+BIP#!%KAya@JVOY!KKN){N`mS*zmNqmHEMR@TXC!
zhcA$PyEZicF>Njsyf1h9JW#x$;@x5Nx2$K$Z-O3txa%;OEbOn6xBzd4n4v)Va=sj5
z%rb#j7{_??Tjb8(Hac<^&s^V{yO-BL*uSUk2;X4xt%NC8SjO-3?;Lzld{gM5A=9AV
z)DBu-Z8rRvXXwSVDH|dL-3FODWhfe1C_iF``F05e{dl(MmS|W%k-j)!7(ARkV?6r~
zF=o42y+VapxdZn;GnzZfGu<6oG-gQ7j7Zvgo7Am@jYxC2FpS@I;Jb%EyaJDBQC(q%
zKlZ}TVu!>;i3t~OAgl@QYy1X|T~D{HOyaS*Bh}A}S#a9MYS{XV{R-|niEB*W%GPW!
zP^NU(L<}>Uab<;)#H)rYbnqt|dOK(-DCnY==%d~y(1*{D{Eo1cqIV8*iMfx&J*%yh
zx=+WHjt0q2m*pLx8=--U<Z87HfG;q}m>qfM6ZWjkev>W-*}_*$Y(bikH`#-Gn#!6_
zIA&kxn;XYI;eN9yvqztK-a113A%97in5CL5Z&#VsQ4=fyf&3MeKu70)(x^z_uw*RG
zo2Pv&+81u*DjMO6>Mrr7vKE2CONqR6C0(*;@4FBM;jPIiuTuhQ-0&C)JIzo_k>TaS
zN_hB;_G=JJJvGGpB?uGgSeKaix~AkNtYky4P7GDTW6{rW{}V9K)Cn^vBYKe*OmP!;
zohJs=l-0sv5&p<L!P8aAhZ$ViHW~K#|M7K>L6-bowk~(swtdRBZQHh8)m^r2+qTtZ
zt4m$B?OQYNyf<S0cjiW%jPp4ocVzDU+iN|mA64$QnCRX3c5pg1#I)OdNDpa)HtXcw
znS3MtA?j8rhi-3|Locu;@KXBA={WJ{lWM@;qi^|k4d=wZbBn}Q)l=zT(OGe|_o;CO
ziN`_dE+z*6Zl6pK%FbeS8#lFna!rG6w2@{OT7y$1;)CKbPy6)aPZ(-qT1;zMcLvBi
z9RxxN#TW55YCB1_U7%BGo@JJx^`VOeiY`Vr8$|ccM-ilzFdc~nYS+WJ)ii$)A*7+Q
zN_y1r@TZnl*uB$J6tDKRYLySLcU(ahq@Q$EoWpLzi!}*SMM~)}4Ayz}zY&Wi`h9#!
z&9oY2DPtdHFqYBFk2l(gA{>BA0E)g28a*{)a=%%f-?{F;++-Xs#5|7kSHTD*E9@$V
ztE%7zX4A(L`n)FY8Y4pOnKC|Pf)j$iR#yP;V0+|Hki+D;t4I4BjkfdYliK9Gf6RYw
z;3px$Ud5aTd`yq$N7*WOs!{X91hZZ;AJ9iQOH%p;v$R%OQum_h#rq9*{ve(++|24z
zh2P;{-Z?u#rOqd0)D^_Ponv(Y9KMB9#?}nJdUX&r_rxF0%3__#8~ZwsyrSPmtWY27
z-54ZquV2t_W!*+%uwC=h-&_q~&nQer0(FL74to%&t^byl^C?wTaZ-IS9OssaQFP)1
zAov0o{?IRAcCf+PjMWSdmP42gysh<IAjP3I4~XaCM5*E+%@4H^3M`6ye$$tP$WNbE
zbdn`)4v4TUCT*VNiX&D{_|KnFA~G{kE=ntpOsoNd$eKT1|78iI(^E{syjncr1LOg7
zvpOD8>|c9Ma&Q^?_+>>+-yrC8WR;*XmJ;>r9v*>=W}tgWG;WIt{~L8`gk8DP{dSdG
z4SDM7g5ahMHYHHk*|mh9{AKh-qW7X+GEQybJt9A@RV{gaHUAva+=lSroK^NUJYEiL
z?X6l9ABpd)9zzA^;FdZ$QQs#uD@hdcaN^;Q=AXlbHv511Me<Gn_mEdR&MLv`f?Kx%
z8r4<bviQ`;c8%82V_>ye`p>P4Y2nbl<QidqPDpe9A~^s`DTJhVk(6Y``I7s6)Uf8I
z)$kBCwQT*1X4n1zt#H4(j7MaFrUGFu3Oe@=ajNRN9Ue(NO3~8Zbcu3`OQey;tOFq)
z;kWaA7JY$F6d<n$zVRV0FWGDsVrWK8OU713_M!=l4OywWwXC>ED<i?mQmUeusywBn
zNXtr^%xvIn1M3zrm2h?T?3n_;CTQ@tsgJ7GBW2Zl9w{U92R%)EW#YjprYugOXEz#;
z)Ez=1KR_+sWJf1M8ko8ef2>EeZo}-$@g&L98Aih6tgLz--${eKTxymIipy0xSYgZZ
zq^yyS4<Tkg@?|;c$V~f<0k6LL2{(FY|1JSj8S@S!VF=edAL=G_+qo_9OeZ<iQZN90
zxqZ|^@d^`g3BJ>yNPTtPj-sM?R8@9Q1gtXPqv{$lb5i|C1yymwnGdfYV3nA-;5!Wl
zD0fayn!B^grdE?q^}ba{-LIv*Z}+hZm_F9c$$cW!bx2DgJD&6|bBIcL@=}kQA1^Eh
zXTEznqk)!!IcTl>ey?V;X8k<+C^DRA{F?T*j0wV`fflrLBQq!l7cbkAUE*6}WabyF
zgpb+|tv=aWg0i}9kBL8ZCObYqHEycr5<J4qyW|o@#4QxtPb0tj1I7virQLPL&}gD@
zWjo9xh~qO(Y0IO<4tNTVXq#ofI9{0jiG){*U=jU%>tpc-$|vdvaBsu#lXD@u_e1iL
z{h>xMRS0a7KvW?VttrJFpX^5DC4Bv4cp6gNG6#8)7r7IxXfSNSp6)_6tZ4l>(D+0I
zPhU)N!sKywaBusHdVE!yo5$20JAU8V_X<uu|HFmVS7PN?;)jmliX-5g<9aNnC&_+8
z-H|>cW{QmO!p<W)v=kYJ37)j)g374|egJcV6PD)VfLvm4bO_Afd90nM0aO!mY&r=|
zH0q8BQ`<MnSES{IMkD&HI#0sb$cxN8L2}my{Ib`%4}@To>*~ns8{2~bhjydnmA&=r
zX9NSM9QYogYMDZ~kS#Qx`mt>AmeR3p@K$`fbJ%LQ1c5lEOz<%BS<}2DL+$>MFcE%e
zlxC)heZ7#i80u?32eOJI9oQRz0z;JW@7Th4q}YmQ-`Z?@y3ia^_)7f37QMwDw~<-@
zT)B6fftmK_6YS!?{uaj5lLxyR++u*ZY2Mphm5cd7PA5=%rd)95hJ9+aGSNfjy>Ylc
zoI0nGIT3sKmwX8h=6CbvhVO+ehFIR155h8iRu<x!^Z2M1Pzq+-oyvlRxQ>XZx^<Tr
z@8Tbpd#czm8-Yg}>cW>rq5K4z_dvM#hRER=WR@THs%WELI9uYK9HN44Em2$#@k)hD
zicqRPKV#yB;UlcsTL_}zCMK0T;eXHfu`y2(dfwm(v)IBbh|#R>`2cot{m7}8_X&oD
zr@94PkMCl%d3FsC4pil=#{3uv^+)pvxfwmPUr)T)T|GcZVD$wVj$mjkjDs`5cm8N!
zXVq2CvL;gWGpPI4;9j;2&hS*o+LNp&C5Ac=OXx*W5y6Z^az)^?G0)!_iAfjH5wiSE
zD(F}hQZB#tF5iEx@0sS+dP70DbZ*<=5X^)Pxo^8aKzOzuyc2rq=<0-k;Y_ID1>9^v
z+)nc36}?>jen*1%OX3R*KRASj${u$gZ$27Hpcj=95kK^aLzxhW6jj_$w6}%#1*$5D
zG1H_vYFrCSwrRqYw*9<}OYAOQT)u%9lC`$IjZV<4`9Sc;j{Qv_6+uHrYifK&On<JP
zpp<X22(sjxoU&97`N=lyhUxuG!%J2u>4V_7yMil!0Yv55z@dFyD{U@Sy>|vTX=P_(
zRm<2xj*Z}B30VAu@0e+}at*y?wXTz|rPalwo?4ZZc>hS0Ky6<x)^TtrxYs71dY<}C
zJ7~9+X<qB9Zm?iEJZ{MZLP-V?O@>~mi@kv#?xP2a;yt?5=(-CqvP_3&$KdjB7Ku;#
z`GLE*jW1QJB5d&E?IJO?1+!Q8HQMGvv^RuFoi=mM4+^tOqvX%X&viB%Ko2o-v4~~J
z267ui;gsW?J=qS=D*@*xJvAy3IOop5bEvfR4MZC>9Y4Z$rGI|EHNNZ7KX;Ix{xSvm
z-)Cau-xuTm|7`4kUdXvd_d^E=po(76ELfq5OgxIt3aqDy#zBfIy-5<3gpn{Ce`-ha
z<;6y@{Bgqw?c~h*&j{FozQCh=`Lv-5Iw!KdSt;%GDOq%=(V!dJ-}|}|0o5G2kJj6{
z`jCSPs$9Fe8O(+qALZiJ$WtR=<@GvsdM)IJ`7XrBfW0iyYE#Vy^e@zb<IIW7E)E^$
zm?~*Pm)RIxAq=^<S?g8*w}0T;h&X*9TfW7JL9qW1?e*VsCQ%P#8&~J=^eZt-8&g4J
z<L@1$tf8Hu`M+0?$?EIQIO<qmc$)KnDqVdnE7YcB<+P}|>ysg*B5Z_kSL6<)vqoaH
zQ{!9!*{e9UZo^h+qZ`T@LfVwAEwc&+9{C8c%oj41q#hyn<&zA9IIur~V|{mmu`n5W
z8)-Ou$YgjQ*PMIqHhZ_9E?(uoK0XM5aQkarcp}WT^7b^FC#^i>#8LGZ9puDuXUYas
z7caX)V5U6uY-L5Wl%)j$qRkR;7@3T<EihbM1}BSdl7mLfHh7Z5IU<tc_%R)hu1Pls
zW*$=d#aMnI^7$3UsHv{BJ5sQxr58gtU@13faQaKvtp1NTWBK(kBri8RFv{lQVz)EU
zE;0H{^qHFLE{ETvnK|d(2S~MWjq;7`XfESL1**kOb%IOfr--P)KDb+w?)o%$&plmc
z3-QPg+R@uss3E2opQk7Rm)`)TI@=AW{c<ZyIj@DoEI=}EtHB@U67zEBY|IA?TC1^`
z&MhCeE#}CUNjguncG~Xq>*N64YK_!`Fw=>CAwe~2loI1<>DZW&sb7Q)X;6E08&$h!
z2=c1i4UOO{R4TmkTz+o9n`}+%d%blR6P;5{`qjtxlN$~I%tMMDCY`~e{+mRF!rj5(
z3ywv)P_PUUqREu)TioPkg&5RKjY6z%pRxQPQ{#GNMTPag^S8(8l{!{WGNs2U1JA-O
zq02VeYcArhTAS;v3);k(&<OjocC=$o?TRU(R~HX`_by+I$w5@xw6t<*2LdvKE37`k
zgY2w6(gNl$CTp!>6ayCH8SXN@r;1NQeJ*y^NHM+zOd;?t&c!Hq^SR_w6twGV8dl>j
zjS+Zc&Yp7cYj&c1y3IxQ%*kWiYypvoh(k8g`HrY<_Bi-r%m-@SLfy-6mobxkWHxyS
z>TtM2M4;Uqqy|+8Q++VcEq$PwomV1D4UzNA*Tgkg9#Gpz#~&iPf|Czx!J?qss?e|3
z4gTua75-P{2X7w9eeK3~GE0ip-D;%%gTi)8bR~Ez@)$gpuS~jZs`CrO5SR-Xy7bkA
z89fr~mY}u4A$|r1$fe-;T{yJh#9Ime1iRu8eo?uY9@yqAU3P!rx~SsP;LT<Jbf>BL
ze<j@!$|69+G^FV<0dmKnWNMprxNJU~t{`&uDIb};O#gD#SUtD<w>oMK(!;(Zt8313
z3)V)q_%eflKW?BnMZa}6E0c7t!$-mC$<TXTDwQQ@WJ4kOaQ${ot9q1qI<zc^KS<Kt
zJ(!kxatq|HncYDRPoG>qt44OME5F(6B$E8w*TUN-h}0dOiXI+TH<trrR;qgADWj3>
zYFrr&k1(yO(|J0vP|{22@Z}bxm@7BkjO)f)&^fv|?_<v?bGp^(v{LHw7E&dh_!6#a
zLp5=pJXzMz!=2cx?pBZ$`*>JX+s)1*|7X7HH(W?b3QZ3!V|~m?8}uJsF>NvE4@fik
zjyyh+U*tt`g6v>k9ub88a;ySvS1QawGn7}aaR**$rJA=a<ruy*aA7&R1cdXAZ=fvQ
z7?AwvVWRDBj+?RQxP9<Eld<T8co91b&oa-&h-paua##@#UmSeWbQ$6ASd2{|yWwae
z<_OD(+JZih5G;0?))pr+4(0uA{>#eUT~ngUbJ%V=qsFIekLbv!YkqjTG{<Sx6K-xw
zVH;1ZChlA4?P<S<vBp1muSlGxcq+nmq$>_$F;$w19$(ivIs*1>?2ka%uMOx@B9`LD
zhm~)z@u4x*zcM1WhiX)!U{qOjJHt1xs{G1S?rYe)L)ntUu^-(o_dfqZu)}W(X%Uu|
zN*qI@&R2fB#Jh|Mi+eMrZDtbNvYD3|v0Kx>E#Ss;Be*T$@DC!2A|mb%d}TTN3J+c=
zu@1gTOXFYy972S+=C;#~)Z{Swr0VI5&}WYzH22un_Yg5o%f9fvV(`6!{C<(Zi<Hd)
zf?9YZqa5KsuLo~(-?`o~M(QFQk@|~fIjZ4c7cvux!@E|473(pKcY&_>gQ2`wso)cj
z9O12k)15^Wuv#rHpe*k5#4<VO!y+MK+RjQ{k<R&Ijfdv^PCr2`Wq*gcd9n6M$i7GS
zWD{0IJi~-slkyvdQrT5W#wgs9QYp`V2JT@Oyl*x}WLwPT{TMA3DhMeG;UOsO>vb%c
znP+Gjr<-p%01d<+^yrSoG?}F=eI8X;?=Fo2a~HUiJ>L!oE#9tXRp!adg-b9D;(6$E
zeW0tH$US04zTX$OxM&X+2ip>KdFM?iG_fgOD-qB|uFng8*#<c|@^p^*czNe~u#Nhe
zA)gEd7Y`NP(8Yn{!E*NxRd3vR5UuC8pCja3oAbZOImMj}O}<yG-*N8}_O_-n_V(7U
z|0U0lR$iA||BgL@A$5!LUrdsQMwOx>Z5jgqGY=zLU?4!OlO#~YBTB9b9#~H@nqQ#5
z6bV));d?IJTVBC+79>rGuy1JgxPLy$dA7;_^^L)02m}XLjFR*qH`eI~+eJo(7D`LH
z(W%lGnGK+Vk_3kyF*zpgO=1MxMg?hxe3}}YI<hw>>dVs8l}5eWjYu4=w6MWK09+05
zGdpa#$awd>Q<Pu(7+f+_x8>|@aZa*z{5F3xy3n@E4YT9%TmMo0jxW59p0bI?&S}M+
z&^NG%rf7h*m9~p#b19|`wO5OMY-=^XT+=yrfGNpl<&~~FGsx_<SH(o8TA?{1o1y)q
z^L3tn!1Q##WYDg?vz6gJq+H{z0Z6=`Rsx+yjHa+c2dL}U+sI|bMA@UmvrjrPhz-y{
zc=i~q1XE+Ro}#t=Wg5zemW=SD_L>`IaFn+sEgF$hgOa~oAVAiu^a$jHcqkE=dj`ze
z=axsfrzzh6VGD0x#6Ff=t%+VTiq!n6^gv*uIUD<9fOhvR;al5kcY${uunn}-!74<7
zmP^3cl-kyN(QY!!Z-^PY-OUkh=3ZWk6>le$_Q&xk4cgH{?i)C%2RM@pX5Q{jdSlo!
zVau5v44cQX5|zQlQDt;dCg)oM0B<=P1CR!W%!^m$!{pKx;bn9DePJjWBX)q!`$;0K
zqJIIyD#aK;#-3&Nf=&IhtbV|?ZGYHSphp~6th`p2<qF+^4BzD@E~uh`pn_7wEi&g3
z=?JBYF~BWU4<wU8Cij8+F{u4npgj-<kzS>rkw&((%kBV7<{siEOU7AxJj+FuRdDu$
zcmTW8usU_u!r<xT=YN-SyzXKiR(!Xt>)#jg|J=Gt{##7;uf4A5cdt6Y02}f(d2)z~
z)CH~gVAOwBLk$ZiIOn}NzDjvfw(w$u|BdCBI#)3xB-Ot?nz?iR38ayCm48M=_#9r7
zw8%pwQ<9mb&#7Es5~_>pN3~#+Er~Q86J+2TDXM6umCbukd-X6pRIr5tF?VauT8jW>
zY^#)log>jtJs2s3xoiPB7~8#1ZMv>Zx0}H58k-@H2huNyw~wsl0B8j)H5)H9c7y&i
zp8^0;rKbxC1eEZ-#Qxvz)Xv$((8lK9I>BspPajluysw^f#t9P;OUis43mmEzX+lk*
zc4T-Ms9_687GR+~QS#0~vxK#DSGN=a-m(@eZTqw2<+lN9>R~gK2)3;sT4%nI%Y|0m
zX9SPR!>?~s=j5H4WMqeTW8QaLZ=1bWS5I3xZ&$(ypc=tHrv+hX@s)VG(tc!yvLM7n
zshN=C#v={X1r;)xn0Pow_1eMhkn!{;x$BJ#PIz)m585&%cmzk;btQzZAN_^zis;n?
z?6I~bN?s;7vg_dtoTc4A5Ow*Rb}No#UYl)sN|RmoYo}k^cKLXd8F`44?RrokkPvN5
ztUrx;U~B;jbE_qGd3n0j2i}A{enJvJ?gSF~NQj~EP5vM-w4@;QQ5n(Npic}XNW6B0
zq9F4T%6kp7qGhd0vpQc<W<L!fXJ~a+;4Khyr5Qhi14))aSy_<lw1b(PskQ0m-#boe
zA7vv?mzfH`Jh>z+nMk8GOAmbz8Bt4@GtewGr6_>Xj>ge)SyfY}nu>Y!a@HoIx(StD
zx`!>RT&}tpBL%nOF%7XIFW?n1AP*xthCMzhrU6G!U6?m4!CPWTvn#Yaoi_95CT2!L
z|B=5zeRW30&ANGN>J9#GtCm&3SF6n4TqDz<-{@ZXkrkRDCpV$DwCtI^e&3i1A{A<x
z;2|tq%BrQu8b~3XIcCW(SwO2Qo{~(iKF8b80t{dy-ASxDFVBu71<-fKB?XPiF|me|
z=x6N^A6xeq&BtV~$BZ<o(2Y7NHHWgSlhiV?0&?tHLu3b)_J-PW>r&JZtS^c+lyPa6
z%JJr42S_;eFC<HyN&ZMSk+!vPxQr^jTrZ{0+R?&~n(bDfIxeox!m}PUNIWM6DI?<(
zT2si%OLG7mJo(b>#M~bdtQePhOU32WDiZ4@H&af)z#$Y|hnQNb)8(3?1Ad>5uaZ1z
zU~!jt3XUI@gpWb8tWTyH7DGvKvzYfqNIy3P{9vpwz_C-QL&`+8Io$F5PS-@YQJoEO
z17D9P(+sXajWSH_8&C?fn>rTLX+(?KiwX#JNV)xE0!Q@>Tid$V2#r4y6fkph?YZ>^
z(o^<V`ne&X_HQL}Ps!gij?%w4&m}?Ps6H^U)X!|>q(0*P->3?I0cELXJn(N|#qTm6
zAPIL~n)m!50;*?5=MOOc<ZEVd{!x0=tWkQjcj#EEXI8($zpYhY?h=S*e^`Z`)3UF{
zky_7<Nsk@h^#%w~-;v}gpHV)m{?b3wAf*D66ID!&W8QczIGr*dR(n<CK$mURTOiAV
z<lD*0YMaqWe&BGwyT1$fC#E5mWk%?^kGR|yu~8)%i_)F%6Vm6DYe;%inSs@0A+dn0
z=o*C|Cgxb{F1AEY_-9NniiLigsT<CHl5`@W3sl@4i-Z28RhP$TcY**+xoy|*Iji|!
zh&M<pkuw^lO>4Wk;w(0c$(!e?vpV23S|n|Y7?nyc<krXgvoM8Y4s4-xP^eZOrQh6(
zbL?h7r&v>8)fD8t-KI&nTklH&BzqQ}D(1gH3P+5zGUzIjT~x`;e8JH=86&5&l-DP%
z)F+Et(h|GJ?rMy-Zrf>Rv@<3^OrCJ1xv_N*_@-K5=)-jP(}h1Rts44H&ou8!G_C1E
zhTfUDASJ2vu!4@j58{NN;78i?6__xR75QEDC4JN{>RmgcNrn-EOpEOcyR<8FS@RB@
zH!R7J=`<qyvO5jTK-`}T1}ITNlGB)M;WAqsA4^>KK^u06eeI|X@}KvQmdKE3AmAy8
zM4IIvde#e4O(iwag<LKgGj#`eK5h`_DRA^{i>`UL5yQo>6&7^=D4yE-Eo9$9R2hR}
zn;Z9i-d=R-xZl4@?s%8|m1M`$J6lW1r0Y)+8q$}Vn4qyR1jqTjGH;@Z!2KiGun2~x
zaiEfzVT<|_b6t}~XPeflAm8hvCHP3Bp*tl{^y_e{Jsn@<QpTRxy~I48d!aMjkL30E
zCwGqg2Ht%$0a~0zPmAFTrO04}q6!O3%)kobW9+~-B779_EFydi@hStv32;GGAB+Vo
zGQY$EjAd0ZZ^AD*7vM__>w+KP{7}bH_s=1S2E1sj=18a39*Ag~lbkT^_OQuYQey=b
zW^{0xlQ@O$^cSxUZ8l(Mspg8z0cL*?yH4;X2}TdN)uN31A%$3$a=4;{S@h#Y(~i%)
zc=K7Ggl=<xZMTLme>&2hYVic*W65gpSPE70pU;FN@3k?BYdNDKv6wlsBAF^);qiqI
zhklsX4TaWiC%VbnZ|yqL+Pcc;(#&E*{+Rx&<&R{uTYCn^OD|mAk4%Q7gbbgMnZwE{
zy7QMK<srwzjF^HmPE;6uLX+TQVJBikFu1hk1`NI#LP|tjvdEkWrdIO=;M;EM96tck
zqH2Rg)#aB!i2Hker_h78G^ozmPAa8sdEWhkLrs})dH+^%<;*UJE3(=5-E|mqSymt{
zkeEv8zHsZx=**5b-w}p5Xd7L~o?@ppl%Y9RVO`!rYId5z?Jvj9nVZn6<VMiK@kpV_
z1E$*oN*ji}VXj6PUp@ZYuo^Dnx<e#Km@qL^G)wt_SZ^Bvqa(cI4J2>%jIjU@ye?0;
z;0--&xVeD}m_hq9A8a}c9WkI2YKj8t!Mkk!o%AQ?<GgxY5guXo8{}TlwGN)h5Xy0(
zvcI7t!O|M=E-&3>|CCBL9}n570}OmZ(w)YI6#QS&p<={tcek*D{CPR%eVA1WBGUXf
z%gO2vL7iVDr1$!LAW)1@H>GoIl=&yyZ7=*9;wrOYQ}O}u>h}4FWL?N2ivURlUi11-
zl{G0fo`9?$iAEN<4kxa#9e0SZPqa{pw?K=tdN5tRc7HDX-~Ta6_+#<kdw-7-Sl_2=
z|H~-xohbjG&B=fL3tN~PTmP#gOHTUFDEW|W%U0cvO`+=TXVqc}ZD?)i48&nf$uM$J
zdaI1Tn{}|JhL&S?m7gT`ZL=Z;b3@Yf?}{VLjq3<fa4fPjlWW;eoL6h<o8Q?<?;r~O
z#-NU<rfKpDlw);-3!-?YoOD<cc?HJl{T3cRqiFQnp#g>s9W&d`6PB7dF*G@|!Mc}i
zc=9&T+edI(@la}QU2An#wlkJ&7RmTEMhyC_A8hWM54?s1WldCFuBmT5*I3K9=1aj=
z6V@93P-lUou`xmB!ATp0(We$?)p*oQs;(Kku15~q9`-LSl{(Efm&@%(zj?aK2;5}P
z{6<@-3^k^5FCDT@Z%XABEcuPoumYkiD&)-8z2Q}HO9OVEU3WM;V^$5r4q>h^m73XF
z5!hZ7SCjfxDcXyj(({vg8FU(m2_}36L_yR>fnW)u=`1t@mPa76`2@%8v@2@$N@TE`
z)kYhGY1jD;B9V=Dv1>BZhR9IJmB?X9Wj99f@MvJ2F<eY`A?(Jl!-zJY9?iV<S8YR1
zzKj>im*R`rsRilvz_3n!nPFLmj({EP!@CGkY5R*Y_dSO{qto~WerlG}DMw9k+n}pk
z*nL~7R2gB{_9=zpqX|*vkU-dx)(j+83uvYGP?K{hr*j2pQsfXn<_As6z%-z+wFLqI
zMhTkG>2M}#BLIOZ(ya1y8#W<+uUo@(43=^4@?CX{-hAuaJki(_A(uXD(>`lzu<hcX
zu<`rODLH+-71-BRaTp})^?Ug_ji6bWm(uXz1O|suX_K)0b#uG8X5wLm7%2Sb7ascW
zc=2~bRAf14wvTyf*%a`PoV7;4md$VXW5kfQ|I7t7mp0vAMFj%-`CY<<|I0A(uPXjm
z;Z|$@heULVpQ5pA?0OXhIAlr@%7pBE0xX{h3@YSTP_chU7b*SNZ(7GgIMVq)5zF1p
z)y;tA%@&vX>M~M;3XA48ZEN@HRV{1nvt?CV)t;|*dow0Ue2`B*iA&!rI`fZQ=b28=
z_dxF}iUQ8}nq0SA4NK@^EQ%=)OY;3fC<$goJ&Kp|APQ@qVbS-<AMg7|<xK7gEQ0*G
zlcf^Q6}hF3@wqieC-alDZL9Le?IO#mRyv1!PW+owXxKCN3nDf53%oqiVrCNyxSko6
zH$42{o(;K2cOd*r*Fv7Uq*F$gLW9SI>MtJQBc)^aO8mYFsbhafeRKdHPW&s^&;%>v
zlTz`YE}CuQ@_X&mqm{+{!h2r)fPGeM_Ge4RRYQkrma`&G<>RW<>S(?#LJ}O-t)d$<
zf}b0svP^Zu@)MqwEV<VUFg__G=GHzjJUM5j%&XT6R(8v6!}8X_uU2*s^;Gn>^Fb_j
zPYYs~vmEC~cOIE6Nc^@b@nyL!w<ZLmMX}N-x6Tx~S9?LZi8cq$5v`F@+&&v*zO9fh
z{v+HUi$rP{f92r|TVIstb;@o^KTr>5o?nQ!$mGq(Pa|1-MD}K0si<&}eag=}WLSDO
z<gdc$xr#hPYiFOOqHB%j&x|A|!bGgNCR1!rS<T3;h{c#J$<sk-tkl))tF|4cBRWRl
z&dyToHxyc1cCuWTaMT#l7IV#Q*r`dxs=6EIq=JkuRAmOOcZKN9Y?+9u7Dm;XJxOjJ
z-I)7X%K36EYHj2(tC1R=#0QSgSx}N6tdO~Vt8eh?B{sf&$)`<qcA{Q-Z7$eq;;bT3
zY~n$;RB|}W2Q~dA;b!X6q{*=M?CK#TX0Fna=`uBpMj;8+^Wj1litN9#+$A~CTT+;^
zb>E4+eA~!J(K}605<AJateB#!5Jo3MRGQXgY#11*@WWa$<7LH&X|GNMTbGbRj>x&4
zT72P7J^)Y)b(3g2MZ@1bv%o1ggwU4Yb!DhQ=uu-;vX+Ix8>#y6wgNKuobvrPNx?$3
zI{BbX<=Y-cBtvY&#MpGTgOLYU4W+csqWZx!=AVMb)Z;8%#1*x_(-)teF>45TCRwi1
z)N<HNFgUCSJViO;(JJ)Kbs4vjFO%F?kvSN-O|_NwW(EOiHj<(STe$x6RQM9bLsVXv
zyH0p@OIcRdC>n>hy3_lo44n-4A@=L2gI$yXCK0lPmMuldhLxR8aI;VrHIS{Dk}yp=
zwjhB6v@0DN=Hnm~3t>`CtnPzvA*Kumfn5OLg&-m&fO<k$Ar7F>bRD};c}Hf?n&mS<
z%$wztc%kjWjCf-?+q(bZh9k~(gs?i4`XVfqMXvPVkUWfm4+EBF(nOkg!}4u)6I)JT
zU6IXqQk?p1a2(bz^S;6ZH3Wy9!JvbiSr7%c$#G1eK2^=~z1WX+VW)CPD#G~)13~pX
zErO(>x$J_4qu-)lNlZkLj2}y$OiKn0ad5Imu5p-2dnt<E$QZc-BKsz(vO|0}pqQi;
zUZ-Nt*kB&hE*49T8u+j=M$kHp6)365a>)(YI|b7rJ3TBUQ8FB8=&ym50*ibd2NAbj
z;JA&hJ$AJlldM+tO;Yl3rBOFiP8fDdF?t(`gkRpmT9inR@uX{bThYNmxx-LN5K8h0
ztS%w*;V%b`%;-NARbNXn9he&AO4$rvmkB#;aaOx?Wk|yBCmN{oMTK&E)`s&APR<-5
z#;_e75z;LJ)gBG~h<^`SGmw<$Z3p`KG|I@7Pd)sTJnouZ1hRvm3}V+#lPGk4b&A#Y
z4VSNi8(R1z7-t=L^%;*;iMTIAjrXl;h106hFrR{n9o8vlz?+*a1P{rEZ2ie{luQs}
zr6t746>eoqiO5)^y;4H%2<BkZi5tt<3x!UQt98zj1W<8j93$uGFF$h$bdznZWhb7b
znLTQgvx9Qcm<Xepxl=K85buKY=Ib!qE>~&FT*Qc*9_oC2$+&syHWsA=rn3B~4#QEW
zf4GT3i_@)f(Fj}gAZj`7205M8!B&HhmbgyZB<D9PSg2+}Rs%jVF~$y=V~y{^%}%>&
z+COyAVNxql#DwfP;H48Yc+Y~ChV6b9auLnfXXvpjr<~lQ@>VbCpQvWz=lyVf1??_c
zAo3C^otZD@(v?X)UX*@w?TF|F8KF>l7%!<kbph8cUodrPAJJd%WY@2~TX}(qwom%@
z@ARbG&%LPifU_Ql=J~cDcFG>Dzu+hksSA^akEkx8QD(V(lK+HBCw6C}2onVExW)f$
zncm*HI(_H;jF@)6eu}Tln!t?ynRkcqBA5MitIM@L^(4_Ke}vy7c%<hQf3mTV+||FK
zM8Z)>$w{(`&7Rn=u>oDM+Z^RUYcbSOPwT(ONyq76R>$V6_M_UP4vs=__I#io{{((|
zy5=k=oVr-Qt$FImP~+&sN8rf2UH*vRMpwohPc@9?id17La4weIfBNa>1Djy+1=ugn
z@}Zs;eFY1OC}WBDxDF=i=On_33(jWE-QYV)HbQ^VM!n>Ci9_W0Zofz7!m>do@KH;S
z4k}FqEAU2)b%B_B-QcPnM5Zh=dQ+4|DJoJwo?)f2nWBuZE@^>a<zU@xT0d6IHav4c
zb~p!{)~4X`9H7f4Os$BBi<uW85TZB$IwVnnn`k+3Lfce&Rzjf31s|*fwe$YbK${pp
z*tbfAnkuI*vOT>(gP~ObzMuyNJTgJFUPcH`%9UFA(P23iaKgo0)CI!SZ>35LpFaD7
z)C2sW$ltSEYNW<tWp;)M)gi8^dT+jcDl%}Kf?9QFd|xmfdxyfsPbDwk@9&~vG_Z{u
zQ@p1E$rDYzYk{^3<YL}K%>%%j<B%d>8F;yK{iHI2Q^}coF@LX`=EvxZb*_O;2Z0Z5
z<t@FVe@+!TR-b?O`lYlsu6VQ^Cc0rHAKtKSmI7A=<mxAvBKvVotgv_F`ia+@cV{@3
zy%&0z)&wPGy<09(8G)NL7S=D5jeCAhpA=|&BSJ~}awHo4v-XZ)h*67&lQ|?paWB>7
zlccxmCfCI;_^awp|G748%Wx%?t9Sh8!V9Y(9$B?9R`G)Nd&snX1j+VpuQ@GGk=y(W
zK|<$O`Cad`Y4#W3GKXgs%lZduAd1t1<7LwG4*zaStE*S)XXPFDyKdgiaVXG2)LvDn
zf}eQ_S(&2!H0Mq1Yt&WpM1!7b#yt_ie7naOfX129_E=)beKj|p1VW9q>>+e$3@G$K
zrB%i_TT1DHjOf7IQ8)Wu4#K%ZSCDGMP7Ab|Kvjq7*~@ewPm~h_-8d4jmNH<pfqQ|I
z!YTVc{?ZH@BD}h%vf=bWfAP7V&Rf=bOq^~NLhGU}*|~<aTkKW3_RmeG><&mNZC@CI
zKxG5O08|@<4(6IEC@L-lcrrvix&_Dj4tBvl=8A}2UX|)~v#V$L22U}UHk`B-1MF(t
zU6aVJWR!>Y0@4m0UA%Sq9B5;4hZvsOu=>L`IU4#3r_t}os|vSDVMA??h>QJ1FD1vR
z*@rclvfD!Iqoxh>VP+?b9TVH8g@KjYR@rRWQy44A`f6doIi+8VTP~pa%`(Oa@5?=h
z8><Is*|k4Wm^S61QF)a<b`%iwYpxvvT(i)Wl=h}ig6i!wS_O7k(5TrzzI?is`$I8d
z$z1Mr-D&oAZ)MmzC$>YxNvA##a3D0)^P|2|+0~f|UsAJV=q(S>eq-dehQ+T>*Q@qN
zU8@kdpU5gGk%ozt?%c8oM6neA?GuSsOfU_b1U)uiEP8eRn~>M$p*R<ZDjnI@9&_4j
zI`4*Q2=nj*9uzH;Gu_8WE&$sDhr9nJLFhU9Bv)!XWvooTphZtN9KcCLiqNi+h}ur>
z43n<hxuc>SZs@^<WV-bOALh{_dBfKfn*4#)TgN}6y#WUIp+l&~VD|#ldfZNBNs;@m
zS{mb4W+67xR5c{QS7@T2*gXOVr}5|HczT)AS`}XpsO4tdd+JVn=SX}g{arjc7QkFk
zHZ<NsBq&W{nHqZoNtG1Ute+=NE{|iy5|n;P%nfsl^(IIYhq3S)UIQ*NU03bZwJx*4
zQzUTd9A|99S!YfiJ_-wo3J!<F$&{VivMD`XXm0Bu+OE<559mi7u2Y~|x^)>ahO78s
zulbK@@<h3kY!x99eCXNI6Xqcoq^(;E47A_F;5jh+=)w=Fa|ac@0W~805Z1K?m2-aZ
z_PM~|?5~!^-AVd?#6L>{3=2=@^yZ)DuIC$ki;`2WNbD_#`LOHN9iMsrgzt-T<8aeh
z(oXrqI$Kgt6)Icu=?11NWs>{)_ed1wh>)wv6RYNUA-C&bejw{cBE_5Wzeo!AHdTd+
z)d(_I<lp0C`-jdEx%X`fgA>KN7z^n|As~3XS=cCB_TgM7rK;X586re`{~Foml$aKs
zb!4Pe7hEP|370EWwn$HKPM!kL94UPZ1%8B^e5fB+=Iw^6=?5n3tZGYjo<LZzL3wRS
zup2H>v83CLB&OQ++p)WCMeshCv_9-~G9C_2x`LxTDjUcW$l6e!6-&a^fM3oP9*g(H
zmCk0nGt1UMdU#pfg1G0um5|sc|KO<+qU1E4iBF~RvN*+`7uNHH^gu{?nw2DSCjig%
zI@ymKZSK=PhHJa(jW&xeApv&JcfSmNJ4uQ|pY=Lcc>=J|{>5Ug3@x#R_b@55xFgfs
za^ANzWdD$Z<U}6PgaA%eY(CW;x5+F+yl^XP2Bql$&W8EBo9%+^n_xCuT9!pO)T7IA
zIK0{%X*E|HV2tzk&y<5TGwwQ6n=7aT2C$X7!8!IrZW$aoAn)WJ?v*?W1!&5C4nM5{
zWoA1hpY;z2qVJE!-Hx+{n)a9;7-5nsfdg-s(1Z`ASH2G*CidBrdtn8~z&E9i#K(Xw
z-Bsq;CKAH4C5uwAJ=PBGLT`;i-yr*%avIjt8()FKn;ONq?ZOl6r7r2gA8id7aZR8H
zkQQDdpGqFH3kr_E%owg2TJlzX36fdb$8nyNPyDDmQJ4>YtFs$d7+oiw0ZmPk2&l|<
z<x&&$7ib2ej}4u3C&TVWo+QOSY8`)=TKM3B15YRkG(_~kSKH?`|JrQ_+F_N_Co^0&
zvxP3H%7O1|@yv6g+m!aLTRW+Hk7YP3d1I)hnHb;rNJCkVCSCDHT|E9f^kjiuB~JVl
zc*5a}8#TnsiWXYd(bScC7_@<goaS6xy4H8xH3fdhTCApNoQMMogPqBP7A@OXQHFJU
zKQ~bFt27lIZzEGQyZYoJ?l_YLwhR*{7Mi?dFxPpGzM#{R^MO(}HQ<Ao@MM(QAFeR?
z9x;dXWYMI#Ha}F=D2W1Up6B5n1#Qy3d~xgxv|q=apg?>c8()w<zOy41on%=kA*#Y{
z0og?wJMUow+HvOt<{XjsenX5x`gy8e_^N5MDD_7ZXcJ59c^tP6-)iv>fiJx@EGpQT
zG$8iLkQ<Bz!QU<f11CFG(&h^m;C^s4M{|Yun5e%@G)2|F({Tpn9PRX$iywb~U?K%i
z{B{K^m!1e36pY3nMHm!O2H&bV2Sex2PdH9|_KnDCF$RLTr=BAiFs!#>Z-086doF1R
zh<#9cz_vRsJdoXbD=QgOtpm}cFAJX8c}<S)cIn9ORwwc%xm&y_N%n2(D36`CNO*eH
z(^`;*BQm*bu6y8Gon03;XAAZTweS(3(Xbbn=v!G8j?Ron8y>>Jew;PQJSXSb^;wlC
zxXLHTS|!GZ-VK_4wV<K&EBIT5&!ue;RuejXlJEQ^lqpnHHDOtJS}S^5XV?}zlJASU
z><9bk4RUmlsByzW_^b>)$6R+jQ}^wco1nMA`9Lncs;&QGp!`5Tx#aXXU?}5_RrtUY
zx(EMzDhl-a^y^f5yfFLMnOO#u)l69&4M?|ne|2EV><q9R0Y~+Z3w;m(5m%re_$$7`
z<Gg)eKU@b_PL8s{Ps&LDP!)zk2(edhJ^yaR@*8)}x|M_(cQ7AcEP}mKvBCrF?z}lG
zILn}|!g5R$!Zv4&c-lW<WSOBq1Rog|>zQ}4JQCBel?~2I4?D|>L$%H(peOOII!U}i
z-j)*h1rODe9{0`xmhG;`AKqw1p0_KhEIU8)DoGnEn9wAhXPaxO_(jNSij~J5m$P*$
z9Mt(t;eV}2+i|kjQpBFcNb7_(VbuF<;RQB~R~p>2*Lg>a&7DEEuq*I%Ls4{zHeUDq
z+M0&YhEn^C*9-B4Q7HJ$xj)dORCXPK+)ZtLOa0o&)Sl+f(Y{p*68$-#yagW5^HQnQ
z0pWpo<wx7z5zDoIkHG$ymR~;3`&Y{tlF_(wP`;ZA^4@ca6I8^iUxaXXoVp%YygVuS
z^p979t3og#iZ#^h$Al&6@;J4M)6fgx>Qpxg8<&gx9im(>=x6v#&RbQ7^AsjxeSDA?
zi4MEJUC~ByG!P<Exv`*r7_Yizi*BaQOJ4i91z(1bzWdZ!B@*bOS>iBjq7$pK&FA^5
z=Y@dtQnuy%IfsaR`TVP0q^3mixl&J-3!$H!ua#{A>0Z1JdLq#d<JNUELP-R*m0VUZ
zr@z_%V#93^JIrjOP1jCie^|YWze;pwe3m=p#P06t-`lrQB|e;IfAz~b%CcG_;@uR_
ztzRt=M;G)xATi9qfu}c_vv`Fnklc3(zkC(*sFIq*hD94Z(S5M{ifn{2`diLfuuHXv
zFs-^dEr2|&Sc-TCQ~#w~b>4UV<w%Vzul~hYYNbR+N-v)`hnqLXgLN>9nlYm641ZHl
zH6mK~iI6lR3OUEVL}Z5{ONZ_6{Nk%Bv03ag<1HVN?R%w2^aR5@E>6(r>}IoMl$wRF
zWr-DItN*k7T$NTT8B)+23c?171sADhj<pTnb%>Inb2Xb>GhFYGC&3{b>huvLlaS4O
z^{j5q+b5H?Z)yuy%AByaVl2yj9cnalY1sM<gzM#36)4^l9)*NC=3hG!BRp6-hll>Q
zXI#e%*CLajxGxP!K6xf9RD2pMHOfAa1d^Lr6kE`IBpxOiGXfNcoQ*FI6wsNtLD!T+
zC4r2q>5qz0f}UY^RY#1^0*FPO*Zp-U1h9U|qWjwqJaDB(pZ`<`U-xo7+JB$zvwV}^
z2>$0&Q5k#l|Er7*PPG1yc<j+Hgj7{K<ER~hRdQ!)3|;oFNJd6dX^HEEI)euHneoJ$
z#-=LNX#HXNd@=J~82(=b;~qImSTU^93kye1(_ij%--v7epTHJ*SmLm)Mr{G>j4BGz
zg&`d*?nUi1Q!OB>{V@T$A;)8@h;*Rb1{xk_8X<34L`s}xkH-rQZvjM`jI=jaJRGRg
zeEcjYChf-78|RLrao%4HyZBfnAx5KaE~@Sx+o-2MLJ>j-6uDb!U`odj*=)0k)K75l
zo^)8-iz{_k7-_qy{Ko~N#B`n@o#A22YbKiA>0f3k=p-B~XX=<MSGacN)U%?3i^I)a
z4|ju)0K%rke~yEQ2vA37As200K~)P$Qn?#4ZHYa~(Wt8h>`Ug>jl$e7>I=hph0&AK
z?ya;(NaKY_!od=tFUcGU5Kwt!c9EPUQLi;JDCT*{<P27@Ql_745n_YIplWoIkPmd?
zjni(f{BYZi1TY@c3Y-$9V6}aPNgOR10uajL&cR1nrt&w^(F48Kd=63F>90O@Wc>b|
zI;&GIY$JlQW^9?R$-OEUG|3sp+hn+TL(YK?S@ZW<4PQa}=IcUAn_wW3d!r#$B}n08
z*&lf(YN21NDJ74DqwV`l`RX(4zJ<(E4D}N0@QaE-hnfdPDku~@yhb^AeZL73RgovX
z6=e>!`&e^l@1WA5h!}}PwwL*Gjg!LbC5g0|qb8H$^S{eGs%cc?4vTyVFW=s6KtfW?
z@&Xm+E(uz(qDbwDvRQI9DdB<2sW}FYK9sg*f%-i*>*n{t-_wXvg~N7gM|a91B!x|K
zyLbJ~6!!JZpZ`#HpCB8g#Q*~VU47Rp$NyZb3WhEgg3ivSwnjGJgi0BEV?!H}Z@QF|
zrO`Kx*52;FR#J-V-;`oR-pr!t>bYf)UYcixN=(FUR6$fhN@~i09^3WeP3*)D*`*mJ
z1u%klAbzQ=P4s%|FnVTZv%|@(HDB+ap5S#cFSJUSGkyI*Y>9Lwx|0lTs%uhoCW(f1
zi+|a9;vDPfh3nS<7m~wqTM6+pEm(&z-Ll;lFH!w#(Uk#2>Iv~2Hu}lITn7hnOny`~
z*Vj=r<&Nwpq^@g5m`u&QTBRoK*}plAuHg$L$~NO#wF0!*r0OfcS%)k0A??uY*@B^C
zJe9WdU(w){rTIf<;rwJt^_35^d<<XSsx?mw)n~){BsbL1=79ZX5?H71;`*yX`P%U3
z9dsudtG@K{p%|%!@RtG#orG18Av*K)9d~LiHHzeA=4Q06n;{~60AXElVy_J*Z&2CF
zp|X7HJ`glDJpI%)eEoh5Dl@aV4XzCd7I3h;$e>A@$Fq<Kz~u9L>EZW6kwyfAo2<Z|
zy3%L_hMPa(<umu2v5sA_J3z6-{K(d~$J~4408!Z7Av~VG6@gUq$uIvPdF1W1aD#o5
zduZVQADr;N)%*YL8UNQ4RXh2KvxLTL3?P}<10R;Gt4A4zs;3=8mxM~<C=zK2N2BF_
z5X%OdmcX7!lwdYW89I@JvW5Y?DM>x0T$Ye2MZox6Z7<%Qbu$}}u{rtE+h2M+Z}T4I
zxF1cwJ(Uvp!T#mogWkhb(?SxD4_#tV(Sc8N4Gu*{Fh#<qKjSG6^WceMtGbTpPr(Wp
zmgkN_thv2q#a|ZDK`6x8hTYSCcEj8@4CUy-8F(6a=F5THL-y-iP9(OwzAI;Vy<tI^
z3G+}Jawg&-7y<Ib9Ee?g0ms+d7vMKLqafCU-Z5p<huT>})Pvb^ef%jrlnG*&Ie+J5
zsly5oo?1((um&lLDxn(DkYtk`My>lgKTp3Y4?hTQ4_`YNOFtjF-FUY#d#(EQd(rfz
zB8z%Vi;?x)ZM$3c>yc5H8KBvSevnWNdCbAj?QCac)6-K~Xz@EZp}~N9q)5*Ufjz3C
z6kkOeI{3H(^VO8hKDrVjy2DXd;5w<IeAyEChw*xM>r4nb`19yJi0DO@607MSx+7F$
zz3F7sl8JV@@sM$6`#JmSilqI%Bs)}Py2eFT;TjcG5?8$zwV60b(_5A>b#uk~7U^bO
z>y|6SCrP2IGST(8HFuX|XQUXPLt2gL_hm|uj1Ws`O2VW>SyL^uXkl>Zvkcpi?@!F7
z%svLoT@{R#XrIh^<BvD8TN@?R0^1~VQ|Dyb`Pt?eHbKjd_U@{*5*?M1r2-=@kXKy_
z<+1e;D_wch)*kXNfy<8J;B_ZPfIuGZlU+DheTq7D4xiwHKVcB3^tAriI9e?w7hG|b
zzAE`Lttd0z+DArViohu(*8FTGnED_vJ&J6-5=>*dE~$YhMwC+b7JE09NAS47kT%Ew
zD!XjxA@1+KOAyu`H2z#h+pGm!lG<xxwAYpf>>WI0v745l+Fd><3dh{ATq%h?JSdEt
zu%J*zfFUx%Tx&0DS5WSbE)<i~BDkB9<hoUP9wxHiG+%EYy<~b)kp7JgYH!LlPK#fp
zJ2>vwZSoAGT=;W#(DoiL($BcK;U*w`xA&kheyMLI673HCb7fGkp{_vdV2uo;vSoAH
z9BuLM#Vzwt#rJH>58=KXa#O;*)_N{$>l7`umacQ0g$pI3iW4=L<O;S5E41~94p-)Z
zdj7nov9|DvAZ$CyXNpo?<F@0YV%cmvbNl3LBDOM(t`bSs@dF1YDKZSzsj_O_xqIsv
zuCIbkxI<=6yR`L`!>--O;Wiq0zy7OKp`j2r^y3`7X!?sq9rr5B{41BkBr1fEd1#Q3
z-dXc2RSb4U>FvpVhlQ<Pz-+-AoFa}hrx!}<Af7A>CIzQ-hs=<OvYsM3tZJYC;kL6d
zg5_D(?q{yw{$oVco%AG1zHw)1ueC6Yqq>8420z=7F2F(^x<PaYq(0J8j!WZuHlRML
zvw=Ou(u?QDY^uASsn_awr45Elt7VSt*KdrZ=5#dpwIJ?;ZL2@+Q)(UbDV8x4(c3iO
z2aeiw>D;^RXgpjlh8S6*xCP#Gj2+Q0bAg?XARw3dnlQ*Lz3vk}m`HXmCgN=?bIL{T
zi}Ds-xn|P)dxhraT@XY$ZQ&^%x8y!o+?n#+>+dZ1c{hYwNTNRke@3enT(a@}V*X{!
z81+{Jc2UR;+Zcbc6cUlafh4DFKwp>;M}8SGD+YnW3Q_)*9Z_pny_z+MeYQm<P^9sZ
znu|D_<^I{Ste`SvT@HfHmt@`N$>z?r%EVaN0d!NE*FVPq&U@vo{ef6wkMIDEWLbDs
zz91$($XbGnQ?4WHjB~4xgPgKZts{p|g1B{-4##}#c5aL5C6_RJ<Ttl4G#F(}kh2N+
zSmGyT4m$ai#zH>_(*5>85B1}U!<!2UM(Sssbgv-0jQn3Z5Z=St$WFu4dk8%00AohQ
z5_j$rTra@;%r#_}_?yR(M9~qtS0k9ms7u1^E8A@EK84Z`D~?syqKIQJH&9UN_(2yn
zaXv?4f`av1qKK8aD`4!ISzW$pIw{{@aR?}Ecb%D}kOc_L9P-b$q;o5b(%uM(yrG^H
znekk2NVFX17-R54zr%wT27VgHKA8R)BU{N1+t3`IdwbgwB&^T5eo$rnjs4P+bjHcm
z77A$!ebLmbShMNcWal#|h%#mjg#kG>_<``}q-97Q7~u)(&lsb(WT^(*n7H%33%@_b
zO5(?-v??s??33b19xiB7t_YT!q8!qAzN1#RD@3;kYAli%kazt#YN7}MhVu=ljuz27
z1`<+g8oVwy57&$`Ci<lcu~1NgJrU{`5a2BQw}bDY_L1`4I2DtS>HeaM)tz(OSt4E#
zJ@P6E*e504oUw~RD(=9WP8QdW^6wRdFb<cdg|`6ZEa;xyB#I9Rs;NDT4}$+Loeugr
zbvE!V{r7<a0%G`|d!>KII!GAWecJ(?{`EzTR@?j!3g?$@LLCt;U={>!9z7DU!(1Jq
zqEwdx5q?W1Ncm7mXP8MFwAr?nw5$H%cb>Q><9j{Tk2RY9ngGvaJgWXx^r!ywk{ph-
zs2PFto4@IIwBh{oXe;yMZJYlS?3%a-CJ#js90hoh5W5d^OMwCFmpryHFr|mG+*ZP$
zqyS5BW@s}|3xUO0PR<^{a2M(gkP5BDGxvkWkP<K5ZUpLtQ+!)MrC%3LpSOp81VT|8
z`5eKrF=|Etn*ZUmlMfM>udSV*TMRK5Qm4?~VuqVAOerffRt$HGAvp;M++Iq$E6alB
z;ykBr-eZ6v_H^1Wip56Czj&=`mb^TsX|FPN#-gnlP03AkiJDM=?y|LzER1M93R4sC
z*HT(;EV=*F*>!+Z{r!KG?6ODMGvkt3viG=@kQJHNMYd}bS4KrrHf4`&*(0m0R5Hqz
zEk)r=sFeS?MZRvn<@Z0&bDw)XkMnw+_xqgp=W{;ioX`6;G-P9N%wfoYJ$-m$L#MC%
z^sH?tSzA|WWP(cN3({~_*X$l{M*;1V{l$;T6b){#l4pswDTid26HaXgKed}13YIP=
zJRvA3nmx{}R$Lr&S4!kWU3`~dxM}>VXWu6Xd(VP}z1->h&f%82eXD_TuTs@=c;l0T
z|LHmWKJ+?7hkY=YM>t}zvb4|lV;!ARMtWFp!E^J=Asu9w&kVF*i{T#}sY++-qnVh!
z5TQ|=>)+vutf{&qB+LO9^jm#rD7E5+tcorr^Fn5Xb0B;)f^$7Ev#}G_`r==ea294V
z--v4LwjswWlSq9ba6i?IXr8M_VEGQ$H%hCqJTFQ3+1B9tmxDUhnNU%dy4+zbqYJ|o
z3!N{b?A@{;cG2~nb-`|z;gEDL5ffF@oc3`R{fGi)0wtMqEkw4tRX3t;LVS3-zAmg^
zgL7Z{hmdPSz9oA@t>tZ1<|Khn&Lp=_!Q=@a?k+t~H&3jN<D4lUIhOd<?VDvgDnvrv
zz4=bg&Q;9n@uc^iq`xXjs5@BQ!NS5opc3BrP@?~P;z^Qv`Js*uUB!=stZpEOE$b)R
zQxsw(IgM0&bG=Jf+qmL9GGuX(@~b*ICb{?!`)}nwe3LaSl?_|rUF4ioLoFM(NIeT+
zOKlkfiYnHFG~d{}FZa@Q<nc(z+=>?dr(}7s;{L+jiKY57?WsF<x$5(lnpgwn)O@}o
zm3JYnm4Rj#@s?R(%kb|ZbDA)EH@$HmiVZ=w#ueOujETmpQ?K4Mr-BPxy*=Nq8}JVy
z#56o0y=a}jd673Gv7}G<2uYeY0mf=dtU2A6uJh#AT8x)S@ir?$6K1@Z)IQBld?Tr^
zudQ9`))1>BfW^mu6a03_^VKrdK=9egXw@!nzZ3TbYc*osyQNoCXP<Zq2`FgMzc{q+
zIDOTRUdg?S`IKPsH65f_m0G5{N%M)-S*QGJiQ8{KrZfn>YoFS<&Nr97MrQCOK(gO8
z;0@iqRTJy4-RH)PJld5`AJN}n?5r^-enKrHQOR;z>UMfm+e8~4ZL5k<wXGZfd!cWm
ztc=YM>>oXMiYq12Bx4eVQv0jFgp_zC#``sjZpywYqISM<Z(6s5vy51{{W$X+<42zI
z;e`%SFQ2fhLB<k%50BpMXlDw)x~VpQDv`K({mgA$mG8<AA=t)&L)h!yPq&EG@f8{P
z%XOQmFJemuJ;TQ?lJMWU8}KDO<bjssGJ~49Nml=8jImj;lWy%M+VzkM8LLkTvVv-W
zmn{C{45Kd4H*kbG%Q@IvTEktPO?Mv{Z$%kt0f~d)m$KGFq;u|{$w|nEQsiYcFhjd2
z3v?U}8fhHjqFPC9$=Fq3tz*+codZs-1b+Tf9m}?)DWmrAN22i3w~rPj2YXjvju|&#
znqaTV5(jf&i4!%JGKM`4#z@2*NPG5H$ijMnavb}Gn9xE`1Q{crG}2$MmuL7@F65FR
z0xLb!Wu}yQJ{Q6e9h!KU`Ti%P7EQe7k)+_X{u%tf)w9Fv=N`VqWfiOBf_HQbd1bz4
zW%TLhHRPyq_EuVMhCbOuJP^K^4kKe!!1@kR?_c0SHX&x?{Q2OoG{||aYih=9bS|!F
z<cQERPH9`ya%U+f*QqD_X6YXmV0)7P%vCE#So?F2HZQNX*44$?d#Qe}-b1uf{Wp$I
z)Y!bOEo8adSxiM*F_2|dQyWs!O2fVGOQ!2K>P}VZ@!~1Mf$!x|opj%mQ98JnSk@`~
zPmmyuPZKtZOnEC!1y!?`TYRsZ!II;d!iln}%e}bk5qIiUADERr*K$3dekgHV9TtBX
zi5q!J!6Zgd#cLxRmZN^J`o@Zv{+p+<_#8^nvY)44Hw_2i@?R&5n^q33fpOnDg1nPQ
z_r<$hURl~OketX|Tdbvf_7=3x^rSFJtEp@tuDpVB&uq)qW;xUQ7mmkr-@eZwa$l+?
zoKk``Vz@TH#>jMce*8>@FZ+@BEUdYa_K0i|{*;j9MW<wwdhTzBnOlwwk4Kt)7s2g3
zJDz_{6T#mTJh2%53=$SO-WG?!kVGQ8g?l#J(fznDyqLKr$?=3GKUqf7N@pa0?@+|*
zp=O8v;Hz+#EIDxmBuR-y_g>3K%pnM*T;@>|o@lMhgLrpZP5aol(z>g;b4}|e$U~Fn
zGL%(}p%Jsl4LxE!VW_Y4T>e}W4e#~F03H_^R!Q)kpJG{lO!@I4{mFo^V#ayHh_5~o
zB$O71gcE(G@6xv);#Ky?e(Ed}^O+Ho(t=93T9T3TnEY(OVf_dR-gY@j<p^xgTM4Iq
zrCY@#l+H9>j+iJSY?q|6prBv(S9A4k=2fN<j+SIxJ$8-$(X3S;`a)u*-aXG@Fxkkb
zz`!`U|Ab>Zz!W@S=B@~b?TJRTuBQq448@juN#Y=3q=^VCF>Z}n6wICJ<^^Kn8C;mK
zZYiFSN#Z$?NDGV7(#}q2tAZAtE63icK-MY>UQu4MWlGIbJ$AF8Zt-jV;@7P5MPI>%
zPWvO!t%1+s>-A%`;0^o8Ezeaa4DMwI8ooQrJ;ax@Qt<g=or(e&kDiQJ!hhqL8KxVw
zWJ6PQ!IA26UJ4}+#q%ainj)=JR5&&_RwTbaN|FlM82`5Ec}Q!F`f;L%kKp2?Ah-k3
zgq^r#V{&E4(oy=cr_(r|Zf64Bmp6$0-q1bv_7H!Z93tnzcYD$NO}e<ZeCt=a4U9ZQ
z*y-EcdO@5zUI}mxkF_RZde}9~xCuD{`YEU2r#LY2LKC8M0-vAYyG^hVez}0n@Prea
zq0L%k0}ng7kweZyQ})@cg-wc7wUZ;`;IN?hz?kqz|JZQ<pwuZ=rV-cDic42sS`99*
zMcL5|E|-(O7l|KyQ~>*6XONWw)dPwOPI9@u*EG&844*1~EoZ2qsAe~M>d`;Bc_CWY
zMoDKEmDh-}k9d6*<0g@aQmsnrM1H9IcKYZs)><)d92{|0Hh8?~XbF)7U+UmP@P<zO
zTK%M?zK-$akeNk$8Bi=bHSDQQD{L(&W*G+6>w_6geVB?7N$4J4*E0z3EO&5kRS(EE
zv92(+e5WxLXMN{h;-|8@!Q#0q247hb^3R%*k3MuMO5*L}$0D#5P*N$aHd54C<Wviv
z>+=_RToYXTyewugOaDmGsCvb4H1s=@gkfVnzTCWKMa-Mm1v4Wq!t-JIrbV&EWwKDe
ze#kJpOq#iRlFz%5#6Fio9IUlKnQ#X&DY8Ux#<-WqxAac-y%U_L+EZZ4Rg5*yNg`f<
zSZn&uio@zanUCPqX1l4W&B!;UWs#P7B^|4WwoCxQXl|44n^cBNqu=3Vl*ltAqsUQO
z9q_@nD0<nNCi%{?zVlX<z4vRDQLAj8CDO_6Ee@!Kk%zz9AkC;~?Z?fCyM`h0VSIM!
z)<YS|G4p)`ye~>zq0O8r`coEm>9+|rA3HL#l}X;0##>SJS$cVavOZVCpSGf4mUU1(
zWaRCUYc^9QbG9=vpWo%xP}CMFnMb{reA`K7t<BJjA783oFo?rh&7)#{uHi~2^!ZU^
zQ6^|ZrnHesREnE2?ng^c0ZAnlC4C12x5_&&$*w%5QB}>T(t5DM)d9l}jVPY>qoRzT
zE3m-p#=i=$9x*CB`AL>SY}u3agYFl#uULNen#&44H;!L@I{RI=PlWxG8J((f)ma7A
z@jLvQ>?Nx`n?3ChRG#HqE3MXP8*o3!Qq`+t8EMt_p)oeKHqPusBxPn!#?R??-=e3e
zo73WNs<ENIuq^CzKQPSey+?XX|3#leW7<yK_PWcE812^X>_IZF`WLigre=|`aS2^>
zN1zn!7k&Dh28t%VpJ%<Xnbsu79=)^VoAo^<Y#d`N0#C|JR-(K_BCGt0zo8Xn`zpf}
zN0>**&E!eAcB5oLjQFFcJQj*URMia%Ya3@q1UQ18=oWMM6`I}iT_&L1gl?*~6nU<l
z2-Ao?0hw>4q4Z0<S?e@BXyH5z8SQv8@{f=@<?<zph9`>`H<5yDp(HeZ+RGf9`mM&=
zn-qRp%i!g$R;i1d1aMZ{IewNjE@p2+Z{`x{*xL*x$?WV~{BjJpsP&C&JK0HLoy<jU
zIllUrhUc9$)6cu}xy&^N!ah3LKPGi3!kGx@=!kK=v8b-!=l<ZUj?(8#ECS+uF|Q8W
z%$X&<vfXN({Lp)jV&il%dqa{+$?Pn5QEsr+XQQWvw<xHNXLCmTywcS+e{;K<G=R)Y
zq^PtuC7@F8jC@COWJXsGonj|R_H-RV$fSQQIlNmsDyc-3^m2BTUHi+d!)YpOiVN>f
z`0z^v&fBQSa!I7FU~9MaQ%e|?RP>sM^2PL!mE^Q1Ig_4M$5BRfi72oMYu6Ke?wmDX
z@0a%-V|z}b23K=ye(W+fG#w|jJUnT{=KR5jfuq!RX}<1irTDw(${<&}dWQu4;EuE<
z@3u4dBkQaCHHM&;cE0z50_V!(vJ1_V)A8?C#eJuLkt!98Z%|Bgzidc0j|<WDg(m5f
zkAcIde5&rV@AQq8=TC^wvz66%m)1x186~-ro7Q_;(a%0p3PaF!>z(&o)TCzYlrgZA
zC3@i>L!&Gw_~7`>puB97I2lK)<HEfWxACj&R$ZxM&BS~!z9mj#OS!_YeV8_c$Lhua
zx$L@m6QNDWW7To~=QO$DndhYMD~^JEh}^LkAQrEBahvg2kUIHG?W$Fl3&f8!xrq?#
zVzQaEbhureIRirP@jQ4jjU8!;9yU7#nlDdTg++>lESZQqVXc_8T^G2O#VHhO?IC$g
zOYhXJ7)~C<8l|Xrftka@QuowScM{K&0zskoU$Aw~vIRVRF<AR*uvTpW;<}dB`KH7=
z0zMh?uvn?evU)1vOs(Z-A)R|ZfwnitnP{0J>9TEQ4*3=_5)98B`=t8(N%ZuWqmwlW
zllAzq=E5_5!sKDXam@w`ZD(nl%LAPxQuEtDcKPqu9LPJvNIITawU#c^PQ2HmZgs)r
zH^+gRwZ?0)8IFQgU)+p@0Iqb^tcEoqcB@zhfz_FaOM&_d<|jnU>q5nSKa<@%9|dje
zIupcg1!tRiMP4X=oG<7s4|AW&^-Cw4FL9<j5FZy2(<{6zV;Cr;sKVFBe)t_>OuI$t
zxjc*y;Uw!G7a|jz>E*2+PlR(CemWebS7m-&*CDwnmx<p<;?@-|!NOSWIXM;zWzH#H
zw~LN~h21P#6P5S4S0tHB*h1mOme<9LE5wrG9-sYU<V~%5?DjJSjVmfTS{FVBMU|xJ
zLxKvDECM<2Ct4XQn{y*e=}A(D%}%d)wjxb4JWn~6o|adqWe&d`W<;B&rhZ{m<xBTe
zTfxg{p%Pan4!VBbzV+*EhnV^gk*ZCPtbF+RnK8LY+}7hLPH2cjDxs54G}E;J+OL5u
z-)e-BGs~16Ve<@}%%Yg_tPF6@MRnxB#67NA8*0x1SygK61xgySR{Dj^Qv95{+q9G?
z&oas&yrgQ5<hzs#1APQcu4&a1Jm)EcZ<5M+B+V6uA!<X*f+BM1G6ETT?9UU!S(X<*
z3+0i<xmU%VCOy5<-!fUiQH(s^j%O>biRqJvQ&os-sC&4OWt^(2@vG4|jui#Df@-D=
zh3D%8Y3R6+jRBStSvH9pt&tCI`NK08J1*pC(?OM0h!bS-JK3I}`pDY<h?u~U3ifVP
zzM+jQr6P^jPQhr%kj2xD<Nh=Tq&d`zkFi8o<`YdIY=k18t*|!K^nLo5ajk+hs6Bp-
zk@Lw$^_c|;&52lViPS5Q_dW05Gf%VxY$a`BX+SZt$Pa;6Zg9av*AH=+T}?_bQ`|1Z
z7$0O%Kh+hqWgsf&6oH(>-fDIaB_*W6KS+TO0Q*%kkeuN6uWITt=TsCGw6uBE710q;
zRluI%j{?@jwhM|l5&TB!-TkQs!A=DXRE>u18t@;<GyYhSY){4adn^9h%|f#_c&^`_
z10Mt9G8F~}*DnAJj0_|l*!tW@o9|~9kL|fNN}uoN0v@_W!>zndD0M$U@Igrt?UW2;
z7%=dsHIVH_LCkGUU0fW&UMjDnvjcc0Mp(mK&;d~ZJ5EJ)#7@aTZvGDFXzFZg2Lq~s
z5PR_LazNN)JD5K_uK*Hy{mXuHTkGGv|9V8KP#iQ$3!G*^<vQCb8A!zMm1WFOl7{98
zIR`sCu=J^BZ4U#_{%*k9okdV|ll!lgm+FB5v;g`?YzG^BXCSx0b|qTdcHlu$wQUa`
zoXEGJ(^Cm_Hx$^i*N$Q^sL+W11uNqWceMjrCLqBX%yD=90^N?%vB@tq03HtD_ja8k
zczXM9_`kSE%M$-J7I;S#zP)$5<Dm>>7UiE{|1G1A-qg(xH;Xa>&<yyX9xHcB26BP+
zPnbWn+J9+OqNC1jST{hyYET2X4DId6e!2so%2<LSqG;{xg3xj`+jjr{QCH<^lAH#-
z%nr7+QIWS(GLYU(2POLH9+&+IaJU;-zX3~JLClHopo8xWq&@2a2xMWF4$dGA-T_Qm
zvRfiWjmJe00E&B<T0qDA_n_JC2{}_27<fAvX76H+0Gao+z?lqtIPe|n%J6G30KK2Y
zvk9R1z;zxZ7#I?JI1$_j;Dp)(akZcB``HRDged~Hza#S`FefvBd8=Rm?{(y$zyYY#
ztzBGzK|jer82*7N<T93NE?{Z~OnZ&$7d`;fHif&XGwiQM5tc+=t_4)Nfa>%f|BZkH
zG=J^0pHzSAq<Ef&scdjHNe9Cabu4G6fDi?-bwHB8nj*@>v5*5ysQ{Puy^-_|IPrii
zKS$mE10Zngf>Sgg@BjpRyJbrHe<b*u$MOkVX|VxS6$m9TxPFxkWUt%-sQ&7!;pi>o
zD8Ro0LI*W#+9?^xlOS^c>Z^<PSN?~<N&8a&(HNvHdI#<#>^n^0I|FH^@^`ZR`{H=$
zjO0_$cnpBM7Zcm?H_RXIu-Lu~qweDSV|tEZBZh!e6hQy->}e;d#osZ1hQj{HhHkC0
zJ|F-HKmeTGgDe979ogBz24;@<|I7;TU!IXb@oWMsMECIETmQy`zPtM`|NP}PjzR_u
zKMG1Z{%1kWeMfEf(10U#w!clmQ2)JC8zm(Fv!H4dUHQHCFLikID?hrd{0>kCQt?kP
zdqn2ZG0}ytcQJ7t_B3s0ZvH3PYjkjQ`Q%;jV@?MK-+z3etBCGGo4f4`y^|AdCs!DH
zThTQ;cL5dM{|tB_1y6K3bVa^hx_<9J(}5`2SDz1^0bT!Vm*JV;9~t&{IC{$DUAVV*
z{|E=#yN{wNdTY@$6z{_KNA3&%w|vFu1n9XRcM0Ak>`U<T8FIf&km%6pDfxDx)9&vJ
zjh3Jf9S=S6-7a2O(*AgR)8L`wq35dE#pBHWBi`?sYtW(56Nv0WFXsFSdjBzoe(-)5
z4I3;m{OcaQb20k&F?{r<p`VQ21$)@EFW8<lwEuNZ8l46GZ0;^g^pn4{?C%Ejb4t6I
zqaFKVqTko{oLWL}G5W6mF8YU;`=W0zqwI4A038~AzjGJ*T<@Qt_wIgbK(WEp95p#V
Njln_&>W!lQ`ah3D4r%}Z

diff --git a/java/gradle/wrapper/gradle-wrapper.properties b/java/gradle/wrapper/gradle-wrapper.properties
index f396aaac2d317..4baf5a11d45a3 100644
--- a/java/gradle/wrapper/gradle-wrapper.properties
+++ b/java/gradle/wrapper/gradle-wrapper.properties
@@ -1,7 +1,8 @@
 distributionBase=GRADLE_USER_HOME
 distributionPath=wrapper/dists
-distributionSha256Sum=1b6b558be93f29438d3df94b7dfee02e794b94d9aca4611a92cdb79b6b88e909
-distributionUrl=https\://services.gradle.org/distributions/gradle-8.0.1-bin.zip
+distributionSha256Sum=9631d53cf3e74bfa726893aee1f8994fee4e060c401335946dba2156f440f24c
+distributionUrl=https\://services.gradle.org/distributions/gradle-8.6-bin.zip
 networkTimeout=10000
+validateDistributionUrl=true
 zipStoreBase=GRADLE_USER_HOME
 zipStorePath=wrapper/dists
diff --git a/java/gradlew b/java/gradlew
index 79a61d421cc4e..1aa94a4269074 100755
--- a/java/gradlew
+++ b/java/gradlew
@@ -83,10 +83,8 @@ done
 # This is normally unused
 # shellcheck disable=SC2034
 APP_BASE_NAME=${0##*/}
-APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit
-
-# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
-DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
+# Discard cd standard output in case $CDPATH is set (https://github.com/gradle/gradle/issues/25036)
+APP_HOME=$( cd "${APP_HOME:-./}" > /dev/null && pwd -P ) || exit
 
 # Use the maximum available, or set MAX_FD != -1 to use that value.
 MAX_FD=maximum
@@ -133,10 +131,13 @@ location of your Java installation."
     fi
 else
     JAVACMD=java
-    which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+    if ! command -v java >/dev/null 2>&1
+    then
+        die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
 
 Please set the JAVA_HOME variable in your environment to match the
 location of your Java installation."
+    fi
 fi
 
 # Increase the maximum file descriptors if we can.
@@ -144,7 +145,7 @@ if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then
     case $MAX_FD in #(
       max*)
         # In POSIX sh, ulimit -H is undefined. That's why the result is checked to see if it worked.
-        # shellcheck disable=SC3045
+        # shellcheck disable=SC2039,SC3045
         MAX_FD=$( ulimit -H -n ) ||
             warn "Could not query maximum file descriptor limit"
     esac
@@ -152,7 +153,7 @@ if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then
       '' | soft) :;; #(
       *)
         # In POSIX sh, ulimit -n is undefined. That's why the result is checked to see if it worked.
-        # shellcheck disable=SC3045
+        # shellcheck disable=SC2039,SC3045
         ulimit -n "$MAX_FD" ||
             warn "Could not set maximum file descriptor limit to $MAX_FD"
     esac
@@ -197,11 +198,15 @@ if "$cygwin" || "$msys" ; then
     done
 fi
 
-# Collect all arguments for the java command;
-#   * $DEFAULT_JVM_OPTS, $JAVA_OPTS, and $GRADLE_OPTS can contain fragments of
-#     shell script including quotes and variable substitutions, so put them in
-#     double quotes to make sure that they get re-expanded; and
-#   * put everything else in single quotes, so that it's not re-expanded.
+
+# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
+
+# Collect all arguments for the java command:
+#   * DEFAULT_JVM_OPTS, JAVA_OPTS, JAVA_OPTS, and optsEnvironmentVar are not allowed to contain shell fragments,
+#     and any embedded shellness will be escaped.
+#   * For example: A user cannot expect ${Hostname} to be expanded, as it is an environment variable and will be
+#     treated as '${Hostname}' itself on the command line.
 
 set -- \
         "-Dorg.gradle.appname=$APP_BASE_NAME" \
diff --git a/java/src/main/java/ai/onnxruntime/OnnxJavaType.java b/java/src/main/java/ai/onnxruntime/OnnxJavaType.java
index 24bf6ad4b95fa..6f3ca13984f47 100644
--- a/java/src/main/java/ai/onnxruntime/OnnxJavaType.java
+++ b/java/src/main/java/ai/onnxruntime/OnnxJavaType.java
@@ -45,8 +45,10 @@ public enum OnnxJavaType {
 
   /** The native value of the enum. */
   public final int value;
+
   /** The Java side type used as the carrier. */
   public final Class<?> clazz;
+
   /** The number of bytes used by a single value of this type. */
   public final int size;
 
diff --git a/java/src/main/java/ai/onnxruntime/OnnxMap.java b/java/src/main/java/ai/onnxruntime/OnnxMap.java
index cde9f0de4ff0a..68d91d0d9e74a 100644
--- a/java/src/main/java/ai/onnxruntime/OnnxMap.java
+++ b/java/src/main/java/ai/onnxruntime/OnnxMap.java
@@ -39,6 +39,7 @@ public enum OnnxMapValueType {
     FLOAT(3),
     /** A 64-bit floating point value. */
     DOUBLE(4);
+
     /** The native enum value. */
     final int value;
 
diff --git a/java/src/main/java/ai/onnxruntime/OnnxRuntime.java b/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
index ed739dd9729d9..f552badd4f83e 100644
--- a/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
+++ b/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
@@ -54,19 +54,25 @@ final class OnnxRuntime {
 
   /** The short name of the ONNX runtime shared library */
   static final String ONNXRUNTIME_LIBRARY_NAME = "onnxruntime";
+
   /** The short name of the ONNX runtime JNI shared library */
   static final String ONNXRUNTIME_JNI_LIBRARY_NAME = "onnxruntime4j_jni";
 
   /** The short name of the ONNX runtime shared provider library */
   static final String ONNXRUNTIME_LIBRARY_SHARED_NAME = "onnxruntime_providers_shared";
+
   /** The short name of the ONNX runtime CUDA provider library */
   static final String ONNXRUNTIME_LIBRARY_CUDA_NAME = "onnxruntime_providers_cuda";
+
   /** The short name of the ONNX runtime ROCM provider library */
   static final String ONNXRUNTIME_LIBRARY_ROCM_NAME = "onnxruntime_providers_rocm";
+
   /** The short name of the ONNX runtime DNNL provider library */
   static final String ONNXRUNTIME_LIBRARY_DNNL_NAME = "onnxruntime_providers_dnnl";
+
   /** The short name of the ONNX runtime OpenVINO provider library */
   static final String ONNXRUNTIME_LIBRARY_OPENVINO_NAME = "onnxruntime_providers_openvino";
+
   /** The short name of the ONNX runtime TensorRT provider library */
   static final String ONNXRUNTIME_LIBRARY_TENSORRT_NAME = "onnxruntime_providers_tensorrt";
 
diff --git a/java/src/main/java/ai/onnxruntime/OnnxSparseTensor.java b/java/src/main/java/ai/onnxruntime/OnnxSparseTensor.java
index 804fe742ad624..8400ef53ff6d7 100644
--- a/java/src/main/java/ai/onnxruntime/OnnxSparseTensor.java
+++ b/java/src/main/java/ai/onnxruntime/OnnxSparseTensor.java
@@ -637,6 +637,7 @@ public abstract static class SparseTensor<T extends Buffer> {
 
     /** The buffer holding the indices. */
     final T indices;
+
     /** The buffer holding the values. */
     final Buffer values;
 
diff --git a/java/src/test/java/sample/ScoreMNIST.java b/java/src/test/java/sample/ScoreMNIST.java
index 5587b58e17f52..6ecbc5cd56d10 100644
--- a/java/src/test/java/sample/ScoreMNIST.java
+++ b/java/src/test/java/sample/ScoreMNIST.java
@@ -30,6 +30,7 @@
 public class ScoreMNIST {
 
   private static final Logger logger = Logger.getLogger(ScoreMNIST.class.getName());
+
   /** Pattern for splitting libsvm format files. */
   private static final Pattern splitPattern = Pattern.compile("\\s+");
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml
index 9e192716c3ffd..56692847479a5 100644
--- a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml
@@ -52,6 +52,12 @@ jobs:
     inputs:
       versionSpec: '18.x'
 
+  - task: JavaToolInstaller@0
+    inputs:
+      versionSpec: "11"
+      jdkArchitectureOption: "x64"
+      jdkSourceOption: 'PreInstalled'
+
   - template: set-version-number-variables-step.yml
 
   - template: use-xcode-version.yml

From 2a184ac1a11f971ac2a247a80f387698643eadf5 Mon Sep 17 00:00:00 2001
From: cao lei <jslhcl@gmail.com>
Date: Thu, 28 Mar 2024 20:45:37 -0700
Subject: [PATCH 269/279] use OrtCustomOp's new API GetMayInplace in
 CreateKernelCreateInfo (#20037)

### Description
<!-- Describe your changes. -->
use OrtCustomOp's new API GetMayInplace in CreateKernelCreateInfo to
hook the inplace map of custom ops


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
This PR is to use OrtCustomOp's new API GetMayInplace in
CreateKernelCreateInfo to hook the inplace map of custom ops
---
 include/onnxruntime/core/session/onnxruntime_c_api.h   |  6 +++++-
 include/onnxruntime/core/session/onnxruntime_cxx_api.h |  3 +++
 .../core/session/onnxruntime_lite_custom_op.h          |  5 ++++-
 onnxruntime/core/framework/feeds_fetches_manager.h     |  2 ++
 onnxruntime/core/session/custom_ops.cc                 | 10 ++++++++++
 onnxruntime/test/shared_lib/test_inference.cc          |  7 +++++--
 6 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index 2a7b17977f1ae..c4fb0d3a83a67 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -4732,8 +4732,12 @@ struct OrtCustomOp {
   // Callers will provide 2 raw int* and pass in their address, this function will fill these 2 arrays
   // when return, output (*output_index)[i] may reuse the input (*input_index[i]).
   // The return value is the size of these 2 arrays.
-  // Callers are responsible to delete these 2 arrays after use.
+  // Callers are responsible to delete these 2 arrays after use by calling OrtCustomOp::ReleaseMayInplace().
   size_t(ORT_API_CALL* GetMayInplace)(_Out_ int** input_index, _Out_ int** output_index);
+
+  // Release the pointer input_index and output_index allocated from GetMayInplace() function.
+  // If GetMayInplace() is defined, this function MUST be defined as well.
+  void(ORT_API_CALL* ReleaseMayInplace)(_Frees_ptr_opt_ int* input_index, _Frees_ptr_opt_ int* output_index);
 };
 
 /*
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
index 60540514fbfa6..5f2e0a470a133 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
@@ -2301,6 +2301,9 @@ struct CustomOpBase : OrtCustomOp {
     OrtCustomOp::GetEndVersion = [](const OrtCustomOp* this_) {
       return static_cast<const TOp*>(this_)->end_ver_;
     };
+
+    OrtCustomOp::GetMayInplace = nullptr;
+    OrtCustomOp::ReleaseMayInplace = nullptr;
   }
 
   // Default implementation of GetExecutionProviderType that returns nullptr to default to the CPU provider
diff --git a/include/onnxruntime/core/session/onnxruntime_lite_custom_op.h b/include/onnxruntime/core/session/onnxruntime_lite_custom_op.h
index 0c0af16d4e20c..896893e986e05 100644
--- a/include/onnxruntime/core/session/onnxruntime_lite_custom_op.h
+++ b/include/onnxruntime/core/session/onnxruntime_lite_custom_op.h
@@ -862,6 +862,9 @@ struct OrtLiteCustomOp : public OrtCustomOp {
       auto self = reinterpret_cast<const OrtLiteCustomOp*>(op);
       return self->end_ver_;
     };
+
+    OrtCustomOp::GetMayInplace = {};
+    OrtCustomOp::ReleaseMayInplace = {};
   }
 
   const std::string op_name_;
@@ -1111,4 +1114,4 @@ OrtLiteCustomOp* CreateLiteCustomOp(const char* op_name,
 }
 
 }  // namespace Custom
-}  // namespace Ort
\ No newline at end of file
+}  // namespace Ort
diff --git a/onnxruntime/core/framework/feeds_fetches_manager.h b/onnxruntime/core/framework/feeds_fetches_manager.h
index 31c00d65ce833..c2c1be64f3e1d 100644
--- a/onnxruntime/core/framework/feeds_fetches_manager.h
+++ b/onnxruntime/core/framework/feeds_fetches_manager.h
@@ -73,6 +73,8 @@ struct FeedsFetchesInfo {
 struct MLValueCopyInfo {
   OrtDevice source_device{};
   OrtDevice target_device{};  // default is CPU
+
+  // if all the consume ops are from the same stream, this variable is the stream index; otherwise -1
   int unique_stream_index_consumes_it = -1;
 };
 
diff --git a/onnxruntime/core/session/custom_ops.cc b/onnxruntime/core/session/custom_ops.cc
index 513aafcdadb7d..cc23d0822c36e 100644
--- a/onnxruntime/core/session/custom_ops.cc
+++ b/onnxruntime/core/session/custom_ops.cc
@@ -865,6 +865,16 @@ KernelCreateInfo CreateKernelCreateInfo(const std::string& domain, const OrtCust
     def_builder.Provider(onnxruntime::kCpuExecutionProvider);
   }
 
+  if (op->version >= 18 && op->GetMayInplace != nullptr) {
+    int* input_index = nullptr;
+    int* output_index = nullptr;
+    size_t len = op->GetMayInplace(&input_index, &output_index);
+    if (len > 0) {
+      for (size_t i = 0; i < len; i++) def_builder.MayInplace(input_index[i], output_index[i]);
+      op->ReleaseMayInplace(input_index, output_index);
+    }
+  }
+
   KernelCreateFn kernel_create_fn = [op](FuncManager&, const OpKernelInfo& info,
                                          std::unique_ptr<OpKernel>& out) -> Status {
     out = std::make_unique<CustomOpKernel>(info, *op);
diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc
index 68f5d866a9276..5dd5fabd26fb4 100644
--- a/onnxruntime/test/shared_lib/test_inference.cc
+++ b/onnxruntime/test/shared_lib/test_inference.cc
@@ -4035,6 +4035,10 @@ struct MockGQA : public OrtCustomOp {
       (*output_index)[1] = 2;
       return ret;
     };
+    OrtCustomOp::ReleaseMayInplace = [](int* input_index, int* output_index) {
+      free(input_index);
+      free(output_index);
+    };
   }
 };
 
@@ -4050,6 +4054,5 @@ TEST(CApiTest, OrtCustomOp_GetInPlace) {
   ASSERT_EQ(output_index[0], 1);
   ASSERT_EQ(output_index[1], 2);
   ASSERT_EQ(len, static_cast<size_t>(2));
-  free(input_index);
-  free(output_index);
+  mock_gqa.ReleaseMayInplace(input_index, output_index);
 }

From 2092bebc782b69c2a7a973fc76cb8099ad1da94a Mon Sep 17 00:00:00 2001
From: pengwa <pengwa@microsoft.com>
Date: Fri, 29 Mar 2024 17:44:38 +0800
Subject: [PATCH 270/279] Fix transformer layer detection for recompute
 (#20106)

### Fix transformer layer detection for recompute

Originally logic miss detecting the layer boudary node in Mistral model.
This PR simplifies the searching, by using more strong pattern's match,
to make sure it is flexible enough to cover different transformer
variants.

Also add a UT.

Add a warning when user enable layerwise recompute but no layer boudary
nodes are found.
---
 .../memory_optimizer/memory_insight.cc        |   7 +-
 .../memory_optimizer/recompute_analysis.cc    |  33 +++-
 .../memory_optimizer/recompute_analysis.h     |   2 +-
 .../memory_optimizer/transformer_specific.cc  | 159 +++++++++++++-----
 .../memory_optimizer/transformer_specific.h   |   2 +-
 .../test/optimizer/memory_optimizer_test.cc   |  41 +++++
 6 files changed, 194 insertions(+), 50 deletions(-)

diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/memory_insight.cc b/orttraining/orttraining/core/optimizer/memory_optimizer/memory_insight.cc
index 54c49db0597c7..3d0fa942fd2d4 100644
--- a/orttraining/orttraining/core/optimizer/memory_optimizer/memory_insight.cc
+++ b/orttraining/orttraining/core/optimizer/memory_optimizer/memory_insight.cc
@@ -257,10 +257,15 @@ Status FindORTModuleMemoryOpportunity(const GraphViewer& graph_viewer,
                                                      is_forward_nodes,
                                                      logger));
 
-  InlinedHashSet<const Node*> layer_boundary_ln_nodes;
+  InlinedVector<const Node*> layer_boundary_ln_nodes;
   FindLayerBoundaryLayerNormNodes(graph_viewer, logger, node_index_to_its_order_in_topological_sort_map,
                                   yield_op_order_in_topological_sort, layer_boundary_ln_nodes);
 
+  if (probe_config.enable_transformer_layer_as_boundary && layer_boundary_ln_nodes.size() == 0) {
+    LOGS(logger, WARNING) << "No transformer layer boundary nodes found, this might cause memory optimization "
+                             "not working as expected. Please check the model and the configuration.";
+  }
+
   // The first pass - find the candidate subgraphs.
   for (int i = static_cast<int>(node_ids.size()) - 1; i >= 0; --i) {
     const Node* p_node = graph_viewer.GetNode(node_ids[i]);
diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.cc b/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.cc
index b421eb2ab32da..37ac1c4950ecd 100644
--- a/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.cc
+++ b/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.cc
@@ -386,6 +386,26 @@ const InlinedHashMap<std::string, OpsetToIgnorableIndicesMap>& GetAllowedRecompu
                 {1, {}},
             },
         },
+        {
+            utils::GetFullQualifiedOpName("SimplifiedLayerNormalization", kOnnxDomain),
+            {
+                // Opset 1 in ONNX official does not have SimplifiedLayerNormalization,
+                // while our contrib op defined SimplifiedLayerNormalization in opset 1 in ONNX domain.
+                {1, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("SkipLayerNormalization", kMSDomain),
+            {
+                {1, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("SkipSimplifiedLayerNormalization", kMSDomain),
+            {
+                {1, {}},
+            },
+        },
         {
             utils::GetFullQualifiedOpName("Softmax", kOnnxDomain),
             {
@@ -691,7 +711,7 @@ std::unique_ptr<NodeRecomputePlan> CheckNodeForRecompute(const GraphViewer& grap
                                                              node_index_to_its_order_in_topological_sort_map,
                                                          const InlinedHashMap<const Node*, InlinedVector<size_t>>&
                                                              candidate_output_args_map,
-                                                         const InlinedHashSet<const Node*>& layer_boundary_ln_nodes,
+                                                         const InlinedVector<const Node*>& layer_boundary_ln_nodes,
                                                          const logging::Logger& logger,
                                                          bool compromise_stashed_activation,
                                                          bool& can_compromise_stashed_activation) {
@@ -709,13 +729,14 @@ std::unique_ptr<NodeRecomputePlan> CheckNodeForRecompute(const GraphViewer& grap
       auto output_name = node.OutputDefs()[output_index]->Name();
       auto consumers = graph_viewer.GetConsumerNodes(output_name);
       for (auto& consumer : consumers) {
-        if (layer_boundary_ln_nodes.find(consumer) != layer_boundary_ln_nodes.end()) {
+        if (std::find(layer_boundary_ln_nodes.begin(), layer_boundary_ln_nodes.end(), consumer) !=
+            layer_boundary_ln_nodes.end()) {
           int dest_in_index = optimizer_utils::IndexOfNodeInput(*consumer, *node.OutputDefs()[output_index]);
           if (dest_in_index == 0) {
-            LOGS(logger, INFO) << "Node " << node.Name() << "(" << node.OpType()
-                               << ") is a Attention+MLP layer boundary node, "
-                               << "its stashed activation outputs are used by LayerNormalization's inputs, "
-                               << "we don't need to recompute it.";
+            MO_LOG_DEBUG_INFO(logger, "Node " + node.Name() + "(" + node.OpType() +
+                                          ") is a Attention+MLP layer boundary node, " +
+                                          "its stashed activation outputs are used by LayerNormalization's inputs, " +
+                                          "we don't need to recompute it.");
             return nullptr;
           }
         }
diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.h b/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.h
index ab114d970191e..ac1021f5eb83b 100644
--- a/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.h
+++ b/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.h
@@ -164,7 +164,7 @@ std::unique_ptr<NodeRecomputePlan> CheckNodeForRecompute(const GraphViewer& grap
                                                              node_index_to_its_order_in_topological_sort_map,
                                                          const InlinedHashMap<const Node*, InlinedVector<size_t>>&
                                                              candidate_output_args_map,
-                                                         const InlinedHashSet<const Node*>& layer_boundary_ln_nodes,
+                                                         const InlinedVector<const Node*>& layer_boundary_ln_nodes,
                                                          const logging::Logger& logger,
                                                          bool compromise_stashed_activation,
                                                          bool& can_compromise_stashed_activation);
diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/transformer_specific.cc b/orttraining/orttraining/core/optimizer/memory_optimizer/transformer_specific.cc
index c88a0f05d36b8..3bcfbd324ba3c 100644
--- a/orttraining/orttraining/core/optimizer/memory_optimizer/transformer_specific.cc
+++ b/orttraining/orttraining/core/optimizer/memory_optimizer/transformer_specific.cc
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 #include <charconv>
+#include <tuple>
 #include <vector>
 #include <utility>
 
@@ -16,43 +17,139 @@
 
 namespace onnxruntime::optimizer::memory_optimizer {
 
+namespace {
+
+bool IsLayerNormNode(const Node& node) {
+  static const std::set<std::string> layer_norm_ops = {
+      "LayerNormalization",
+      "SkipLayerNormalization",
+      "SimplifiedLayerNormalization",
+      "SkipSimplifiedLayerNormalization",
+  };
+  return layer_norm_ops.find(node.OpType()) != layer_norm_ops.end();
+}
+
+bool IsSoftmaxNode(const Node& node) {
+  static const std::set<std::string> softmax_ops = {
+      "Softmax",
+      "BiasSoftmax",
+  };
+  return softmax_ops.find(node.OpType()) != softmax_ops.end();
+}
+
+std::tuple<bool, const Node*, const Node*> IsResidualNodeArg(const GraphViewer& graph_viewer, const NodeArg* node_arg) {
+  auto consumers = graph_viewer.GetConsumerNodes(node_arg->Name());
+  if (2 > consumers.size()) {
+    return std::make_tuple(false, nullptr, nullptr);
+  }
+
+  // Find the Add node from the consumer list.
+  const Node* add_node = nullptr;
+  const Node* other_node = nullptr;
+  for (const auto* consumer : consumers) {
+    if (consumer->OpType() == "Add") {
+      add_node = consumer;
+    } else if (IsLayerNormNode(*consumer)) {
+      other_node = consumer;
+    }
+  }
+
+  return std::make_tuple(add_node != nullptr && other_node != nullptr, add_node, other_node);
+}
+}  // namespace
+
+/*
+    One classical layer includes 1). input layer norm, 2). attention, 3). residual add
+    (input layer norm input + attention output), 4). post attention layer norm feedforward, and 5). residual add
+    (post attention layer norm input + feedforward out).
+
+    The pattern graph looks like below for each transformer layer (taking the example of MistralDecoderLayer):
+                            |
+                        Embedding
+                            |
+      ----------------------|
+      |                     |
+      |                     |
+      |        SimplifiedLayerNormalization (layer boudary node)
+      |                     |
+      |                     |
+      |               MistralAttention
+      |                     |
+      |                     |
+      |____________________Add
+                            |
+      ----------------------|
+      |                     |
+      |                     |
+      |         SimplifiedLayerNormalization
+      |                     |
+      |                     |
+      |            MultipleLayerPerception
+      |                     |
+      |                     |
+      |____________________Add
+                            |
+                        (new layer)
+      ----------------------|
+      |                     |
+      |         SimplifiedLayerNormalization
+                           ....
+*/
 void FindLayerBoundaryLayerNormNodes(
     const GraphViewer& graph_viewer,
-    const logging::Logger&,
+    const logging::Logger& logger,
     const InlinedHashMap<NodeIndex, ptrdiff_t>&
         node_index_to_its_order_in_topological_sort_map,
     const ptrdiff_t& yield_op_order_in_topological_sort,
-    InlinedHashSet<const Node*>& layer_boundary_ln_nodes) {
+    InlinedVector<const Node*>& layer_boundary_ln_nodes) {
   // Loop all nodes to find LayerNormalization nodes.
   // For each LayerNormalization node, keep checking its output nodes,
   // until find a node that is Softmax or BiasSoftmax or another LayerNormalization.
   // If the found node is Softmax or BiasSoftmax, the LayerNormalization node as ATTENTION.
   // If the found node is another LayerNormalization, the LayerNormalization node as MLP.
-  const InlinedHashSet<std::string_view> softmax_ops{"Softmax", "BiasSoftmax"};
-  const InlinedHashSet<std::string_view> layernorm_ops{"LayerNormalization", "SkipLayerNormalization"};
 
   layer_boundary_ln_nodes.clear();
+
   const auto& node_topology_list = graph_viewer.GetNodesInTopologicalOrder(ExecutionOrder::PRIORITY_BASED);
   for (auto node_index : node_topology_list) {
     auto& node = *graph_viewer.GetNode(node_index);
 
-    if (layernorm_ops.find(node.OpType()) == layernorm_ops.end()) {
+    if (!IsLayerNormNode(node)) {
       continue;
     }
+    const NodeArg* input_arg = node.InputDefs()[0];
 
+    // IsResidualNodeArg checks input_arg
+    auto [is_residual_node_arg, add_node, other_node] = IsResidualNodeArg(graph_viewer, input_arg);
+    if (!is_residual_node_arg) {
+      MO_LOG_DEBUG_INFO(logger, "Not a residual node arg " + input_arg->Name());
+      continue;
+    }
+
+    // At this point, there should not be any recompute node, so we don't need check the node existence in
+    //  node_index_to_its_order_in_topological_sort_map.
+    ptrdiff_t attention_residual_add_node_order =
+        node_index_to_its_order_in_topological_sort_map.at(add_node->Index());
+    ptrdiff_t attention_residual_ln_node_order =
+        node_index_to_its_order_in_topological_sort_map.at(other_node->Index());
+    if (attention_residual_add_node_order >= yield_op_order_in_topological_sort ||
+        attention_residual_ln_node_order >= yield_op_order_in_topological_sort) {
+      MO_LOG_DEBUG_INFO(logger, "Not a valid residual node arg " + input_arg->Name());
+      continue;
+    }
+
+    // Search all forward nodes that is before `add_node` in topo order, unless we find a softmax or
+    // nodes_to_check is empty.
     std::deque<const Node*> nodes_to_check;
     std::set<const Node*> visited_nodes;
     for (auto node_it = node.OutputNodesBegin(); node_it != node.OutputNodesEnd(); ++node_it) {
       // Ignore those nodes after YieldOp.
-      if (node_index_to_its_order_in_topological_sort_map.at(node_it->Index()) < yield_op_order_in_topological_sort) {
+      auto order = node_index_to_its_order_in_topological_sort_map.at(node_it->Index());
+      if (order < yield_op_order_in_topological_sort && order < attention_residual_add_node_order) {
         nodes_to_check.push_back(&(*node_it));
       }
     }
 
-    bool unexpected_failure = false;
-    bool found_softmax = false;
-    bool found_layernorm = false;
-    ptrdiff_t next_layernorm_execution_oder = -1;
     while (!nodes_to_check.empty()) {
       const Node* next_node = nodes_to_check.front();
       nodes_to_check.pop_front();
@@ -62,41 +159,21 @@ void FindLayerBoundaryLayerNormNodes(
       }
 
       visited_nodes.insert(next_node);
-      if (softmax_ops.find(next_node->OpType()) != softmax_ops.end()) {
-        found_softmax = true;
-      } else if (layernorm_ops.find(next_node->OpType()) != layernorm_ops.end()) {
-        if (found_layernorm) {
-          // If we found another LayerNormalization node, we would report as warning, and do nothing for layer boundary detection.
-          unexpected_failure = true;
-          break;
-        }
-        found_layernorm = true;  // don't trace further
-        next_layernorm_execution_oder = node_index_to_its_order_in_topological_sort_map.at(next_node->Index());
-        continue;
-      } else {
-        for (auto node_it = next_node->OutputNodesBegin(); node_it != next_node->OutputNodesEnd(); ++node_it) {
-          // Stop if the node is after next Layernorm node in execution order.
-          if (found_layernorm &&
-              node_index_to_its_order_in_topological_sort_map.at(node_it->Index()) >= next_layernorm_execution_oder) {
-            continue;
-          }
+      if (IsSoftmaxNode(*next_node)) {
+        MO_LOG_DEBUG_INFO(logger, "Found layer boundary node " + node.Name() + " with its input arg: " +
+                                      input_arg->Name());
+        layer_boundary_ln_nodes.push_back(&node);
+        break;
+      }
+
+      for (auto node_it = next_node->OutputNodesBegin(); node_it != next_node->OutputNodesEnd(); ++node_it) {
+        // Stop if the node is after next Layernorm node in execution order.
+        auto order = node_index_to_its_order_in_topological_sort_map.at(node_it->Index());
+        if (order < yield_op_order_in_topological_sort && order < attention_residual_add_node_order) {
           nodes_to_check.push_back(&(*node_it));
         }
       }
     }
-
-    if (unexpected_failure) {
-      layer_boundary_ln_nodes.clear();
-      break;
-    }
-
-    if (found_softmax) {
-      layer_boundary_ln_nodes.insert(&node);
-    } else if (!found_layernorm) {
-      // If no Softmax found, and no other LayerNormalization found, this should be the last LayerNormalization node,
-      // we also consider it as boundary node.
-      layer_boundary_ln_nodes.insert(&node);
-    }
   }
 }
 
diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/transformer_specific.h b/orttraining/orttraining/core/optimizer/memory_optimizer/transformer_specific.h
index b58d822124f43..a72e5a0af92d3 100644
--- a/orttraining/orttraining/core/optimizer/memory_optimizer/transformer_specific.h
+++ b/orttraining/orttraining/core/optimizer/memory_optimizer/transformer_specific.h
@@ -23,6 +23,6 @@ void FindLayerBoundaryLayerNormNodes(const GraphViewer& graph_viewer,
                                      const InlinedHashMap<NodeIndex, ptrdiff_t>&
                                          node_index_to_its_order_in_topological_sort_map,
                                      const ptrdiff_t& yield_op_order_in_topological_sort,
-                                     InlinedHashSet<const Node*>& layer_boundary_ln_nodes);
+                                     InlinedVector<const Node*>& layer_boundary_ln_nodes);
 
 }  // namespace onnxruntime::optimizer::memory_optimizer
diff --git a/orttraining/orttraining/test/optimizer/memory_optimizer_test.cc b/orttraining/orttraining/test/optimizer/memory_optimizer_test.cc
index 22f1da1327547..360095dea6697 100644
--- a/orttraining/orttraining/test/optimizer/memory_optimizer_test.cc
+++ b/orttraining/orttraining/test/optimizer/memory_optimizer_test.cc
@@ -29,6 +29,7 @@
 #include "orttraining/core/optimizer/memory_optimizer/common.h"
 #include "orttraining/core/optimizer/memory_optimizer/memory_optimizer.h"
 #include "orttraining/core/optimizer/memory_optimizer/memory_insight.h"
+#include "orttraining/core/optimizer/memory_optimizer/transformer_specific.h"
 
 using namespace std;
 using namespace ONNX_NAMESPACE;
@@ -312,5 +313,45 @@ TEST(MemoryOptimizerTests, TransformerPerLayerRecompute) {
   }
 }
 
+TEST(MemoryOptimizerTests, TransformerLayerDetectionTest) {
+  const logging::Logger* logger = &logging::LoggingManager::DefaultLogger();
+  auto model_uri = MODEL_FOLDER "3layer_bloom_optimized_training.onnx";
+  std::shared_ptr<Model> model;
+  ASSERT_STATUS_OK(Model::Load(model_uri, model, nullptr, *logger));
+  Graph& graph = model->MainGraph();
+  GraphViewer graph_viewer(graph);
+
+  InlinedHashMap<NodeIndex, ptrdiff_t> node_index_to_its_order_in_topological_sort_map;
+  const auto& node_ids = graph_viewer.GetNodesInTopologicalOrder(ExecutionOrder::PRIORITY_BASED);
+
+  // Find boundary ops between forward and backward pass, currently, it's limited to YieldOp.
+  ptrdiff_t yield_op_order_in_topological_sort = -1;
+  for (size_t i = 0; i < node_ids.size(); ++i) {
+    const Node* p_node = graph_viewer.GetNode(node_ids[i]);
+    if (p_node == nullptr) { /* skip removed nodes*/
+      continue;
+    }
+
+    if (p_node->OpType() == "YieldOp") {
+      // There are multiple YieldOps in the graph。
+      ASSERT_EQ(yield_op_order_in_topological_sort, -1);
+      yield_op_order_in_topological_sort = static_cast<ptrdiff_t>(i);
+    }
+
+    node_index_to_its_order_in_topological_sort_map[p_node->Index()] = static_cast<ptrdiff_t>(i);
+  }
+
+  InlinedVector<const Node*> layer_boundary_ln_node;
+  optimizer::memory_optimizer::FindLayerBoundaryLayerNormNodes(graph_viewer, *logger,
+                                                               node_index_to_its_order_in_topological_sort_map,
+                                                               yield_op_order_in_topological_sort,
+                                                               layer_boundary_ln_node);
+
+  ASSERT_EQ(layer_boundary_ln_node.size(), 3);
+  ASSERT_EQ(layer_boundary_ln_node[0]->Name(), "LayerNormalization_token_0");
+  ASSERT_EQ(layer_boundary_ln_node[1]->Name(), "LayerNormalization_token_6");
+  ASSERT_EQ(layer_boundary_ln_node[2]->Name(), "LayerNormalization_token_12");
+}
+
 }  // namespace test
 }  // namespace onnxruntime

From 17919717b57c1246a9f2629b7d3e9d523d80dcd6 Mon Sep 17 00:00:00 2001
From: Ye Wang <52801275+wangyems@users.noreply.github.com>
Date: Fri, 29 Mar 2024 10:24:19 -0700
Subject: [PATCH 271/279] add QMoE (#20108)

### Description
<!-- Describe your changes. -->
1. Introduce latest cutlass extension from TRTLLM that gives us cutlass
upgrade(to 3.4) opportunity from MoE side.
2. Fix Windows build issue
3. Add Int4 MoE op and ut


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 cmake/onnxruntime_rocm_hipify.cmake           |    2 +
 docs/ContribOperators.md                      |   64 +
 docs/OperatorKernels.md                       |    1 +
 .../cuda/collective/sharded_moe.cc            |  126 +-
 .../contrib_ops/cuda/cuda_contrib_kernels.cc  |    2 +
 .../cuda/moe/cutlass_extensions/arch/mma.h    |  110 ++
 .../compute_occupancy.h                       |   21 +-
 .../epilogue/thread/fused_activations.h}      |   76 +-
 .../epilogue_per_row_per_col_scale.h          |  306 ++++
 .../threadblock/epilogue_tensor_op_int32.h    |  247 +++
 .../epilogue_helpers.h                        |  129 +-
 .../gemm/device/gemm_universal_base_compat.h  |  384 +++++
 .../gemm/device/splitk_gemm_grouped.h         |  476 ++++++
 .../gemm/kernel/default_fpA_intB_traits.h}    |   61 +-
 .../gemm/kernel/default_int8_traits.h         |   51 +
 .../gemm/kernel/default_splitk_gemm_grouped.h |  206 +++
 .../gemm/kernel/fpA_intB_gemm.h               |  513 ++++++
 .../gemm/kernel/gemm_moe_problem_visitor.h    |   66 +
 .../gemm/kernel/gemm_with_epilogue_visitor.h  |  516 ++++++
 .../gemm/kernel/mixed_gemm_B_layout.h         |  126 ++
 .../gemm/kernel/moe_cutlass_kernel.h          |  471 +++++
 .../gemm/kernel}/moe_problem_visitor.h        |   54 +-
 .../gemm/kernel/splitk_gemm_grouped.h         |  464 +++++
 .../gemm/threadblock/default_dq_mma.h         |  120 ++
 .../threadblock/default_dq_mma_multistage.h   |  289 ++++
 .../threadblock/default_dq_mma_pipelined.h    |  245 +++
 .../gemm/threadblock/default_mma.h            |  283 +++
 .../gemm/threadblock/default_mma_bf16.h       |  345 ++++
 .../gemm/threadblock/dq_mma_base.h            |  237 +++
 .../gemm/threadblock/dq_mma_multistage.h      |  107 ++
 .../dq_mma_multistage_finegrained.h           |  634 +++++++
 .../threadblock/dq_mma_multistage_percol.h    |  586 +++++++
 .../gemm/threadblock/dq_mma_pipelined.h       |  379 +++++
 .../gemm/warp/default_mma_tensor_op.h         |  103 ++
 .../warp/mma_tensorop_compute_B_with_f16.h    |  283 +++
 .../gemm/warp/mma_tensorop_dequantizer.h      |  534 ++++++
 .../moe/cutlass_extensions/gemm_configs.h     |  125 ++
 .../interleaved_numeric_conversion.h          |  392 +++++
 .../tile_interleaved_layout.h                 |    2 +-
 .../fine_grained_scale_zero_iterator.h        |  222 +++
 .../cutlass_extensions/weight_only_quant_op.h |   50 +
 .../cuda/moe/ft_moe/cutlass_heuristic.cc      |    4 +-
 .../cuda/moe/ft_moe/cutlass_heuristic.h       |    2 +-
 .../cuda/moe/ft_moe/ft_gemm_configs.h         |   58 -
 .../cuda/moe/ft_moe/moe_cutlass_kernel.h      |  463 -----
 .../cuda/moe/ft_moe/moe_gemm_kernels.h        |   10 +-
 .../moe/ft_moe/moe_gemm_kernels_fp16_fp16.cu  |   11 +-
 .../moe/ft_moe/moe_gemm_kernels_fp16_uint4.cu |   30 +
 .../moe/ft_moe/moe_gemm_kernels_fp32_fp32.cu  |   12 +-
 .../moe/ft_moe/moe_gemm_kernels_template.h    |  139 +-
 .../contrib_ops/cuda/moe/ft_moe/moe_kernel.cu |  168 +-
 onnxruntime/contrib_ops/cuda/moe/moe.cc       |   92 +-
 onnxruntime/contrib_ops/cuda/moe/moe_base.h   |   94 +-
 .../cuda/quantization/moe_quantization.cc     |  143 ++
 .../cuda/quantization/moe_quantization.h      |   25 +
 .../core/graph/contrib_ops/contrib_defs.cc    |   58 +
 onnxruntime/core/graph/contrib_ops/ms_opset.h |    2 +
 onnxruntime/test/contrib_ops/moe_test.cc      | 1516 ++++++++++++-----
 .../transformers/test_parity_mixtral_moe.py   |    6 +-
 .../python/transformers/test_parity_moe.py    |    4 +-
 60 files changed, 10748 insertions(+), 1497 deletions(-)
 create mode 100644 onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/arch/mma.h
 rename onnxruntime/contrib_ops/cuda/moe/{ft_moe => cutlass_extensions}/compute_occupancy.h (62%)
 rename onnxruntime/contrib_ops/cuda/moe/{ft_moe/gemm_moe_problem_visitor.h => cutlass_extensions/epilogue/thread/fused_activations.h} (57%)
 create mode 100644 onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/epilogue/threadblock/epilogue_per_row_per_col_scale.h
 create mode 100644 onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/epilogue/threadblock/epilogue_tensor_op_int32.h
 rename onnxruntime/contrib_ops/cuda/moe/{ft_moe => cutlass_extensions}/epilogue_helpers.h (57%)
 create mode 100644 onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/device/gemm_universal_base_compat.h
 create mode 100644 onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/device/splitk_gemm_grouped.h
 rename onnxruntime/contrib_ops/cuda/moe/{ft_moe/layout_traits_helper.h => cutlass_extensions/gemm/kernel/default_fpA_intB_traits.h} (71%)
 create mode 100644 onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/default_int8_traits.h
 create mode 100644 onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/default_splitk_gemm_grouped.h
 create mode 100644 onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/fpA_intB_gemm.h
 create mode 100644 onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/gemm_moe_problem_visitor.h
 create mode 100644 onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/gemm_with_epilogue_visitor.h
 create mode 100644 onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/mixed_gemm_B_layout.h
 create mode 100644 onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/moe_cutlass_kernel.h
 rename onnxruntime/contrib_ops/cuda/moe/{ft_moe => cutlass_extensions/gemm/kernel}/moe_problem_visitor.h (79%)
 create mode 100644 onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/splitk_gemm_grouped.h
 create mode 100644 onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_dq_mma.h
 create mode 100644 onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_dq_mma_multistage.h
 create mode 100644 onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_dq_mma_pipelined.h
 create mode 100644 onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_mma.h
 create mode 100644 onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_mma_bf16.h
 create mode 100644 onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_base.h
 create mode 100644 onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_multistage.h
 create mode 100644 onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_multistage_finegrained.h
 create mode 100644 onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_multistage_percol.h
 create mode 100644 onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_pipelined.h
 create mode 100644 onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/warp/default_mma_tensor_op.h
 create mode 100644 onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/warp/mma_tensorop_compute_B_with_f16.h
 create mode 100644 onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/warp/mma_tensorop_dequantizer.h
 create mode 100644 onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm_configs.h
 create mode 100644 onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/interleaved_numeric_conversion.h
 rename onnxruntime/contrib_ops/cuda/moe/{ft_moe => cutlass_extensions}/tile_interleaved_layout.h (98%)
 create mode 100644 onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/transform/threadblock/fine_grained_scale_zero_iterator.h
 create mode 100644 onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/weight_only_quant_op.h
 delete mode 100644 onnxruntime/contrib_ops/cuda/moe/ft_moe/ft_gemm_configs.h
 delete mode 100644 onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_cutlass_kernel.h
 create mode 100644 onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp16_uint4.cu
 create mode 100644 onnxruntime/contrib_ops/cuda/quantization/moe_quantization.cc
 create mode 100644 onnxruntime/contrib_ops/cuda/quantization/moe_quantization.h

diff --git a/cmake/onnxruntime_rocm_hipify.cmake b/cmake/onnxruntime_rocm_hipify.cmake
index cadb06bb38707..0051f241e4f9b 100644
--- a/cmake/onnxruntime_rocm_hipify.cmake
+++ b/cmake/onnxruntime_rocm_hipify.cmake
@@ -60,6 +60,8 @@ set(contrib_ops_excluded_files
   "quantization/matmul_nbits.cc"
   "quantization/matmul_nbits.cuh"
   "quantization/matmul_nbits.cu"
+  "quantization/moe_quantization.h"
+  "quantization/moe_quantization.cc"
   "quantization/quantize_dequantize_linear.cc"
   "quantization/qordered_ops/qordered_attention_impl.cu"
   "quantization/qordered_ops/qordered_attention_impl.h"
diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
index 32a4ca16b7824..9b45cc02708d6 100644
--- a/docs/ContribOperators.md
+++ b/docs/ContribOperators.md
@@ -78,6 +78,7 @@ Do not modify directly.*
   * <a href="#com.microsoft.QLinearSigmoid">com.microsoft.QLinearSigmoid</a>
   * <a href="#com.microsoft.QLinearSoftmax">com.microsoft.QLinearSoftmax</a>
   * <a href="#com.microsoft.QLinearWhere">com.microsoft.QLinearWhere</a>
+  * <a href="#com.microsoft.QMoE">com.microsoft.QMoE</a>
   * <a href="#com.microsoft.QOrderedAttention">com.microsoft.QOrderedAttention</a>
   * <a href="#com.microsoft.QOrderedGelu">com.microsoft.QOrderedGelu</a>
   * <a href="#com.microsoft.QOrderedLayerNormalization">com.microsoft.QOrderedLayerNormalization</a>
@@ -4261,6 +4262,69 @@ This version of the operator has been available since version 1 of the 'com.micr
 </dl>
 
 
+### <a name="com.microsoft.QMoE"></a><a name="com.microsoft.qmoe">**com.microsoft.QMoE**</a>
+
+  Int4 MoE
+
+#### Version
+
+This version of the operator has been available since version 1 of the 'com.microsoft' operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>activation_type</tt> : string</dt>
+<dd>Activation function to use. Choose from relu, gelu, silu and identity. Default is relu</dd>
+<dt><tt>k</tt> : int</dt>
+<dd>Number of top experts to select from expert pool</dd>
+<dt><tt>normalize_routing_weights</tt> : int</dt>
+<dd>Whether to normalize routing weights</dd>
+</dl>
+
+#### Inputs (7 - 11)
+
+<dl>
+<dt><tt>input</tt> : T</dt>
+<dd>2D input tensor with shape (num_rows, hidden_size) or 3D input tensor with shape (batch_size, sequence_length, hidden_size)</dd>
+<dt><tt>router_probs</tt> : T</dt>
+<dd>2D input tensor with shape (num_rows, num_experts)</dd>
+<dt><tt>fc1_experts_weights</tt> : T1</dt>
+<dd>3D input tensor with shape (num_experts, hidden_size, inter_size / 2)</dd>
+<dt><tt>fc1_scales</tt> : T</dt>
+<dd>2D input tensor with shape (num_experts, inter_size)</dd>
+<dt><tt>fc1_experts_bias</tt> (optional) : T</dt>
+<dd>2D optional input tensor with shape (num_experts, inter_size)</dd>
+<dt><tt>fc2_experts_weights</tt> : T1</dt>
+<dd>3D input tensor with shape (num_experts, inter_size, hidden_size / 2)</dd>
+<dt><tt>fc2_scales</tt> : T</dt>
+<dd>2D input tensor with shape (num_experts, hidden_size)</dd>
+<dt><tt>fc2_experts_bias</tt> (optional) : T</dt>
+<dd>2D optional input tensor with shape (num_experts, hidden_size)</dd>
+<dt><tt>fc3_experts_weights</tt> (optional) : T1</dt>
+<dd>3D optional input tensor with shape (num_experts, hidden_size, inter_size / 2)</dd>
+<dt><tt>fc3_scales</tt> (optional) : T</dt>
+<dd>2D optional input tensor with shape (num_experts, inter_size)</dd>
+<dt><tt>fc3_experts_bias</tt> (optional) : T</dt>
+<dd>2D optional input tensor with shape (num_experts, inter_size)</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>output</tt> : T</dt>
+<dd>2D input tensor with shape (num_rows, hidden_size) or 3D input tensor with shape (batch_size, sequence_length, hidden_size)</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(float16)</dt>
+<dd>Constrain input and output types to float or float16 tensors.</dd>
+<dt><tt>T1</tt> : tensor(uint8)</dt>
+<dd>Constrain weights type to uint8 tensors.</dd>
+</dl>
+
+
 ### <a name="com.microsoft.QOrderedAttention"></a><a name="com.microsoft.qorderedattention">**com.microsoft.QOrderedAttention**</a>
 
   Quantized version of simplified Multi-Head Self Attention(using int8 with specific matrix Layout).
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index bca8e17b3dfd4..c963781435465 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -868,6 +868,7 @@ Do not modify directly.*
 |PackedAttention|*in* input:**T**<br> *in* weights:**T**<br> *in* bias:**T**<br> *in* token_offset:**M**<br> *in* cumulative_sequence_length:**M**<br> *in* relative_position_bias:**T**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
 |PackedMultiHeadAttention|*in* query:**T**<br> *in* key:**T**<br> *in* value:**T**<br> *in* bias:**T**<br> *in* token_offset:**M**<br> *in* cumulative_sequence_length:**M**<br> *in* relative_position_bias:**T**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
 |QAttention|*in* input:**T1**<br> *in* weight:**T2**<br> *in* bias:**T3**<br> *in* input_scale:**T3**<br> *in* weight_scale:**T3**<br> *in* mask_index:**T4**<br> *in* input_zero_point:**T1**<br> *in* weight_zero_point:**T2**<br> *in* past:**T3**<br> *out* output:**T3**<br> *out* present:**T3**|1+|**T1** = tensor(int8)<br/> **T2** = tensor(int8)<br/> **T3** = tensor(float), tensor(float16)<br/> **T4** = tensor(int32)|
+|QMoE|*in* input:**T**<br> *in* router_probs:**T**<br> *in* fc1_experts_weights:**T1**<br> *in* fc1_scales:**T**<br> *in* fc1_experts_bias:**T**<br> *in* fc2_experts_weights:**T1**<br> *in* fc2_scales:**T**<br> *in* fc2_experts_bias:**T**<br> *in* fc3_experts_weights:**T1**<br> *in* fc3_scales:**T**<br> *in* fc3_experts_bias:**T**<br> *out* output:**T**|1+|**T** = tensor(float16)<br/> **T1** = tensor(uint8)|
 |QOrderedAttention|*in* input:**Q**<br> *in* scale_input:**S**<br> *in* scale_Q_gemm:**S**<br> *in* scale_K_gemm:**S**<br> *in* scale_V_gemm:**S**<br> *in* Q_weight:**Q**<br> *in* K_weight:**Q**<br> *in* V_weight:**Q**<br> *in* scale_Q_weight:**S**<br> *in* scale_K_weight:**S**<br> *in* scale_V_weight:**S**<br> *in* Q_bias:**S**<br> *in* K_bias:**S**<br> *in* V_bias:**S**<br> *in* scale_QKT_gemm:**S**<br> *in* scale_QKT_softmax:**S**<br> *in* scale_values_gemm:**S**<br> *in* mask_index:**G**<br> *in* past:**Q**<br> *in* relative_position_bias:**S**<br> *out* output:**Q**|1+|**G** = tensor(int32)<br/> **Q** = tensor(int8)<br/> **S** = tensor(float)|
 |QOrderedGelu|*in* X:**Q**<br> *in* scale_X:**S**<br> *in* scale_Y:**S**<br> *out* Y:**Q**|1+|**Q** = tensor(int8)<br/> **S** = tensor(float)|
 |QOrderedLayerNormalization|*in* X:**Q**<br> *in* scale_X:**S**<br> *in* scale:**F**<br> *in* B:**F**<br> *in* scale_Y:**S**<br> *out* Y:**Q**|1+|**F** = tensor(float), tensor(float16)<br/> **Q** = tensor(int8)<br/> **S** = tensor(float)|
diff --git a/onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc b/onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc
index 2efc37cf98010..1dbbe8c4e7eaa 100644
--- a/onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc
+++ b/onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc
@@ -18,23 +18,15 @@ namespace cuda {
 
 #if defined(ORT_USE_NCCL)
 
-#define REGISTER_KERNEL_TYPED(T)                                  \
-  ONNX_OPERATOR_TYPED_KERNEL_EX(                                  \
-      ShardedMoE,                                                 \
-      kMSDomain,                                                  \
-      1,                                                          \
-      T,                                                          \
-      kCudaExecutionProvider,                                     \
-      (*KernelDefBuilder::Create())                               \
-          .MayInplace(0, 0)                                       \
-          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
+#define REGISTER_KERNEL_TYPED(T)                                                                            \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                                                            \
+      ShardedMoE, kMSDomain, 1, T, kCudaExecutionProvider,                                                  \
+      (*KernelDefBuilder::Create()).MayInplace(0, 0).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
       ShardedMoE<T>);
 
 REGISTER_KERNEL_TYPED(float)
 REGISTER_KERNEL_TYPED(MLFloat16)
 
-using namespace ONNX_NAMESPACE;
-
 template <typename T>
 ShardedMoE<T>::ShardedMoE(const OpKernelInfo& op_kernel_info) : NcclKernel(op_kernel_info), MoEBase(op_kernel_info) {
   ORT_ENFORCE(op_kernel_info.GetAttr<int64_t>("tensor_shards", &tensor_shards_).IsOK());
@@ -69,25 +61,23 @@ Status ShardedMoE<T>::ComputeInternal(OpKernelContext* context) const {
   const Tensor* fc3_experts_bias_optional = context->Input<Tensor>(7);
 
   MoEParameters moe_params(tensor_shards_);
-  ORT_RETURN_IF_ERROR(CheckInputs(moe_params, input, router_probs, fc1_experts_weights, fc1_experts_bias_optional,
-                                  fc2_experts_weights, fc2_experts_bias_optional, fc3_experts_weights_optional,
-                                  fc3_experts_bias_optional));
+  MoEQuantType quant_type = MoEQuantType::None;
+  ORT_RETURN_IF_ERROR(CheckInputs(moe_params, quant_type, input, router_probs, fc1_experts_weights,
+                                  fc1_experts_bias_optional, fc2_experts_weights, fc2_experts_bias_optional,
+                                  fc3_experts_weights_optional, fc3_experts_bias_optional));
 
-  ORT_RETURN_IF_NOT(moe_params.num_experts % nccl_->Size() == 0,
-                    "num_experts should be divisible by world_size");
+  ORT_RETURN_IF_NOT(moe_params.num_experts % nccl_->Size() == 0, "num_experts should be divisible by world_size");
 
   if (moe_params.parallel_type == MoEParallelType::EP || moe_params.parallel_type == MoEParallelType::EPAndTP) {
     ORT_RETURN_IF_ERROR(SynchronizeExpertsStartIndex(allocator, context, copy_event));
   }
 
-  ort_fastertransformer::CutlassMoeFCRunner<CudaT, CudaT> moe_runner(sm,
-                                                                     fc3_experts_weights_optional != nullptr,
+  ort_fastertransformer::CutlassMoeFCRunner<CudaT, CudaT> moe_runner(sm, fc3_experts_weights_optional != nullptr,
                                                                      normalize_routing_weights_);
 
-  size_t ws_size =
-      moe_runner.getWorkspaceSize(static_cast<size_t>(moe_params.num_rows), static_cast<size_t>(moe_params.hidden_size),
-                                  static_cast<size_t>(moe_params.inter_size),
-                                  static_cast<size_t>(moe_params.num_experts), static_cast<size_t>(k_));
+  size_t ws_size = moe_runner.getWorkspaceSize(
+      static_cast<size_t>(moe_params.num_rows), static_cast<size_t>(moe_params.hidden_size),
+      static_cast<size_t>(moe_params.inter_size), static_cast<size_t>(moe_params.num_experts), static_cast<size_t>(k_));
 
   size_t fc2_output_size = k_ * moe_params.num_rows * moe_params.hidden_size * sizeof(CudaT);
   size_t expert_scales_size = k_ * moe_params.num_rows * sizeof(CudaT);
@@ -107,30 +97,29 @@ Status ShardedMoE<T>::ComputeInternal(OpKernelContext* context) const {
 
   const CudaT* fc_scales_ptr = nullptr;
 
-  moe_runner.run_moe_fc(reinterpret_cast<const CudaT*>(input->template Data<T>()),
-                        reinterpret_cast<const CudaT*>(router_probs->template Data<T>()),
-                        reinterpret_cast<const CudaT*>(fc1_experts_weights->template Data<T>()),
-                        std::move(fc_scales_ptr),
-                        fc1_experts_bias_optional == nullptr
-                            ? nullptr
-                            : reinterpret_cast<const CudaT*>(fc1_experts_bias_optional->template Data<T>()),
-                        activation_type_,
-                        fc3_experts_weights_optional == nullptr
-                            ? nullptr
-                            : reinterpret_cast<const CudaT*>(fc3_experts_weights_optional->template Data<T>()),
-                        std::move(fc_scales_ptr),
-                        fc3_experts_bias_optional == nullptr
-                            ? nullptr
-                            : reinterpret_cast<const CudaT*>(fc3_experts_bias_optional->template Data<T>()),
-                        reinterpret_cast<const CudaT*>(fc2_experts_weights->template Data<T>()),
-                        std::move(fc_scales_ptr), static_cast<int>(moe_params.num_rows),
-                        static_cast<int>(moe_params.hidden_size),
-                        static_cast<int>(moe_params.inter_size), static_cast<int>(moe_params.num_experts),
-                        static_cast<int>(moe_params.local_num_experts), static_cast<int>(local_experts_start_index_),
-                        static_cast<int>(k_), reinterpret_cast<char*>(work_space.get()),
-                        reinterpret_cast<CudaT*>(fc2_output.get()), reinterpret_cast<CudaT*>(expert_scales.get()),
-                        reinterpret_cast<int*>(expanded_source_row_to_expanded_dest_row.get()),
-                        reinterpret_cast<int*>(expert_for_source_row.get()), Stream(context));
+  moe_runner.run_moe_fc(
+      reinterpret_cast<const CudaT*>(input->template Data<T>()),
+      reinterpret_cast<const CudaT*>(router_probs->template Data<T>()),
+      reinterpret_cast<const CudaT*>(fc1_experts_weights->template Data<T>()), std::move(fc_scales_ptr),
+      fc1_experts_bias_optional == nullptr
+          ? nullptr
+          : reinterpret_cast<const CudaT*>(fc1_experts_bias_optional->template Data<T>()),
+      activation_type_,
+      fc3_experts_weights_optional == nullptr
+          ? nullptr
+          : reinterpret_cast<const CudaT*>(fc3_experts_weights_optional->template Data<T>()),
+      std::move(fc_scales_ptr),
+      fc3_experts_bias_optional == nullptr
+          ? nullptr
+          : reinterpret_cast<const CudaT*>(fc3_experts_bias_optional->template Data<T>()),
+      reinterpret_cast<const CudaT*>(fc2_experts_weights->template Data<T>()), std::move(fc_scales_ptr),
+      static_cast<int>(moe_params.num_rows), static_cast<int>(moe_params.hidden_size),
+      static_cast<int>(moe_params.inter_size), static_cast<int>(moe_params.num_experts),
+      static_cast<int>(moe_params.local_num_experts), static_cast<int>(local_experts_start_index_),
+      static_cast<int>(k_), reinterpret_cast<char*>(work_space.get()), reinterpret_cast<CudaT*>(fc2_output.get()),
+      reinterpret_cast<CudaT*>(expert_scales.get()),
+      reinterpret_cast<int*>(expanded_source_row_to_expanded_dest_row.get()),
+      reinterpret_cast<int*>(expert_for_source_row.get()), Stream(context));
 
   Tensor* output = context->Output(0, input->Shape());
 
@@ -146,12 +135,8 @@ Status ShardedMoE<T>::ComputeInternal(OpKernelContext* context) const {
     ORT_ENFORCE(moe_params.tensor_shards == nccl_->Size());
     NCCL_RETURN_IF_ERROR(ncclGroupStart());
     NCCL_RETURN_IF_ERROR(ncclAllReduce(reinterpret_cast<const char*>(fc2_output.get()),
-                                       reinterpret_cast<char*>(fc2_output_bc.get()),
-                                       fc2_output_size / sizeof(CudaT),
-                                       GetNcclDataType(input->DataType()),
-                                       ncclSum,
-                                       nccl_->Comm(),
-                                       Stream(context)));
+                                       reinterpret_cast<char*>(fc2_output_bc.get()), fc2_output_size / sizeof(CudaT),
+                                       GetNcclDataType(input->DataType()), ncclSum, nccl_->Comm(), Stream(context)));
     NCCL_RETURN_IF_ERROR(ncclGroupEnd());
   }
 
@@ -166,19 +151,12 @@ Status ShardedMoE<T>::ComputeInternal(OpKernelContext* context) const {
     NCCL_RETURN_IF_ERROR(ncclGroupStart());
     for (int rank = 0; rank < nccl_->Size(); ++rank) {
       int64_t experts_start_index = rank_to_experts_start_index_[rank];
-      moe_runner.get_total_rows_info(experts_start_index,
-                                     moe_params.local_num_experts,
-                                     total_past_rows,
+      moe_runner.get_total_rows_info(experts_start_index, moe_params.local_num_experts, total_past_rows,
                                      total_covered_rows);
       const char* src = reinterpret_cast<const char*>(fc2_output.get()) + total_past_rows * stride_bytes;
       char* dst = reinterpret_cast<char*>(fc2_output_bc.get()) + total_past_rows * stride_bytes;
-      NCCL_RETURN_IF_ERROR(ncclBroadcast(src,
-                                         dst,
-                                         total_covered_rows * stride_count,
-                                         GetNcclDataType(input->DataType()),
-                                         rank,
-                                         nccl_->Comm(),
-                                         Stream(context)));
+      NCCL_RETURN_IF_ERROR(ncclBroadcast(src, dst, total_covered_rows * stride_count,
+                                         GetNcclDataType(input->DataType()), rank, nccl_->Comm(), Stream(context)));
     }
     NCCL_RETURN_IF_ERROR(ncclGroupEnd());
   }
@@ -197,8 +175,7 @@ Status ShardedMoE<T>::ComputeInternal(OpKernelContext* context) const {
 }
 
 template <typename T>
-Status ShardedMoE<T>::SynchronizeExpertsStartIndex(AllocatorPtr& allocator,
-                                                   OpKernelContext* context,
+Status ShardedMoE<T>::SynchronizeExpertsStartIndex(AllocatorPtr& allocator, OpKernelContext* context,
                                                    cudaEvent_t& cuda_event) const {
   if (rank_to_experts_start_index_[0] != std::numeric_limits<int64_t>::min()) {
     return Status::OK();
@@ -215,23 +192,16 @@ Status ShardedMoE<T>::SynchronizeExpertsStartIndex(AllocatorPtr& allocator,
       IAllocator::MakeUniquePtr<IndexType>(allocator, nccl_->Size(), false, stream);
 
   // Only happens in the first run.
-  CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(experts_start_index_d.get(),
-                                       &local_experts_start_index_,
-                                       IndexTypeSize,
-                                       cudaMemcpyHostToDevice,
-                                       Stream(context)));
+  CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(experts_start_index_d.get(), &local_experts_start_index_, IndexTypeSize,
+                                       cudaMemcpyHostToDevice, Stream(context)));
   NCCL_RETURN_IF_ERROR(ncclAllGather(reinterpret_cast<const char*>(experts_start_index_d.get()),
-                                     reinterpret_cast<char*>(rank_to_experts_start_index_d.get()),
-                                     1,
-                                     GetNcclDataType(DataTypeImpl::GetType<IndexType>()),
-                                     nccl_->Comm(),
+                                     reinterpret_cast<char*>(rank_to_experts_start_index_d.get()), 1,
+                                     GetNcclDataType(DataTypeImpl::GetType<IndexType>()), nccl_->Comm(),
                                      Stream(context)));
   // The const_cast<> violates the const modifier to make sure the synchronization happens only once per session.
   CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(const_cast<int64_t*>(rank_to_experts_start_index_.data()),
-                                       rank_to_experts_start_index_d.get(),
-                                       nccl_->Size() * IndexTypeSize,
-                                       cudaMemcpyDeviceToHost,
-                                       Stream(context)));
+                                       rank_to_experts_start_index_d.get(), nccl_->Size() * IndexTypeSize,
+                                       cudaMemcpyDeviceToHost, Stream(context)));
 
   CUDA_RETURN_IF_ERROR(cudaEventCreateWithFlags(&cuda_event, cudaEventDisableTiming));
   CUDA_RETURN_IF_ERROR(cudaEventRecord(cuda_event, Stream(context)));
diff --git a/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc b/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc
index 57e951d3a68ff..3621ffc5c64ca 100644
--- a/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc
@@ -72,6 +72,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MLFloat16, Crop);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, MoE);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, MoE);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, QMoE);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, MultiHeadAttention);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, MultiHeadAttention);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, GroupQueryAttention);
@@ -275,6 +276,7 @@ Status RegisterCudaContribKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MLFloat16, Crop)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, MoE)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, MoE)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, QMoE)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, MultiHeadAttention)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, MultiHeadAttention)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, GroupQueryAttention)>,
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/arch/mma.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/arch/mma.h
new file mode 100644
index 0000000000000..07c38c58e446a
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/arch/mma.h
@@ -0,0 +1,110 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates exposing architecture support for multiply-add operations
+*/
+
+#pragma once
+#include "contrib_ops/cuda/moe/cutlass_extensions/weight_only_quant_op.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace arch {
+
+// Tag which triggers MMA which will trigger
+struct OpMultiplyAddDequantizeInterleavedBToA;
+
+/*
+  Below we have extra tags to signal what kind of dequantization we want to do
+  (per col, scale only fine grained, finegrained with zero). This still lets us
+  the existing template infrastructure (incl. that in CUTLASS). However, we
+  split out the template below into OpMultiplyAddDequantizeInterleavedBToA along
+  with the quantization op before instantiating the GEMM pieces.
+
+  Note that this is somewhat of a hack, but it SIGNIFICANTLY reduces the amount of
+  code we need to duplicate.
+ */
+struct OpMultiplyAddDequantizeInterleavedBToA_percol_scale;
+struct OpMultiplyAddDequantizeInterleavedBToA_fine_scale;
+struct OpMultiplyAddDequantizeInterleavedBToA_fine_scalebias;
+
+// The default just forwards the original operator
+template <typename MmaOp, WeightOnlyQuantOp QuantOp_>
+struct TagOperator {
+  using TaggedOperator = MmaOp;
+};
+
+// Specializations below attach more information to the operator
+template <>
+struct TagOperator<OpMultiplyAddDequantizeInterleavedBToA, WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY> {
+  using TaggedOperator = OpMultiplyAddDequantizeInterleavedBToA_percol_scale;
+};
+
+template <>
+struct TagOperator<OpMultiplyAddDequantizeInterleavedBToA, WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY> {
+  using TaggedOperator = OpMultiplyAddDequantizeInterleavedBToA_fine_scale;
+};
+
+template <>
+struct TagOperator<OpMultiplyAddDequantizeInterleavedBToA, WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS> {
+  using TaggedOperator = OpMultiplyAddDequantizeInterleavedBToA_fine_scalebias;
+};
+
+// Here we instantiate some structs to "detag" the tagged operator. It splits it back to the original
+// operator + the extra information. If no extra info was tagged, the dequant op per column scaling
+// as a default.
+template <typename TaggedMmaOp>
+struct DetagOperator {
+  using Operator = TaggedMmaOp;
+  static constexpr WeightOnlyQuantOp QuantOp = WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY;
+};
+
+template <>
+struct DetagOperator<OpMultiplyAddDequantizeInterleavedBToA_percol_scale> {
+  using Operator = OpMultiplyAddDequantizeInterleavedBToA;
+  static constexpr WeightOnlyQuantOp QuantOp = WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY;
+};
+
+template <>
+struct DetagOperator<OpMultiplyAddDequantizeInterleavedBToA_fine_scale> {
+  using Operator = OpMultiplyAddDequantizeInterleavedBToA;
+  static constexpr WeightOnlyQuantOp QuantOp = WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY;
+};
+
+template <>
+struct DetagOperator<OpMultiplyAddDequantizeInterleavedBToA_fine_scalebias> {
+  using Operator = OpMultiplyAddDequantizeInterleavedBToA;
+  static constexpr WeightOnlyQuantOp QuantOp = WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS;
+};
+
+}  // namespace arch
+}  // namespace cutlass
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/compute_occupancy.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/compute_occupancy.h
similarity index 62%
rename from onnxruntime/contrib_ops/cuda/moe/ft_moe/compute_occupancy.h
rename to onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/compute_occupancy.h
index 86136ea244e23..99cbe4a66049e 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/compute_occupancy.h
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/compute_occupancy.h
@@ -26,19 +26,22 @@ namespace ort_fastertransformer {
 
 template <typename GemmKernel>
 inline int compute_occupancy_for_kernel() {
-  int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+  int smem_size = static_cast<int>(sizeof(typename GemmKernel::SharedStorage));
 
   if (smem_size > (48 << 10)) {
-    cudaError_t status =
-        cudaFuncSetAttribute(cutlass::Kernel<GemmKernel>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
-    if (status == cudaError::cudaErrorInvalidValue) {
-      // Clear the error bit since we can ignore this.
-      // This should mean that smem_size > cudaDevAttrMaxSharedMemoryPerBlockOptin. In that case, we return an
-      // occupancy of 0. This will cause the heuristic to ignore this configuration.
-      status = cudaGetLastError();
+    cudaFuncAttributes attr;
+    int device = 0;
+    int max_smem_per_block = 0;
+    CUDA_CALL_THROW(cudaGetDevice(&device));
+    CUDA_CALL_THROW(cudaDeviceGetAttribute(&max_smem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device));
+    CUDA_CALL_THROW(cudaFuncGetAttributes(&attr, cutlass::Kernel<GemmKernel>));
+    if (smem_size + attr.sharedSizeBytes >= static_cast<size_t>(max_smem_per_block)) {
+      // This should mean that
+      // cudaFuncSetAttribute(cutlass::Kernel<GemmKernel>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size)
+      // wouldn't work. In that case, we return an occupancy of 0. This will cause the heuristic to ignore this
+      // configuration.
       return 0;
     }
-    CUDA_CALL_THROW(status);
   }
 
   int max_active_blocks = -1;
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/gemm_moe_problem_visitor.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/epilogue/thread/fused_activations.h
similarity index 57%
rename from onnxruntime/contrib_ops/cuda/moe/ft_moe/gemm_moe_problem_visitor.h
rename to onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/epilogue/thread/fused_activations.h
index 311ed323cb90c..da8cb6d294efd 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/gemm_moe_problem_visitor.h
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/epilogue/thread/fused_activations.h
@@ -28,52 +28,68 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
-
 /*! \file
-    \brief Scheduler for grouped GEMM
+  \brief Functor performing linear combination with a maximum operation used by epilogues.
 */
 
 #pragma once
 
+#include "cutlass/array.h"
 #include "cutlass/cutlass.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/kernel/gemm_grouped_problem_visitor.h"
-#include "cutlass/matrix_coord.h"
-
-#include "moe_problem_visitor.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/linear_combination_generic.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+#include "cutlass/functional.h"
+#include "cutlass/half.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass {
-namespace gemm {
-namespace kernel {
+namespace epilogue {
+namespace thread {
 
-/// Visitor class to abstract away the algorithm for iterating over tiles
-template <typename ThreadblockShape, GroupScheduleMode GroupScheduleMode_, int PrefetchTileCount, int ThreadCount,
-          bool Transposed = false>
-struct GemmMoeProblemVisitor
-    : public MoeProblemVisitor<detail::GemmGroupedProblemSizeHelper<ThreadblockShape, Transposed>, ThreadblockShape,
-                               GroupScheduleMode_, PrefetchTileCount, ThreadCount> {
-  static bool const kTransposed = Transposed;
+/////////////////////////////////////////////////////////////////////////////////////////////////
 
-  using ProblemSizeHelper = detail::GemmGroupedProblemSizeHelper<ThreadblockShape, Transposed>;
-  using Base =
-      MoeProblemVisitor<ProblemSizeHelper, ThreadblockShape, GroupScheduleMode_, PrefetchTileCount, ThreadCount>;
-  using Params = typename Base::Params;
-  using SharedStorage = typename Base::SharedStorage;
+__forceinline__ __device__ float copysignf_pos(float a, float b) {
+  float r;
+  r = __int_as_float(__float_as_int(a) | (__float_as_int(b) & 0x80000000));
+  return r;
+}
 
-  //
-  // Methods
-  //
-  CUTLASS_DEVICE
-  GemmMoeProblemVisitor(Params const& params_, SharedStorage& shared_storage_, int32_t block_idx)
-      : Base(params_, shared_storage_, block_idx) {}
-};
+__forceinline__ __device__ float tanh_opt(float x) {
+#if (__CUDACC_VER_MAJOR__ < 11) || (__CUDA_ARCH__ < 750)
+  float const exp_val = -1.f * fabs(2 * x);
+  return copysignf_pos((1.0f - __expf(exp_val)) / (__expf(exp_val) + 1.0f), x);
+#else
+  return fast_tanh(x);
+#endif
+}
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
+template <>
+struct GELU_taylor<float> {
+  static bool const kIsHeavy = true;
+
+  CUTLASS_DEVICE
+  float operator()(float const& z) const {
+    float k0 = static_cast<float>(0.7978845608028654);
+    float k1 = static_cast<float>(0.044715);
+
+    return static_cast<float>(
+        cutlass::constants::half<float>() * z *
+        (cutlass::constants::one<float>() + tanh_opt(k0 * z * (cutlass::constants::one<float>() + k1 * z * z))));
+  }
+
+  using Params = LinearCombinationGenericParams<float>;
+
+  CUTLASS_DEVICE
+  float operator()(float const& scalar, Params const& params_) const { return this->operator()(scalar); }
+};
 
-}  // namespace kernel
-}  // namespace gemm
+}  // namespace thread
+}  // namespace epilogue
 }  // namespace cutlass
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/epilogue/threadblock/epilogue_per_row_per_col_scale.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/epilogue/threadblock/epilogue_per_row_per_col_scale.h
new file mode 100644
index 0000000000000..affd1d83a35de
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/epilogue/threadblock/epilogue_per_row_per_col_scale.h
@@ -0,0 +1,306 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue visitor for threadblock scoped INT8 GEMMs that uses one scaling factor per row, and one per column.
+
+  original file: 3rdparty/cutlass/include/cutlass/epilogue/threadblock/epilogue_visitor_with_softmax.h
+
+*/
+
+#pragma once
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/arch/memory.h"
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/numeric_conversion.h"
+#include "tensorrt_llm/common/quantization.h"
+
+namespace tk = tensorrt_llm::common;
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+template <typename ThreadblockShape_, int ThreadCount, typename ScaleTileIterator_, typename OutputTileIterator_,
+          typename ElementAccumulator_, typename ElementCompute_, typename ElementwiseFunctor_,
+          bool UseMasking_ = false>
+class EpilogueVisitorPerRowPerCol {
+ public:
+  using ThreadblockShape = ThreadblockShape_;
+  static int const kThreadCount = ThreadCount;
+
+  using ScaleTileIterator = ScaleTileIterator_;
+  using OutputTileIterator = OutputTileIterator_;
+  using ElementwiseFunctor = ElementwiseFunctor_;
+
+  static int const kIterations = OutputTileIterator::kIterations;
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+
+  using ElementOutput = typename OutputTileIterator::Element;
+  using LayoutOutput = cutlass::layout::RowMajor;
+  using ElementAccumulator = ElementAccumulator_;
+
+  using AlphaScaleElementType = typename ScaleTileIterator::Element;
+
+  using ElementCompute = ElementCompute_;
+  using AccumulatorFragment = Array<ElementAccumulator, kElementsPerAccess>;
+  using ComputeFragment = Array<ElementCompute_, kElementsPerAccess>;
+  using OutputVector = Array<ElementOutput, kElementsPerAccess>;
+
+  static int const kThreadsPerRow = OutputTileIterator::ThreadMap::Detail::kAccessWidth;
+  static bool const kHasMultiStepsInRow = (OutputTileIterator::ThreadMap::Iterations::kColumn > 1);
+
+  /// Argument structure
+  struct Arguments {
+    typename ElementwiseFunctor::Params elementwise;
+    int64_t batch_stride_alpha;
+    int64_t batch_stride_C;
+    int64_t batch_stride_D;
+
+    //
+    // Methods
+    //
+    Arguments() : batch_stride_alpha(0), batch_stride_C(0), batch_stride_D(0) {}
+
+    explicit Arguments(typename ElementwiseFunctor::Params elementwise_)
+        : elementwise(elementwise_), batch_stride_alpha(0), batch_stride_C(0), batch_stride_D(0) {}
+
+    Arguments(typename ElementwiseFunctor::Params elementwise_, int64_t batch_stride_alpha_, int64_t batch_stride_C_,
+              int64_t batch_stride_D_)
+        : elementwise(elementwise_),
+          batch_stride_alpha(batch_stride_alpha_),
+          batch_stride_C(batch_stride_C_),
+          batch_stride_D(batch_stride_D_) {}
+  };
+
+  struct Params {
+    typename ElementwiseFunctor::Params elementwise;
+    int64_t batch_stride_alpha;
+    int64_t batch_stride_C;
+    int64_t batch_stride_D;
+
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    CUTLASS_HOST_DEVICE
+    explicit Params(Arguments const& args)
+        : elementwise(args.elementwise),
+          batch_stride_alpha(args.batch_stride_alpha),
+          batch_stride_C(args.batch_stride_C),
+          batch_stride_D(args.batch_stride_D) {}
+  };
+
+  /// Shared storage
+  struct SharedStorage {};
+
+ private:
+  Params const& params_;
+  SharedStorage& shared_storage_;
+  MatrixCoord extent_;
+  MatrixCoord extent_real_;
+  ElementwiseFunctor elementwise_;
+
+  bool const per_token_quant_;
+  bool const per_channel_quant_;
+
+  AlphaScaleElementType* ptr_alpha_row_;
+  AlphaScaleElementType* ptr_alpha_col_;
+  ScaleTileIterator iterator_alpha_col_;
+  OutputTileIterator iterator_C_;
+  OutputTileIterator iterator_D_;
+
+  AlphaScaleElementType element_alpha_row_ = 1.0f;
+  AlphaScaleElementType element_alpha_col_ = 1.0f;
+  typename ScaleTileIterator::Fragment fragment_alpha_col_;
+  typename OutputTileIterator::Fragment fragment_C_;
+  typename OutputTileIterator::Fragment fragment_D_;
+
+  ElementAccumulator beta_;
+
+  int column_offset_;
+
+  MatrixCoord thread_offset_;
+
+ public:
+  CUTLASS_DEVICE
+  EpilogueVisitorPerRowPerCol(Params const& params, SharedStorage& shared_storage,
+                              cutlass::MatrixCoord const& problem_size, int thread_idx, int warp_idx, int lane_idx,
+                              typename ScaleTileIterator::Params params_alpha_col,
+                              typename OutputTileIterator::Params params_C,
+                              typename OutputTileIterator::Params params_D, tk::QuantMode quant_option,
+                              AlphaScaleElementType* ptr_alpha_row, AlphaScaleElementType* ptr_alpha_col,
+                              typename OutputTileIterator::Element* ptr_C, typename OutputTileIterator::Element* ptr_D,
+                              cutlass::MatrixCoord const& threadblock_offset = cutlass::MatrixCoord(0, 0),
+                              int column_offset = 0,
+                              cutlass::MatrixCoord const& problem_size_real = cutlass::MatrixCoord(0, 0))
+      : params_(params),
+        shared_storage_(shared_storage),
+        extent_(problem_size),
+        elementwise_(params.elementwise),
+        per_token_quant_(quant_option.hasPerTokenScaling()),
+        per_channel_quant_(quant_option.hasPerChannelScaling()),
+        ptr_alpha_row_(ptr_alpha_row),
+        ptr_alpha_col_(ptr_alpha_col),
+        iterator_alpha_col_(params_alpha_col, ptr_alpha_col, problem_size, thread_idx, threadblock_offset),
+        iterator_C_(params_C, ptr_C, problem_size, thread_idx, threadblock_offset),
+        iterator_D_(params_D, ptr_D, problem_size, thread_idx, threadblock_offset),
+        extent_real_(problem_size_real) {
+    beta_ = (params.elementwise.beta_ptr ? *params.elementwise.beta_ptr : params.elementwise.beta);
+
+    if (beta_ == ElementAccumulator()) {
+      iterator_C_.clear_mask();
+    }
+
+    if (!per_channel_quant_ && (ptr_alpha_col_ != nullptr)) {
+      element_alpha_col_ = *ptr_alpha_col_;
+    }
+
+    if (!per_token_quant_ && (ptr_alpha_row_ != nullptr)) {
+      element_alpha_row_ = *ptr_alpha_row_;
+    }
+  }
+
+  /// Helper to indicate split-K behavior
+  CUTLASS_DEVICE
+  void set_k_partition(int split_k_index,     ///< Index of this threadblock within split-K partitioned scheme
+                       int split_k_slices) {  ///< Total number of split-K slices
+  }
+
+  /// Called to set the batch index
+  CUTLASS_DEVICE
+  void set_batch_index(int batch_idx) {
+    iterator_alpha_col_.add_pointer_offset(batch_idx * params_.batch_stride_alpha);
+    iterator_C_.add_pointer_offset(batch_idx * params_.batch_stride_C);
+    iterator_D_.add_pointer_offset(batch_idx * params_.batch_stride_D);
+  }
+
+  /// Called at the start of the epilogue just before iterating over accumulator slices
+  CUTLASS_DEVICE
+  void begin_epilogue() {
+    if (per_channel_quant_) {
+      iterator_alpha_col_.load(fragment_alpha_col_);
+    }
+  }
+
+  /// Called at the start of one step before starting accumulator exchange
+  CUTLASS_DEVICE
+  void begin_step(int step_idx) {
+    fragment_D_.clear();
+    fragment_C_.clear();
+
+    if (elementwise_.kScale != cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling) {
+      iterator_C_.load(fragment_C_);
+      ++iterator_C_;
+    }
+  }
+
+  /// Called at the start of a row
+  CUTLASS_DEVICE
+  void begin_row(int row_idx) {
+    // load alpha_row in begin_step only when per token(row) scaling is used
+    if (per_token_quant_) {
+      int thread_offset_row =
+          iterator_D_.thread_start_row() + OutputTileIterator::ThreadMap::iteration_offset(row_idx).row();
+
+      arch::global_load<AlphaScaleElementType, sizeof(AlphaScaleElementType)>(
+          element_alpha_row_, ptr_alpha_row_ + thread_offset_row, thread_offset_row < extent_.row());
+    }
+  }
+
+  /// Called after accumulators have been exchanged for each accumulator vector
+  CUTLASS_DEVICE
+  void visit(int iter_idx, int row_idx, int column_idx, int frag_idx, AccumulatorFragment const& accum) {
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess> source_converter;
+
+    ComputeFragment result = source_converter(accum);
+    if (per_channel_quant_) {
+      ComputeFragment alpha_col = reinterpret_cast<ComputeFragment*>(&fragment_alpha_col_)[column_idx];
+      result = per_token_channel_scale_accumulator_(result, alpha_col, element_alpha_row_);
+    } else {
+      result = per_token_scale_accumulator_(result, element_alpha_col_, element_alpha_row_);
+    }
+
+    // Convert to the output
+    NumericArrayConverter<ElementOutput, ElementCompute, kElementsPerAccess> output_converter;
+    OutputVector& output = reinterpret_cast<OutputVector*>(&fragment_D_)[frag_idx];
+    output = output_converter(result);
+  }
+
+  /// Called at the end of a row
+  CUTLASS_DEVICE
+  void end_row(int row_idx) {}
+
+  /// Called after all accumulator elements have been visited
+  CUTLASS_DEVICE
+  void end_step(int step_idx) {
+    iterator_D_.store(fragment_D_);
+    ++iterator_D_;
+  }
+
+  /// Called after all steps have been completed
+  CUTLASS_DEVICE
+  void end_epilogue() {}
+
+ private:
+  CUTLASS_DEVICE
+  ComputeFragment per_token_channel_scale_accumulator_(ComputeFragment const& accum, ComputeFragment const& scale_col,
+                                                       AlphaScaleElementType const& scale_row) {
+    ComputeFragment result;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ComputeFragment::kElements; ++i) {
+      result[i] = accum[i] * (scale_col[i] * scale_row);
+    }
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  ComputeFragment per_token_scale_accumulator_(ComputeFragment const& accum, AlphaScaleElementType const& scale_col,
+                                               AlphaScaleElementType const& scale_row) {
+    ComputeFragment result;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ComputeFragment::kElements; ++i) {
+      result[i] = accum[i] * (scale_col * scale_row);
+    }
+
+    return result;
+  }
+};
+
+}  // namespace threadblock
+}  // namespace epilogue
+}  // namespace cutlass
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/epilogue/threadblock/epilogue_tensor_op_int32.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/epilogue/threadblock/epilogue_tensor_op_int32.h
new file mode 100644
index 0000000000000..40f126d56616a
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/epilogue/threadblock/epilogue_tensor_op_int32.h
@@ -0,0 +1,247 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+  original file: 3rdparty/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h
+
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/platform/platform.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_clamp.h"
+#include "cutlass/epilogue/thread/linear_combination_gelu.h"
+#include "cutlass/epilogue/thread/linear_combination_hardswish.h"
+#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/thread/linear_combination_relu0.h"
+#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
+
+#include "cutlass/epilogue/thread/conversion_op.h"
+#include "cutlass/epilogue/thread/reduction_op.h"
+
+#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
+
+#include "cutlass/epilogue/threadblock/default_thread_map_tensor_op.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator_mixed.h"
+#include "cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h"
+#include "cutlass/epilogue/warp/fragment_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/threadblock/interleaved_epilogue.h"
+
+#include "cutlass/layout/permute.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/// Partial specialization for bfloat16_t <= int32_t x 8 epilogues avoids shared memory bank conflicts.
+template <typename ThreadblockShape, typename WarpShape, typename InstructionShape, typename ThreadMap>
+struct DefaultIteratorsTensorOp<cutlass::bfloat16_t, int32_t, 8, ThreadblockShape, WarpShape, InstructionShape,
+                                ThreadMap> {
+  using WarpTileIterator =
+      cutlass::epilogue::warp::TileIteratorTensorOpMixed<WarpShape, InstructionShape, int32_t, 32, 16, 8, 8>;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<ThreadMap, int32_t, 32, 16, 8, 8>;
+
+  static int const kFragmentsPerIteration = 2;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load output tile from shared memory in epilogue.
+///
+/// Satisfies: ReadableTileIterator
+///
+template <typename ThreadMap_  ///< Thread map (concept: OutputTileThreadMap)
+          >
+class SharedLoadIteratorMixed<ThreadMap_, int32_t, 32, 16, 8, 8> {
+ public:
+  using ThreadMap = ThreadMap_;
+  using Shape = typename ThreadMap::Shape;
+
+  using Element = int32_t;
+
+  using Layout = layout::RowMajor;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = MatrixCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+
+  static int const kAlignment = ThreadMap::kElementsPerAccess * sizeof_bits<Element>::value / 8;
+
+  static int const kThreads = ThreadMap::kThreads;
+
+  /// Fragment object
+  using Fragment =
+      Array<Element, ThreadMap::Iterations::kColumn * ThreadMap::Iterations::kRow * ThreadMap::Iterations::kGroup *
+                         ThreadMap::Iterations::kCluster * ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess, kAlignment>;
+
+  /// Vector type used for SMEM loads
+  using LoadType = AlignedArray<Element, const_min(128 / sizeof_bits<Element>::value, ThreadMap::kElementsPerAccess),
+                                const_min(16, kAlignment)>;
+
+  static int const kLoadsPerAccess = AccessType::kElements / LoadType::kElements;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Byte-level pointer
+  LoadType const* pointers_[kLoadsPerAccess];
+
+  /// Stride along adjacent rows in units of LoadType
+  int stride_;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  SharedLoadIteratorMixed(TensorRef ref, int thread_idx) : stride_((ref.stride(0) / LoadType::kElements)) {
+    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx);
+
+    // Initialize pointers
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kLoadsPerAccess; ++i) {
+      pointers_[i] = reinterpret_cast<LoadType const*>(ref.data());
+
+      int col_idx = (thread_offset.column() / kElementsPerAccess) * kLoadsPerAccess;
+      int bank_offset = (col_idx * static_cast<int>(sizeof(LoadType)) / 128) % kLoadsPerAccess;
+
+      col_idx += (bank_offset + i) % kLoadsPerAccess;
+
+      pointers_[i] += thread_offset.row() * stride_ + col_idx;
+    }
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kLoadsPerAccess; ++i) {
+      pointers_[i] += pointer_offset / LoadType::kElements;
+    }
+  }
+
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const& offset) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kLoadsPerAccess; ++i) {
+      pointers_[i] += offset.row() * Shape::kRow * stride_ + offset.column() * Shape::kColumn / LoadType::kElements;
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) const {
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int row_ptr_offset = row * ThreadMap::Delta::kRow * stride_ + group * ThreadMap::Delta::kGroup * stride_ +
+                               cluster * ThreadMap::Delta::kCluster * stride_ + pointer_offset / LoadType::kElements;
+
+          int frag_row_idx = (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          LoadType* frag_ptr = reinterpret_cast<LoadType*>(&frag);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+            int frag_idx = frag_row_idx * ThreadMap::Iterations::kColumn + column;
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int v = 0; v < kLoadsPerAccess; ++v) {
+              int vector_idx = (column * ThreadMap::Delta::kColumn / kElementsPerAccess * kLoadsPerAccess);
+
+              LoadType const* memory_pointer = pointers_[v] + row_ptr_offset;
+
+              frag_ptr[frag_idx * kLoadsPerAccess + v] = memory_pointer[vector_idx];
+            }
+          }
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment
+  CUTLASS_DEVICE
+  void load(Fragment& frag) const { load_with_pointer_offset(frag, 0); }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace epilogue
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/epilogue_helpers.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/epilogue_helpers.h
similarity index 57%
rename from onnxruntime/contrib_ops/cuda/moe/ft_moe/epilogue_helpers.h
rename to onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/epilogue_helpers.h
index b18a70e899d1c..b784646c31f84 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/epilogue_helpers.h
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/epilogue_helpers.h
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -24,139 +25,85 @@
 
 #pragma once
 
-#include "cutlass/array.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/epilogue/thread/activation.h"
-#include "cutlass/epilogue/thread/scale_type.h"
-#include "cutlass/functional.h"
-#include "cutlass/half.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/numeric_types.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/epilogue/thread/fused_activations.h"
 #include "cutlass/epilogue/thread/linear_combination.h"
 #include "cutlass/epilogue/thread/linear_combination_generic.h"
 #include "cutlass/epilogue/thread/linear_combination_relu.h"
 #include "cutlass/epilogue/thread/linear_combination_silu.h"
 
-namespace cutlass {
-namespace epilogue {
-namespace thread {
-
-__forceinline__ __device__ float copysignf_pos(float a, float b) {
-  float r;
-  r = __int_as_float(__float_as_int(a) | (__float_as_int(b) & 0x80000000));
-  return r;
-}
-
-__forceinline__ __device__ float tanh_opt(float x) {
-#if (__CUDACC_VER_MAJOR__ < 11) || (__CUDA_ARCH__ < 750)
-  const float exp_val = -1.f * fabs(2 * x);
-  return copysignf_pos((1.0f - __expf(exp_val)) / (__expf(exp_val) + 1.0f), x);
-#else
-  return fast_tanh(x);
-#endif
-}
-
-template <>
-struct GELU_taylor<float> {
-  static const bool kIsHeavy = true;
-  CUTLASS_DEVICE
-  float operator()(float const& z) const {
-    float k0 = float(0.7978845608028654);
-    float k1 = float(0.044715);
-
-    return float(
-        cutlass::constants::half<float>() * z *
-        (cutlass::constants::one<float>() + tanh_opt(k0 * z * (cutlass::constants::one<float>() + k1 * z * z))));
-  }
-
-  using Params = LinearCombinationGenericParams<float>;
-
-  CUTLASS_DEVICE
-  float operator()(float const& scalar, Params const& params_) const { return this->operator()(scalar); }
-};
-
-}  // namespace thread
-}  // namespace epilogue
-}  // namespace cutlass
-
 namespace ort_fastertransformer {
 
 struct EpilogueOpBiasSilu {};
 
-struct EpilogueOpNoBiasSilu {};
-
 struct EpilogueOpBiasReLU {};
 
-struct EpilogueOpNoBiasReLU {};
-
 struct EpilogueOpBiasFtGelu {};
 
-struct EpilogueOpNoBiasFtGelu {};
+struct EpilogueOpDefaultSilu {};
+
+struct EpilogueOpDefaultReLU {};
+
+struct EpilogueOpDefaultFtGelu {};
 
 struct EpilogueOpBias {};
 
-struct EpilogueOpNoBias {};
+struct EpilogueOpDefault {};
 
 template <typename ElementType, int ElementsPerVectorAccess, typename ElementAccumulator, typename Op>
 struct Epilogue {};
 
+constexpr auto BiasScaleMode = cutlass::epilogue::thread::ScaleType::NoBetaScaling;
+
 template <typename ElementType, int ElementsPerVectorAccess, typename ElementAccumulator>
 struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpBiasSilu> {
   using Op = cutlass::epilogue::thread::LinearCombinationSilu<ElementType, ElementsPerVectorAccess, ElementAccumulator,
-                                                              ElementAccumulator,
-                                                              cutlass::epilogue::thread::ScaleType::NoBetaScaling>;
+                                                              ElementAccumulator, BiasScaleMode>;
 };
 
 template <typename ElementType, int ElementsPerVectorAccess, typename ElementAccumulator>
-struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpNoBiasSilu> {
-  using Op = cutlass::epilogue::thread::LinearCombinationSilu<ElementType, ElementsPerVectorAccess, ElementAccumulator,
-                                                              ElementAccumulator,
-                                                              cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling>;
+struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpBiasReLU> {
+  using Op = cutlass::epilogue::thread::LinearCombinationRelu<ElementType, ElementsPerVectorAccess, ElementAccumulator,
+                                                              ElementAccumulator, BiasScaleMode>;
 };
 
 template <typename ElementType, int ElementsPerVectorAccess, typename ElementAccumulator>
-struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpBiasReLU> {
-  using Op = cutlass::epilogue::thread::LinearCombinationRelu<ElementType, ElementsPerVectorAccess, ElementAccumulator,
-                                                              ElementAccumulator,
-                                                              cutlass::epilogue::thread::ScaleType::NoBetaScaling>;
+struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpBiasFtGelu> {
+  using Op = cutlass::epilogue::thread::LinearCombinationGeneric<
+      cutlass::epilogue::thread::GELU_taylor, ElementType, ElementsPerVectorAccess, ElementAccumulator,
+      ElementAccumulator, BiasScaleMode, cutlass::FloatRoundStyle::round_to_nearest, true>;
 };
 
 template <typename ElementType, int ElementsPerVectorAccess, typename ElementAccumulator>
-struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpNoBiasReLU> {
-  using Op = cutlass::epilogue::thread::LinearCombinationRelu<ElementType, ElementsPerVectorAccess, ElementAccumulator,
-                                                              ElementAccumulator,
-                                                              cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling>;
+struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpBias> {
+  using Op = cutlass::epilogue::thread::LinearCombination<ElementType, ElementsPerVectorAccess, ElementAccumulator,
+                                                          ElementAccumulator, BiasScaleMode>;
 };
 
+constexpr auto DefaultScaleMode = cutlass::epilogue::thread::ScaleType::Default;
+
 template <typename ElementType, int ElementsPerVectorAccess, typename ElementAccumulator>
-struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpBiasFtGelu> {
-  using Op = cutlass::epilogue::thread::LinearCombinationGeneric<
-      cutlass::epilogue::thread::GELU_taylor, ElementType, ElementsPerVectorAccess, ElementAccumulator,
-      ElementAccumulator, cutlass::epilogue::thread::ScaleType::NoBetaScaling,
-      cutlass::FloatRoundStyle::round_to_nearest, true>;
+struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpDefaultSilu> {
+  using Op = cutlass::epilogue::thread::LinearCombinationSilu<ElementType, ElementsPerVectorAccess, ElementAccumulator,
+                                                              ElementAccumulator, DefaultScaleMode>;
 };
 
 template <typename ElementType, int ElementsPerVectorAccess, typename ElementAccumulator>
-struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpNoBiasFtGelu> {
-  using Op = cutlass::epilogue::thread::LinearCombinationGeneric<
-      cutlass::epilogue::thread::GELU_taylor, ElementType, ElementsPerVectorAccess, ElementAccumulator,
-      ElementAccumulator, cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling,
-      cutlass::FloatRoundStyle::round_to_nearest, true>;
+struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpDefaultReLU> {
+  using Op = cutlass::epilogue::thread::LinearCombinationRelu<ElementType, ElementsPerVectorAccess, ElementAccumulator,
+                                                              ElementAccumulator, DefaultScaleMode>;
 };
 
 template <typename ElementType, int ElementsPerVectorAccess, typename ElementAccumulator>
-struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpBias> {
-  using Op = cutlass::epilogue::thread::LinearCombination<ElementType, ElementsPerVectorAccess, ElementAccumulator,
-                                                          ElementAccumulator,
-                                                          cutlass::epilogue::thread::ScaleType::NoBetaScaling>;
+struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpDefaultFtGelu> {
+  using Op = cutlass::epilogue::thread::LinearCombinationGeneric<
+      cutlass::epilogue::thread::GELU_taylor, ElementType, ElementsPerVectorAccess, ElementAccumulator,
+      ElementAccumulator, DefaultScaleMode, cutlass::FloatRoundStyle::round_to_nearest, true>;
 };
 
 template <typename ElementType, int ElementsPerVectorAccess, typename ElementAccumulator>
-struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpNoBias> {
-  using Op =
-      cutlass::epilogue::thread::LinearCombination<
-          ElementType, ElementsPerVectorAccess, ElementAccumulator,
-          ElementAccumulator, cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling>;
+struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpDefault> {
+  using Op = cutlass::epilogue::thread::LinearCombination<ElementType, ElementsPerVectorAccess, ElementAccumulator,
+                                                          ElementAccumulator, DefaultScaleMode>;
 };
 
 }  // namespace ort_fastertransformer
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/device/gemm_universal_base_compat.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/device/gemm_universal_base_compat.h
new file mode 100644
index 0000000000000..f5064afc23ae0
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/device/gemm_universal_base_compat.h
@@ -0,0 +1,384 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+  \file
+  \brief The universal GEMM accommodates serial reductions, parallel reductions, batched strided, and
+    batched array variants.
+*/
+
+#pragma once
+
+// #include <limits>
+#include <algorithm>
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/device_kernel.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm_universal.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/kernel/default_gemm_universal.h"
+
+#include "cutlass/trace.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*
+    This is the device layer from CUTLASS 2.10 (SHA - cc85b64cf676c45f98a17e3a47c0aafcf817f088)
+    It is replicated here since we needed to duplicate kernel level APIs for mixed dtype GEMMs
+    and SmoothQuant. The newer device layer is not compatible with these older kernel level APIs.
+
+    Note: While CUTLASS 3.x supports stream-k, none of the kernels in the extensions folder support
+          that feature at the moment.
+  */
+
+template <typename GemmKernel_>
+class GemmUniversalBaseCompat {
+ public:
+  using GemmKernel = GemmKernel_;
+  using ThreadblockShape = typename GemmKernel::Mma::Shape;
+
+  using ElementA = typename GemmKernel::ElementA;
+  using LayoutA = typename GemmKernel::LayoutA;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  static ComplexTransform const kTransformA = GemmKernel::kTransformA;
+
+  using ElementB = typename GemmKernel::ElementB;
+  using LayoutB = typename GemmKernel::LayoutB;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  static ComplexTransform const kTransformB = GemmKernel::kTransformB;
+
+  using ElementC = typename GemmKernel::ElementC;
+  using LayoutC = typename GemmKernel::LayoutC;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+
+  using ElementAccumulator = typename GemmKernel::Mma::Policy::Operator::ElementC;
+
+  using EpilogueOutputOp = typename GemmKernel::EpilogueOutputOp;
+  using ThreadblockSwizzle = typename GemmKernel::ThreadblockSwizzle;
+  using Operator = typename GemmKernel::Operator;
+
+  /// Argument structure
+  using Arguments = typename GemmKernel::Arguments;
+
+ protected:
+  /// Kernel parameters object
+  typename GemmKernel::Params params_;
+
+ protected:
+  /// Private helper to obtain the grid dimensions with fix-up for split-K
+  static void get_grid_shape_(gemm::GemmCoord& grid_tiled_shape, int& gemm_k_size, Arguments const& args) {
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+        args.problem_size, {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK}, args.batch_count);
+
+    gemm_k_size = args.problem_size.k();
+
+    if (args.mode == GemmUniversalMode::kGemm || args.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      int const kAlignK =
+          const_max(const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value), 1);
+
+      gemm_k_size = round_up(ceil_div(args.problem_size.k(), args.batch_count), kAlignK);
+
+      if (gemm_k_size) {
+        grid_tiled_shape.k() = ceil_div(args.problem_size.k(), gemm_k_size);
+      }
+    }
+  }
+
+ public:
+  /// Constructs the GEMM.
+  GemmUniversalBaseCompat() {}
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const& args) {
+    // Determine grid shape
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    int gemm_k_size = 0;
+
+    get_grid_shape_(grid_tiled_shape, gemm_k_size, args);
+
+    ThreadblockSwizzle threadblock_swizzle;
+    dim3 grid = threadblock_swizzle.get_grid_shape(grid_tiled_shape);
+
+    uint32_t const kGridYZMax = ((1 << (sizeof(uint16_t) * 8)) - 1);
+
+    if (!(grid.y <= kGridYZMax && grid.z <= kGridYZMax)) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return GemmKernel::can_implement(args);
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const& args) {
+    CUTLASS_TRACE_HOST("GemmUniversalBaseCompat::get_workspace_size()");
+
+    size_t workspace_bytes = 0;
+
+    // Determine grid shape
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    int gemm_k_size = 0;
+
+    get_grid_shape_(grid_tiled_shape, gemm_k_size, args);
+
+    if (args.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      // Split-K parallel always requires a temporary workspace
+      workspace_bytes = sizeof(ElementC) * size_t(args.batch_stride_D) * size_t(grid_tiled_shape.k());
+    } else if (args.mode == GemmUniversalMode::kGemm && grid_tiled_shape.k() > 1) {
+      // Serial split-K only requires a temporary workspace if the number of partitions along the
+      // GEMM K dimension is greater than one.
+      workspace_bytes = sizeof(int) * size_t(grid_tiled_shape.m()) * size_t(grid_tiled_shape.n());
+    }
+
+    CUTLASS_TRACE_HOST("  workspace_bytes: " << workspace_bytes);
+
+    workspace_bytes += GemmKernel::get_extra_workspace_size(args, grid_tiled_shape);
+
+    return workspace_bytes;
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Arguments const& args) {
+    CUTLASS_TRACE_HOST("GemmUniversalBaseCompat::get_grid_shape()");
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    int gemm_k_size = 0;
+
+    get_grid_shape_(grid_tiled_shape, gemm_k_size, args);
+    dim3 result = threadblock_swizzle.get_grid_shape(grid_tiled_shape);
+
+    CUTLASS_TRACE_HOST("  grid_tiled_shape: " << grid_tiled_shape << "\n"
+                                              << "  result = {" << result << "}");
+
+    return result;
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int smem_capacity = -1) {
+    CUTLASS_TRACE_HOST("GemmUniversalBaseCompat::maximum_active_blocks()");
+
+    int max_active_blocks = -1;
+    int smem_size = static_cast<int>(sizeof(typename GemmKernel::SharedStorage));
+
+    CUTLASS_TRACE_HOST("  smem_size: " << smem_size << " bytes");
+
+    if (smem_size <= (48 << 10)) {
+      cudaError_t result = cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, Kernel<GemmKernel>,
+                                                                         GemmKernel::kThreadCount, smem_size);
+
+      if (result == cudaSuccess) {
+        CUTLASS_TRACE_HOST("  max_active_blocks: " << max_active_blocks);
+        return max_active_blocks;
+      }
+    } else {
+      // Query assuming zero shared memory then compute occupancy limit based on SMEM
+      cudaError_t result = cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, Kernel<GemmKernel>,
+                                                                         GemmKernel::kThreadCount, 0);
+
+      if (result != cudaSuccess) {
+        CUTLASS_TRACE_HOST("  cudaOccupancyMaxActiveBlocksPerMultiprocessor() returned error "
+                           << cudaGetErrorString(result));
+
+        return -1;
+      }
+
+      if (smem_capacity < 0) {
+        int device_idx = 0;
+        result = cudaGetDevice(&device_idx);
+
+        if (result != cudaSuccess) {
+          return -1;
+        }
+
+        cudaDeviceProp properties;
+        result = cudaGetDeviceProperties(&properties, device_idx);
+
+        if (result != cudaSuccess) {
+          return -1;
+        }
+
+        smem_capacity = static_cast<int>(properties.sharedMemPerMultiprocessor);
+      }
+
+      int occupancy = std::min(max_active_blocks, smem_capacity / smem_size);
+
+      CUTLASS_TRACE_HOST("  occupancy: " << occupancy);
+
+      return occupancy;
+    }
+
+    CUTLASS_TRACE_HOST("  returning internal error");
+
+    return -1;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr) {
+    CUTLASS_TRACE_HOST("GemmUniversalBaseCompat::initialize() - workspace "
+                       << workspace << ", stream: " << (stream ? "non-null" : "null"));
+
+    size_t workspace_bytes = get_workspace_size(args);
+
+    CUTLASS_TRACE_HOST("  workspace_bytes: " << workspace_bytes);
+
+    if (workspace_bytes) {
+      if (!workspace) {
+        CUTLASS_TRACE_HOST("  error: device workspace must not be null");
+
+        return Status::kErrorWorkspaceNull;
+      }
+
+      if (args.mode == GemmUniversalMode::kGemm) {
+        CUTLASS_TRACE_HOST("  clearing device workspace");
+        cudaError_t result = cudaMemsetAsync(workspace, 0, workspace_bytes, stream);
+
+        if (result != cudaSuccess) {
+          CUTLASS_TRACE_HOST("  cudaMemsetAsync() returned error " << cudaGetErrorString(result));
+
+          return Status::kErrorInternal;
+        }
+      }
+    }
+
+    // Get CUDA grid shape
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    int gemm_k_size = 0;
+
+    get_grid_shape_(grid_tiled_shape, gemm_k_size, args);
+
+    // Initialize the Params structure
+    params_ = typename GemmKernel::Params(args, grid_tiled_shape, gemm_k_size, static_cast<int*>(workspace));
+
+    // Specify shared memory capacity for kernel.
+    int smem_size = static_cast<int>(sizeof(typename GemmKernel::SharedStorage));
+
+    if (smem_size >= (48 << 10)) {
+      cudaError_t result =
+          cudaFuncSetAttribute(Kernel<GemmKernel>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const& args, void* workspace = nullptr) {
+    CUTLASS_TRACE_HOST("GemmUniversalBaseCompat()::update() - workspace: " << workspace);
+
+    size_t workspace_bytes = get_workspace_size(args);
+
+    if (workspace_bytes && !workspace) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    params_.update(args, workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+    CUTLASS_TRACE_HOST("GemmUniversalBaseCompat::run()");
+
+    //
+    // Configure grid and block dimensions
+    //
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(GemmKernel::kThreadCount, 1, 1);
+
+    int smem_size = static_cast<int>(sizeof(typename GemmKernel::SharedStorage));
+
+    //
+    // Launch kernel
+    //
+
+    CUTLASS_TRACE_HOST("  grid: (" << grid << "),  block: (" << block << "),  SMEM: " << smem_size << " bytes");
+
+    // Launch
+    cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    //
+    // Query for errors
+    //
+    cudaError_t result = cudaGetLastError();
+
+    if (result != cudaSuccess) {
+      CUTLASS_TRACE_HOST("  grid launch failed with error " << cudaGetErrorString(result));
+      return Status::kErrorInternal;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) { return run(stream); }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr) {
+    Status status = initialize(args, workspace, stream);
+
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace device
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/device/splitk_gemm_grouped.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/device/splitk_gemm_grouped.h
new file mode 100644
index 0000000000000..b226b73e86fe1
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/device/splitk_gemm_grouped.h
@@ -0,0 +1,476 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+  \file
+  \brief Based on cutlass/include/cutlass/gemm/kernel/gemm_grouped.h
+*/
+
+#pragma once
+
+#include <algorithm>
+#include <limits>
+#include <numeric>
+#include <vector>
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/device_kernel.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm_universal.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/kernel/default_gemm_universal.h"
+
+#include "cutlass/trace.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T_IN, typename T_OUT>
+__global__ void splitkReduction(T_OUT** out_tensor, const T_IN* in_tensor, GemmCoord const* problem_sizes, int splitk,
+                                int64_t* splitk_buffer_offsets) {
+  // in_tensor: [problem_idx, k_partition, hidden_size]
+  //      Note that different requests of in_tensor might have different hidden_size (=m*n)
+  //      so, we need to use splitk_buffer_offsets.
+  // out_tensor: problem_idx * [hidden_size]
+
+  int const problem_idx = blockIdx.y;
+  GemmCoord problem = problem_sizes[problem_idx];
+  int const hidden_size = problem.m() * problem.n();
+  const T_IN* in_tensor_ = in_tensor + splitk_buffer_offsets[problem_idx] * splitk;
+  T_OUT* out_tensor_ = out_tensor[problem_idx];
+
+  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < hidden_size; i += blockDim.x * gridDim.x) {
+    float sum = 0.0f;
+    for (int k_idx = 0; k_idx < splitk; k_idx++) {
+      sum += static_cast<float>(in_tensor_[k_idx * hidden_size + i]);
+    }
+    out_tensor_[i] = (T_OUT)(sum);
+  }
+}
+
+/// GEMM Grouped
+template <typename BaseKernel_>
+class BaseSplitkGrouped {
+ public:
+  using BaseKernel = BaseKernel_;
+
+  using ElementA = typename BaseKernel::ElementA;
+  using LayoutA = typename BaseKernel::LayoutA;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  static ComplexTransform const kTransformA = BaseKernel::kTransformA;
+  static int const kAlignmentA = BaseKernel::kAlignmentA;
+
+  using ElementB = typename BaseKernel::ElementB;
+  using LayoutB = typename BaseKernel::LayoutB;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  static ComplexTransform const kTransformB = BaseKernel::kTransformB;
+  static int const kAlignmentB = BaseKernel::kAlignmentB;
+
+  using ElementC = typename BaseKernel::ElementC;
+  using LayoutC = typename BaseKernel::LayoutC;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  static int const kAlignmentC = BaseKernel::kAlignmentC;
+
+  using ElementAccumulator = typename BaseKernel::Mma::Policy::Operator::ElementC;
+
+  using EpilogueOutputOp = typename BaseKernel::EpilogueOutputOp;
+  using ThreadblockSwizzle = typename threadblock::GemmSplitKHorizontalThreadblockSwizzle;
+
+  using Operator = typename BaseKernel::Operator;
+  using WarpMmaOperator = typename BaseKernel::Mma::Policy::Operator;
+
+  using ArchMmaOperator = typename WarpMmaOperator::ArchMmaOperator;
+  using MathOperator = typename WarpMmaOperator::MathOperator;
+  using OperatorClass = typename WarpMmaOperator::OperatorClass;
+  using ArchTag = typename WarpMmaOperator::ArchTag;
+  using ThreadblockShape = typename BaseKernel::Mma::Shape;
+  using WarpShape = typename BaseKernel::WarpShape;
+  using InstructionShape = typename BaseKernel::InstructionShape;
+  static int const kStages = BaseKernel::Mma::kStages;
+
+  /// Argument structure
+  using Arguments = typename BaseKernel::Arguments;
+
+  using ProblemInfo = typename BaseKernel::ProblemVisitor::ProblemInfo;
+
+ protected:
+  /// Kernel parameters object
+  typename BaseKernel::Params gemm_params_;
+
+ private:
+  /// Get the number of tiles across all problems in a group
+  static int32_t group_tile_count(cutlass::gemm::GemmCoord const* problem_sizes_ptr, int problem_count) {
+    int32_t tiles = 0;
+    for (int32_t i = 0; i < problem_count; ++i) {
+      cutlass::gemm::GemmCoord problem = problem_sizes_ptr[i];
+      BaseKernel::ProblemVisitor::possibly_transpose_problem(problem);
+      tiles += problem_tile_count(problem);
+    }
+    return tiles;
+  }
+
+  /// Copy from `data` to `workspace`
+  Status copy_to_workspace(void* workspace, void* data, size_t bytes) {
+    cudaError_t cuda_error = cudaMemcpy(workspace, data, bytes, cudaMemcpyHostToDevice);
+    if (cuda_error != cudaSuccess) {
+      // Call cudaGetLastError() to clear the error bit
+      cuda_error = cudaGetLastError();
+      CUTLASS_TRACE_HOST("  cudaMemcpy() returned error " << cudaGetErrorString(cuda_error));
+      return Status::kErrorInternal;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Precomputes scheduling information for the grouped GEMM
+  Status precompute(Arguments const& args, int32_t tile_count, void* workspace) {
+    size_t workspace_bytes = get_workspace_size(args);
+    std::vector<uint8_t> host_workspace(workspace_bytes);
+    BaseKernel::ProblemVisitor::host_precompute(args.host_problem_sizes, args.problem_count, args.threadblock_count,
+                                                reinterpret_cast<void*>(host_workspace.data()));
+    return copy_to_workspace(workspace, host_workspace.data(), workspace_bytes);
+  }
+
+  /// Reorder `data` according to `indices`
+  template <typename T>
+  static void reorder_array(T* data, std::vector<size_t> const& indices) {
+    // For now, simply create a copy of the data and then copy over to the original.
+    std::vector<T> copy(indices.size());
+    for (size_t i = 0; i < indices.size(); ++i) {
+      copy.at(i) = data[indices[i]];
+    }
+
+    memcpy(data, copy.data(), indices.size() * sizeof(T));
+  }
+
+ public:
+  /// Constructs the GEMM.
+  BaseSplitkGrouped() {}
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const& args) { return BaseKernel::can_implement(args); }
+
+  /// Get the number of tiles in a problem
+  static int32_t problem_tile_count(cutlass::gemm::GemmCoord const& problem) {
+    auto grid = BaseKernel::ProblemVisitor::grid_shape(problem);
+    return BaseKernel::ProblemVisitor::tile_count(grid);
+  }
+
+  /// Get the number of tiles across all problems in a group
+  static int32_t group_tile_count(Arguments const& args) {
+    if (args.host_problem_sizes == nullptr) {
+      CUTLASS_TRACE_HOST("Received nullptr for `args.host_problem_sizes");
+      return -1;
+    }
+
+    return group_tile_count(args.host_problem_sizes, args.problem_count);
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const& args) {
+    size_t total_mn = 0;
+    for (int i = 0; i < args.problem_count; i++) {
+      total_mn += args.host_problem_sizes[i].m() * args.host_problem_sizes[i].n();
+    }
+    size_t workSpaceSize = total_mn * sizeof(ElementAccumulator) * args.split_k_slices;
+
+    if (BaseKernel::ProblemVisitor::kRequiresPrecomputation) {
+      workSpaceSize += BaseKernel::ProblemVisitor::get_workspace_size(args.host_problem_sizes, args.problem_count,
+                                                                      args.threadblock_count);
+    }
+    return workSpaceSize;
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Arguments const& args) { return dim3(args.threadblock_count, 1, 1); }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int smem_capacity = -1) {
+    CUTLASS_TRACE_HOST("BaseSplitkGrouped::maximum_active_blocks()");
+
+    int smem_size = static_cast<int>(sizeof(typename BaseKernel::SharedStorage));
+
+    CUTLASS_TRACE_HOST("  smem_size: " << smem_size << " bytes");
+
+    cudaError_t result;
+    if (smem_size > (48 << 10)) {
+      result = cudaFuncSetAttribute(Kernel<BaseKernel>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
+
+      if (result != cudaSuccess) {
+        // Call cudaGetLastError() to clear the error bit
+        result = cudaGetLastError();
+        CUTLASS_TRACE_HOST("  cudaFuncSetAttribute() returned error " << cudaGetErrorString(result));
+        return -1;
+      }
+    }
+
+    int max_active_blocks = -1;
+    result = cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, Kernel<BaseKernel>,
+                                                           BaseKernel::kThreadCount, smem_size);
+
+    if (result != cudaSuccess) {
+      // Call cudaGetLastError() to clear the error bit
+      result = cudaGetLastError();
+      CUTLASS_TRACE_HOST("  cudaOccupancyMaxActiveBlocksPerMultiprocessor() returned error "
+                         << cudaGetErrorString(result));
+      return -1;
+    }
+
+    CUTLASS_TRACE_HOST("  max_active_blocks: " << max_active_blocks);
+    return max_active_blocks;
+  }
+
+  /// Sorts each pointer passed in according to the indices that sort
+  /// `problem_sizes_ptr` in descending order of problem-K dimension.
+  static void sort_problems(int problem_count, cutlass::gemm::GemmCoord* problem_sizes_ptr, int64_t* lda_host_ptr,
+                            int64_t* ldb_host_ptr, int64_t* ldc_host_ptr, int64_t* ldd_host_ptr, int64_t* offset_A_ptr,
+                            int64_t* offset_B_ptr, int64_t* offset_C_ptr, int64_t* offset_D_ptr) {
+    std::vector<size_t> indices(problem_count);
+    std::iota(indices.begin(), indices.end(), 0);
+    std::stable_sort(indices.begin(), indices.end(), [&problem_sizes_ptr](size_t i, size_t j) {
+      return problem_sizes_ptr[i].k() > problem_sizes_ptr[j].k();
+    });
+
+    reorder_array(problem_sizes_ptr, indices);
+    reorder_array(lda_host_ptr, indices);
+    reorder_array(ldb_host_ptr, indices);
+    reorder_array(ldc_host_ptr, indices);
+    reorder_array(ldd_host_ptr, indices);
+    reorder_array(offset_A_ptr, indices);
+    reorder_array(offset_B_ptr, indices);
+    reorder_array(offset_C_ptr, indices);
+    reorder_array(offset_D_ptr, indices);
+  }
+
+  /// Computes the number of threadblocks to launch for the grouped kernel
+  static int sufficient(cutlass::gemm::GemmCoord const* problem_sizes_ptr = nullptr, int problem_count = 0,
+                        int available_sm_count = -1) {
+    // Determine the number of blocks that would be launched to fill up a single
+    // wave on the GPU with each SM having maximum occupancy.
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+    if (result != cudaSuccess) {
+      // Call cudaGetLastError() to clear the error bit
+      result = cudaGetLastError();
+      CUTLASS_TRACE_HOST("  cudaGetDevice() returned error " << cudaGetErrorString(result));
+      return 0;
+    }
+
+    int multiprocessor_count;
+    result = cudaDeviceGetAttribute(&multiprocessor_count, cudaDevAttrMultiProcessorCount, device_idx);
+    if (result != cudaSuccess) {
+      CUTLASS_TRACE_HOST("  cudaDeviceGetAttribute() returned error " << cudaGetErrorString(result));
+      return 0;
+    }
+
+    bool override_sm_count = (available_sm_count < 0 || available_sm_count > multiprocessor_count);
+    if (override_sm_count) {
+      available_sm_count = multiprocessor_count;
+    }
+
+    int max_active_blocks = maximum_active_blocks();
+    if (max_active_blocks <= 0) {
+      return 0;
+    }
+
+    int occupancy_based_block_count = available_sm_count * max_active_blocks;
+
+    if (problem_sizes_ptr == nullptr || problem_count == 0) {
+      return occupancy_based_block_count;
+    }
+
+    int total_tiles = group_tile_count(problem_sizes_ptr, problem_count);
+
+    // If the group contains a single problem, launching the exact number of
+    // threadblocks needed to cover the problem minimizes the work performed
+    // per threadblock in finding the next tile to compute. We return total_tiles
+    // unless the user has provided the SM count.
+    if (problem_count == 1 && override_sm_count) {
+      return total_tiles;
+    }
+
+    // Choose between the full wave of threadblocks and the tile count. If there
+    // are fewer tiles in the group than threadblocks in the full wave, only
+    // some threadblocks will be assigned tiles. Those threadblocks
+    // which are not assigned tiles still need to perform the work of iterating through
+    // problem sizes to determine that they have no work to do. This competes for cycles
+    // with those threadblocks that are assigned tiles to compute.
+    return std::min(total_tiles, occupancy_based_block_count);
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr) {
+    CUTLASS_TRACE_HOST("BaseSplitkGrouped::initialize() - workspace "
+                       << workspace << ", stream: " << (stream ? "non-null" : "null"));
+
+    // Workspace
+    size_t workspace_bytes = get_workspace_size(args);
+
+    if (workspace_bytes && !workspace) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    if (BaseKernel::ProblemVisitor::kRequiresPrecomputation) {
+      int32_t tile_count = group_tile_count(args);
+      Status status = precompute(args, tile_count, workspace);
+      if (status != Status::kSuccess) {
+        return status;
+      }
+
+      gemm_params_ = typename BaseKernel::Params(args, workspace, tile_count);
+    } else {
+      gemm_params_ = typename BaseKernel::Params(args, workspace);
+    }
+
+    // Specify shared memory capacity for kernel.
+    int smem_size = static_cast<int>(sizeof(typename BaseKernel::SharedStorage));
+
+    if (smem_size >= (48 << 10)) {
+      cudaError_t result =
+          cudaFuncSetAttribute(Kernel<BaseKernel>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const& args, void* workspace = nullptr) {
+    size_t workspace_bytes = get_workspace_size(args);
+
+    if (workspace_bytes && !workspace) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    if (BaseKernel::ProblemVisitor::kRequiresPrecomputation) {
+      int32_t tile_count = group_tile_count(args);
+      Status status = precompute(args, tile_count, workspace);
+      if (status != Status::kSuccess) {
+        return status;
+      }
+
+      gemm_params_.update(args, workspace, tile_count);
+    } else {
+      gemm_params_.update(args, workspace);
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+    if (!gemm_params_.problem_visitor.problem_count) {
+      return Status::kSuccess;
+    }
+
+    //
+    // Launch kernel
+    //
+
+    // Launch splitk grouped gemm
+    {
+      dim3 grid(gemm_params_.threadblock_count, 1, gemm_params_.split_k_slices);
+      dim3 block(BaseKernel::kThreadCount, 1, 1);
+
+      int smem_size = static_cast<int>(sizeof(typename BaseKernel::SharedStorage));
+      cutlass::Kernel<BaseKernel><<<grid, block, smem_size, stream>>>(gemm_params_);
+
+      cudaError_t result = cudaGetLastError();
+
+      if (result != cudaSuccess) {
+        CUTLASS_TRACE_HOST("  grid launch failed with error " << cudaGetErrorString(result));
+        return Status::kErrorInternal;
+      }
+    }
+
+    // Launch splitkReduction
+    {
+      dim3 grid(32, gemm_params_.problem_visitor.problem_count);
+      dim3 block(256);
+      splitkReduction<<<grid, block, 0, stream>>>(gemm_params_.ptr_D, gemm_params_.ptr_D_split,
+                                                  gemm_params_.problem_visitor.problem_sizes,
+                                                  gemm_params_.split_k_slices, gemm_params_.splitk_buffer_offsets);
+
+      cudaError_t result = cudaGetLastError();
+
+      if (result != cudaSuccess) {
+        CUTLASS_TRACE_HOST("  grid launch failed with error " << cudaGetErrorString(result));
+        return Status::kErrorInternal;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) { return run(stream); }
+
+  /// Initializes and runs the kernel.
+  Status operator()(Arguments const& args, void* workspace, cudaStream_t stream = nullptr) {
+    Status status = initialize(args, workspace, stream);
+
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// GEMM Grouped
+template <typename GemmKernel_>
+class SplitkGemmGrouped : public BaseSplitkGrouped<GemmKernel_> {
+ public:
+  using GemmKernel = GemmKernel_;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace device
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/layout_traits_helper.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/default_fpA_intB_traits.h
similarity index 71%
rename from onnxruntime/contrib_ops/cuda/moe/ft_moe/layout_traits_helper.h
rename to onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/default_fpA_intB_traits.h
index eb33a98e4246f..2b3478a38fc2e 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/layout_traits_helper.h
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/default_fpA_intB_traits.h
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -13,51 +14,22 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-/*
-  This file exists so that we use the same weight layout for MoE grouped gemm and regular gemm when the weight is
-  quantized. The preprocessing code reads this template to know how to organize the quantized weight matrices
-  to be consumed by CUTLASS.
-
-  Note that for int4, ThreadBlockK MUST be 64.
-
- */
-
 #pragma once
 
-#include "cutlass/layout/matrix.h"
-#include "cutlass/numeric_types.h"
 #include "cutlass/arch/arch.h"
 #include "cutlass/arch/mma.h"
-#include "cutlass/platform/platform.h"
+#include "cutlass/bfloat16.h"
 #include "cutlass/cutlass.h"
 #include "cutlass/gemm/gemm.h"
+#include "cutlass/layout/matrix.h"
+
+#include "contrib_ops/cuda/moe/cutlass_extensions/arch/mma.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/mixed_gemm_B_layout.h"
 
 namespace cutlass {
 namespace gemm {
 namespace kernel {
 
-template <typename TypeB, typename Arch, typename Enable = void>
-struct LayoutDetailsB {};
-
-// Volta specialiations. Volta will dequantize before STS, so we need a different operator
-template <typename TypeB>
-struct LayoutDetailsB<TypeB, arch::Sm70> {
-  static constexpr int ThreadblockK = 64;
-  using Layout = layout::RowMajor;
-  static constexpr int ElementsPerAccess = 8;
-  using Operator = cutlass::arch::OpMultiplyAdd;
-};
-
-// Specializations for Turing+ when B is FP16. These are currently only used for MoE networks.
-// TODO - Switch this to column major for weights since gemms should be more performant.
-template <typename Arch>
-struct LayoutDetailsB<half_t, Arch, typename platform::enable_if<Arch::kMinComputeCapability >= 75>::type> {
-  static constexpr int ThreadblockK = 64;
-  using Layout = layout::RowMajor;
-  static constexpr int ElementsPerAccess = 128 / cutlass::sizeof_bits<half_t>::value;
-  using Operator = cutlass::arch::OpMultiplyAdd;
-};
-
 template <typename TypeA, typename TypeB, typename arch, typename Enable = void>
 struct MixedGemmArchTraits {};
 
@@ -66,7 +38,7 @@ struct MixedGemmArchTraits<float, float, arch> {
   static constexpr int Stages = 2;
   using OperatorClass = cutlass::arch::OpClassSimt;
   using AccType = float;
-  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
 
   static constexpr int ElementsPerAccessA = 1;
   static constexpr int ElementsPerAccessB = 1;
@@ -80,10 +52,13 @@ struct MixedGemmArchTraits<float, float, arch> {
 // ========================= Volta Traits ===========================
 // Volta will always dequantize after the global memory load.
 // This will instantiate any HMMA tensorcore kernels for Volta.
+// Note that volta does not have native bfloat support so weights and activations will be casted to fp16
+// and compute will happen in fp16 then will be converted for bf16 output.
 template <typename TypeA, typename TypeB>
 struct MixedGemmArchTraits<
     TypeA, TypeB, cutlass::arch::Sm70,
-    typename cutlass::platform::enable_if<cutlass::platform::is_same<TypeA, cutlass::half_t>::value>::type> {
+    typename cutlass::platform::enable_if<cutlass::platform::is_same<TypeA, cutlass::half_t>::value ||
+                                          cutlass::platform::is_same<TypeA, cutlass::bfloat16_t>::value>::type> {
  private:
   using LayoutDetails = LayoutDetailsB<TypeB, cutlass::arch::Sm70>;
 
@@ -103,10 +78,13 @@ struct MixedGemmArchTraits<
 };
 
 // ======================= Turing Traits ==============================
+// Note that turing does not have native bfloat support so weights and activations will be casted to fp16
+// and compute will happen in fp16 then will be converted for bf16 output.
 template <typename TypeA, typename TypeB>
 struct MixedGemmArchTraits<
     TypeA, TypeB, cutlass::arch::Sm75,
-    typename cutlass::platform::enable_if<cutlass::platform::is_same<TypeA, cutlass::half_t>::value>::type> {
+    typename cutlass::platform::enable_if<cutlass::platform::is_same<TypeA, cutlass::half_t>::value ||
+                                          cutlass::platform::is_same<TypeA, cutlass::bfloat16_t>::value>::type> {
  private:
   using LayoutDetails = LayoutDetailsB<TypeB, cutlass::arch::Sm75>;
 
@@ -129,7 +107,8 @@ struct MixedGemmArchTraits<
 template <typename TypeA, typename TypeB>
 struct MixedGemmArchTraits<
     TypeA, TypeB, cutlass::arch::Sm80,
-    typename cutlass::platform::enable_if<cutlass::platform::is_same<TypeA, cutlass::half_t>::value>::type> {
+    typename cutlass::platform::enable_if<cutlass::platform::is_same<TypeA, cutlass::half_t>::value ||
+                                          cutlass::platform::is_same<TypeA, cutlass::bfloat16_t>::value>::type> {
  private:
   using LayoutDetails = LayoutDetailsB<TypeB, cutlass::arch::Sm80>;
 
@@ -150,4 +129,4 @@ struct MixedGemmArchTraits<
 
 }  // namespace kernel
 }  // namespace gemm
-}  // namespace cutlass
\ No newline at end of file
+}  // namespace cutlass
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/default_int8_traits.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/default_int8_traits.h
new file mode 100644
index 0000000000000..fe4bc0940d9e8
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/default_int8_traits.h
@@ -0,0 +1,51 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/layout/matrix.h"
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+template <typename arch>
+struct Int8GemmArchTraits {
+  using OperatorClass = cutlass::arch::OpClassSimt;
+  using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+};
+
+// ======================= Turing Traits ==============================
+template <>
+struct Int8GemmArchTraits<cutlass::arch::Sm75> {
+  using OperatorClass = cutlass::arch::OpClassTensorOp;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>;
+};
+
+// ======================= Ampere Traits ==============================
+template <>
+struct Int8GemmArchTraits<cutlass::arch::Sm80> {
+  using OperatorClass = cutlass::arch::OpClassTensorOp;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+};
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/default_splitk_gemm_grouped.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/default_splitk_gemm_grouped.h
new file mode 100644
index 0000000000000..9339be92dfb2a
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/default_splitk_gemm_grouped.h
@@ -0,0 +1,206 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief
+      Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
+      specializations here choose 'device::GemmTransposed' to implement this functionality.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/complex.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass/gemm/kernel/default_gemm_complex.h"
+#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
+
+#include "cutlass/layout/permute.h"
+
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/splitk_gemm_grouped.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Whether the schedule of problems to visit has been precomputed
+    GroupScheduleMode GroupScheduleMode_ = GroupScheduleMode::kDeviceOnly,
+    /// Operation performed by GEMM
+    typename Operator = typename device::DefaultGemmConfiguration<OperatorClass, ArchTag, ElementA_, ElementB_,
+                                                                  ElementC_, ElementAccumulator>::Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
+    /// Permute result D
+    typename PermuteDLayout = layout::NoPermute,
+    ///
+    typename Enable = void>
+struct DefaultSplitkGemmGrouped;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Real-valued GEMM kernels
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Whether the schedule of problems to visit has been precomputed
+    GroupScheduleMode GroupScheduleMode_,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear,
+    /// Permute result D
+    typename PermuteDLayout>
+struct DefaultSplitkGemmGrouped<ElementA, LayoutA,
+                                ComplexTransform::kNone,  // transform A
+                                kAlignmentA, ElementB, LayoutB,
+                                ComplexTransform::kNone,  // transform B
+                                kAlignmentB, ElementC, LayoutC, ElementAccumulator, OperatorClass, ArchTag,
+                                ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle,
+                                Stages, GroupScheduleMode_, Operator, SharedMemoryClear, PermuteDLayout,
+                                typename platform::enable_if<!cutlass::is_complex<ElementAccumulator>::value>::type> {
+  // If true, we must construct a 'transposed-and-exchanged' Mma operator.
+  static bool const kInternalTranspose = platform::is_same<LayoutC, layout::ColumnMajor>::value;
+
+  using MapArguments =
+      kernel::detail::MapArguments<ElementA, LayoutA, ComplexTransform::kNone, kAlignmentA, ElementB, LayoutB,
+                                   ComplexTransform::kNone, kAlignmentB, LayoutC, kInternalTranspose>;
+
+  // Define the default GEMM kernel
+  using DefaultGemmKernel =
+      typename kernel::DefaultGemm<typename MapArguments::ElementA, typename MapArguments::LayoutA,
+                                   MapArguments::kAlignmentA, typename MapArguments::ElementB,
+                                   typename MapArguments::LayoutB, MapArguments::kAlignmentB, ElementC,
+                                   typename MapArguments::LayoutC, ElementAccumulator, OperatorClass, ArchTag,
+                                   ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle,
+                                   Stages, true, Operator, SharedMemoryClear, false, /*GatherA*/
+                                   false,                                            /*GatherB*/
+                                   false,                                            /*ScatterD*/
+                                   PermuteDLayout>::GemmKernel;
+
+  /// Define the kernel in terms of the default kernel
+  using GemmKernel = kernel::SplitkGemmGrouped<typename DefaultGemmKernel::Mma, typename DefaultGemmKernel::Epilogue,
+                                               ThreadblockSwizzle, GroupScheduleMode_, kInternalTranspose>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/fpA_intB_gemm.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/fpA_intB_gemm.h
new file mode 100644
index 0000000000000..778d45f39eab3
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/fpA_intB_gemm.h
@@ -0,0 +1,513 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include <algorithm>
+#include <type_traits>
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/semaphore.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+template <typename>
+inline constexpr bool dependent_false_v = false;
+}
+
+template <typename Mma_,                 ///! Threadblock-scoped matrix multiply-accumulate
+          typename Epilogue_,            ///! Epilogue
+          typename ThreadblockSwizzle_,  ///! Threadblock swizzling function
+          typename KernelArch,           ///! The Architecture this kernel is compiled for. Used since SIMT kernels lose
+                                         /// top-level
+                                         /// arch.
+          bool SplitKSerial              ///! If true, code supporting split-K via serial reduction is enabled.
+          >
+struct GemmFpAIntB {
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static bool const kSplitKSerial = SplitKSerial;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Element;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Mma::LayoutC;
+  using ElementScale = ElementC;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformA;
+
+  // Type definitions about the mainloop.
+  using Operator = typename Mma::Operator;
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  static constexpr int kInterleave = Mma::IteratorB::Shape::kRow / Mma::Shape::kK;
+
+  /// Parameters structure
+  struct Arguments {
+    GemmUniversalMode mode = GemmUniversalMode::kGemm;
+
+    cutlass::gemm::GemmCoord problem_size;
+    int group_size;
+    typename Mma::IteratorA::TensorRef ref_A;
+    typename Mma::IteratorB::TensorRef ref_B;
+    typename Mma::IteratorScale::TensorRef ref_scale;
+    typename Mma::IteratorScale::TensorRef ref_zero;
+    typename Epilogue::OutputTileIterator::TensorRef ref_C;
+    typename Epilogue::OutputTileIterator::TensorRef ref_D;
+
+    // Control serial split-k
+    int batch_count;
+
+    typename EpilogueOutputOp::Params output_op;
+
+    // For gather+scatter operations
+    int const* gather_A_indices;
+    int const* gather_B_indices;
+    int const* scatter_D_indices;
+
+    // Included so we can use Gemm Universal
+    int batch_stride_D = 0;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Arguments() {}
+
+    CUTLASS_HOST_DEVICE
+    Arguments(cutlass::gemm::GemmCoord const& problem_size, int const group_size,
+              typename Mma::IteratorA::TensorRef ref_A, typename Mma::IteratorB::TensorRef ref_B,
+              typename Mma::IteratorScale::TensorRef ref_scale, typename Mma::IteratorScale::TensorRef ref_zero,
+              typename Epilogue::OutputTileIterator::TensorRef ref_C,
+              typename Epilogue::OutputTileIterator::TensorRef ref_D, int serial_split_k_factor,
+              typename EpilogueOutputOp::Params output_op = typename EpilogueOutputOp::Params(),
+              int const* gather_A_indices = nullptr, int const* gather_B_indices = nullptr,
+              int const* scatter_D_indices = nullptr)
+        : problem_size(problem_size),
+          group_size(group_size),
+          ref_A(ref_A),
+          ref_B(ref_B),
+          ref_scale(ref_scale),
+          ref_zero(ref_zero),
+          ref_C(ref_C),
+          ref_D(ref_D),
+          batch_count(serial_split_k_factor),
+          output_op(output_op),
+          gather_A_indices(gather_A_indices),
+          gather_B_indices(gather_B_indices),
+          scatter_D_indices(scatter_D_indices) {}
+  };
+
+  /// Parameters structure
+  struct Params {
+    cutlass::gemm::GemmCoord problem_size;
+    int group_size;
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    int swizzle_log_tile;
+    typename Mma::IteratorA::Params params_A;
+    typename Mma::IteratorA::TensorRef ref_A;
+    typename Mma::IteratorB::Params params_B;
+    typename Mma::IteratorB::TensorRef ref_B;
+    typename Mma::IteratorScale::Params params_scale;
+    typename Mma::IteratorScale::TensorRef ref_scale;
+    typename Mma::IteratorScale::TensorRef ref_zero;
+    typename Epilogue::OutputTileIterator::Params params_C;
+    typename Epilogue::OutputTileIterator::TensorRef ref_C;
+    typename Epilogue::OutputTileIterator::Params params_D;
+    typename Epilogue::OutputTileIterator::TensorRef ref_D;
+    typename EpilogueOutputOp::Params output_op;
+    int* semaphore;
+    int gemm_k_size;
+    // For gather+scatter operations
+    int const* gather_A_indices;
+    int const* gather_B_indices;
+    int const* scatter_D_indices;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params() : swizzle_log_tile(0), semaphore(0), gemm_k_size(0) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(Arguments const& args, cutlass::gemm::GemmCoord const& grid_tiled_shape, int const gemm_k_size,
+           void* workspace = nullptr)
+        : problem_size(args.problem_size),
+          group_size(args.group_size),
+          grid_tiled_shape(grid_tiled_shape),
+          swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
+          params_A(args.ref_A.layout()),
+          ref_A(args.ref_A),
+          params_B(args.ref_B.layout()),
+          ref_B(args.ref_B),
+          params_scale(args.ref_scale.layout()),
+          ref_scale(args.ref_scale),
+          ref_zero(args.ref_zero),
+          params_C(args.ref_C.layout()),
+          ref_C(args.ref_C),
+          params_D(args.ref_D.layout()),
+          ref_D(args.ref_D),
+          output_op(args.output_op),
+          semaphore(static_cast<int*>(workspace)),
+          gemm_k_size(gemm_k_size),
+          gather_A_indices(args.gather_A_indices),
+          gather_B_indices(args.gather_B_indices),
+          scatter_D_indices(args.scatter_D_indices) {}
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  GemmFpAIntB() {}
+
+  /// Determines whether kernel satisfies alignment
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Arguments const& args) {
+    static int const kAlignmentA =
+        (platform::is_same<typename Mma::IteratorA::Layout, layout::ColumnMajorInterleaved<32>>::value) ? 32
+        : (platform::is_same<typename Mma::IteratorA::Layout, layout::ColumnMajorInterleaved<64>>::value)
+            ? 64
+            : Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB =
+        (platform::is_same<typename Mma::IteratorB::Layout, layout::RowMajorInterleaved<32>>::value) ? 32
+        : (platform::is_same<typename Mma::IteratorB::Layout, layout::RowMajorInterleaved<64>>::value)
+            ? 64
+            : Mma::IteratorB::AccessType::kElements;
+
+    static int const kAlignmentScale = Mma::IteratorScale::AccessType::kElements;
+
+    static int const kAlignmentC =
+        (platform::is_same<typename Epilogue::OutputTileIterator::Layout, layout::ColumnMajorInterleaved<32>>::value)
+            ? 32
+        : (platform::is_same<typename Epilogue::OutputTileIterator::Layout, layout::ColumnMajorInterleaved<64>>::value)
+            ? 64
+            : Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    if (!TensorRef_aligned(args.ref_A, kAlignmentA)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(args.ref_B, kAlignmentB)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(args.ref_scale, kAlignmentScale)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(args.ref_zero, kAlignmentScale)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(args.ref_C, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(args.ref_D, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!args.ref_scale.good()) {
+      return Status::kErrorNotSupported;
+    }
+
+    if constexpr (hasZero(Mma::QuantOp)) {
+      if (!args.ref_zero.good()) {
+        return Status::kErrorNotSupported;
+      }
+    } else {
+      if (args.ref_zero.good()) {
+        return Status::kErrorNotSupported;
+      }
+    }
+
+    if constexpr (isFinegrained(Mma::QuantOp)) {
+      if (args.group_size != 64 && args.group_size != 128) {
+        return Status::kErrorNotSupported;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+
+  static size_t get_extra_workspace_size(Arguments const& args, cutlass::gemm::GemmCoord const& grid_tiled_shape) {
+    return 0;
+  }
+
+  // Initializes the fine grained scale+bias iterator. Needed since the fine grained iterator
+  // has a different constructor signature than a regular cutlass iterator
+  template <typename IteratorScale, WeightOnlyQuantOp op, std::enable_if_t<isFinegrained(op), bool> = true>
+  CUTLASS_DEVICE static IteratorScale initialize_scale(typename IteratorScale::Params const& params,
+                                                       typename IteratorScale::Pointer pointer_scale,
+                                                       typename IteratorScale::Pointer pointer_zero,
+                                                       typename IteratorScale::TensorCoord extent, int thread_id,
+                                                       typename IteratorScale::TensorCoord const& threadblock_offset,
+                                                       int group_size) {
+    return IteratorScale(params, pointer_scale, pointer_zero, extent, thread_id, threadblock_offset, group_size);
+  }
+
+  template <typename IteratorScale, WeightOnlyQuantOp op, std::enable_if_t<!isFinegrained(op), bool> = true>
+  CUTLASS_DEVICE static IteratorScale initialize_scale(typename IteratorScale::Params const& params,
+                                                       typename IteratorScale::Pointer pointer_scale,
+                                                       typename IteratorScale::Pointer pointer_zero,
+                                                       typename IteratorScale::TensorCoord extent, int thread_id,
+                                                       typename IteratorScale::TensorCoord const& threadblock_offset,
+                                                       int group_size) {
+    return IteratorScale(params, pointer_scale, extent, thread_id, threadblock_offset);
+  }
+
+  CUTLASS_DEVICE
+  void run_kernel_(Params const& params, SharedStorage& shared_storage) {
+    using LayoutB = typename Mma::IteratorB::Layout;
+    static_assert(platform::is_same<LayoutB, layout::RowMajor>::value && kInterleave == 1 ||
+                      platform::is_same<LayoutB, layout::ColumnMajor>::value && kInterleave >= 1,
+                  "B must be row major/col major OR col major interleaved.");
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+        params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+      return;
+    }
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+        threadblock_tile_offset.m() * Mma::Shape::kM,
+        threadblock_tile_offset.k() * params.gemm_k_size,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{threadblock_tile_offset.k() * params.gemm_k_size * kInterleave,
+                                     threadblock_tile_offset.n() * Mma::Shape::kN / kInterleave};
+
+    typename MatrixCoord::Index fg_row_offset = threadblock_tile_offset.k() * params.gemm_k_size / 64;
+    typename MatrixCoord::Index scale_row_offset = isFinegrained(Mma::QuantOp) ? fg_row_offset : 0;
+    cutlass::MatrixCoord tb_offset_scale{scale_row_offset, threadblock_tile_offset.n() * Mma::Shape::kN};
+
+    // Problem size is a function of threadblock index in the K dimension
+    int problem_size_k = min(params.problem_size.k(), (threadblock_tile_offset.k() + 1) * params.gemm_k_size);
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - tb_offset_A.column() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(params.params_A, params.ref_A.data(), {params.problem_size.m(), problem_size_k},
+                                       thread_idx, tb_offset_A, params.gather_A_indices);
+
+    typename Mma::IteratorB iterator_B(params.params_B, params.ref_B.data(),
+                                       {problem_size_k * kInterleave, params.problem_size.n() / kInterleave},
+                                       thread_idx, tb_offset_B, params.gather_B_indices);
+
+    typename MatrixCoord::Index scale_row_extent = isFinegrained(Mma::QuantOp) ? problem_size_k / 64 : 1;
+    typename Mma::IteratorScale iterator_scale = initialize_scale<typename Mma::IteratorScale, Mma::QuantOp>(
+        params.params_scale, params.ref_scale.data(), params.ref_zero.data(),
+        {scale_row_extent, params.problem_size.n()}, thread_idx, tb_offset_scale, params.group_size);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, params.group_size, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    if (!kSplitKSerial || gemm_k_iterations > 0) {
+      // Compute threadblock-scoped matrix multiply-add
+      mma(gemm_k_iterations, accumulators, iterator_A, iterator_B, iterator_scale, accumulators);
+    }
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // assume identity swizzle
+    MatrixCoord threadblock_offset(threadblock_tile_offset.m() * Mma::Shape::kM,
+                                   threadblock_tile_offset.n() * Mma::Shape::kN);
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    // If performing a reduction via split-K, fetch the initial synchronization
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+      // Fetch the synchronization lock initially but do not block.
+      semaphore.fetch();
+
+      // Indicate which position in a serial reduction the output operator is currently updating
+      output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+    }
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(params.params_C, params.ref_C.data(), params.problem_size.mn(),
+                                                     thread_idx, threadblock_offset, params.scatter_D_indices);
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(params.params_D, params.ref_D.data(), params.problem_size.mn(),
+                                                     thread_idx, threadblock_offset, params.scatter_D_indices);
+
+    Epilogue epilogue(shared_storage.epilogue, thread_idx, warp_idx, lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(output_op, iterator_D, accumulators, iterator_C);
+
+    //
+    // Release the semaphore
+    //
+
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      } else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+
+      semaphore.release(lock);
+    }
+  }
+
+  template <typename CompilationArch>
+  CUTLASS_DEVICE void run_kernel(Params const& params, SharedStorage& shared_storage) {
+    if constexpr (platform::is_same<KernelArch, CompilationArch>::value) {
+      run_kernel_(params, shared_storage);
+    } else {
+      CUTLASS_NOT_IMPLEMENTED();
+    }
+  }
+
+  /*
+      To improve compilation speed, we do not compile the device operator if the CUDA_ARCH does not correspond
+      to the ArchTag of the cutlass kernel operator.
+    */
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const& params, SharedStorage& shared_storage) {
+#if defined(__CUDA_ARCH__)
+#if (__CUDA_ARCH__ >= 700) && (__CUDA_ARCH__ < 750)
+    run_kernel<arch::Sm70>(params, shared_storage);
+#elif (__CUDA_ARCH__ >= 750) && (__CUDA_ARCH__ < 800)
+    run_kernel<arch::Sm75>(params, shared_storage);
+#elif (__CUDA_ARCH__ >= 800) && (__CUDA_ARCH__ < 900)
+    run_kernel<arch::Sm80>(params, shared_storage);
+#elif (__CUDA_ARCH__ >= 900)
+    CUTLASS_NOT_IMPLEMENTED();  // Don't compile these for Hopper or later. Use CUTLASS 3.x kernels.
+#else
+    static_assert(false,
+                  "Invalid architecture being compiled. Only Volta+ supported in weight-only quantization kernels.");
+#endif
+#else
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/gemm_moe_problem_visitor.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/gemm_moe_problem_visitor.h
new file mode 100644
index 0000000000000..6cb5cc4e1334c
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/gemm_moe_problem_visitor.h
@@ -0,0 +1,66 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*! \file
+    \brief Scheduler for grouped GEMM
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm_grouped_problem_visitor.h"
+#include "cutlass/matrix_coord.h"
+
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/gemm_moe_problem_visitor.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/moe_problem_visitor.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/// Visitor class to abstract away the algorithm for iterating over tiles
+template <typename ThreadblockShape, GroupScheduleMode GroupScheduleMode_, int PrefetchTileCount, int ThreadCount,
+          bool Transposed = false>
+struct GemmMoeProblemVisitor
+    : public MoeProblemVisitor<detail::GemmGroupedProblemSizeHelper<ThreadblockShape, Transposed>, ThreadblockShape,
+                               GroupScheduleMode_, PrefetchTileCount, ThreadCount> {
+  static bool const kTransposed = Transposed;
+
+  using ProblemSizeHelper = detail::GemmGroupedProblemSizeHelper<ThreadblockShape, Transposed>;
+  using Base =
+      MoeProblemVisitor<ProblemSizeHelper, ThreadblockShape, GroupScheduleMode_, PrefetchTileCount, ThreadCount>;
+  using Params = typename Base::Params;
+  using SharedStorage = typename Base::SharedStorage;
+
+  //
+  // Methods
+  //
+  CUTLASS_DEVICE
+  GemmMoeProblemVisitor(Params const& params_, SharedStorage& shared_storage_, int32_t block_idx)
+      : Base(params_, shared_storage_, block_idx) {}
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/gemm_with_epilogue_visitor.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/gemm_with_epilogue_visitor.h
new file mode 100644
index 0000000000000..fb35b2dbf12cf
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/gemm_with_epilogue_visitor.h
@@ -0,0 +1,516 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief GEMM kernel to support the epilogue visitor model
+    for customized softmax partial reduction epilogue fusion.
+
+    This source file will likely be moved to `include/cutlass/gemm/kernel/` in the future once
+    its usage has been stabilized. For now, it is included in this example to demonstrate
+    some basic output fusion options.
+
+    original file: 3rdparty/cutlass/examples/35_gemm_softmax/gemm_with_epilogue_visitor.h
+*/
+
+#pragma once
+
+#include "cutlass/complex.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/trace.h"
+
+#include "contrib_ops/cuda/moe/cutlass_extensions/epilogue/threadblock/epilogue_per_row_per_col_scale.h"
+
+namespace tk = tensorrt_llm::common;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Mma_,                ///! Threadblock-scoped matrix multiply-accumulate
+          typename Epilogue_,           ///! Epilogue
+          typename ThreadblockSwizzle_  ///! Threadblock swizzling function
+          >
+struct GemmWithEpilogueVisitor {
+ public:
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueVisitor = typename Epilogue::Visitor;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using TensorRefA = TensorRef<ElementA, LayoutA>;
+
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using TensorRefB = TensorRef<ElementB, LayoutB>;
+
+  using ElementCompute = typename EpilogueVisitor::ElementCompute;
+  using LayoutAlphaCol = cutlass::layout::RowMajor;
+  using LayoutAlphaRow = cutlass::layout::ColumnMajor;
+  using TensorRefAlphaCol = TensorRef<ElementCompute, LayoutAlphaCol>;
+  using TensorRefAlphaRow = TensorRef<ElementCompute, LayoutAlphaRow>;
+
+  using ElementC = typename EpilogueVisitor::ElementOutput;
+  using LayoutC = typename Epilogue::Layout;
+  using TensorRefC = TensorRef<ElementC, LayoutC>;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+  using EpilogueOutputOp =
+      typename Epilogue::Visitor::ElementwiseFunctor;  // Define type so GemmUniversalBase doesn't complain
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = EpilogueVisitor::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Split-K preserves splits that are 128b aligned
+  static int const kSplitKAlignment = const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value);
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+    //
+    // Data members
+    //
+
+    GemmUniversalMode mode;
+    GemmCoord problem_size;
+    int batch_count;
+
+    TensorRefA ref_A;
+    TensorRefB ref_B;
+    tk::QuantMode quant_option;
+    TensorRefAlphaCol ref_alpha_col;
+    TensorRefAlphaRow ref_alpha_row;
+    TensorRefC ref_C;
+    TensorRefC ref_D;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+    int64_t batch_stride_D;
+
+    typename EpilogueVisitor::Arguments epilogue_visitor;
+
+    //
+    // Methods
+    //
+
+    Arguments() : mode(GemmUniversalMode::kGemm), batch_count(1) {}
+
+    /// constructs an arguments structure
+    Arguments(GemmUniversalMode mode_, GemmCoord problem_size_, int batch_count_, TensorRefA ref_A_, TensorRefB ref_B_,
+              tk::QuantMode quant_option_, TensorRefAlphaCol ref_alpha_col_, TensorRefAlphaRow ref_alpha_row_,
+              TensorRefC ref_C_, TensorRefC ref_D_, int64_t batch_stride_A_, int64_t batch_stride_B_,
+              typename EpilogueVisitor::Arguments epilogue_visitor_)
+        : mode(mode_),
+          problem_size(problem_size_),
+          batch_count(batch_count_),
+          ref_A(ref_A_),
+          ref_B(ref_B_),
+          quant_option(quant_option_),
+          ref_alpha_col(ref_alpha_col_),
+          ref_alpha_row(ref_alpha_row_),
+          ref_C(ref_C_),
+          ref_D(ref_D_),
+          batch_stride_A(batch_stride_A_),
+          batch_stride_B(batch_stride_B_),
+          batch_stride_D(0),
+          epilogue_visitor(epilogue_visitor_) {}
+  };
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params {
+    cutlass::gemm::GemmCoord problem_size;
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    int swizzle_log_tile;
+
+    typename Mma::IteratorA::Params params_A;
+    typename Mma::IteratorB::Params params_B;
+    typename EpilogueVisitor::ScaleTileIterator::Params params_alpha_col;
+    typename EpilogueVisitor::ScaleTileIterator::Params params_alpha_row;
+    typename EpilogueVisitor::OutputTileIterator::Params params_C;
+    typename EpilogueVisitor::OutputTileIterator::Params params_D;
+
+    GemmUniversalMode mode;
+    int batch_count;
+    int gemm_k_size;
+
+    void* ptr_A;
+    void* ptr_B;
+    tk::QuantMode quant_option;
+    typename EpilogueVisitor::ScaleTileIterator::Element* ptr_alpha_col;
+    typename EpilogueVisitor::ScaleTileIterator::Element* ptr_alpha_row;
+    ElementC* ptr_C;
+    ElementC* ptr_D;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+
+    typename EpilogueVisitor::Params epilogue_visitor;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params()
+        : swizzle_log_tile(0),
+          params_A(0),
+          params_B(0),
+          params_alpha_col(0),
+          params_C(0),
+          params_D(0),
+          batch_count(0),
+          gemm_k_size(0),
+          mode(cutlass::gemm::GemmUniversalMode::kGemm),
+          ptr_A(nullptr),
+          ptr_B(nullptr),
+          ptr_alpha_col(nullptr),
+          ptr_alpha_row(nullptr),
+          ptr_C(nullptr),
+          ptr_D(nullptr),
+          batch_stride_A(0),
+          batch_stride_B(0) {}
+
+    Params(Arguments const& args, cutlass::gemm::GemmCoord const& grid_tiled_shape_, int gemm_k_size_, int* workspace_)
+        : problem_size(args.problem_size),
+          swizzle_log_tile(0),
+          params_A(args.ref_A.layout()),
+          params_B(args.ref_B.layout()),
+          params_alpha_col(args.ref_alpha_col.layout()),
+          params_alpha_row(args.ref_alpha_col.layout()),
+          params_C(args.ref_C.layout()),
+          params_D(args.ref_D.layout()),
+          mode(args.mode),
+          batch_count(args.batch_count),
+          gemm_k_size(args.problem_size.k()),
+          ptr_A(args.ref_A.data()),
+          ptr_B(args.ref_B.data()),
+          quant_option(args.quant_option),
+          ptr_alpha_col(args.ref_alpha_col.data()),
+          ptr_alpha_row(args.ref_alpha_row.data()),
+          ptr_C(args.ref_C.data()),
+          ptr_D(args.ref_D.data()),
+          batch_stride_A(args.batch_stride_A),
+          batch_stride_B(args.batch_stride_B),
+          epilogue_visitor(args.epilogue_visitor) {
+      ThreadblockSwizzle threadblock_swizzle;
+
+      grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+          args.problem_size, {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK}, args.batch_count);
+
+      if (args.mode == GemmUniversalMode::kGemm || args.mode == GemmUniversalMode::kGemmSplitKParallel) {
+        int const kAlignK =
+            const_max(const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value), 1);
+
+        gemm_k_size = round_up(ceil_div(args.problem_size.k(), args.batch_count), kAlignK);
+
+        if (gemm_k_size) {
+          grid_tiled_shape.k() = ceil_div(args.problem_size.k(), gemm_k_size);
+        }
+      }
+
+      swizzle_log_tile = threadblock_swizzle.get_log_tile(grid_tiled_shape);
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+
+    struct {
+      typename Epilogue::SharedStorage epilogue;
+      typename EpilogueVisitor::SharedStorage visitor;
+    } epilogue;
+  };
+
+ public:
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  GemmWithEpilogueVisitor() {}
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(cutlass::gemm::GemmCoord const& problem_size) {
+    CUTLASS_TRACE_HOST("GemmWithEpilogueVisitor::can_implement()");
+
+    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = EpilogueVisitor::OutputTileIterator::kElementsPerAccess;
+
+    bool isAMisaligned = false;
+    bool isBMisaligned = false;
+    bool isCMisaligned = false;
+
+    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      isAMisaligned = problem_size.m() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value ||
+               platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    }
+
+    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
+      isBMisaligned = problem_size.n() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value ||
+               platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    }
+
+    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
+      isCMisaligned = problem_size.m() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value ||
+               platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    }
+
+    if (isAMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isBMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isCMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    CUTLASS_TRACE_HOST("  returning kSuccess");
+
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const& args) { return can_implement(args.problem_size); }
+
+  static size_t get_extra_workspace_size(Arguments const& args, cutlass::gemm::GemmCoord const& grid_tiled_shape) {
+    return 0;
+  }
+
+#define SPLIT_K_ENABLED 1
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void run_kernel_(Params const& params, SharedStorage& shared_storage) {
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+        params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+      return;
+    }
+
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA* ptr_A = static_cast<ElementA*>(params.ptr_A);
+    ElementB* ptr_B = static_cast<ElementB*>(params.ptr_B);
+
+#if SPLIT_K_ENABLED
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm || params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size;
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    } else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
+      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
+    } else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA* const*>(params.ptr_A)[threadblock_tile_offset.k()];
+      ptr_B = static_cast<ElementB* const*>(params.ptr_B)[threadblock_tile_offset.k()];
+    }
+#endif
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+        threadblock_tile_offset.m() * Mma::Shape::kM,
+        offset_k,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{offset_k, threadblock_tile_offset.n() * Mma::Shape::kN};
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(params.params_A, ptr_A, {params.problem_size.m(), problem_size_k}, thread_idx,
+                                       tb_offset_A);
+
+    typename Mma::IteratorB iterator_B(params.params_B, ptr_B, {problem_size_k, params.problem_size.n()}, thread_idx,
+                                       tb_offset_B);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // assume identity swizzle
+    MatrixCoord threadblock_offset(threadblock_tile_offset.m() * Mma::Shape::kM,
+                                   threadblock_tile_offset.n() * Mma::Shape::kN);
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    //
+    // Construct the epilogue visitor
+    //
+
+    EpilogueVisitor epilogue_visitor(
+        params.epilogue_visitor, shared_storage.epilogue.visitor, params.problem_size.mn(), thread_idx, warp_idx,
+        lane_idx, params.params_alpha_col, params.params_C, params.params_D, params.quant_option, params.ptr_alpha_row,
+        params.ptr_alpha_col, params.ptr_C, params.ptr_D, threadblock_offset, blockIdx.y * params.problem_size.m());
+
+    if (params.mode == GemmUniversalMode::kGemm) {
+      // Indicate which position in a serial reduction the output operator is currently updating
+      epilogue_visitor.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+    } else if (params.mode == GemmUniversalMode::kBatched || params.mode == GemmUniversalMode::kArray) {
+      epilogue_visitor.set_batch_index(threadblock_tile_offset.k());
+    }
+
+    // Construct the epilogue
+    Epilogue epilogue(shared_storage.epilogue.epilogue, thread_idx, warp_idx, lane_idx);
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(epilogue_visitor, accumulators);
+  }
+
+  template <typename CompilationArch>
+  CUTLASS_DEVICE void run_kernel(Params const& params, SharedStorage& shared_storage) {
+    if constexpr (platform::is_same<ArchTag, CompilationArch>::value) {
+      run_kernel_(params, shared_storage);
+    } else {
+      CUTLASS_NOT_IMPLEMENTED();
+    }
+  }
+
+  /*
+      To improve compilation speed, we do not compile the device operator if the CUDA_ARCH does not correspond
+      to the ArchTag of the cutlass kernel operator.
+    */
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const& params, SharedStorage& shared_storage) {
+#if defined(__CUDA_ARCH__)
+#if (__CUDA_ARCH__ >= 700) && (__CUDA_ARCH__ < 720)
+    run_kernel<arch::Sm70>(params, shared_storage);
+#elif (__CUDA_ARCH__ >= 720) && (__CUDA_ARCH__ < 750)
+    run_kernel<arch::Sm72>(params, shared_storage);
+#elif (__CUDA_ARCH__ >= 750) && (__CUDA_ARCH__ < 800)
+    run_kernel<arch::Sm75>(params, shared_storage);
+#elif (__CUDA_ARCH__ >= 800) && (__CUDA_ARCH__ < 900)
+    run_kernel<arch::Sm80>(params, shared_storage);
+#elif (__CUDA_ARCH__ >= 900)
+    // replace with CUTLASS_NOT_IMPLEMENTED() and upgrade to 3.x kernels.
+    run_kernel<arch::Sm80>(params, shared_storage);
+#else
+    static_assert(false,
+                  "Invalid architecture being compiled. Only Volta+ supported in weight-only quantization kernels.");
+#endif
+#else
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/mixed_gemm_B_layout.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/mixed_gemm_B_layout.h
new file mode 100644
index 0000000000000..35d22b2f55a89
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/mixed_gemm_B_layout.h
@@ -0,0 +1,126 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+  This file exists so that we use the same weight layout for MoE grouped gemm and regular gemm when the weight is
+  quantized. The preprocessing code reads this template to know how to organize the quantized weight matrices
+  to be consumed by CUTLASS.
+
+  Note that for int4, ThreadBlockK MUST be 64.
+
+ */
+
+#pragma once
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/platform/platform.h"
+
+#include "contrib_ops/cuda/moe/cutlass_extensions/arch/mma.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/tile_interleaved_layout.h"
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+template <typename TypeB, typename Arch, typename Enable = void>
+struct LayoutDetailsB {};
+
+// Volta specialiations. Volta will dequantize before STS, so we need a different operator
+template <typename TypeB>
+struct LayoutDetailsB<TypeB, arch::Sm70> {
+  static constexpr int ThreadblockK = 64;
+  using Layout = layout::ColumnMajor;
+  static constexpr int ElementsPerAccess = 8;
+  using Operator = cutlass::arch::OpMultiplyAdd;
+};
+
+// Specializations for Turing+ when B is FP16. These are currently only used for MoE networks.
+// Switch this to column major for weights since gemms should be more performant.
+template <typename Arch>
+struct LayoutDetailsB<half_t, Arch, typename platform::enable_if<Arch::kMinComputeCapability >= 75>::type> {
+  static constexpr int ThreadblockK = 64;
+  using Layout = layout::ColumnMajor;
+  static constexpr int ElementsPerAccess = 128 / cutlass::sizeof_bits<half_t>::value;
+  using Operator = cutlass::arch::OpMultiplyAdd;
+};
+
+template <typename Arch>
+struct LayoutDetailsB<bfloat16_t, Arch, typename platform::enable_if<Arch::kMinComputeCapability >= 75>::type> {
+  static constexpr int ThreadblockK = 64;
+  using Layout = layout::ColumnMajor;
+  static constexpr int ElementsPerAccess = 128 / cutlass::sizeof_bits<bfloat16_t>::value;
+  using Operator = cutlass::arch::OpMultiplyAdd;
+};
+
+// Specializations for Turing+ when B is quantized. These can use the operator OpMultiplyAddDequantizeInterleavedBToA,
+// which signals that we want to dequantize after loading from smem.
+template <typename Arch>
+    struct LayoutDetailsB <
+    uint8_t,
+    Arch,
+    typename platform::enable_if<Arch::kMinComputeCapability >= 75 && Arch::kMinComputeCapability<90>::type> {
+  static constexpr int ThreadblockK = 64;
+
+ private:
+  static constexpr int ElementsPerCacheLine = 128 * 8 / sizeof_bits<uint8_t>::value;
+  static constexpr int ColumnsInterleaved = ElementsPerCacheLine / ThreadblockK;
+
+ public:
+  using Layout = layout::ColumnMajorTileInterleave<ThreadblockK, ColumnsInterleaved>;
+  static constexpr int ElementsPerAccess = 128 / cutlass::sizeof_bits<uint8_t>::value;
+  using Operator = cutlass::arch::OpMultiplyAddDequantizeInterleavedBToA;
+};
+
+template <typename Arch>
+    struct LayoutDetailsB <
+    uint4b_t,
+    Arch,
+    typename platform::enable_if<Arch::kMinComputeCapability >= 75 && Arch::kMinComputeCapability<90>::type> {
+  static constexpr int ThreadblockK = 64;
+
+ private:
+  static constexpr int ElementsPerCacheLine = 128 * 8 / sizeof_bits<uint4b_t>::value;
+  static constexpr int ColumnsInterleaved = ElementsPerCacheLine / ThreadblockK;
+
+ public:
+  using Layout = layout::ColumnMajorTileInterleave<ThreadblockK, ColumnsInterleaved>;
+  static constexpr int ElementsPerAccess = 128 / cutlass::sizeof_bits<uint4b_t>::value;
+  using Operator = cutlass::arch::OpMultiplyAddDequantizeInterleavedBToA;
+};
+
+template <typename Arch>
+struct LayoutDetailsB<uint8_t, Arch, typename platform::enable_if<Arch::kMinComputeCapability >= 90>::type> {
+  static constexpr int ThreadblockK = 64;
+  using Layout = layout::ColumnMajor;
+  static constexpr int ElementsPerAccess = 128 / cutlass::sizeof_bits<half_t>::value;
+  using Operator = cutlass::arch::OpMultiplyAdd;
+};
+
+template <typename Arch>
+struct LayoutDetailsB<uint4b_t, Arch, typename platform::enable_if<Arch::kMinComputeCapability >= 90>::type> {
+  static constexpr int ThreadblockK = 64;
+  using Layout = layout::ColumnMajor;
+  static constexpr int ElementsPerAccess = 128 / cutlass::sizeof_bits<half_t>::value;
+  using Operator = cutlass::arch::OpMultiplyAdd;
+};
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/moe_cutlass_kernel.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/moe_cutlass_kernel.h
new file mode 100644
index 0000000000000..9e3e9d20d7f6e
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/moe_cutlass_kernel.h
@@ -0,0 +1,471 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*! \file
+    \brief
+*/
+
+#pragma once
+
+#include "cutlass/complex.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/semaphore.h"
+
+#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/trace.h"
+
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/gemm_moe_problem_visitor.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/tile_interleaved_layout.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// This section exists to that we can use the same kernel code for regular gemm and dequantizing gemms.
+// It will dispatch to the dequantizing gemm if the Mma type has an Iterator for scales in global.
+template <typename...>
+using void_t = void;
+
+template <typename Mma, typename = void>
+struct use_dq_gemm : platform::false_type {};
+
+template <typename Mma>
+struct use_dq_gemm<Mma, void_t<typename Mma::IteratorScale>> : platform::true_type {};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Mma_,                        ///! Threadblock-scoped matrix multiply-accumulate
+          typename Epilogue_,                   ///! Epilogue
+          typename ThreadblockSwizzle_,         ///! Threadblock swizzling function
+          typename KernelArch,                  ///! The Architecture this kernel is compiled for. Used since SIMT
+                                                /// kernels lose top-level arch.
+          GroupScheduleMode GroupScheduleMode_  ///! Type of scheduling to perform
+          >
+struct MoeFCGemm {
+ public:
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static GroupScheduleMode const kGroupScheduleMode = GroupScheduleMode_;
+  static bool const kTransposed = false;
+
+  // Optional transpose
+  using MapArguments =
+      kernel::detail::MapArguments<typename Mma::IteratorA::Element, typename Mma::IteratorA::Layout, Mma::kTransformA,
+                                   Mma::IteratorA::AccessType::kElements, typename Mma::IteratorB::Element,
+                                   typename Mma::IteratorB::Layout, Mma::kTransformB,
+                                   Mma::IteratorB::AccessType::kElements, typename Mma::LayoutC, kTransposed>;
+
+  // Public-facing type definitions related to operand element type, layout, and complex conjugate
+  // operation. Must interact with the 'kTransposed' notion.
+  static_assert(!kTransposed, "Transpose problem not supported");
+  using ElementA = typename MapArguments::ElementA;
+  using LayoutA = typename MapArguments::LayoutA;
+  using ElementB = typename MapArguments::ElementB;
+  using LayoutB = typename MapArguments::LayoutB;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename MapArguments::LayoutC;
+  using ElementScale = ElementC;
+
+  static ComplexTransform const kTransformA = MapArguments::kTransformA;
+  static ComplexTransform const kTransformB = MapArguments::kTransformB;
+
+  // Type definitions about the mainloop.
+  using Operator = typename Mma::Operator;
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = MapArguments::kAlignmentA;
+  static int const kAlignmentB = MapArguments::kAlignmentB;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  using ProblemVisitor =
+      GemmMoeProblemVisitor<ThreadblockShape, kGroupScheduleMode, kThreadCount, kThreadCount, kTransposed>;
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+    //
+    // Data members
+    //
+
+    int problem_count;
+    int threadblock_count;
+    int group_size;
+
+    typename EpilogueOutputOp::Params output_op;
+
+    ElementA* ptr_A;
+    ElementB* ptr_B;
+    ElementScale* weight_scales;
+    ElementC* ptr_C;
+    ElementC* ptr_D;
+
+    int64_t* total_rows_before_expert;
+    int64_t gemm_n;
+    int64_t gemm_k;
+
+    // Only used by device-level operator
+    GemmCoord* host_problem_sizes;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments()
+        : problem_count(0),
+          threadblock_count(0),
+          ptr_A(nullptr),
+          ptr_B(nullptr),
+          weight_scales(nullptr),
+          ptr_C(nullptr),
+          ptr_D(nullptr),
+          total_rows_before_expert(nullptr),
+          gemm_n(0),
+          gemm_k(0),
+          host_problem_sizes(nullptr) {}
+
+    /// Ctor
+    CUTLASS_HOST_DEVICE
+    Arguments(int problem_count, int threadblock_count, int group_size, typename EpilogueOutputOp::Params output_op,
+              ElementA const* ptr_A, ElementB const* ptr_B, ElementScale const* weight_scales, ElementC const* ptr_C,
+              ElementC* ptr_D, int64_t* total_rows_before_expert, int64_t gemm_n, int64_t gemm_k,
+              GemmCoord* host_problem_sizes = nullptr)
+        : problem_count(problem_count),
+          threadblock_count(threadblock_count),
+          group_size(group_size),
+          output_op(output_op),
+          ptr_A(const_cast<ElementA*>(ptr_A)),
+          ptr_B(const_cast<ElementB*>(ptr_B)),
+          weight_scales(const_cast<ElementScale*>(weight_scales)),
+          ptr_C(const_cast<ElementC*>(ptr_C)),
+          ptr_D(ptr_D),
+          total_rows_before_expert(total_rows_before_expert),
+          gemm_n(gemm_n),
+          gemm_k(gemm_k),
+          host_problem_sizes(nullptr) {
+      if (platform::is_same<uint8_t, ElementB>::value || platform::is_same<uint4b_t, ElementB>::value) {
+        assert(weight_scales);
+      }
+    }
+  };
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params {
+    typename ProblemVisitor::Params problem_visitor;
+    int threadblock_count;
+    int group_size;
+
+    typename EpilogueOutputOp::Params output_op;
+
+    ElementA* ptr_A;
+    ElementB* ptr_B;
+    ElementScale* weight_scales;
+    ElementC* ptr_C;
+    ElementC* ptr_D;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params() : ptr_A(nullptr), ptr_B(nullptr), weight_scales(nullptr), ptr_C(nullptr), ptr_D(nullptr) {}
+
+    CUTLASS_HOST_DEVICE
+    explicit Params(Arguments const& args, void* workspace = nullptr, int tile_count = 0)
+        : problem_visitor(args.total_rows_before_expert, args.gemm_n, args.gemm_k, args.problem_count, workspace,
+                          tile_count),
+          threadblock_count(args.threadblock_count),
+          group_size(args.group_size),
+          output_op(args.output_op),
+          ptr_A(args.ptr_A),
+          ptr_B(args.ptr_B),
+          weight_scales(args.weight_scales),
+          ptr_C(args.ptr_C),
+          ptr_D(args.ptr_D) {}
+
+    CUTLASS_HOST_DEVICE
+    void update(Arguments const& args, void* workspace = nullptr, int tile_count = 0) {
+      problem_visitor = typename ProblemVisitor::Params(args.total_rows_before_expert, args.gemm_n, args.gemm_k,
+                                                        args.problem_count, workspace, tile_count);
+      threadblock_count = args.threadblock_count;
+      output_op = args.output_op;
+      ptr_A = args.ptr_A;
+      ptr_B = args.ptr_B;
+      weight_scales = args.weight_scales;
+      ptr_C = args.ptr_C;
+      ptr_D = args.ptr_D;
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename ProblemVisitor::SharedStorage problem_visitor;
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+ public:
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  MoeFCGemm() {}
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(cutlass::gemm::GemmCoord const& problem_size) { return Status::kSuccess; }
+
+  static Status can_implement(Arguments const& args) {
+    if (platform::is_same<uint8_t, ElementB>::value || platform::is_same<uint4b_t, ElementB>::value) {
+      if (args.weight_scales == nullptr) {
+        CUTLASS_TRACE_HOST("MoeFCGemm::can_implement() - weight scales are required for uint8_t and uint4b_t");
+        return Status::kInvalid;
+      }
+    } else if (args.weight_scales != nullptr) {
+      CUTLASS_TRACE_HOST(
+          "MoeFCGemm::can_implement() - weight scales are ignored for all types except uint8_t and uint4b_t");
+      return Status::kInvalid;
+    } else if (args.group_size != args.gemm_k) {
+      CUTLASS_TRACE_HOST("MoeFCGemm::can_implement() - scale shape should be (1, gemm_n)");
+      return Status::kInvalid;
+    } else if (static_cast<size_t>(args.gemm_n) < Mma::IteratorB::AccessType::kElements) {
+      CUTLASS_TRACE_HOST("MoeFCGemm::can_implement() - gemm_n is smaller than the input alignment");
+      return Status::kInvalid;
+    }
+    return Status::kSuccess;
+  }
+
+  static size_t get_extra_workspace_size(Arguments const& args, cutlass::gemm::GemmCoord const& grid_tiled_shape) {
+    return 0;
+  }
+
+  CUTLASS_DEVICE
+  void run_kernel_(Params const& params, SharedStorage& shared_storage) {
+    //
+    // These types shadow the type-level definitions and support the ability to implement
+    // a 'transposed' GEMM that computes the transposed problems.
+    //
+    using ElementA = typename Mma::IteratorA::Element;
+    using LayoutA = typename Mma::IteratorA::Layout;
+    using ElementB = typename Mma::IteratorB::Element;
+    using LayoutB = typename Mma::IteratorB::Layout;
+    using ElementC = typename Epilogue::OutputTileIterator::Element;
+    using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+    static constexpr int kInterleave = Mma::IteratorB::Shape::kRow / Mma::Shape::kK;
+    static_assert(platform::is_same<LayoutB, layout::RowMajor>::value && kInterleave == 1 ||
+                      platform::is_same<LayoutB, layout::ColumnMajor>::value && kInterleave >= 1,
+                  "B must be row major/col major OR col major interleaved.");
+
+    //
+    // Problem visitor.
+    //
+    ProblemVisitor problem_visitor(params.problem_visitor, shared_storage.problem_visitor, blockIdx.x);
+
+    const int64_t gemm_k = params.problem_visitor.gemm_k;
+    const int64_t gemm_n = params.problem_visitor.gemm_n;
+    int64_t bytes_per_expert_matrix = (gemm_k * gemm_n / 8) * cutlass::sizeof_bits<ElementB>::value;
+
+    // Outer 'persistent' loop to iterate over tiles
+    int loop = 0;
+    while (problem_visitor.next_tile()) {
+      loop++;
+
+      GemmCoord problem_size = problem_visitor.problem_size();
+      int32_t problem_idx = problem_visitor.problem_index();
+      int32_t cta_idx = int32_t(problem_visitor.threadblock_idx());
+
+      GemmCoord grid_shape = problem_visitor.grid_shape(problem_size);
+
+      cutlass::gemm::GemmCoord threadblock_offset(static_cast<int>(cta_idx / grid_shape.n()) * Mma::Shape::kM,
+                                                  static_cast<int>(cta_idx % grid_shape.n()) * Mma::Shape::kN, 0);
+
+      // Load element pointers. Exchange pointers and strides if working on the transpose
+      const int64_t rows_to_jump = problem_idx == 0 ? 0 : params.problem_visitor.last_row_for_problem[problem_idx - 1];
+      ElementA* ptr_A = reinterpret_cast<ElementA*>(params.ptr_A) + rows_to_jump * gemm_k;
+      typename LayoutA::LongIndex ldm_A = gemm_k;
+
+      char* byte_ptr_B = (reinterpret_cast<char*>(params.ptr_B)) + problem_idx * bytes_per_expert_matrix;
+      ElementB* ptr_B = reinterpret_cast<ElementB*>(byte_ptr_B);
+      typename LayoutB::LongIndex ldm_B =
+          platform::is_same<layout::RowMajor, LayoutB>::value ? gemm_n : gemm_k * kInterleave;
+
+      // Compute initial location in logical coordinates
+      cutlass::MatrixCoord tb_offset_A{
+          threadblock_offset.m(),
+          0,
+      };
+
+      cutlass::MatrixCoord tb_offset_B{0, threadblock_offset.n() / kInterleave};
+
+      cutlass::MatrixCoord tb_offset_scale{0, threadblock_offset.n()};
+
+      // Compute position within threadblock
+      int thread_idx = threadIdx.x;
+
+      // Construct iterators to A and B operands
+      typename Mma::IteratorA iterator_A(LayoutA(ldm_A), ptr_A, {problem_size.m(), problem_size.k()}, thread_idx,
+                                         tb_offset_A);
+
+      typename Mma::IteratorB iterator_B(LayoutB(ldm_B), ptr_B,
+                                         {problem_size.k() * kInterleave, problem_size.n() / kInterleave}, thread_idx,
+                                         tb_offset_B);
+
+      typename Mma::FragmentC accumulators;
+
+      accumulators.clear();
+
+      // Broadcast the warp_id computed by lane 0 to ensure dependent code
+      // is compiled as warp-uniform.
+      int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+
+      int lane_idx = threadIdx.x % 32;
+
+      //
+      // Matrix multiply phase
+      //
+
+      // Construct thread-scoped matrix multiply
+      auto CreateMMA = [&]() {
+        if constexpr (use_dq_gemm<Mma>::value)
+          return Mma(shared_storage.main_loop, params.group_size, thread_idx, warp_idx, lane_idx);
+        else
+          return Mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+      };
+      Mma mma = CreateMMA();
+
+      // Compute threadblock-scoped matrix multiply-add
+      int gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+      // Wait for all threads to finish their epilogue phases from the previous tile.
+      __syncthreads();
+
+      // Compute threadblock-scoped matrix multiply-add
+      ElementScale* weight_scale_ptr = params.weight_scales + problem_idx * problem_size.n();
+
+      if constexpr (use_dq_gemm<Mma>::value) {
+        const MatrixCoord scale_extent = {1, problem_size.n()};
+        typename Mma::IteratorScale iterator_scale(Mma::IteratorScale::Layout(scale_extent.column()), weight_scale_ptr,
+                                                   scale_extent, thread_idx, tb_offset_scale);
+
+        mma(gemm_k_iterations, accumulators, iterator_A, iterator_B, iterator_scale, accumulators);
+      } else {
+        mma(gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators);
+      }
+
+      //
+      // Epilogue
+      //
+
+      EpilogueOutputOp output_op(params.output_op);
+
+      ElementC* ptr_C = reinterpret_cast<ElementC*>(params.ptr_C) + problem_idx * gemm_n;
+      ElementC* ptr_D = reinterpret_cast<ElementC*>(params.ptr_D) + rows_to_jump * gemm_n;
+
+      LayoutC layout_C(0);
+      LayoutC layout_D(gemm_n);
+
+      typename Epilogue::OutputTileIterator::Params params_C(layout_C);
+      typename Epilogue::OutputTileIterator::Params params_D(layout_D);
+
+      // Tile iterator loading from source tensor.
+      typename Epilogue::OutputTileIterator iterator_C(params_C, ptr_C, problem_size.mn(), thread_idx,
+                                                       threadblock_offset.mn());
+
+      // Tile iterator writing to destination tensor.
+      typename Epilogue::OutputTileIterator iterator_D(params_D, ptr_D, problem_size.mn(), thread_idx,
+                                                       threadblock_offset.mn());
+
+      Epilogue epilogue(shared_storage.epilogue, thread_idx, warp_idx, lane_idx);
+
+      // Execute the epilogue operator to update the destination tensor.
+      epilogue(output_op, iterator_D, accumulators, iterator_C);
+
+      // Next tile
+      problem_visitor.advance(gridDim.x);
+    }
+  }
+
+  template <typename CompilationArch>
+  CUTLASS_DEVICE void run_kernel(Params const& params, SharedStorage& shared_storage) {
+    if constexpr (platform::is_same<KernelArch, CompilationArch>::value) {
+      run_kernel_(params, shared_storage);
+    } else {
+      CUTLASS_NOT_IMPLEMENTED();
+    }
+  }
+
+  /*
+    To improve compilation speed, we do not compile the device operator if the CUDA_ARCH does not correspond
+    to the ArchTag of the cutlass kernel operator.
+  */
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const& params, SharedStorage& shared_storage) {
+#if defined(__CUDA_ARCH__)
+#if (__CUDA_ARCH__ >= 700) && (__CUDA_ARCH__ < 750)
+    run_kernel<arch::Sm70>(params, shared_storage);
+#elif (__CUDA_ARCH__ >= 750) && (__CUDA_ARCH__ < 800)
+    run_kernel<arch::Sm75>(params, shared_storage);
+#elif (__CUDA_ARCH__ >= 800) && (__CUDA_ARCH__ < 900)
+    run_kernel<arch::Sm80>(params, shared_storage);
+#elif (__CUDA_ARCH__ >= 900)
+    run_kernel<arch::Sm80>(params,
+                           shared_storage);  // Don't compile these for Hopper or later. Use CUTLASS 3.x kernels.
+#else
+    // static_assert(false,
+    //               "Invalid architecture being compiled. Only Volta+ supported in weight-only quantization kernels.");
+    ;
+#endif
+#else
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_problem_visitor.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/moe_problem_visitor.h
similarity index 79%
rename from onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_problem_visitor.h
rename to onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/moe_problem_visitor.h
index 1de8f6b69642c..6852d4c811b4d 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_problem_visitor.h
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/moe_problem_visitor.h
@@ -1,33 +1,19 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
  *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 /*! \file
     \brief Base scheduler for grouped problems, using MoE
@@ -106,7 +92,7 @@ struct BaseMoeProblemVisitor {
 
   /// Get the grid shape
   CUTLASS_HOST_DEVICE
-  static cutlass::gemm::GemmCoord grid_shape(const cutlass::gemm::GemmCoord& problem) {
+  static cutlass::gemm::GemmCoord grid_shape(cutlass::gemm::GemmCoord const& problem) {
     return cutlass::gemm::GemmCoord(((problem.m() - 1 + ThreadblockShape::kM) / ThreadblockShape::kM),
                                     ((problem.n() - 1 + ThreadblockShape::kN) / ThreadblockShape::kN), 1);
   }
@@ -145,9 +131,9 @@ struct BaseMoeProblemVisitor {
   }
 
   CUTLASS_HOST_DEVICE
-  static int32_t tile_count(const cutlass::gemm::GemmCoord& grid) { return ProblemSizeHelper::tile_count(grid); }
+  static int32_t tile_count(cutlass::gemm::GemmCoord const& grid) { return ProblemSizeHelper::tile_count(grid); }
 
-  static int32_t group_tile_count(const cutlass::gemm::GemmCoord* host_problem_sizes_ptr, int32_t problem_count) {
+  static int32_t group_tile_count(cutlass::gemm::GemmCoord const* host_problem_sizes_ptr, int32_t problem_count) {
     int32_t total_tiles = 0;
     for (int32_t i = 0; i < problem_count; ++i) {
       auto problem = host_problem_sizes_ptr[i];
@@ -276,13 +262,13 @@ struct MoeProblemVisitor<ProblemSizeHelper, ThreadblockShape, GroupScheduleMode:
     return true;
   }
 
-  static size_t get_workspace_size(const cutlass::gemm::GemmCoord* /*host_problem_sizes_ptr*/,
-                                   int32_t /*problem_count*/, int32_t /*block_count*/) {
+  static size_t get_workspace_size(cutlass::gemm::GemmCoord const* host_problem_sizes_ptr, int32_t problem_count,
+                                   int32_t block_count) {
     return 0;
   }
 
-  static void host_precompute(const cutlass::gemm::GemmCoord* /*host_problem_sizes_ptr*/, int32_t /*problem_count*/,
-                              int32_t /*block_count*/, void* /*host_workspace_ptr*/) {}
+  static void host_precompute(cutlass::gemm::GemmCoord const* host_problem_sizes_ptr, int32_t problem_count,
+                              int32_t block_count, void* host_workspace_ptr) {}
 };
 
 }  // namespace kernel
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/splitk_gemm_grouped.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/splitk_gemm_grouped.h
new file mode 100644
index 0000000000000..5d8ff0c38d3c1
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/splitk_gemm_grouped.h
@@ -0,0 +1,464 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief based on cutlass/include/cutlass/gemm/kernel/gemm_grouped.h
+*/
+
+#pragma once
+
+#include "cutlass/complex.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/semaphore.h"
+
+#include "cutlass/gemm/kernel/gemm_grouped_problem_visitor.h"
+#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/trace.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Mma_,                         ///! Threadblock-scoped matrix multiply-accumulate
+          typename Epilogue_,                    ///! Epilogue
+          typename ThreadblockSwizzle_,          ///! Threadblock swizzling function
+          GroupScheduleMode GroupScheduleMode_,  ///! Type of scheduling to perform
+          bool Transposed = false>
+struct SplitkGemmGrouped {
+ public:
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static GroupScheduleMode const kGroupScheduleMode = GroupScheduleMode_;
+  static bool const kTransposed = Transposed;
+
+  // Optional transpose
+  using MapArguments =
+      kernel::detail::MapArguments<typename Mma::IteratorA::Element, typename Mma::IteratorA::Layout, Mma::kTransformA,
+                                   Mma::IteratorA::AccessType::kElements, typename Mma::IteratorB::Element,
+                                   typename Mma::IteratorB::Layout, Mma::kTransformB,
+                                   Mma::IteratorB::AccessType::kElements, typename Mma::LayoutC, kTransposed>;
+
+  // Public-facing type definitions related to operand element type, layout, and complex conjugate
+  // operation. Must interact with the 'kTransposed' notion.
+  using ElementA = typename MapArguments::ElementA;
+  using LayoutA = typename MapArguments::LayoutA;
+  using ElementB = typename MapArguments::ElementB;
+  using LayoutB = typename MapArguments::LayoutB;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename MapArguments::LayoutC;
+
+  using ElementFinalOutput = typename MapArguments::ElementA;
+
+  static ComplexTransform const kTransformA = MapArguments::kTransformA;
+  static ComplexTransform const kTransformB = MapArguments::kTransformB;
+
+  // Type definitions about the mainloop.
+  using Operator = typename Mma::Operator;
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = MapArguments::kAlignmentA;
+  static int const kAlignmentB = MapArguments::kAlignmentB;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  using ProblemVisitor =
+      GemmGroupedProblemVisitor<ThreadblockShape, kGroupScheduleMode, kThreadCount, kThreadCount, kTransposed>;
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+    //
+    // Data members
+    //
+
+    GemmCoord* problem_sizes;
+    int problem_count;
+    int threadblock_count;
+
+    typename EpilogueOutputOp::Params output_op;
+
+    ElementA** ptr_A;
+    ElementB** ptr_B;
+    ElementFinalOutput** ptr_C;
+    ElementFinalOutput** ptr_D;
+
+    typename LayoutA::Stride::LongIndex* lda;
+    typename LayoutB::Stride::LongIndex* ldb;
+    typename LayoutC::Stride::LongIndex* ldc;
+    typename LayoutC::Stride::LongIndex* ldd;
+
+    // Only used by device-level operator
+    GemmCoord* host_problem_sizes;
+
+    // splitK
+    int split_k_slices;
+    int64_t* splitk_buffer_offsets;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments()
+        : problem_count(0),
+          threadblock_count(0),
+          ptr_A(nullptr),
+          ptr_B(nullptr),
+          ptr_C(nullptr),
+          ptr_D(nullptr),
+          lda(nullptr),
+          ldb(nullptr),
+          ldc(nullptr),
+          ldd(nullptr),
+          host_problem_sizes(nullptr),
+          split_k_slices(1),
+          splitk_buffer_offsets(nullptr) {}
+
+    /// Ctor
+    CUTLASS_HOST_DEVICE
+    Arguments(GemmCoord* problem_sizes, int problem_count, int threadblock_count,
+              typename EpilogueOutputOp::Params output_op, ElementA** ptr_A, ElementB** ptr_B,
+              ElementFinalOutput** ptr_C, ElementFinalOutput** ptr_D, typename LayoutA::Stride::LongIndex* lda,
+              typename LayoutB::Stride::LongIndex* ldb, typename LayoutC::Stride::LongIndex* ldc,
+              typename LayoutC::Stride::LongIndex* ldd, GemmCoord* host_problem_sizes, int split_k_slices,
+              int64_t* splitk_buffer_offsets)
+        : problem_sizes(problem_sizes),
+          problem_count(problem_count),
+          threadblock_count(threadblock_count),
+          output_op(output_op),
+          ptr_A(ptr_A),
+          ptr_B(ptr_B),
+          ptr_C(ptr_C),
+          ptr_D(ptr_D),
+          lda(lda),
+          ldb(ldb),
+          ldc(ldc),
+          ldd(ldd),
+          host_problem_sizes(host_problem_sizes),
+          split_k_slices(split_k_slices),
+          splitk_buffer_offsets(splitk_buffer_offsets) {}
+  };
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params {
+    typename ProblemVisitor::Params problem_visitor;
+    int threadblock_count;
+
+    typename EpilogueOutputOp::Params output_op;
+
+    ElementA** ptr_A;
+    ElementB** ptr_B;
+    ElementFinalOutput** ptr_C;
+    ElementFinalOutput** ptr_D;
+    ElementC* ptr_C_split;
+    ElementC* ptr_D_split;
+
+    typename LayoutA::Stride::LongIndex* lda;
+    typename LayoutB::Stride::LongIndex* ldb;
+    typename LayoutC::Stride::LongIndex* ldc;
+    typename LayoutC::Stride::LongIndex* ldd;
+
+    //
+    // Methods
+    //
+
+    // splitk
+    GemmCoord grid_tiled_shape;
+    int swizzle_log_tile;
+    int gemm_k_size;
+    GemmCoord* host_problem_sizes;
+    int split_k_slices;
+    int64_t* splitk_buffer_offsets;
+
+    CUTLASS_HOST_DEVICE
+    Params()
+        : ptr_A(nullptr),
+          ptr_B(nullptr),
+          ptr_C(nullptr),
+          ptr_D(nullptr),
+          ptr_C_split(nullptr),
+          ptr_D_split(nullptr),
+          lda(nullptr),
+          ldb(nullptr),
+          ldc(nullptr),
+          ldd(nullptr),
+          swizzle_log_tile(0),
+          gemm_k_size(0),
+          host_problem_sizes(nullptr),
+          split_k_slices(1),
+          splitk_buffer_offsets(nullptr) {}
+
+    CUTLASS_HOST_DEVICE
+    explicit(Arguments const& args, void* workspace = nullptr, int tile_count = 0)
+        : problem_visitor(args.problem_sizes, args.problem_count, workspace, tile_count),
+          host_problem_sizes(args.host_problem_sizes),
+          threadblock_count(args.threadblock_count),
+          output_op(args.output_op),
+          ptr_A(args.ptr_A),
+          ptr_B(args.ptr_B),
+          ptr_C(args.ptr_C),
+          ptr_D(args.ptr_D),
+          ptr_C_split(reinterpret_cast<ElementC*>(workspace)),
+          ptr_D_split(reinterpret_cast<ElementC*>(workspace)),
+          lda(args.lda),
+          ldb(args.ldb),
+          ldc(args.ldc),
+          ldd(args.ldd),
+          split_k_slices(args.split_k_slices),
+          splitk_buffer_offsets(args.splitk_buffer_offsets) {
+      // Determine grid shape
+      ThreadblockSwizzle threadblock_swizzle;
+      grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+          args.host_problem_sizes[0], {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+          args.split_k_slices);
+      swizzle_log_tile = ThreadblockSwizzle().get_log_tile(grid_tiled_shape);
+
+      // only support same k
+      int full_gemm_k_iterations = args.host_problem_sizes[0].k() / Mma::Shape::kK;
+      int gemm_k_iterations = full_gemm_k_iterations / grid_tiled_shape.k();
+
+      gemm_k_size = gemm_k_iterations * Mma::Shape::kK;
+    }
+
+    CUTLASS_HOST_DEVICE
+    void update(Arguments const& args, void* workspace = nullptr, int tile_count = 0) {
+      problem_visitor = typename ProblemVisitor::Params(args.problem_sizes, args.problem_count, workspace, tile_count);
+      threadblock_count = args.threadblock_count;
+      output_op = args.output_op;
+      ptr_A = args.ptr_A;
+      ptr_B = args.ptr_B;
+      ptr_C = args.ptr_C;
+      ptr_D = args.ptr_D;
+      ptr_C_split = workspace;
+      ptr_D_split = workspace;
+
+      lda = args.lda;
+      ldb = args.ldb;
+      ldc = args.ldc;
+      ldd = args.ldd;
+    }
+  };
+
+  /// Shared memory storage structure
+  struct SharedStorage {
+    union {
+      typename Mma::SharedStorage main_loop;
+      typename Epilogue::SharedStorage epilogue;
+    } kernel;
+
+    // ProblemVisitor shared storage can't be overlapped with others
+    typename ProblemVisitor::SharedStorage problem_visitor;
+  };
+
+ public:
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  SplitkGemmGrouped() {}
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(cutlass::gemm::GemmCoord const& problem_size) { return Status::kSuccess; }
+
+  static Status can_implement(Arguments const& args) { return Status::kSuccess; }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const& params, SharedStorage& shared_storage) {
+    //
+    // These types shadow the type-level definitions and support the ability to implement
+    // a 'transposed' GEMM that computes the transposed problems.
+    //
+    using ElementA = typename Mma::IteratorA::Element;
+    using LayoutA = typename Mma::IteratorA::Layout;
+    using ElementB = typename Mma::IteratorB::Element;
+    using LayoutB = typename Mma::IteratorB::Layout;
+    using ElementC = typename Epilogue::OutputTileIterator::Element;
+    using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+
+    //
+    // Problem visitor.
+    //
+    ProblemVisitor problem_visitor(params.problem_visitor, shared_storage.problem_visitor, blockIdx.x);
+
+    // Outer 'persistent' loop to iterate over tiles
+    while (problem_visitor.next_tile()) {
+      GemmCoord problem_size = problem_visitor.problem_size();
+      int32_t problem_idx = problem_visitor.problem_index();
+      int32_t threadblock_idx = int32_t(problem_visitor.threadblock_idx());
+
+      GemmCoord grid_shape = problem_visitor.grid_shape(problem_size);
+
+      // Load element pointers. Exchange pointers and strides if working on the transpose
+      ElementA* ptr_A =
+          reinterpret_cast<ElementA*>((kTransposed ? params.ptr_B[problem_idx] : params.ptr_A[problem_idx]));
+      typename LayoutA::LongIndex ldm_A = (kTransposed ? params.ldb[problem_idx] : params.lda[problem_idx]);
+
+      ElementB* ptr_B =
+          reinterpret_cast<ElementB*>((kTransposed ? params.ptr_A[problem_idx] : params.ptr_B[problem_idx]));
+      typename LayoutB::LongIndex ldm_B = (kTransposed ? params.lda[problem_idx] : params.ldb[problem_idx]);
+
+      // Compute threadblock location
+      ThreadblockSwizzle threadblock_swizzle;
+      GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+      cutlass::gemm::GemmCoord threadblock_offset(static_cast<int>(threadblock_idx / grid_shape.n()) * Mma::Shape::kM,
+                                                  static_cast<int>(threadblock_idx % grid_shape.n()) * Mma::Shape::kN,
+                                                  0);
+
+      // Compute initial location in logical coordinates
+      cutlass::MatrixCoord tb_offset_A{
+          threadblock_offset.m(),
+          threadblock_tile_offset.k() * params.gemm_k_size,
+      };
+
+      cutlass::MatrixCoord tb_offset_B{threadblock_tile_offset.k() * params.gemm_k_size, threadblock_offset.n()};
+
+      // Problem size is a function of threadblock index in the K dimension
+      int problem_size_k;
+      if (threadblock_tile_offset.k() + 1 == params.grid_tiled_shape.k()) {
+        problem_size_k = problem_size.k();
+      } else {
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size;
+      }
+
+      // Compute threadblock-scoped matrix multiply-add
+      int gemm_k_iterations = (problem_size_k - tb_offset_A.column() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+      // Compute position within threadblock
+      int thread_idx = threadIdx.x;
+
+      // Construct iterators to A and B operands
+      typename Mma::IteratorA iterator_A(LayoutA(ldm_A), ptr_A, {problem_size.m(), problem_size_k}, thread_idx,
+                                         tb_offset_A);
+
+      typename Mma::IteratorB iterator_B(LayoutB(ldm_B), ptr_B, {problem_size_k, problem_size.n()}, thread_idx,
+                                         tb_offset_B);
+
+      typename Mma::FragmentC accumulators;
+
+      accumulators.clear();
+
+      // Broadcast the warp_id computed by lane 0 to ensure dependent code
+      // is compiled as warp-uniform.
+      int warp_idx = canonical_warp_idx_sync();
+
+      int lane_idx = threadIdx.x % 32;
+
+      //
+      // Matrix multiply phase
+      //
+
+      // Construct thread-scoped matrix multiply
+      Mma mma(shared_storage.kernel.main_loop, thread_idx, warp_idx, lane_idx);
+
+      // Wait for all threads to finish their epilogue phases from the previous tile.
+      __syncthreads();
+
+      // Compute threadblock-scoped matrix multiply-add
+      mma(gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators);
+
+      //
+      // Epilogue
+      //
+
+      EpilogueOutputOp output_op(params.output_op);
+
+      ElementC* ptr_C = params.ptr_C_split;
+      ElementC* ptr_D = params.ptr_D_split;
+
+      LayoutC layout_C(params.ldc[problem_idx]);
+      LayoutC layout_D(params.ldd[problem_idx]);
+
+      typename Epilogue::OutputTileIterator::Params params_C(layout_C);
+      typename Epilogue::OutputTileIterator::Params params_D(layout_D);
+
+      // assume identity swizzle
+      MatrixCoord threadblock_offset_C(threadblock_offset.m(), threadblock_offset.n());
+
+      // Tile iterator loading from source tensor.
+      typename Epilogue::OutputTileIterator iterator_C(params_C, ptr_C, problem_size.mn(), thread_idx,
+                                                       threadblock_offset_C);
+
+      iterator_C.add_pointer_offset(problem_size.m() * problem_size.n() * threadblock_tile_offset.k() +
+                                    gridDim.z * params.splitk_buffer_offsets[problem_idx]);
+
+      // Tile iterator writing to destination tensor.
+      typename Epilogue::OutputTileIterator iterator_D(params_D, ptr_D, problem_size.mn(), thread_idx,
+                                                       threadblock_offset_C);
+      iterator_D.add_pointer_offset(problem_size.m() * problem_size.n() * threadblock_tile_offset.k() +
+                                    gridDim.z * params.splitk_buffer_offsets[problem_idx]);
+
+      Epilogue epilogue(shared_storage.kernel.epilogue, thread_idx, warp_idx, lane_idx);
+
+      // Execute the epilogue operator to update the destination tensor.
+      epilogue(output_op, iterator_D, accumulators, iterator_C);
+
+      // Next tile
+      problem_visitor.advance(gridDim.x);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_dq_mma.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_dq_mma.h
new file mode 100644
index 0000000000000..8bbc1ee4e6c47
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_dq_mma.h
@@ -0,0 +1,120 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "contrib_ops/cuda/moe/cutlass_extensions/arch/mma.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/interleaved_numeric_conversion.h"
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+////////////////////////////////////////////////////////////////////////////////
+
+// We need to distinguish here, since we want volta support. It is too much effort
+// to write shared memory iterators that are probably needed for volta to function
+// properly. As a result, we allow converters both after the LDG (for volta) and after
+// the LDS for Turing+.
+template <
+    /// Iterator for B matrix in global memory
+    typename IteratorB,
+    /// Warp level Mma
+    typename MmaOperator,
+    /// Math operation perform by warp level operator
+    typename MathOperator>
+struct SetConverters {};
+
+// Dequantize after LDG, so set transforms accordingly
+template <
+    /// Iterator for B matrix in global memory
+    typename IteratorB,
+    /// Mma Policy
+    typename MmaOperator>
+struct SetConverters<IteratorB, MmaOperator, arch::OpMultiplyAdd> {
+  using TransformAfterLDG =
+      FastInterleavedAndBiasedNumericArrayConverter<typename MmaOperator::ArchMmaOperator::ElementB,
+                                                    typename IteratorB::Element, IteratorB::Fragment::kElements>;
+
+  using TransformAfterLDS =
+      NumericArrayConverter<typename MmaOperator::ArchMmaOperator::ElementB,
+                            typename MmaOperator::ArchMmaOperator::ElementB, MmaOperator::FragmentB::kElements>;
+};
+
+// Dequantize after LDS, so set transforms accordingly
+
+template <
+    /// Iterator for B matrix in global memory
+    typename IteratorB,
+    /// Mma Policy
+    typename MmaOperator>
+struct SetConverters<IteratorB, MmaOperator, arch::OpMultiplyAddDequantizeInterleavedBToA> {
+  using TransformAfterLDG =
+      NumericArrayConverter<typename IteratorB::Element, typename IteratorB::Element, IteratorB::Fragment::kElements>;
+
+  using TransformAfterLDS =
+      FastInterleavedAndBiasedNumericArrayConverter<typename MmaOperator::ArchMmaOperator::ElementB,
+                                                    typename TransformAfterLDG::result_type::Element,
+                                                    MmaOperator::FragmentB::kElements>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for the input scale
+    typename ElementScale_,
+    /// Layout for the scale operand
+    typename LayoutScale_,
+    /// Access granularity of Scales in unit of elements
+    int kAlignmentScale,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator_,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
+    ///
+    typename Enable = void>
+struct DqMma;
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_dq_mma_multistage.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_dq_mma_multistage.h
new file mode 100644
index 0000000000000..8b9d6b0b14add
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_dq_mma_multistage.h
@@ -0,0 +1,289 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <algorithm>
+
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/arch/mma.h"
+
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_multistage.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/warp/default_mma_tensor_op.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/warp/mma_tensorop_compute_B_with_f16.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/tile_interleaved_layout.h"
+
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_dq_mma.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/transform/threadblock/fine_grained_scale_zero_iterator.h"
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename MmaShape, typename Element, typename Layout, WeightOnlyQuantOp QuantOp, int Alignment,
+          typename Enable = void>
+struct DefaultScaleIterators;
+
+// Fine grained iterators
+template <typename MmaShape, typename Element, typename Layout, WeightOnlyQuantOp QuantOp, int Alignment>
+struct DefaultScaleIterators<MmaShape, Element, Layout, QuantOp, Alignment, std::enable_if_t<isFinegrained(QuantOp)>> {
+  using IteratorScale =
+      cutlass::transform::threadblock::FineGrainedScaleZeroIterator<cutlass::MatrixShape<1, MmaShape::kN>, Element,
+                                                                    Layout, 0, Alignment>;
+
+  using SmemIteratorScale = IteratorScale;
+};
+
+// Per column iterators
+template <typename MmaShape, typename Element, typename Layout, WeightOnlyQuantOp QuantOp, int Alignment>
+struct DefaultScaleIterators<MmaShape, Element, Layout, QuantOp, Alignment, std::enable_if_t<!isFinegrained(QuantOp)>> {
+  // ThreadMap for scale iterator
+  static_assert((MmaShape::kN % Alignment) == 0, "");
+
+ private:
+  using IteratorScaleThreadMap = transform::PitchLinearStripminedThreadMap<layout::PitchLinearShape<MmaShape::kN, 1>,
+                                                                           MmaShape::kN / Alignment, Alignment>;
+
+ public:
+  // Define iterators over tiles from the scale operand
+  using IteratorScale =
+      cutlass::transform::threadblock::PredicatedTileIterator<cutlass::MatrixShape<1, MmaShape::kN>, Element, Layout, 0,
+                                                              IteratorScaleThreadMap, Alignment>;
+
+  using SmemIteratorScale = IteratorScale;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Type for elementA
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Type for element B
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for the input scale
+    typename ElementScale,
+    /// Layout for the scale operand
+    typename LayoutScale,
+    /// Access granularity of Scales in unit of elements
+    int kAlignmentScale,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Stages in GEMM
+    int kStages,
+    ///
+    typename Operator_,
+    ///
+    SharedMemoryClearOption SharedMemoryClear>
+struct DqMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementScale, LayoutScale, kAlignmentScale,
+             ElementAccumulator, layout::RowMajor, OperatorClass, ArchTag, ThreadblockShape, WarpShape,
+             InstructionShape, kStages, Operator_, SharedMemoryClear,
+             typename platform::enable_if<(ArchTag::kMinComputeCapability >= 80 &&
+                                           !layout::IsColumnMajorTileInterleave<LayoutB>::value)>::type> {
+  static_assert(platform::is_same<ElementA, half_t>::value || platform::is_same<ElementA, bfloat16_t>::value,
+                "Element A must be fp16 or bf16");
+
+  using OperatorInfo = arch::DetagOperator<Operator_>;
+  using Operator = typename OperatorInfo::Operator;
+  static_assert(platform::is_same<Operator, arch::OpMultiplyAddDequantizeInterleavedBToA>::value,
+                "Mma multistage must dequantize after ldsm");
+
+  static_assert(platform::is_same<ElementB, uint8_t>::value || platform::is_same<ElementB, uint4b_t>::value,
+                "Element B must be uint8 or uint4");
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpA = ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
+                                                                  ? cutlass::arch::CacheOperation::Global
+                                                                  : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB = ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
+                                                                  ? cutlass::arch::CacheOperation::Global
+                                                                  : cutlass::arch::CacheOperation::Always;
+
+  // Define the MmaCore components
+  // Mma core does not depend on stages, so pass in at least 3 here to mma multistage pieces are created
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA, ElementB, LayoutB, ElementAccumulator,
+      layout::RowMajor, OperatorClass, std::max(kStages, 3), Operator, false, CacheOpA, CacheOpB>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+  using IteratorA = cutlass::transform::threadblock::PredicatedTileAccessIterator<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>, ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB = cutlass::transform::threadblock::PredicatedTileAccessIterator<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>, ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>;
+
+  using ScaleIterators =
+      DefaultScaleIterators<typename MmaCore::Shape, ElementScale, LayoutScale, OperatorInfo::QuantOp, kAlignmentScale>;
+
+  // Define iterators over tiles from the scale operand
+  using IteratorScale = typename ScaleIterators::IteratorScale;
+
+  using SmemIteratorScale = typename ScaleIterators::SmemIteratorScale;
+
+  using Converter = FastInterleavedAndBiasedNumericArrayConverter<ElementA, ElementB,
+                                                                  MmaCore::MmaPolicy::Operator::FragmentB::kElements>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::DqMmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA, MmaCore::kCacheOpA, IteratorB,
+      typename MmaCore::SmemIteratorB, MmaCore::kCacheOpB, IteratorScale, SmemIteratorScale, ElementAccumulator,
+      layout::RowMajor, typename MmaCore::MmaPolicy, kStages, Converter, OperatorInfo::QuantOp, SharedMemoryClear>;
+};
+
+template <
+    /// Type for element A
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Type for element B
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for the input scale
+    typename ElementScale,
+    /// Layout for the scale operand
+    typename LayoutScale,
+    /// Access granularity of Scales in unit of elements
+    int kAlignmentScale,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Stages in GEMM
+    int kStages,
+    ///
+    typename Operator_,
+    ///
+    SharedMemoryClearOption SharedMemoryClear>
+struct DqMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementScale, LayoutScale, kAlignmentScale,
+             ElementAccumulator, layout::RowMajor, OperatorClass, ArchTag, ThreadblockShape, WarpShape,
+             InstructionShape, kStages, Operator_, SharedMemoryClear,
+             typename platform::enable_if<(ArchTag::kMinComputeCapability >= 80 &&
+                                           layout::IsColumnMajorTileInterleave<LayoutB>::value)>::type> {
+  static_assert(platform::is_same<ElementA, half_t>::value || platform::is_same<ElementA, bfloat16_t>::value,
+                "Element A must be fp16 or bf16");
+
+  using OperatorInfo = arch::DetagOperator<Operator_>;
+  using Operator = typename OperatorInfo::Operator;
+  static_assert(platform::is_same<Operator, arch::OpMultiplyAddDequantizeInterleavedBToA>::value,
+                "Mma multistage must dequantize after ldsm");
+
+  static_assert(platform::is_same<ElementB, uint8_t>::value || platform::is_same<ElementB, uint4b_t>::value,
+                "Element B must be uint8 or uint4");
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpA = ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
+                                                                  ? cutlass::arch::CacheOperation::Global
+                                                                  : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB = ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
+                                                                  ? cutlass::arch::CacheOperation::Global
+                                                                  : cutlass::arch::CacheOperation::Always;
+
+  // Define the MmaCore components
+  // Mma core does not depend on stages, so pass in at least 3 here to mma multistage pieces are created
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA, ElementB, layout::ColumnMajor,
+      ElementAccumulator, layout::RowMajor, OperatorClass, std::max(kStages, 3), Operator, false, CacheOpA, CacheOpB>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+  using IteratorA = cutlass::transform::threadblock::PredicatedTileAccessIterator<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>, ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
+
+ private:
+  static constexpr int ColumnsInterleaved = LayoutB::kColumnsInterleaved;
+  static constexpr int RowsPerTile = LayoutB::kRowsPerTile;
+  static_assert(!(MmaCore::Shape::kN % ColumnsInterleaved), "");
+  static_assert(RowsPerTile == MmaCore::Shape::kK, "");
+
+  using OriginalThreadMap = typename MmaCore::IteratorThreadMapB;
+  using OriginalWarpArrangement = typename OriginalThreadMap::Detail::WarpThreadArrangement;
+  static_assert(!(OriginalWarpArrangement::kStrided % ColumnsInterleaved), "");
+
+  using GmemIteratorShape =
+      MatrixShape<MmaCore::Shape::kK * ColumnsInterleaved, MmaCore::Shape::kN / ColumnsInterleaved>;
+  using GmemThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<GmemIteratorShape::kRow, GmemIteratorShape::kColumn>, OriginalThreadMap::kThreads,
+      layout::PitchLinearShape<OriginalWarpArrangement::kContiguous * ColumnsInterleaved,
+                               OriginalWarpArrangement::kStrided / ColumnsInterleaved>,
+      MmaCore::kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+ public:
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<GmemIteratorShape, ElementB, layout::ColumnMajor, 0,
+                                                                    GmemThreadMapB, AccessTypeB>;
+
+  using ScaleIterators =
+      DefaultScaleIterators<typename MmaCore::Shape, ElementScale, LayoutScale, OperatorInfo::QuantOp, kAlignmentScale>;
+
+  // Define iterators over tiles from the scale operand
+  using IteratorScale = typename ScaleIterators::IteratorScale;
+
+  using SmemIteratorScale = typename ScaleIterators::SmemIteratorScale;
+
+  using Converter = FastInterleavedAndBiasedNumericArrayConverter<ElementA, ElementB,
+                                                                  MmaCore::MmaPolicy::Operator::FragmentB::kElements>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::DqMmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA, MmaCore::kCacheOpA, IteratorB,
+      typename MmaCore::SmemIteratorB, MmaCore::kCacheOpB, IteratorScale, SmemIteratorScale, ElementAccumulator,
+      layout::RowMajor, typename MmaCore::MmaPolicy, kStages, Converter, OperatorInfo::QuantOp, SharedMemoryClear>;
+};
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_dq_mma_pipelined.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_dq_mma_pipelined.h
new file mode 100644
index 0000000000000..91c4cd342569e
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_dq_mma_pipelined.h
@@ -0,0 +1,245 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/arch/mma.h"
+
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_pipelined.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/warp/default_mma_tensor_op.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/warp/mma_tensorop_compute_B_with_f16.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/tile_interleaved_layout.h"
+
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_dq_mma.h"
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Type for element A
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Type for element B
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for the input scale
+    typename ElementScale,
+    /// Layout for the scale operand
+    typename LayoutScale,
+    /// Access granularity of Scales in unit of elements
+    int kAlignmentScale,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DqMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementScale, LayoutScale, kAlignmentScale,
+             ElementAccumulator, layout::RowMajor, OperatorClass, ArchTag, ThreadblockShape, WarpShape,
+             InstructionShape, 2, Operator_, SharedMemoryClearOption::kNone,
+             typename platform::enable_if<(ArchTag::kMinComputeCapability < 80 &&
+                                           !layout::IsColumnMajorTileInterleave<LayoutB>::value)>::type> {
+  static_assert(platform::is_same<ElementA, half_t>::value || platform::is_same<ElementA, bfloat16_t>::value,
+                "Element A must be fp16 or bf16");
+
+  static_assert(platform::is_same<ElementB, uint8_t>::value || platform::is_same<ElementB, uint4b_t>::value,
+                "Element B must be uint8 or uint4");
+
+  using OperatorInfo = arch::DetagOperator<Operator_>;
+  using Operator = typename OperatorInfo::Operator;
+  static_assert(OperatorInfo::QuantOp == WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY, "");
+
+  static constexpr bool DqAfterLDG = platform::is_same<arch::OpMultiplyAdd, Operator>::value;
+  static constexpr bool arch_has_bf16_mma = ArchTag::kMinComputeCapability >= 80;
+  using MmaCoreElementA = typename platform::conditional<arch_has_bf16_mma, ElementA, half_t>::type;
+  using MmaCoreElementB = typename platform::conditional<DqAfterLDG, MmaCoreElementA, ElementB>::type;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, MmaCoreElementA, LayoutA, MmaCoreElementB, LayoutB,
+      ElementAccumulator, layout::RowMajor, OperatorClass, 2, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA = cutlass::transform::threadblock::PredicatedTileIterator<
+      cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>, ElementA, LayoutA, 1,
+      typename MmaCore::IteratorThreadMapA, kAlignmentA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB = cutlass::transform::threadblock::PredicatedTileIterator<
+      cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>, ElementB, LayoutB, 0,
+      typename MmaCore::IteratorThreadMapB, kAlignmentB>;
+
+  // ThreadMap for scale iterator
+  static_assert((MmaCore::Shape::kN % kAlignmentScale) == 0, "");
+  using IteratorScaleThreadMap =
+      transform::PitchLinearStripminedThreadMap<layout::PitchLinearShape<MmaCore::Shape::kN, 1>,
+                                                MmaCore::Shape::kN / kAlignmentScale, kAlignmentScale>;
+
+  // Define iterators over tiles from the scale operand
+  using IteratorScale =
+      cutlass::transform::threadblock::PredicatedTileIterator<cutlass::MatrixShape<1, MmaCore::Shape::kN>, ElementScale,
+                                                              LayoutScale, 0, IteratorScaleThreadMap, kAlignmentScale>;
+
+  using SmemScaleType = typename platform::conditional<arch_has_bf16_mma, ElementScale, half_t>::type;
+  using SmemIteratorScale =
+      cutlass::transform::threadblock::PredicatedTileIterator<cutlass::MatrixShape<1, MmaCore::Shape::kN>,
+                                                              SmemScaleType, LayoutScale, 0, IteratorScaleThreadMap,
+                                                              kAlignmentScale>;
+
+  using Converters = SetConverters<IteratorB, typename MmaCore::MmaPolicy::Operator, Operator>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::DqMmaPipelined<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA, IteratorB, typename MmaCore::SmemIteratorB,
+      IteratorScale, SmemIteratorScale, ElementAccumulator, layout::RowMajor, typename MmaCore::MmaPolicy,
+      typename Converters::TransformAfterLDG, typename Converters::TransformAfterLDS, OperatorInfo::QuantOp>;
+};
+
+// Specialization to handle column major interleave B
+template <
+    /// Type for element A
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Type for element B
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for the input scale
+    typename ElementScale,
+    /// Layout for the scale operand
+    typename LayoutScale,
+    /// Access granularity of Scales in unit of elements
+    int kAlignmentScale,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DqMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementScale, LayoutScale, kAlignmentScale,
+             ElementAccumulator, layout::RowMajor, OperatorClass, ArchTag, ThreadblockShape, WarpShape,
+             InstructionShape, 2, Operator_, SharedMemoryClearOption::kNone,
+             typename platform::enable_if<(ArchTag::kMinComputeCapability < 80 &&
+                                           layout::IsColumnMajorTileInterleave<LayoutB>::value)>::type> {
+  static_assert(platform::is_same<ElementA, half_t>::value || platform::is_same<ElementA, bfloat16_t>::value,
+                "Element A must be fp16 or bf16");
+
+  static_assert(platform::is_same<ElementB, uint8_t>::value || platform::is_same<ElementB, uint4b_t>::value,
+                "Element B must be uint8 or uint4");
+
+  using OperatorInfo = arch::DetagOperator<Operator_>;
+  using Operator = typename OperatorInfo::Operator;
+  static_assert(OperatorInfo::QuantOp == WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY, "");
+
+  static constexpr bool DqAfterLDG = platform::is_same<arch::OpMultiplyAdd, Operator>::value;
+  static constexpr bool arch_has_bf16_mma = ArchTag::kMinComputeCapability >= 80;
+  using MmaCoreElementA = typename platform::conditional<arch_has_bf16_mma, ElementA, half_t>::type;
+  using MmaCoreElementB = typename platform::conditional<DqAfterLDG, MmaCoreElementA, ElementB>::type;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, MmaCoreElementA, LayoutA, MmaCoreElementB, layout::ColumnMajor,
+      ElementAccumulator, layout::RowMajor, OperatorClass, 2, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA = cutlass::transform::threadblock::PredicatedTileIterator<
+      cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>, ElementA, LayoutA, 1,
+      typename MmaCore::IteratorThreadMapA, kAlignmentA>;
+
+ private:
+  static constexpr int ColumnsInterleaved = LayoutB::kColumnsInterleaved;
+  static constexpr int RowsPerTile = LayoutB::kRowsPerTile;
+  static_assert(!(MmaCore::Shape::kN % ColumnsInterleaved), "");
+  static_assert(RowsPerTile == MmaCore::Shape::kK, "");
+
+  using OriginalThreadMap = typename MmaCore::IteratorThreadMapB;
+  using OriginalWarpArrangement = typename OriginalThreadMap::Detail::WarpThreadArrangement;
+  static_assert(!(OriginalWarpArrangement::kStrided % ColumnsInterleaved), "");
+
+  using GmemIteratorShape =
+      MatrixShape<MmaCore::Shape::kK * ColumnsInterleaved, MmaCore::Shape::kN / ColumnsInterleaved>;
+  using GmemThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<GmemIteratorShape::kRow, GmemIteratorShape::kColumn>, OriginalThreadMap::kThreads,
+      layout::PitchLinearShape<OriginalWarpArrangement::kContiguous * ColumnsInterleaved,
+                               OriginalWarpArrangement::kStrided / ColumnsInterleaved>,
+      MmaCore::kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+ public:
+  // Define iterators over tiles from the B operand
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileIterator<GmemIteratorShape, ElementB, layout::ColumnMajor, 0,
+                                                              GmemThreadMapB, kAlignmentB>;
+
+  // ThreadMap for scale iterator
+  static_assert((MmaCore::Shape::kN % kAlignmentScale) == 0, "");
+  using IteratorScaleThreadMap =
+      transform::PitchLinearStripminedThreadMap<layout::PitchLinearShape<MmaCore::Shape::kN, 1>,
+                                                MmaCore::Shape::kN / kAlignmentScale, kAlignmentScale>;
+
+  // Define iterators over tiles from the scale operand
+  using IteratorScale =
+      cutlass::transform::threadblock::PredicatedTileIterator<cutlass::MatrixShape<1, MmaCore::Shape::kN>, ElementScale,
+                                                              LayoutScale, 0, IteratorScaleThreadMap, kAlignmentScale>;
+
+  using SmemScaleType = typename platform::conditional<arch_has_bf16_mma, ElementScale, half_t>::type;
+  using SmemIteratorScale =
+      cutlass::transform::threadblock::PredicatedTileIterator<cutlass::MatrixShape<1, MmaCore::Shape::kN>,
+                                                              SmemScaleType, LayoutScale, 0, IteratorScaleThreadMap,
+                                                              kAlignmentScale>;
+
+  using Converters = SetConverters<IteratorB, typename MmaCore::MmaPolicy::Operator, Operator>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::DqMmaPipelined<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA, IteratorB, typename MmaCore::SmemIteratorB,
+      IteratorScale, SmemIteratorScale, ElementAccumulator, layout::RowMajor, typename MmaCore::MmaPolicy,
+      typename Converters::TransformAfterLDG, typename Converters::TransformAfterLDS, OperatorInfo::QuantOp>;
+};
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_mma.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_mma.h
new file mode 100644
index 0000000000000..1a3e7e39c9656
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_mma.h
@@ -0,0 +1,283 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_dq_mma_multistage.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_dq_mma_pipelined.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_mma_bf16.h"
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output (OperatorClass TensorOp), fp16 activation & int8 weight
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultMma<cutlass::half_t, LayoutA, kAlignmentA, uint8_t, LayoutB, kAlignmentB, ElementAccumulator,
+                  layout::RowMajor, arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape, InstructionShape, 2,
+                  Operator> {
+ private:
+  static constexpr int kAlignmentScale = 128 / sizeof_bits<half_t>::value;
+
+  using Mma = DqMma<half_t, LayoutA, kAlignmentA, uint8_t, LayoutB, kAlignmentB, half_t, layout::RowMajor,
+                    kAlignmentScale, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, ArchTag,
+                    ThreadblockShape, WarpShape, InstructionShape, 2, Operator>;
+
+ public:
+  // Define the MmaCore components
+  using MmaCore = typename Mma::MmaCore;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA = typename Mma::IteratorA;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB = typename Mma::IteratorB;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = typename Mma::ThreadblockMma;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization for row-major output (OperatorClass TensorOp), fp16 activation & int4 weight
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultMma<cutlass::half_t, LayoutA, kAlignmentA, uint4b_t, LayoutB, kAlignmentB, ElementAccumulator,
+                  layout::RowMajor, arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape, InstructionShape, 2,
+                  Operator> {
+ private:
+  static constexpr int kAlignmentScale = 128 / sizeof_bits<half_t>::value;
+
+  using Mma = DqMma<half_t, LayoutA, kAlignmentA, uint4b_t, LayoutB, kAlignmentB, half_t, layout::RowMajor,
+                    kAlignmentScale, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, ArchTag,
+                    ThreadblockShape, WarpShape, InstructionShape, 2, Operator>;
+
+ public:
+  // Define the MmaCore components
+  using MmaCore = typename Mma::MmaCore;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA = typename Mma::IteratorA;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB = typename Mma::IteratorB;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = typename Mma::ThreadblockMma;
+};
+
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    ///
+    int kStages,
+    /// Shared memory clear option
+    SharedMemoryClearOption SharedMemoryClear>
+struct DefaultMma<cutlass::half_t, LayoutA, kAlignmentA, uint8_t, LayoutB, kAlignmentB, ElementAccumulator,
+                  layout::RowMajor, arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape, InstructionShape,
+                  kStages, Operator, false, SharedMemoryClear> {
+ private:
+  static constexpr int kAlignmentScale = 128 / sizeof_bits<half_t>::value;
+
+  using Mma = DqMma<half_t, LayoutA, kAlignmentA, uint8_t, LayoutB, kAlignmentB, half_t, layout::RowMajor,
+                    kAlignmentScale, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, ArchTag,
+                    ThreadblockShape, WarpShape, InstructionShape, kStages, Operator, SharedMemoryClear>;
+
+ public:
+  // Define the MmaCore components
+  using MmaCore = typename Mma::MmaCore;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA = typename Mma::IteratorA;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB = typename Mma::IteratorB;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = typename Mma::ThreadblockMma;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization for row-major output (OperatorClass TensorOp), fp16 activation & int4 weight
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    ///
+    int kStages,
+    /// Shared memory clear option
+    SharedMemoryClearOption SharedMemoryClear>
+struct DefaultMma<cutlass::half_t, LayoutA, kAlignmentA, uint4b_t, LayoutB, kAlignmentB, ElementAccumulator,
+                  layout::RowMajor, arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape, InstructionShape,
+                  kStages, Operator, false, SharedMemoryClear> {
+ private:
+  static constexpr int kAlignmentScale = 128 / sizeof_bits<half_t>::value;
+
+  using Mma = DqMma<half_t, LayoutA, kAlignmentA, uint4b_t, LayoutB, kAlignmentB, half_t, layout::RowMajor,
+                    kAlignmentScale, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, ArchTag,
+                    ThreadblockShape, WarpShape, InstructionShape, kStages, Operator, SharedMemoryClear>;
+
+ public:
+  // Define the MmaCore components
+  using MmaCore = typename Mma::MmaCore;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA = typename Mma::IteratorA;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB = typename Mma::IteratorB;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = typename Mma::ThreadblockMma;
+};
+
+// fp16 x fp16 specialization on Ampere to use mma multistage for 2 stage. Helps avoid reg spills on
+// large tile when not enough shared mem is present to do 3+ stage
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB>
+struct DefaultMma<half_t, LayoutA, kAlignmentA, half_t, LayoutB, kAlignmentB, ElementAccumulator, layout::RowMajor,
+                  arch::OpClassTensorOp, arch::Sm80, ThreadblockShape, WarpShape, InstructionShape, 2, Operator, false,
+                  SharedMemoryClear, GatherA, GatherB> {
+  // Define the MmaCore components
+  // 3 is used on purpose here to trigger components for mma multistage
+  using MmaCore =
+      typename cutlass::gemm::threadblock::DefaultMmaCore<ThreadblockShape, WarpShape, InstructionShape, half_t,
+                                                          LayoutA, half_t, LayoutB, ElementAccumulator,
+                                                          layout::RowMajor, arch::OpClassTensorOp, 3, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<half_t, kAlignmentA>;
+  using IteratorA = cutlass::transform::threadblock::PredicatedTileAccessIterator<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>, half_t, LayoutA, 1, ThreadMapA, AccessTypeA,
+      GatherA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<half_t, kAlignmentB>;
+  using IteratorB = cutlass::transform::threadblock::PredicatedTileAccessIterator<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>, half_t, LayoutB, 0, ThreadMapB, AccessTypeB,
+      GatherB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma =
+      cutlass::gemm::threadblock::MmaMultistage<typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+                                                MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+                                                MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+                                                typename MmaCore::MmaPolicy, 2>;
+};
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_mma_bf16.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_mma_bf16.h
new file mode 100644
index 0000000000000..4afd482f85628
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_mma_bf16.h
@@ -0,0 +1,345 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_dq_mma_multistage.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_dq_mma_pipelined.h"
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output (OperatorClass TensorOp), bf16 activation & bf16 weight
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB>
+struct DefaultMma<bfloat16_t, LayoutA, kAlignmentA, bfloat16_t, LayoutB, kAlignmentB, ElementAccumulator,
+                  layout::RowMajor, arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape, InstructionShape, 2,
+                  Operator, false, SharedMemoryClear, GatherA, GatherB> {
+ private:
+  // Conversions only needed pre-ampere. This will trigger mma pipeline, so we convert before STS.
+  static constexpr bool arch_has_bf16_mma = ArchTag::kMinComputeCapability >= 80;
+  using MmaElementA = typename platform::conditional<arch_has_bf16_mma, bfloat16_t, half_t>::type;
+  using MmaElementB = typename platform::conditional<arch_has_bf16_mma, bfloat16_t, half_t>::type;
+
+ public:
+  // Define the MmaCore components
+  using MmaCore =
+      typename cutlass::gemm::threadblock::DefaultMmaCore<ThreadblockShape, WarpShape, InstructionShape, MmaElementA,
+                                                          LayoutA, MmaElementB, LayoutB, ElementAccumulator,
+                                                          layout::RowMajor, arch::OpClassTensorOp, 2, Operator>;
+
+  using IteratorA = cutlass::transform::threadblock::PredicatedTileIterator<
+      cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>, bfloat16_t, LayoutA, 1,
+      typename MmaCore::IteratorThreadMapA, kAlignmentA, GatherA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB = cutlass::transform::threadblock::PredicatedTileIterator<
+      cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>, bfloat16_t, LayoutB, 0,
+      typename MmaCore::IteratorThreadMapB, kAlignmentB, GatherB>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma =
+      cutlass::gemm::threadblock::MmaPipelined<typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+                                               IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
+                                               layout::RowMajor, typename MmaCore::MmaPolicy>;
+};
+
+// bf16 x bf16 specialization on Ampere to use mma multistage for 2 stage. Helps avoid reg spills on
+// large tile when not enough shared mem is present to do 3+ stage
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB>
+struct DefaultMma<bfloat16_t, LayoutA, kAlignmentA, bfloat16_t, LayoutB, kAlignmentB, ElementAccumulator,
+                  layout::RowMajor, arch::OpClassTensorOp, arch::Sm80, ThreadblockShape, WarpShape, InstructionShape, 2,
+                  Operator, false, SharedMemoryClear, GatherA, GatherB> {
+  // Define the MmaCore components
+  // 3 is used on purpose here to trigger components for mma multistage
+  using MmaCore =
+      typename cutlass::gemm::threadblock::DefaultMmaCore<ThreadblockShape, WarpShape, InstructionShape, bfloat16_t,
+                                                          LayoutA, bfloat16_t, LayoutB, ElementAccumulator,
+                                                          layout::RowMajor, arch::OpClassTensorOp, 3, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<bfloat16_t, kAlignmentA>;
+  using IteratorA = cutlass::transform::threadblock::PredicatedTileAccessIterator<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>, bfloat16_t, LayoutA, 1, ThreadMapA, AccessTypeA,
+      GatherA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<bfloat16_t, kAlignmentB>;
+  using IteratorB = cutlass::transform::threadblock::PredicatedTileAccessIterator<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>, bfloat16_t, LayoutB, 0, ThreadMapB, AccessTypeB,
+      GatherB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma =
+      cutlass::gemm::threadblock::MmaMultistage<typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+                                                MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+                                                MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+                                                typename MmaCore::MmaPolicy, 2>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output (OperatorClass TensorOp), bf16 activation & int8 weight
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultMma<cutlass::bfloat16_t, LayoutA, kAlignmentA, uint8_t, LayoutB, kAlignmentB, ElementAccumulator,
+                  layout::RowMajor, arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape, InstructionShape, 2,
+                  Operator> {
+ private:
+  static constexpr int kAlignmentScale = 128 / sizeof_bits<bfloat16_t>::value;
+
+  using Mma = DqMma<bfloat16_t, LayoutA, kAlignmentA, uint8_t, LayoutB, kAlignmentB, bfloat16_t, layout::RowMajor,
+                    kAlignmentScale, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, ArchTag,
+                    ThreadblockShape, WarpShape, InstructionShape, 2, Operator>;
+
+ public:
+  // Define the MmaCore components
+  using MmaCore = typename Mma::MmaCore;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA = typename Mma::IteratorA;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB = typename Mma::IteratorB;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = typename Mma::ThreadblockMma;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization for row-major output (OperatorClass TensorOp), bf16 activation & int4 weight
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultMma<cutlass::bfloat16_t, LayoutA, kAlignmentA, uint4b_t, LayoutB, kAlignmentB, ElementAccumulator,
+                  layout::RowMajor, arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape, InstructionShape, 2,
+                  Operator> {
+ private:
+  static constexpr int kAlignmentScale = 128 / sizeof_bits<bfloat16_t>::value;
+
+  using Mma = DqMma<bfloat16_t, LayoutA, kAlignmentA, uint4b_t, LayoutB, kAlignmentB, bfloat16_t, layout::RowMajor,
+                    kAlignmentScale, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, ArchTag,
+                    ThreadblockShape, WarpShape, InstructionShape, 2, Operator>;
+
+ public:
+  // Define the MmaCore components
+  using MmaCore = typename Mma::MmaCore;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA = typename Mma::IteratorA;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB = typename Mma::IteratorB;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = typename Mma::ThreadblockMma;
+};
+
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    ///
+    int kStages,
+    /// Shared memory clear option
+    SharedMemoryClearOption SharedMemoryClear>
+struct DefaultMma<cutlass::bfloat16_t, LayoutA, kAlignmentA, uint8_t, LayoutB, kAlignmentB, ElementAccumulator,
+                  layout::RowMajor, arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape, InstructionShape,
+                  kStages, Operator, false, SharedMemoryClear> {
+ private:
+  static constexpr int kAlignmentScale = 128 / sizeof_bits<bfloat16_t>::value;
+
+  using Mma = DqMma<bfloat16_t, LayoutA, kAlignmentA, uint8_t, LayoutB, kAlignmentB, bfloat16_t, layout::RowMajor,
+                    kAlignmentScale, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, ArchTag,
+                    ThreadblockShape, WarpShape, InstructionShape, kStages, Operator, SharedMemoryClear>;
+
+ public:
+  // Define the MmaCore components
+  using MmaCore = typename Mma::MmaCore;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA = typename Mma::IteratorA;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB = typename Mma::IteratorB;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = typename Mma::ThreadblockMma;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization for row-major output (OperatorClass TensorOp), fp16 activation & int4 weight
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    ///
+    int kStages,
+    /// Shared memory clear option
+    SharedMemoryClearOption SharedMemoryClear>
+struct DefaultMma<cutlass::bfloat16_t, LayoutA, kAlignmentA, uint4b_t, LayoutB, kAlignmentB, ElementAccumulator,
+                  layout::RowMajor, arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape, InstructionShape,
+                  kStages, Operator, false, SharedMemoryClear> {
+ private:
+  static constexpr int kAlignmentScale = 128 / sizeof_bits<bfloat16_t>::value;
+
+  using Mma = DqMma<bfloat16_t, LayoutA, kAlignmentA, uint4b_t, LayoutB, kAlignmentB, bfloat16_t, layout::RowMajor,
+                    kAlignmentScale, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, ArchTag,
+                    ThreadblockShape, WarpShape, InstructionShape, kStages, Operator, SharedMemoryClear>;
+
+ public:
+  // Define the MmaCore components
+  using MmaCore = typename Mma::MmaCore;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA = typename Mma::IteratorA;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB = typename Mma::IteratorB;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = typename Mma::ThreadblockMma;
+};
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_base.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_base.h
new file mode 100644
index 0000000000000..cf5ba6faa0c82
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_base.h
@@ -0,0 +1,237 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/mma_base.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "contrib_ops/cuda/moe/cutlass_extensions/weight_only_quant_op.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+// SFINAE trick so I can keep the same loop code for Volta and dispatch to the
+// correct warp level mma. On volta, all data is stored to shared memory as FP16.
+template <typename WarpMma, int kExpansionFactor = 1>
+CUTLASS_DEVICE void run_warp_mma(WarpMma& warp_mma, typename WarpMma::FragmentC& D,
+                                 typename WarpMma::FragmentA const& A, typename WarpMma::FragmentB const& B,
+                                 typename WarpMma::FragmentC const& C, int const warp_tileB_k_offset) {
+  warp_mma(D, A, B, C);
+}
+
+template <typename WarpMma, int kExpansionFactor = WarpMma::kExpansionFactor>
+CUTLASS_DEVICE void run_warp_mma(WarpMma& warp_mma, typename WarpMma::FragmentC& D,
+                                 typename WarpMma::TransformedFragmentA const& A,
+                                 typename WarpMma::TransformedFragmentB const& B, typename WarpMma::FragmentC const& C,
+                                 int const warp_tileB_k_offset) {
+  warp_mma(D, A, B, C, warp_tileB_k_offset);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// The type of the scales
+    typename ElementScale_,
+    /// Number of stages,
+    int Stages,
+    /// The dequantizing op to be performed.
+    WeightOnlyQuantOp DequantOp,
+    /// Used for partial specialization,
+    typename Enable = bool>
+class DqMmaBase {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  ///< Type of the scale to be loaded
+  using ElementScale = ElementScale_;
+
+  static_assert(DequantOp != WeightOnlyQuantOp::UNDEFINED, "");
+
+  // Finegrained scales get streamed in via cp.async
+  static constexpr int ScalebiasStages = isFinegrained(DequantOp) ? Stages : 1;
+  // We always have scales.
+  static constexpr int ScaleElementsPerStage = Shape::kN;
+  // We sometimes have a bias
+  static constexpr int BiasElementsPerStage = hasZero(DequantOp) ? Shape::kN : 0;
+
+  //
+  // Dependent types
+  //
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Shape describing the overall GEMM computed from shared memory
+  /// by each warp.
+  using WarpGemm = typename Policy::Operator::Shape;
+
+  /// Shape describing the number of warps filling the CTA
+  using WarpCount = GemmShape<Shape::kM / WarpGemm::kM, Shape::kN / WarpGemm::kN, Shape::kK / WarpGemm::kK>;
+
+  /// Number of warp-level GEMM operations
+  static int const kWarpGemmIterations = (WarpGemm::kK / Operator::Policy::MmaShape::kK);
+
+  static constexpr int kNumKIterationsPerWarpBLoad =
+      Operator::IteratorB::InstructionShape::kRow / Operator::InstructionShape::kK;
+
+  static_assert(!(kWarpGemmIterations % kNumKIterationsPerWarpBLoad), "");
+  static constexpr int kWarpGemmIterationsForB = kWarpGemmIterations / kNumKIterationsPerWarpBLoad;
+
+  /// Number of stages
+  static int const kStages = Stages;
+
+  /// Tensor reference to the A operand
+  using TensorRefA = TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;
+
+  /// Tensor reference to the B operand
+  using TensorRefB = TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
+
+  //
+  // Nested structs
+  //
+
+  /// Shared storage object needed by threadblock-scoped GEMM
+  class SharedStorage {
+   public:
+    //
+    // Type definitions
+    //
+
+    /// Shape of the A matrix operand in shared memory
+    using ShapeA =
+        MatrixShape<Shape::kM + Policy::SmemPaddingA::kRow, Shape::kK * kStages + Policy::SmemPaddingA::kColumn>;
+
+    /// Shape of the B matrix operand in shared memory
+    using ShapeB =
+        MatrixShape<Shape::kK * kStages + Policy::SmemPaddingB::kRow, Shape::kN + Policy::SmemPaddingB::kColumn>;
+
+    /// Shape of the shared memory buffer for the scales for the B matrix.
+    using ShapeScale = MatrixShape<ScalebiasStages, ScaleElementsPerStage>;
+    /// Shape of the shared memory buffer for the biases of the B matrix.
+    using ShapeZero = MatrixShape<ScalebiasStages, BiasElementsPerStage>;
+
+   public:
+    //
+    // Data members
+    //
+
+    /// Buffer for A operand
+    AlignedBuffer<typename Operator::ElementA, ShapeA::kCount> operand_A;
+
+    /// Buffer for B operand
+    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;
+
+    /// Buffer to hold scales for threadblock
+    AlignedBuffer<ElementScale, ShapeScale::kCount> operand_scale;
+
+    /// Buffer to hold scales for threadblock
+    AlignedBuffer<ElementScale, ShapeZero::kCount> operand_zero;
+
+   public:
+    //
+    // Methods
+    //
+
+    /// Returns a layout object for the A matrix
+    CUTLASS_DEVICE
+    static typename Operator::LayoutA LayoutA() { return Operator::LayoutA::packed({ShapeA::kRow, ShapeA::kColumn}); }
+
+    /// Returns a layout object for the B matrix
+    CUTLASS_HOST_DEVICE
+    static typename Operator::LayoutB LayoutB() { return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn}); }
+
+    /// Returns a TensorRef to the A operand
+    CUTLASS_HOST_DEVICE
+    TensorRefA operand_A_ref() { return TensorRefA{operand_A.data(), LayoutA()}; }
+
+    /// Returns a TensorRef to the B operand
+    CUTLASS_HOST_DEVICE
+    TensorRefB operand_B_ref() { return TensorRefB{operand_B.data(), LayoutB()}; }
+  };
+
+ protected:
+  //
+  // Data members
+  //
+
+  /// Iterator to load a warp-scoped tile of A operand from shared memory
+  typename Operator::IteratorA warp_tile_iterator_A_;
+
+  /// Iterator to load a warp-scoped tile of B operand from shared memory
+  typename Operator::IteratorB warp_tile_iterator_B_;
+
+ public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  DqMmaBase(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      SharedStorage& shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : warp_tile_iterator_A_(shared_storage.operand_A_ref(), lane_idx),
+        warp_tile_iterator_B_(shared_storage.operand_B_ref(), lane_idx) {}
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_multistage.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_multistage.h
new file mode 100644
index 0000000000000..f11e94d9d2b95
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_multistage.h
@@ -0,0 +1,107 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_base.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/warp/mma_tensorop_dequantizer.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/interleaved_numeric_conversion.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Data type for the scales
+    typename IteratorScale_,
+    /// Iterators over scales in shared memory
+    typename SmemIteratorScale_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Converter for B matrix applited immediately after the LDS
+    typename TransformBAfterLDS_,
+    /// The quantization operator being used
+    WeightOnlyQuantOp QuantOp_,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
+    /// Used for partial specialization
+    typename Enable = void>
+class DqMmaMultistage;
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_multistage_finegrained.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_multistage_percol.h"
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_multistage_finegrained.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_multistage_finegrained.h
new file mode 100644
index 0000000000000..dd934b9a00369
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_multistage_finegrained.h
@@ -0,0 +1,634 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_base.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/warp/mma_tensorop_dequantizer.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/interleaved_numeric_conversion.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Data type for the scales
+    typename IteratorScale_,
+    /// Iterators over scales in shared memory
+    typename SmemIteratorScale_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Converter for B matrix applited immediately after the LDS
+    typename TransformBAfterLDS_,
+    /// The quantization operator being used
+    WeightOnlyQuantOp QuantOp_,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear>
+class DqMmaMultistage<Shape_, IteratorA_, SmemIteratorA_, CacheOpA, IteratorB_, SmemIteratorB_, CacheOpB,
+                      IteratorScale_, SmemIteratorScale_, ElementC_, LayoutC_, Policy_, Stages, TransformBAfterLDS_,
+                      QuantOp_, SharedMemoryClear, std::enable_if_t<isFinegrained(QuantOp_)>>
+    : public DqMmaBase<Shape_, Policy_, typename IteratorScale_::Element, Stages, QuantOp_> {
+ public:
+  ///< Base class
+  using Base = DqMmaBase<Shape_, Policy_, typename IteratorScale_::Element, Stages, QuantOp_>;
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  using IteratorScale = IteratorScale_;
+  using ElementScale = typename IteratorScale::Element;
+  using LayoutScale = typename IteratorScale::Layout;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+  using SmemIteratorScale = SmemIteratorScale_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  using TransformBAfterLDS = TransformBAfterLDS_;
+
+  static constexpr WeightOnlyQuantOp QuantOp = QuantOp_;
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Minimum architecture is Sm80 to support cp.async
+  using ArchTag = arch::Sm80;
+
+  using Dequantizer = warp::MmaTensorOpDequantizer<Operator, typename Base::WarpGemm, Operand::kB, ElementScale,
+                                                   LayoutScale, 32, QuantOp>;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  static_assert(Base::SharedStorage::ShapeScale::kRow == Stages, "");
+  static_assert(Base::SharedStorage::ShapeScale::kColumn == Shape::kN, "");
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+    static_assert(Base::kWarpGemmIterations > 1,
+                  "The pipelined structure requires at least two warp-level "
+                  "GEMM operations.");
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const AsyncCopyIterationsPerStageA = IteratorA::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const AsyncCopyIterationsPerStageB = IteratorB::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand A
+    static int const kAccessesPerGroupA =
+        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB =
+        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+  };
+
+ private:
+  using WarpFragmentA = typename Operator::FragmentA;
+  using WarpFragmentB = typename Operator::FragmentB;
+  Dequantizer warp_dequantizer_;
+
+  using ElementB = typename IteratorB::Element;
+  using LayoutDetailsForB = kernel::LayoutDetailsB<ElementB, ArchTag>;
+
+  static constexpr bool RequiresTileInterleave =
+      layout::IsColumnMajorTileInterleave<typename LayoutDetailsForB::Layout>::value;
+  static_assert(!RequiresTileInterleave || (RequiresTileInterleave && (Shape::kK == LayoutDetailsForB::ThreadblockK)),
+                "Layout K must match threadblockK");
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+  /// Iterator to write threadblock-scoped tile of scale and zero operand to shared memory
+  SmemIteratorScale smem_iterator_scale_;
+
+ public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  DqMmaMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage& shared_storage,
+      /// The group size for quantization
+      int group_size,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : Base(shared_storage, thread_idx, warp_idx, lane_idx),
+        warp_dequantizer_({shared_storage.operand_scale.data(), LayoutScale(Shape::kN)},
+                          {shared_storage.operand_zero.data(), LayoutScale(Shape::kN)},
+                          (warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN)) / Base::WarpCount::kM, lane_idx),
+        smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+        smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx),
+        smem_iterator_scale_(LayoutScale(Shape::kN), shared_storage.operand_scale.data(),
+                             shared_storage.operand_zero.data(), {Base::kStages, Shape::kN}, thread_idx, group_size) {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset({warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset({Base::kWarpGemmIterationsForB * warp_idx_k, warp_idx_n});
+  }
+
+  CUTLASS_DEVICE
+  void copy_scales_and_advance(IteratorScale& iterator_scale, int stage = -1, int k_iter = -1) {
+    static_assert(IteratorScale::Shape::kRow == 1, "Scale stride must be 1.");
+
+    typename IteratorScale::AccessType* gmem_scale_ptr = iterator_scale.get_scale();
+    typename IteratorScale::AccessType* gmem_zero_ptr = iterator_scale.get_zero();
+
+    typename IteratorScale::AccessType* smem_scale_ptr =
+        reinterpret_cast<typename IteratorScale::AccessType*>(this->smem_iterator_scale_.get_scale());
+    typename IteratorScale::AccessType* smem_zero_ptr =
+        reinterpret_cast<typename IteratorScale::AccessType*>(this->smem_iterator_scale_.get_zero());
+
+    int const kSrcBytes = sizeof_bits<typename IteratorScale::Element>::value * IteratorScale::kAlignment / 8;
+
+    cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(smem_scale_ptr, gmem_scale_ptr, iterator_scale.valid());
+
+    if (gmem_zero_ptr != nullptr) {
+      cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(smem_zero_ptr, gmem_zero_ptr, iterator_scale.valid());
+    }
+
+    if (iterator_scale.group_size_ == 64) {
+      iterator_scale.add_tile_offset({1, 0});
+    } else if (iterator_scale.group_size_ == 128) {
+      if (iterator_scale.row_groupsize64_ & 0x1) {
+        iterator_scale.add_tile_offset({1, 0});
+      }
+    }
+
+    iterator_scale.row_groupsize64_++;
+
+    this->smem_iterator_scale_.add_tile_offset({1, 0});
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance(IteratorA& iterator_A, IteratorB& iterator_B, IteratorScale& iterator_scale,
+                              int group_start_A = 0, int group_start_B = 0) {
+    iterator_A.set_iteration_index(group_start_A * IteratorA::kAccessesPerVector);
+    this->smem_iterator_A_.set_iteration_index(group_start_A);
+
+    // Async Copy for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
+      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
+        typename IteratorA::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType*>(this->smem_iterator_A_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                              IteratorA::ThreadMap::kElementsPerAccess / IteratorA::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_A.get();
+
+          if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(dst_ptr + v, gmem_ptr, iterator_A.valid());
+          } else {
+            cutlass::arch::cp_async<kSrcBytes, kCacheOpA>(dst_ptr + v, gmem_ptr, iterator_A.valid());
+          }
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+    }
+
+    iterator_B.set_iteration_index(group_start_B * IteratorB::kAccessesPerVector);
+    this->smem_iterator_B_.set_iteration_index(group_start_B);
+
+    // Async Copy for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
+      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
+        typename IteratorB::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType*>(this->smem_iterator_B_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+                              IteratorB::ThreadMap::kElementsPerAccess / IteratorB::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B.get();
+
+          if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(dst_ptr + v, gmem_ptr, iterator_B.valid());
+          } else {
+            cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(dst_ptr + v, gmem_ptr, iterator_B.valid());
+          }
+
+          ++iterator_B;
+        }
+        ++this->smem_iterator_B_;
+      }
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC& accum,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      ///< iterator over scale operand in global memory
+      IteratorScale iterator_scale,
+      ///< initial value of accumulator
+      FragmentC const& src_accum) {
+    //
+    // Prologue
+    //
+
+    TransformBAfterLDS lds_converter;
+
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1; ++stage, --gemm_k_iterations) {
+      iterator_A.clear_mask(gemm_k_iterations == 0);
+      iterator_B.clear_mask(gemm_k_iterations == 0);
+      iterator_scale.clear_mask(gemm_k_iterations == 0);
+
+      iterator_A.set_iteration_index(0);
+      this->smem_iterator_A_.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType*>(this->smem_iterator_A_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                                IteratorA::ThreadMap::kElementsPerAccess / IteratorA::kAccessesPerVector / 8;
+
+          int src_bytes = (iterator_A.valid() ? kSrcBytes : 0);
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(dst_ptr + v, iterator_A.get(), iterator_A.valid());
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+
+      iterator_B.set_iteration_index(0);
+      this->smem_iterator_B_.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType*>(this->smem_iterator_B_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+                                IteratorB::ThreadMap::kElementsPerAccess / IteratorB::kAccessesPerVector / 8;
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(dst_ptr + v, iterator_B.get(), iterator_B.valid());
+
+          ++iterator_B;
+        }
+
+        ++this->smem_iterator_B_;
+      }
+
+      copy_scales_and_advance(iterator_scale, stage, gemm_k_iterations);
+
+      // Move to the next stage
+      iterator_A.add_tile_offset({0, 1});
+      iterator_B.add_tile_offset({1, 0});
+
+      this->smem_iterator_A_.add_tile_offset({0, 1});
+      this->smem_iterator_B_.add_tile_offset({1, 0});
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    //
+    // Clear the remaining tiles of SMEM. This is a functional requirement for some kernels
+    // so that all accumulator elements outside the GEMM footprint are zero.
+    //
+
+    if (SharedMemoryClear == SharedMemoryClearOption::kClearLastStage) {
+      /// Iterator to write threadblock-scoped tile of A operand to shared memory
+      SmemIteratorA last_smem_iterator_A(this->smem_iterator_A_);
+
+      typename IteratorA::AccessType zero_A;
+      zero_A.clear();
+
+      last_smem_iterator_A.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType*>(last_smem_iterator_A.get());
+
+        *dst_ptr = zero_A;
+
+        ++last_smem_iterator_A;
+      }
+
+      /// Iterator to write threadblock-scoped tile of B operand to shared memory
+      SmemIteratorB last_smem_iterator_B(this->smem_iterator_B_);
+      typename IteratorB::AccessType zero_B;
+
+      zero_B.clear();
+      last_smem_iterator_B.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType*>(last_smem_iterator_B.get());
+
+        *dst_ptr = zero_B;
+
+        ++last_smem_iterator_B;
+      }
+    }
+
+    // Wait until we have at least one committed global fetch stage. (#uncommitted = Base::kStages - 1 - #committed)
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpFragmentA warp_frag_A[2];
+    WarpFragmentB warp_frag_B[2];
+    typename Dequantizer::FragmentScale warp_frag_scales;
+    typename Dequantizer::FragmentZero warp_frag_zeros;
+
+    Operator warp_mma;
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_frag_B[0]);
+
+    warp_dequantizer_.load(warp_frag_scales, warp_frag_zeros);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+    warp_dequantizer_.add_pointer_offset(Shape::kN);
+
+    iterator_A.clear_mask(gemm_k_iterations == 0);
+    iterator_B.clear_mask(gemm_k_iterations == 0);
+    iterator_scale.clear_mask(gemm_k_iterations == 0);
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-Base::kStages + 1);) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]);
+        ++this->warp_tile_iterator_A_;
+
+        int const warp_tileB_k_compute_offset = warp_mma_k % Base::kNumKIterationsPerWarpBLoad;
+        int const warp_tileB_k_load_offset = warp_mma_k / Base::kNumKIterationsPerWarpBLoad;
+        if (warp_tileB_k_compute_offset == Base::kNumKIterationsPerWarpBLoad - 1) {
+          this->warp_tile_iterator_B_.set_kgroup_index((warp_tileB_k_load_offset + 1) % Base::kWarpGemmIterationsForB);
+          this->warp_tile_iterator_B_.load(warp_frag_B[(warp_tileB_k_load_offset + 1) % 2]);
+          ++this->warp_tile_iterator_B_;
+        }
+
+        typename TransformBAfterLDS::result_type converted_frag_B =
+            lds_converter(warp_frag_B[warp_tileB_k_load_offset % 2]);
+        warp_dequantizer_.dequantize(converted_frag_B, warp_frag_scales, warp_frag_zeros);
+
+        run_warp_mma(warp_mma, accum, warp_frag_A[warp_mma_k % 2], converted_frag_B, accum,
+                     warp_tileB_k_compute_offset);
+
+        // Issue global->shared copies for the this stage
+        if (warp_mma_k < Base::kWarpGemmIterations - 1) {
+          int group_start_iteration_A, group_start_iteration_B;
+
+          group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA;
+          group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB;
+
+          copy_tiles_and_advance(iterator_A, iterator_B, iterator_scale, group_start_iteration_A,
+                                 group_start_iteration_B);
+
+          // This is the first group of a given stage, so we issue the loads for the B scales immediately.
+          if (group_start_iteration_B == 0) {
+            copy_scales_and_advance(iterator_scale);
+          }
+        }
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
+          int group_start_iteration_A, group_start_iteration_B;
+          group_start_iteration_A = (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+          group_start_iteration_B = (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+
+          copy_tiles_and_advance(iterator_A, iterator_B, iterator_scale, group_start_iteration_A,
+                                 group_start_iteration_B);
+
+          // Inserts a memory fence between stages of cp.async instructions.
+          cutlass::arch::cp_async_fence();
+
+          // Wait until we have at least one committed global fetch stage. (#uncommitted = Base::kStages - 1 -
+          // #committed)
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A.add_tile_offset({0, 1});
+          iterator_B.add_tile_offset({1, 0});
+
+          this->smem_iterator_A_.add_tile_offset({0, 1});
+          this->smem_iterator_B_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+            this->smem_iterator_scale_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterationsForB, 0});
+            warp_dequantizer_.add_pointer_offset(-Base::kStages * Shape::kN);
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations;
+          iterator_A.clear_mask(gemm_k_iterations == 0);
+          iterator_B.clear_mask(gemm_k_iterations == 0);
+          iterator_scale.clear_mask(gemm_k_iterations == 0);
+        }
+      }
+
+      // Load the scale needed for the next tile iteration.
+      warp_dequantizer_.load(warp_frag_scales, warp_frag_zeros);
+      // Update internal pointer to set of scales in shared memory.
+      warp_dequantizer_.add_pointer_offset(Shape::kN);
+    }
+
+    if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+      // commit and drain all pending and predicated LDGSTS pnz from the GEMM mainloop
+      cutlass::arch::cp_async_fence();
+      cutlass::arch::cp_async_wait<0>();
+      __syncthreads();
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_multistage_percol.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_multistage_percol.h
new file mode 100644
index 0000000000000..33bcb19106381
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_multistage_percol.h
@@ -0,0 +1,586 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_base.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/warp/mma_tensorop_dequantizer.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/interleaved_numeric_conversion.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Data type for the scales
+    typename IteratorScale_,
+    /// Iterators over scales in shared memory
+    typename SmemIteratorScale_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Converter for B matrix applited immediately after the LDS
+    typename TransformBAfterLDS_,
+    /// The quantization operator being used
+    WeightOnlyQuantOp QuantOp_,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear>
+class DqMmaMultistage<Shape_, IteratorA_, SmemIteratorA_, CacheOpA, IteratorB_, SmemIteratorB_, CacheOpB,
+                      IteratorScale_, SmemIteratorScale_, ElementC_, LayoutC_, Policy_, Stages, TransformBAfterLDS_,
+                      QuantOp_, SharedMemoryClear, std::enable_if_t<!isFinegrained(QuantOp_)>>
+    : public DqMmaBase<Shape_, Policy_, typename IteratorScale_::Element, Stages, QuantOp_> {
+ public:
+  ///< Base class
+  using Base = DqMmaBase<Shape_, Policy_, typename IteratorScale_::Element, Stages, QuantOp_>;
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  using IteratorScale = IteratorScale_;
+  using ElementScale = typename IteratorScale::Element;
+  using LayoutScale = typename IteratorScale::Layout;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+  using SmemIteratorScale = SmemIteratorScale_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  using TransformBAfterLDS = TransformBAfterLDS_;
+
+  static constexpr WeightOnlyQuantOp QuantOp = QuantOp_;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of operand Scale loaded from global memory;
+  using FragmentScale = typename IteratorScale::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Minimum architecture is Sm80 to support cp.async
+  using ArchTag = arch::Sm80;
+
+  using Dequantizer = warp::MmaTensorOpDequantizer<Operator, typename Base::WarpGemm, Operand::kB, ElementScale,
+                                                   LayoutScale, 32, QuantOp>;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+    static_assert(Base::kWarpGemmIterations > 1,
+                  "The pipelined structure requires at least two warp-level "
+                  "GEMM operations.");
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const AsyncCopyIterationsPerStageA = IteratorA::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const AsyncCopyIterationsPerStageB = IteratorB::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand A
+    static int const kAccessesPerGroupA =
+        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB =
+        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+  };
+
+ private:
+  using WarpFragmentA = typename Operator::FragmentA;
+  using WarpFragmentB = typename Operator::FragmentB;
+  Dequantizer warp_dequantizer_;
+
+  using ElementB = typename IteratorB::Element;
+  using LayoutDetailsForB = kernel::LayoutDetailsB<ElementB, ArchTag>;
+
+  static constexpr bool RequiresTileInterleave =
+      layout::IsColumnMajorTileInterleave<typename LayoutDetailsForB::Layout>::value;
+  static_assert(!RequiresTileInterleave || (RequiresTileInterleave && (Shape::kK == LayoutDetailsForB::ThreadblockK)),
+                "Layout K must match threadblockK");
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+  /// Iterator to write threadblock-scoped tile of scale operand to shared memory
+  SmemIteratorScale smem_iterator_scale_;
+
+ public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  DqMmaMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage& shared_storage,
+      ///< Group size for quantization. Not used by this main loop since it assumes per-column
+      int const group_size,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : Base(shared_storage, thread_idx, warp_idx, lane_idx),
+        warp_dequantizer_({shared_storage.operand_scale.data(), LayoutScale(Shape::kN)},
+                          (warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN)) / Base::WarpCount::kM, lane_idx),
+        smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+        smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx),
+        smem_iterator_scale_(LayoutScale(Shape::kN), shared_storage.operand_scale.data(), {1, Shape::kN}, thread_idx) {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset({warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset({Base::kWarpGemmIterationsForB * warp_idx_k, warp_idx_n});
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance(IteratorA& iterator_A, IteratorB& iterator_B, int group_start_A = 0,
+                              int group_start_B = 0) {
+    iterator_A.set_iteration_index(group_start_A * IteratorA::kAccessesPerVector);
+    this->smem_iterator_A_.set_iteration_index(group_start_A);
+
+    // Async Copy for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
+      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
+        typename IteratorA::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType*>(this->smem_iterator_A_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                              IteratorA::ThreadMap::kElementsPerAccess / IteratorA::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_A.get();
+
+          if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(dst_ptr + v, gmem_ptr, iterator_A.valid());
+          } else {
+            cutlass::arch::cp_async<kSrcBytes, kCacheOpA>(dst_ptr + v, gmem_ptr, iterator_A.valid());
+          }
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+    }
+
+    iterator_B.set_iteration_index(group_start_B * IteratorB::kAccessesPerVector);
+    this->smem_iterator_B_.set_iteration_index(group_start_B);
+
+    // Async Copy for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
+      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
+        typename IteratorB::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType*>(this->smem_iterator_B_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+                              IteratorB::ThreadMap::kElementsPerAccess / IteratorB::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B.get();
+
+          if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(dst_ptr + v, gmem_ptr, iterator_B.valid());
+          } else {
+            cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(dst_ptr + v, gmem_ptr, iterator_B.valid());
+          }
+
+          ++iterator_B;
+        }
+        ++this->smem_iterator_B_;
+      }
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC& accum,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      ///< iterator over scale operand in global memory
+      IteratorScale iterator_scale,
+      ///< initial value of accumulator
+      FragmentC const& src_accum) {
+    //
+    // Prologue
+    //
+
+    TransformBAfterLDS lds_converter;
+
+    // NOTE - switch to ldg.sts
+    // Issue this first, so cp.async.commit_group will commit this load as well.
+    // Note: we do not commit here and this load will commit in the same group as
+    //       the first load of A.
+    FragmentScale tb_frag_scales;
+    tb_frag_scales.clear();
+    iterator_scale.load(tb_frag_scales);
+    this->smem_iterator_scale_.store(tb_frag_scales);
+
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1; ++stage, --gemm_k_iterations) {
+      iterator_A.clear_mask(gemm_k_iterations == 0);
+      iterator_B.clear_mask(gemm_k_iterations == 0);
+
+      iterator_A.set_iteration_index(0);
+      this->smem_iterator_A_.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType*>(this->smem_iterator_A_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                                IteratorA::ThreadMap::kElementsPerAccess / IteratorA::kAccessesPerVector / 8;
+
+          int src_bytes = (iterator_A.valid() ? kSrcBytes : 0);
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(dst_ptr + v, iterator_A.get(), iterator_A.valid());
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+
+      iterator_B.set_iteration_index(0);
+      this->smem_iterator_B_.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType*>(this->smem_iterator_B_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+                                IteratorB::ThreadMap::kElementsPerAccess / IteratorB::kAccessesPerVector / 8;
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(dst_ptr + v, iterator_B.get(), iterator_B.valid());
+
+          ++iterator_B;
+        }
+
+        ++this->smem_iterator_B_;
+      }
+
+      // Move to the next stage
+      iterator_A.add_tile_offset({0, 1});
+      iterator_B.add_tile_offset({1, 0});
+
+      this->smem_iterator_A_.add_tile_offset({0, 1});
+      this->smem_iterator_B_.add_tile_offset({1, 0});
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    //
+    // Clear the remaining tiles of SMEM. This is a functional requirement for some kernels
+    // so that all accumulator elements outside the GEMM footprint are zero.
+    //
+
+    if (SharedMemoryClear == SharedMemoryClearOption::kClearLastStage) {
+      /// Iterator to write threadblock-scoped tile of A operand to shared memory
+      SmemIteratorA last_smem_iterator_A(this->smem_iterator_A_);
+
+      typename IteratorA::AccessType zero_A;
+      zero_A.clear();
+
+      last_smem_iterator_A.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType*>(last_smem_iterator_A.get());
+
+        *dst_ptr = zero_A;
+
+        ++last_smem_iterator_A;
+      }
+
+      /// Iterator to write threadblock-scoped tile of B operand to shared memory
+      SmemIteratorB last_smem_iterator_B(this->smem_iterator_B_);
+      typename IteratorB::AccessType zero_B;
+
+      zero_B.clear();
+      last_smem_iterator_B.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType*>(last_smem_iterator_B.get());
+
+        *dst_ptr = zero_B;
+
+        ++last_smem_iterator_B;
+      }
+    }
+
+    // Waits until kStages-2 stages have committed.
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpFragmentA warp_frag_A[2];
+    WarpFragmentB warp_frag_B[2];
+    typename Dequantizer::FragmentScale warp_frag_scales;
+
+    Operator warp_mma;
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_frag_B[0]);
+    warp_dequantizer_.load(warp_frag_scales);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    iterator_A.clear_mask(gemm_k_iterations == 0);
+    iterator_B.clear_mask(gemm_k_iterations == 0);
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-Base::kStages + 1);) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]);
+        ++this->warp_tile_iterator_A_;
+
+        int const warp_tileB_k_compute_offset = warp_mma_k % Base::kNumKIterationsPerWarpBLoad;
+        int const warp_tileB_k_load_offset = warp_mma_k / Base::kNumKIterationsPerWarpBLoad;
+        if (warp_tileB_k_compute_offset == Base::kNumKIterationsPerWarpBLoad - 1) {
+          this->warp_tile_iterator_B_.set_kgroup_index((warp_tileB_k_load_offset + 1) % Base::kWarpGemmIterationsForB);
+          this->warp_tile_iterator_B_.load(warp_frag_B[(warp_tileB_k_load_offset + 1) % 2]);
+          ++this->warp_tile_iterator_B_;
+        }
+
+        typename TransformBAfterLDS::result_type converted_frag_B =
+            lds_converter(warp_frag_B[warp_tileB_k_load_offset % 2]);
+        warp_dequantizer_.dequantize(converted_frag_B, warp_frag_scales);
+
+        run_warp_mma(warp_mma, accum, warp_frag_A[warp_mma_k % 2], converted_frag_B, accum,
+                     warp_tileB_k_compute_offset);
+
+        // Issue global->shared copies for the this stage
+        if (warp_mma_k < Base::kWarpGemmIterations - 1) {
+          int group_start_iteration_A, group_start_iteration_B;
+
+          group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA;
+          group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB;
+
+          copy_tiles_and_advance(iterator_A, iterator_B, group_start_iteration_A, group_start_iteration_B);
+        }
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
+          int group_start_iteration_A, group_start_iteration_B;
+          group_start_iteration_A = (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+          group_start_iteration_B = (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+
+          copy_tiles_and_advance(iterator_A, iterator_B, group_start_iteration_A, group_start_iteration_B);
+
+          // Inserts a memory fence between stages of cp.async instructions.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages have committed.
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A.add_tile_offset({0, 1});
+          iterator_B.add_tile_offset({1, 0});
+
+          this->smem_iterator_A_.add_tile_offset({0, 1});
+          this->smem_iterator_B_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterationsForB, 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations;
+          iterator_A.clear_mask(gemm_k_iterations == 0);
+          iterator_B.clear_mask(gemm_k_iterations == 0);
+        }
+      }
+    }
+
+    if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+      // commit and drain all pending and predicated LDGSTS pnz from the GEMM mainloop
+      cutlass::arch::cp_async_fence();
+      cutlass::arch::cp_async_wait<0>();
+      __syncthreads();
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_pipelined.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_pipelined.h
new file mode 100644
index 0000000000000..2c85ba8a1995e
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_pipelined.h
@@ -0,0 +1,379 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_conversion.h"
+
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_base.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/warp/mma_tensorop_dequantizer.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/interleaved_numeric_conversion.h"
+
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/mixed_gemm_B_layout.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm_configs.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Data type for the scales
+    typename IteratorScale_,
+    /// Iterators over scales in shared memory
+    typename SmemIteratorScale_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Converter for B matrix applied immediately after the LDG (before STS)
+    typename TransformBAfterLDG_,
+    /// Converter for B matrix applited immediately after the LDS
+    typename TransformBAfterLDS_,
+    /// The quantization operator being used
+    WeightOnlyQuantOp QuantOp_,
+    /// Used for partial specialization
+    typename Enable = bool>
+class DqMmaPipelined : public DqMmaBase<Shape_, Policy_, typename SmemIteratorScale_::Element, 2, QuantOp_> {
+ public:
+  ///< Base class
+  using Base = DqMmaBase<Shape_, Policy_, typename SmemIteratorScale_::Element, 2, QuantOp_>;
+
+  using Shape = Shape_;          ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using IteratorA = IteratorA_;  ///< Iterates over tiles of A operand in global memory
+  using IteratorB = IteratorB_;  ///< Iterates over tiles of B operand in global memory
+  using ElementC = ElementC_;    ///< Data type of accumulator matrix
+  using LayoutC = LayoutC_;      ///< Layout of accumulator matrix
+  using Policy = Policy_;        ///< Policy describing tuning details
+
+  using IteratorScale = IteratorScale_;
+  using ElementScale = typename IteratorScale::Element;
+  using LayoutScale = typename IteratorScale::Layout;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+  using SmemIteratorScale = SmemIteratorScale_;
+
+  using TransformBAfterLDG = TransformBAfterLDG_;
+  using TransformBAfterLDS = TransformBAfterLDS_;
+
+  static constexpr WeightOnlyQuantOp QuantOp = QuantOp_;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of operand A loaded from global memory
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Fragment of operand Scale loaded from global memory;
+  using FragmentScale = typename IteratorScale::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Obtain the arch tag from the warp-level operator
+  using ArchTag = typename Policy::Operator::ArchTag;
+
+  using Dequantizer =
+      warp::MmaTensorOpDequantizer<Operator, typename Base::WarpGemm, Operand::kB,
+                                   typename SmemIteratorScale::Fragment::Element, LayoutScale, 32, QuantOp>;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  // staticaly assert kStages for DqMmaPipelined is two (Double-buffered pipeline)
+  static_assert((Base::kStages == 2), "DqMmaPipelined requires kStages set to value 2");
+
+ private:
+  using WarpFragmentA = typename Operator::FragmentA;
+  using WarpFragmentB = typename Operator::FragmentB;
+  Dequantizer warp_dequantizer_;
+
+  using ElementB = typename IteratorB::Element;
+  using LayoutDetailsForB = kernel::LayoutDetailsB<ElementB, ArchTag>;
+
+  static constexpr bool RequiresTileInterleave =
+      layout::IsColumnMajorTileInterleave<typename LayoutDetailsForB::Layout>::value;
+  static_assert(!RequiresTileInterleave || (RequiresTileInterleave && (Shape::kK == LayoutDetailsForB::ThreadblockK)),
+                "Layout K must match threadblockK");
+
+ protected:
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+  /// Iterator to write threadblock-scoped tile of scale operand to shared memory
+  SmemIteratorScale smem_iterator_scale_;
+
+ public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  DqMmaPipelined(
+      typename Base::SharedStorage&
+          shared_storage,    ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      int const group_size,  ///< Will not be used, just to adapt to finegrained modifications and make the compilation
+                             ///< successful. Because DqMmaPipelined is only enabled for sm<80, so even if this
+                             ///< argument is not added, it does not affect compilation for sm>=80.
+      int thread_idx,        ///< ID within the threadblock
+      int warp_idx,          ///< ID of warp
+      int lane_idx           ///< ID of each thread within a warp
+      )
+      : Base(shared_storage, thread_idx, warp_idx, lane_idx),
+        warp_dequantizer_({shared_storage.operand_scale.data(), LayoutScale(Shape::kN)},
+                          (warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN)) / Base::WarpCount::kM, lane_idx),
+        smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+        smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx),
+        smem_iterator_scale_(LayoutScale(Shape::kN), shared_storage.operand_scale.data(), {1, Shape::kN}, thread_idx) {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset({warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset({Base::kWarpGemmIterationsForB * warp_idx_k, warp_idx_n});
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(int gemm_k_iterations,         ///< number of iterations of the mainloop
+                  FragmentC& accum,              ///< destination accumulator tile
+                  IteratorA iterator_A,          ///< iterator over A operand in global memory
+                  IteratorB iterator_B,          ///< iterator over B operand in global memory
+                  IteratorScale iterator_scale,  ///< iterator over scale operand in global memory
+                  FragmentC const& src_accum) {  ///< source accumulator tile
+    //
+    // Prologue
+    //
+    TransformBAfterLDG ldg_converter;
+    TransformBAfterLDS lds_converter;
+
+    using TransformA =
+        NumericArrayConverter<typename WarpFragmentA::Element, typename FragmentA::Element, FragmentA::kElements>;
+
+    using TransformScale = NumericArrayConverter<typename SmemIteratorScale::Fragment::Element,
+                                                 typename FragmentScale::Element, FragmentScale::kElements>;
+
+    // These transforms are mainly to handle when we have bfloat activations and weights in GMEM and want
+    // to issue HMMA on architectures older than Ampere. We will convert to FP16 before STS.
+    TransformA transformA;
+    TransformScale transformScale;
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    FragmentA tb_frag_A;
+    FragmentB tb_frag_B;
+    FragmentScale tb_frag_scales;
+
+    using WarpFragmentScale = typename Dequantizer::FragmentScale;
+    WarpFragmentScale warp_frag_scales;
+
+    tb_frag_A.clear();
+    tb_frag_B.clear();
+    tb_frag_scales.clear();
+
+    // The last kblock is loaded in the prolog
+    iterator_A.load(tb_frag_A);
+    iterator_B.load(tb_frag_B);
+    iterator_scale.load(tb_frag_scales);
+
+    ++iterator_A;
+    ++iterator_B;
+
+    this->smem_iterator_A_.store(transformA(tb_frag_A));
+    this->smem_iterator_B_.store(ldg_converter(tb_frag_B));
+    this->smem_iterator_scale_.store(transformScale(tb_frag_scales));
+
+    ++this->smem_iterator_A_;
+    ++this->smem_iterator_B_;
+
+    __syncthreads();
+
+    warp_dequantizer_.load(warp_frag_scales);
+
+    // Pair of fragments used to overlap shared memory loads and math instructions
+    WarpFragmentA warp_frag_A[2];
+    WarpFragmentB warp_frag_B[2];
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    Operator warp_mma;
+
+    int smem_write_stage_idx = 1;
+
+    // Avoid reading out of bounds
+    iterator_A.clear_mask(gemm_k_iterations <= 1);
+    iterator_B.clear_mask(gemm_k_iterations <= 1);
+
+    // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
+    // shared memory loads (which have the tighest latency requirement).
+
+    //
+    // Mainloop
+    //
+
+    // Note: The main loop does not support Base::kWarpGemmIterations == 2.
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > 0; --gemm_k_iterations) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
+        // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
+        // as the case may be.
+
+        if (warp_mma_k == Base::kWarpGemmIterations - 1) {
+          // Write fragments to shared memory
+          this->smem_iterator_A_.store(transformA(tb_frag_A));
+
+          this->smem_iterator_B_.store(ldg_converter(tb_frag_B));
+
+          __syncthreads();
+
+          ++this->smem_iterator_A_;
+          ++this->smem_iterator_B_;
+
+          // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
+          if (smem_write_stage_idx == 1) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+          } else {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterationsForB, 0});
+          }
+
+          smem_write_stage_idx ^= 1;
+        }
+
+        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]);
+        ++this->warp_tile_iterator_A_;
+
+        int const warp_tileB_k_compute_offset = warp_mma_k % Base::kNumKIterationsPerWarpBLoad;
+        int const warp_tileB_k_load_offset = warp_mma_k / Base::kNumKIterationsPerWarpBLoad;
+        // We are just about to finish computing on a fragment of B, so initiate the load for the next fragment.
+        if (warp_tileB_k_compute_offset == Base::kNumKIterationsPerWarpBLoad - 1) {
+          this->warp_tile_iterator_B_.set_kgroup_index((warp_tileB_k_load_offset + 1) % Base::kWarpGemmIterationsForB);
+          this->warp_tile_iterator_B_.load(warp_frag_B[(warp_tileB_k_load_offset + 1) % 2]);
+          ++this->warp_tile_iterator_B_;
+        }
+
+        if (warp_mma_k == 0) {
+          iterator_A.load(tb_frag_A);
+          iterator_B.load(tb_frag_B);
+
+          ++iterator_A;
+          ++iterator_B;
+
+          // Avoid reading out of bounds if this was the last loop iteration
+          iterator_A.clear_mask(gemm_k_iterations <= 2);
+          iterator_B.clear_mask(gemm_k_iterations <= 2);
+        }
+
+        typename TransformBAfterLDS::result_type converted_frag_B =
+            lds_converter(warp_frag_B[warp_tileB_k_load_offset % 2]);
+        warp_dequantizer_.dequantize(converted_frag_B, warp_frag_scales);
+        run_warp_mma(warp_mma, accum, warp_frag_A[warp_mma_k % 2], converted_frag_B, accum,
+                     warp_tileB_k_compute_offset);
+      }
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/warp/default_mma_tensor_op.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/warp/default_mma_tensor_op.h
new file mode 100644
index 0000000000000..f0b6f4fcaad33
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/warp/default_mma_tensor_op.h
@@ -0,0 +1,103 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Default warp-level GEMM operators selected by data type, size, and layouts of operands.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/warp/default_mma_tensor_op.h"
+#include "cutlass/gemm/warp/mma_tensor_op.h"
+
+#include "contrib_ops/cuda/moe/cutlass_extensions/arch/mma.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/warp/mma_tensorop_compute_B_with_f16.h"
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for m-by-n-by-kgroup
+template <
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A elements,
+    typename ElementA,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Data type of B elements
+    typename ElementB,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Element type of C matrix
+    typename ElementC,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Number of partitions along K dimension
+    int PartitionsK,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor>
+struct DefaultMmaTensorOp<WarpShape_, InstructionShape_, ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                          arch::OpMultiplyAddDequantizeInterleavedBToA, PartitionsK, AccumulatorsInRowMajor> {
+ private:
+  // Shape for computing the FP16s
+  using ComputeInstructionShape = InstructionShape_;
+
+  // Chosen so we get K=16 for int8 and K=32 for int4.
+  static constexpr int LoadInstructionK = 8 * sizeof_bits<ElementA>::value / sizeof_bits<ElementB>::value;
+
+  // Shape for loading the narrow data type from shared memory
+  using LoadInstructionShape = GemmShape<InstructionShape_::kM, InstructionShape_::kN, LoadInstructionK>;
+
+ public:
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<InstructionShape_, 32, ElementA, cutlass::layout::RowMajor, ElementA,
+                         cutlass::layout::ColumnMajor, ElementC, cutlass::layout::RowMajor, arch::OpMultiplyAdd>,
+      cutlass::MatrixShape<1, 1>>;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaTensorOpComputeBWithF16<WarpShape_, ElementA, LayoutA, ElementB, LayoutB,
+                                                               ElementC, LayoutC, Policy, LoadInstructionShape,
+                                                               PartitionsK, AccumulatorsInRowMajor>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace warp
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/warp/mma_tensorop_compute_B_with_f16.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/warp/mma_tensorop_compute_B_with_f16.h
new file mode 100644
index 0000000000000..a368c6d220266
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/warp/mma_tensorop_compute_B_with_f16.h
@@ -0,0 +1,283 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing warp-level matrix multiply-accumulate operations targeting
+      Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/platform/platform.h"
+
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/arch/mma_sm75.h"
+#include "cutlass/arch/mma_sm80.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Data type of A elements
+    typename ElementA_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA_,
+    /// Data type of B elements
+    typename ElementB_,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB_,
+    /// Element type of C matrix
+    typename ElementC_,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC_,
+    /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+    typename Policy_,
+    /// Instruction shape to override shared memory iterators with
+    typename SharedMemoryInstructionShape_,
+    /// Number of partitions along K dimension
+    int PartitionsK_ = 1,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaTensorOpComputeBWithF16 {
+ public:
+  /// Shape of warp-level matrix operation (concept: GemmShape)
+  using Shape = Shape_;
+
+  /// Data type of multiplicand A
+  using ElementA = ElementA_;
+
+  /// Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  /// Data type of multiplicand B
+  using ElementB = ElementB_;
+
+  /// Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  /// Data type of accumulator matrix C
+  using ElementC = ElementC_;
+
+  /// Layout of accumulator matrix C
+  using LayoutC = LayoutC_;
+
+  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
+  using Policy = Policy_;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  using ArchMmaOperator = typename Policy::Operator;
+
+  /// Indicates math operator
+  using MathOperator = typename ArchMmaOperator::Operator;
+
+  /// Architecture tag from underlying instruction
+  using ArchTag = typename ArchMmaOperator::ArchTag;
+  static_assert((platform::is_same<typename ArchMmaOperator::ElementA, half_t>::value &&
+                 platform::is_same<typename ArchMmaOperator::ElementB, half_t>::value) ||
+                    (platform::is_same<typename ArchMmaOperator::ElementA, bfloat16_t>::value &&
+                     platform::is_same<typename ArchMmaOperator::ElementB, bfloat16_t>::value &&
+                     ArchTag::kMinComputeCapability >= 80),
+                "MmaTensorOpCvtBToA only supports underlying HMMA");
+
+  static_assert(platform::is_same<ElementA, half_t>::value ||
+                    (platform::is_same<ElementA, bfloat16_t>::value && ArchTag::kMinComputeCapability >= 80),
+                "MmaTensorOpCvtBToA only supports Fp16 A or Bf16 A on Ampere+");
+
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Shape of underlying instruction
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  /// Instruction shape to override shared memory iterators with
+  using SharedMemoryInstructionShape = SharedMemoryInstructionShape_;
+
+  static_assert(SharedMemoryInstructionShape::kM == InstructionShape::kM,
+                "M dimension of compute instruction must match load");
+  static_assert(SharedMemoryInstructionShape::kN == InstructionShape::kN,
+                "N dimension of compute instruction must match load");
+
+  static constexpr int kExpansionFactor = SharedMemoryInstructionShape::kK / InstructionShape::kK;
+
+  static_assert(!(Shape::kK % SharedMemoryInstructionShape::kK), "");
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+
+  /// Number of threads participating in warp-level matrix product
+  static int const kThreadCount = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+ public:
+  /// Iterates over the A operand in memory
+  using IteratorA =
+      MmaTensorOpMultiplicandTileIterator<MatrixShape<Shape::kM, Shape::kK>, Operand::kA, ElementA, LayoutA,
+                                          MatrixShape<InstructionShape::kM, InstructionShape::kK>,
+                                          Policy::OpDelta::kRow, kThreadCount, kPartitionsK>;
+
+  /// Storage for A tile
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Storage for transformed A tile
+  using TransformedFragmentA = Array<typename ArchMmaOperator::ElementA, FragmentA::kElements>;
+
+  /// Iterates over the B operand in memory
+  using IteratorB =
+      MmaTensorOpMultiplicandTileIterator<MatrixShape<Shape::kK, Shape::kN>, Operand::kB, ElementB, LayoutB,
+                                          MatrixShape<SharedMemoryInstructionShape::kK, InstructionShape::kN>,
+                                          Policy::OpDelta::kRow, kThreadCount, kPartitionsK>;
+
+  /// Storage for B tile
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Storage for transformed B tile
+  using TransformedFragmentB = Array<typename ArchMmaOperator::ElementB, FragmentB::kElements>;
+
+  /// Iterates over the C operand in memory
+  using IteratorC = MmaTensorOpAccumulatorTileIterator<MatrixShape<Shape::kM, Shape::kN>, ElementC, LayoutC,
+                                                       typename ArchMmaOperator::Shape, typename Policy::OpDelta>;
+
+  /// Storage for C tile
+  using FragmentC = typename IteratorC::Fragment;
+
+  /// Number of mma operations performed
+  using MmaIterations = MatrixShape<(Shape::kM + ArchMmaOperator::Shape::kM - 1) / ArchMmaOperator::Shape::kM,
+                                    (Shape::kN + ArchMmaOperator::Shape::kN - 1) / ArchMmaOperator::Shape::kN>;
+
+ public:
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  ArchMmaOperator mma;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_DEVICE
+  MmaTensorOpComputeBWithF16() {}
+
+  /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(FragmentC& D, TransformedFragmentA const& A, TransformedFragmentB const& B, FragmentC const& C,
+                  int const warp_tileB_k_offset) const {
+    using MmaOperandA = typename ArchMmaOperator::FragmentA;
+    using MmaOperandB = typename ArchMmaOperator::FragmentB;
+    using MmaOperandC = typename ArchMmaOperator::FragmentC;
+
+    static_assert(
+        TransformedFragmentB::kElements == MmaOperandB::kElements * kExpansionFactor * MmaIterations::kColumn,
+        "Each thread should have a pack of mma registers for each column iteration AND for the expanded K dim of "
+        "B");
+
+    D = C;
+
+    MmaOperandA const* ptr_A = reinterpret_cast<MmaOperandA const*>(&A);
+    MmaOperandB const* ptr_B = reinterpret_cast<MmaOperandB const*>(&B);
+    MmaOperandC* ptr_D = reinterpret_cast<MmaOperandC*>(&D);
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)
+    // Serpentine visitation order maximizing reuse of Rb
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < MmaIterations::kColumn; ++n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < MmaIterations::kRow; ++m) {
+        int m_serpentine = ((n % 2) ? (MmaIterations::kRow - 1 - m) : m);
+
+        int n_offsetB = warp_tileB_k_offset + kExpansionFactor * n;
+        if (AccumulatorsInRowMajor) {  // matrix B is reordered
+          mma(ptr_D[n + m_serpentine * MmaIterations::kColumn], ptr_A[m_serpentine], ptr_B[n_offsetB],
+              ptr_D[n + m_serpentine * MmaIterations::kColumn]);
+        } else {
+          mma(ptr_D[m_serpentine + n * MmaIterations::kRow], ptr_A[m_serpentine], ptr_B[n_offsetB],
+              ptr_D[m_serpentine + n * MmaIterations::kRow]);
+        }
+      }
+    }
+#elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+    // Serpentine visitation order maximizing reuse of Ra
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < MmaIterations::kRow; ++m) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+        int n_serpentine = ((m % 2) ? (MmaIterations::kColumn - 1 - n) : n);
+
+        int n_serpentine_offsetB = warp_tileB_k_offset + kExpansionFactor * n_serpentine;
+        if (AccumulatorsInRowMajor) {  // matrix B is reordered
+          mma(ptr_D[n_serpentine + m * MmaIterations::kColumn], ptr_A[m], ptr_B[n_serpentine_offsetB],
+              ptr_D[n_serpentine + m * MmaIterations::kColumn]);
+        } else {
+          mma(ptr_D[m + n_serpentine * MmaIterations::kRow], ptr_A[m], ptr_B[n_serpentine_offsetB],
+              ptr_D[m + n_serpentine * MmaIterations::kRow]);
+        }
+      }
+    }
+#else
+    assert(0);
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace warp
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/warp/mma_tensorop_dequantizer.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/warp/mma_tensorop_dequantizer.h
new file mode 100644
index 0000000000000..51ca8282e42ff
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/warp/mma_tensorop_dequantizer.h
@@ -0,0 +1,534 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Defines iterators used by warp-level matrix multiply operations targeting Tensor Cores.
+*/
+
+#pragma once
+
+#include <functional>
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/array.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+
+#include "cutlass/functional.h"
+#include "cutlass/platform/platform.h"
+
+#include "contrib_ops/cuda/moe/cutlass_extensions/weight_only_quant_op.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Matrix multiply operator
+    typename MmaOperator_,
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Operand identity
+    Operand Operand,
+    /// Data type of Scale elements
+    typename Element_,
+    /// Layout of operand
+    typename Layout_,
+    /// Number of threads participating in one matrix operation
+    int Threads,
+    ///
+    WeightOnlyQuantOp QuantOp_,
+    ///
+    typename Enable = void>
+class MmaTensorOpDequantizer;
+
+////////////////////////////////////////////////////////////////////////////////
+// Bfloat specialization for Ampere
+template <
+    /// Underlying matrix multiply operator (concept: MmaTensorOp)
+    typename MmaOperator_,
+    /// Shape of the warp level matrix multiply (concept: GemmShape)
+    typename Shape_,
+    ///
+    WeightOnlyQuantOp QuantOp_>
+class MmaTensorOpDequantizer<
+    MmaOperator_, Shape_, Operand::kB, bfloat16_t, layout::RowMajor, 32, QuantOp_,
+    typename platform::enable_if<
+        MmaOperator_::ArchTag::kMinComputeCapability >= 80 &&
+        platform::is_same<typename MmaOperator_::ArchMmaOperator::LayoutB, layout::ColumnMajor>::value>::type> {
+ public:
+  /// Mma Operator
+  using MmaOperator = MmaOperator_;
+
+  // The architecture specific mma ooperator being used
+  using ArchMmaOperator = typename MmaOperator::ArchMmaOperator;
+
+  // Mma Instruction Shape
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  // This is the ratio of the load instruction vs the compute instruction.
+  static constexpr int kExpansionFactor = MmaOperator::IteratorB::InstructionShape::kRow / InstructionShape::kK;
+
+  /// Type of the scales
+  using ElementScale = bfloat16_t;
+
+  /// Fragment to hold B data before Mma
+  using FragmentDequantizedOperand = Array<ElementScale, MmaOperator::FragmentB::kElements>;
+
+  // Fragment to hold scale data to apply to B before mma
+  // We need 1 fp16 per matrix iteration in the N dimension
+  static constexpr int kColsPerMmaPerThread = 1;
+  using FragmentScale = Array<ElementScale, kColsPerMmaPerThread * MmaOperator::MmaIterations::kColumn>;
+  using FragmentZero = Array<ElementScale, kColsPerMmaPerThread * MmaOperator::MmaIterations::kColumn>;
+
+  /// Warp mma shape
+  using Shape = Shape_;
+
+  /// Layout of the scales in shared memory
+  using Layout = layout::RowMajor;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<ElementScale, Layout>;
+
+  static constexpr WeightOnlyQuantOp QuantOp = QuantOp_;
+
+  CUTLASS_DEVICE
+  MmaTensorOpDequantizer(TensorRef smem_scales, TensorRef smem_zeros, int const warp_idx_n, int const lane_idx) {
+    int const warp_offset = warp_idx_n * Shape::kN;
+    int const quad = lane_idx / 4;
+    int const thread_offset = warp_offset + quad;
+    pointer_scale_ = smem_scales.data() + thread_offset;
+    if constexpr (hasZero(QuantOp)) {
+      pointer_zero_ = smem_zeros.data() + thread_offset;
+    }
+  }
+
+  CUTLASS_DEVICE
+  MmaTensorOpDequantizer(TensorRef smem_scales, int const warp_idx_n, int const lane_idx)
+      : MmaTensorOpDequantizer(smem_scales, TensorRef(), warp_idx_n, lane_idx) {}
+
+  CUTLASS_DEVICE
+  void load(FragmentScale& scale_frag) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n_iter = 0; mma_n_iter < MmaOperator::MmaIterations::kColumn; ++mma_n_iter) {
+      scale_frag[mma_n_iter] = pointer_scale_[mma_n_iter * InstructionShape::kN];
+    }
+  }
+
+  CUTLASS_DEVICE
+  void dequantize(FragmentDequantizedOperand& operand_frag, FragmentScale const& scale_frag) {
+    // Slow path not implemented here on purpose. If we need to do HMMA on older arch, scale conversion should
+    // happen before scales are stored to shared memory and we should use the fp16 dequantizer. This will avoid
+    // numerous conversion instructions in GEMM main loop.
+    arch::device_breakpoint();
+  }
+
+  CUTLASS_DEVICE
+  void load(FragmentScale& scale_frag, FragmentScale& zero_frag) {
+    if constexpr (hasZero(QuantOp)) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_n_iter = 0; mma_n_iter < MmaOperator::MmaIterations::kColumn; ++mma_n_iter) {
+        scale_frag[mma_n_iter] = pointer_scale_[mma_n_iter * InstructionShape::kN];
+        zero_frag[mma_n_iter] = pointer_zero_[mma_n_iter * InstructionShape::kN];
+      }
+    } else {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_n_iter = 0; mma_n_iter < MmaOperator::MmaIterations::kColumn; ++mma_n_iter) {
+        scale_frag[mma_n_iter] = pointer_scale_[mma_n_iter * InstructionShape::kN];
+      }
+    }
+  }
+
+  CUTLASS_DEVICE
+  void dequantize(FragmentDequantizedOperand& operand_frag, FragmentScale const& scale_frag,
+                  FragmentScale const& zero_frag) {
+    // Slow path not implemented here on purpose. If we need to do HMMA on older arch, scale conversion should
+    // happen before scales are stored to shared memory and we should use the fp16 dequantizer. This will avoid
+    // numerous conversion instructions in GEMM main loop.
+    arch::device_breakpoint();
+  }
+
+  // Adds a pointer offset in units of elements.
+  CUTLASS_DEVICE
+  void add_pointer_offset(int64_t const& offset) {
+    static_assert(sizeof(ElementScale) > 1, "");
+    pointer_scale_ += offset;
+    pointer_zero_ += offset;
+  }
+
+ private:
+  ElementScale const* pointer_scale_;
+  ElementScale const* pointer_zero_;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Specialization for Turing & Ampere
+template <
+    /// Underlying matrix multiply operator (concept: MmaTensorOp)
+    typename MmaOperator_,
+    /// Shape of the warp level matrix multiply (concept: GemmShape)
+    typename Shape_,
+    ///
+    WeightOnlyQuantOp QuantOp_>
+class MmaTensorOpDequantizer<
+    MmaOperator_, Shape_, Operand::kB, half_t, layout::RowMajor, 32, QuantOp_,
+    typename platform::enable_if<
+        MmaOperator_::ArchTag::kMinComputeCapability >= 75 &&
+        platform::is_same<typename MmaOperator_::ArchMmaOperator::LayoutB, layout::ColumnMajor>::value>::type> {
+ public:
+  /// Mma Operator
+  using MmaOperator = MmaOperator_;
+
+  // The architecture specific mma ooperator being used
+  using ArchMmaOperator = typename MmaOperator::ArchMmaOperator;
+
+  // Mma Instruction Shape
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  // This is the ratio of the load instruction vs the compute instruction.
+  static constexpr int kExpansionFactor = MmaOperator::IteratorB::InstructionShape::kRow / InstructionShape::kK;
+
+  /// Type of the scales
+  using ElementScale = half_t;
+
+  /// Fragment to hold B data before Mma
+  using FragmentDequantizedOperand = Array<ElementScale, MmaOperator::FragmentB::kElements>;
+
+  // Fragment to hold scale data to apply to B before mma
+  // We need 1 fp16 per matrix iteration in the N dimension
+  static constexpr int kColsPerMmaPerThread = 1;
+  using FragmentScale = Array<ElementScale, kColsPerMmaPerThread * MmaOperator::MmaIterations::kColumn>;
+  using FragmentZero = Array<ElementScale, kColsPerMmaPerThread * MmaOperator::MmaIterations::kColumn>;
+
+  /// Warp mma shape
+  using Shape = Shape_;
+
+  /// Layout of the scales in shared memory
+  using Layout = layout::RowMajor;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<ElementScale, Layout>;
+
+  static constexpr WeightOnlyQuantOp QuantOp = QuantOp_;
+
+  CUTLASS_DEVICE
+  MmaTensorOpDequantizer(TensorRef smem_scales, TensorRef smem_zeros, int const warp_idx_n, int const lane_idx) {
+    int const warp_offset = warp_idx_n * Shape::kN;
+    int const quad = lane_idx / 4;
+    int const thread_offset = warp_offset + quad;
+    pointer_scale_ = smem_scales.data() + thread_offset;
+    if constexpr (hasZero(QuantOp)) {
+      pointer_zero_ = smem_zeros.data() + thread_offset;
+    }
+  }
+
+  CUTLASS_DEVICE
+  MmaTensorOpDequantizer(TensorRef smem_scales, int const warp_idx_n, int const lane_idx)
+      : MmaTensorOpDequantizer(smem_scales, TensorRef(), warp_idx_n, lane_idx) {}
+
+  CUTLASS_DEVICE
+  void load(FragmentScale& scale_frag) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n_iter = 0; mma_n_iter < MmaOperator::MmaIterations::kColumn; ++mma_n_iter) {
+      scale_frag[mma_n_iter] = pointer_scale_[mma_n_iter * InstructionShape::kN];
+    }
+  }
+
+  CUTLASS_DEVICE
+  void dequantize(FragmentDequantizedOperand& operand_frag, FragmentScale const& scale_frag) {
+    using _MmaOperandB = typename ArchMmaOperator::FragmentB;
+    using ExpandedMmaOperandB = Array<typename _MmaOperandB::Element, kExpansionFactor * _MmaOperandB::kElements>;
+    static_assert(
+        ExpandedMmaOperandB::kElements * MmaOperator::MmaIterations::kColumn == FragmentDequantizedOperand::kElements,
+        "");
+
+    multiplies<ExpandedMmaOperandB> mul_op;
+
+    ExpandedMmaOperandB* operand_frag_ptr = reinterpret_cast<ExpandedMmaOperandB*>(&operand_frag);
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n_iter = 0; mma_n_iter < MmaOperator::MmaIterations::kColumn; ++mma_n_iter) {
+      operand_frag_ptr[mma_n_iter] = mul_op(operand_frag_ptr[mma_n_iter], scale_frag[mma_n_iter]);
+    }
+  }
+
+  CUTLASS_DEVICE
+  void load(FragmentScale& scale_frag, FragmentScale& zero_frag) {
+    if constexpr (hasZero(QuantOp)) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_n_iter = 0; mma_n_iter < MmaOperator::MmaIterations::kColumn; ++mma_n_iter) {
+        scale_frag[mma_n_iter] = pointer_scale_[mma_n_iter * InstructionShape::kN];
+        zero_frag[mma_n_iter] = pointer_zero_[mma_n_iter * InstructionShape::kN];
+      }
+    } else {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_n_iter = 0; mma_n_iter < MmaOperator::MmaIterations::kColumn; ++mma_n_iter) {
+        scale_frag[mma_n_iter] = pointer_scale_[mma_n_iter * InstructionShape::kN];
+      }
+    }
+  }
+
+  CUTLASS_DEVICE
+  void dequantize(FragmentDequantizedOperand& operand_frag, FragmentScale const& scale_frag,
+                  FragmentScale const& zero_frag) {
+    using _MmaOperandB = typename ArchMmaOperator::FragmentB;
+    using ExpandedMmaOperandB = Array<typename _MmaOperandB::Element, kExpansionFactor * _MmaOperandB::kElements>;
+    static_assert(
+        ExpandedMmaOperandB::kElements * MmaOperator::MmaIterations::kColumn == FragmentDequantizedOperand::kElements,
+        "");
+
+    multiplies<ExpandedMmaOperandB> mul_op;
+    ExpandedMmaOperandB* operand_frag_ptr = reinterpret_cast<ExpandedMmaOperandB*>(&operand_frag);
+
+    if constexpr (hasZero(QuantOp)) {
+      plus<ExpandedMmaOperandB> plus_op;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_n_iter = 0; mma_n_iter < MmaOperator::MmaIterations::kColumn; ++mma_n_iter) {
+        operand_frag_ptr[mma_n_iter] =
+            plus_op(mul_op(operand_frag_ptr[mma_n_iter], scale_frag[mma_n_iter]), zero_frag[mma_n_iter]);
+      }
+    } else {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_n_iter = 0; mma_n_iter < MmaOperator::MmaIterations::kColumn; ++mma_n_iter) {
+        operand_frag_ptr[mma_n_iter] = mul_op(operand_frag_ptr[mma_n_iter], scale_frag[mma_n_iter]);
+      }
+    }
+  }
+
+  // Adds a pointer offset in units of elements.
+  CUTLASS_DEVICE
+  void add_pointer_offset(int64_t const& offset) {
+    static_assert(sizeof(ElementScale) > 1, "");
+    pointer_scale_ += offset;
+    pointer_zero_ += offset;
+  }
+
+ private:
+  ElementScale const* pointer_scale_;
+  ElementScale const* pointer_zero_;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Specialization for Volta A x RowMajor B tensorOp, for 32x32x4 interleaved gemm
+template <
+    /// Underlying matrix multiply operator (concept: MmaTensorOp)
+    typename MmaOperator_,
+    /// Shape of the warp level matrix multiply (concept: GemmShape)
+    typename Shape_,
+    ///
+    WeightOnlyQuantOp QuantOp_>
+class MmaTensorOpDequantizer<
+    MmaOperator_, Shape_, Operand::kB, half_t, layout::RowMajor, 32, QuantOp_,
+    typename platform::enable_if<
+        platform::is_same<typename MmaOperator_::ArchTag, arch::Sm70>::value &&
+        platform::is_same<typename MmaOperator_::ArchMmaOperator::LayoutB, layout::RowMajor>::value>::type> {
+ public:
+  static_assert(platform::is_same<typename MmaOperator_::InterleavedTileShape, GemmShape<32, 32, 4>>::value, "");
+
+  /// Mma Operator
+  using MmaOperator = MmaOperator_;
+
+  // The architecture specific mma ooperator being used
+  using ArchMmaOperator = typename MmaOperator::ArchMmaOperator;
+
+  // Mma Instruction Shape
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  /// Type of the scales
+  using ElementScale = half_t;
+
+  /// Fragment to hold B data before Mma
+  using FragmentDequantizedOperand = Array<ElementScale, MmaOperator::FragmentB::kElements>;
+
+  /// Warp mma shape
+  using Shape = Shape_;
+
+  // Fragment to hold scale data to apply to B before mma
+  // Each 32x32x4 matmul uses 8 elements from B.
+  static constexpr int ColsPerMmaTile = 32;
+  static constexpr int TileNIterations = Shape::kN / ColsPerMmaTile;
+  using FragmentScale = Array<ElementScale, TileNIterations * 8>;
+  using AccessType = Array<ElementScale, 8>;
+
+  /// Layout of the scales in shared memory
+  using Layout = layout::RowMajor;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<ElementScale, Layout>;
+
+  static constexpr WeightOnlyQuantOp QuantOp = QuantOp_;
+  static_assert(QuantOp == WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY, "");
+
+  CUTLASS_DEVICE
+  MmaTensorOpDequantizer(TensorRef smem_scales, int const warp_idx_n, int const lane_idx) {
+    int const warp_offset = warp_idx_n * Shape::kN;
+    int const base_col = lane_idx & 0xF8;
+    int const thread_offset = warp_offset + base_col;
+    pointer_ = smem_scales.data() + thread_offset;
+  }
+
+  CUTLASS_DEVICE
+  void load(FragmentScale& scale_frag) {
+    AccessType* scale_frag_ptr = reinterpret_cast<AccessType*>(&scale_frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int tile_iter = 0; tile_iter < TileNIterations; ++tile_iter) {
+      // We jump by 32 here since volta does <32x32x4> super mmas inside a warp.
+      scale_frag_ptr[tile_iter] = *reinterpret_cast<AccessType const*>(pointer_ + ColsPerMmaTile * tile_iter);
+    }
+  }
+
+  CUTLASS_DEVICE
+  void dequantize(FragmentDequantizedOperand& operand_frag, FragmentScale const& scale_frag) {
+    static_assert(FragmentScale::kElements == FragmentDequantizedOperand::kElements, "");
+
+    multiplies<FragmentDequantizedOperand> mul_op;
+    operand_frag = mul_op(operand_frag, scale_frag);
+  }
+
+ private:
+  ElementScale const* pointer_;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Specialization for Volta A x ColumnMajor B tensorOp, for 32x32x4 interleaved gemm
+template <
+    /// Underlying matrix multiply operator (concept: MmaTensorOp)
+    typename MmaOperator_,
+    /// Shape of the warp level matrix multiply (concept: GemmShape)
+    typename Shape_,
+    ///
+    WeightOnlyQuantOp QuantOp_>
+class MmaTensorOpDequantizer<
+    MmaOperator_, Shape_, Operand::kB, half_t, layout::RowMajor, 32, QuantOp_,
+    typename platform::enable_if<
+        platform::is_same<typename MmaOperator_::ArchTag, arch::Sm70>::value &&
+        platform::is_same<typename MmaOperator_::ArchMmaOperator::LayoutB, layout::ColumnMajor>::value>::type> {
+ public:
+  static_assert(platform::is_same<typename MmaOperator_::InterleavedTileShape, GemmShape<32, 32, 4>>::value, "");
+
+  /// Mma Operator
+  using MmaOperator = MmaOperator_;
+
+  // The architecture specific mma ooperator being used
+  using ArchMmaOperator = typename MmaOperator::ArchMmaOperator;
+
+  // Mma Instruction Shape
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  /// Type of the scales
+  using ElementScale = half_t;
+
+  /// Fragment to hold B data before Mma
+  using FragmentDequantizedOperand = Array<ElementScale, MmaOperator::FragmentB::kElements>;
+
+  /// Warp mma shape
+  using Shape = Shape_;
+
+  // Fragment to hold scale data to apply to B before mma
+  // Each 32x32x4 matmul uses 8 elements from B.
+  static constexpr int ColsPerMmaTile = 32;
+  static constexpr int TileNIterations = Shape::kN / ColsPerMmaTile;
+  using FragmentScale = Array<ElementScale, TileNIterations * 2>;
+
+  /// Layout of the scales in shared memory
+  using Layout = layout::RowMajor;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<ElementScale, Layout>;
+
+  static constexpr WeightOnlyQuantOp QuantOp = QuantOp_;
+  static_assert(QuantOp == WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY, "");
+
+  CUTLASS_DEVICE
+  MmaTensorOpDequantizer(TensorRef smem_scales, int const warp_idx_n, int const lane_idx) {
+    int const warp_offset = warp_idx_n * Shape::kN;
+    int const base_col = lane_idx & 0xF8 + lane_idx % 4;
+    int const thread_offset = warp_offset + base_col;
+    pointer_ = smem_scales.data() + thread_offset;
+  }
+
+  CUTLASS_DEVICE
+  void load(FragmentScale& scale_frag) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int tile_iter = 0; tile_iter < TileNIterations; ++tile_iter) {
+      // We jump by 32 here since volta does <32x32x4> super mmas inside a warp.
+      // For col major B, each thread will jump 4 cols to get its next value inside
+      // of the super mma.
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_iter = 0; mma_iter < 2; ++mma_iter) {
+        scale_frag[tile_iter * 2 + mma_iter] = pointer_[ColsPerMmaTile * tile_iter + 4 * mma_iter];
+      }
+    }
+  }
+
+  CUTLASS_DEVICE
+  void dequantize(FragmentDequantizedOperand& operand_frag, FragmentScale const& scale_frag) {
+    using MmaOperandB = typename ArchMmaOperator::FragmentB;
+    static constexpr int total_n_mmas = 2 * TileNIterations;
+    static_assert(MmaOperandB::kElements * total_n_mmas == FragmentDequantizedOperand::kElements, "");
+
+    multiplies<MmaOperandB> mul_op;
+
+    MmaOperandB* operand_frag_ptr = reinterpret_cast<MmaOperandB*>(&operand_frag);
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n_iter = 0; mma_n_iter < total_n_mmas; ++mma_n_iter) {
+      operand_frag_ptr[mma_n_iter] = mul_op(operand_frag_ptr[mma_n_iter], scale_frag[mma_n_iter]);
+    }
+  }
+
+ private:
+  ElementScale const* pointer_;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace warp
+}  // namespace gemm
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm_configs.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm_configs.h
new file mode 100644
index 0000000000000..0841218a480ba
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm_configs.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace ort_fastertransformer {
+// Note: The shapes are in the format MxNxK. The K shape of the runtime config MUST match the K shape
+//       in the kernel layout details when doing weight only quantization.
+enum class CutlassTileConfig {
+  // Signals that we should run heuristics do choose a config
+  Undefined,
+
+  // Signals that we should run heuristics do choose a config
+  ChooseWithHeuristic,
+
+  // SiMT config
+  CtaShape128x128x8_WarpShape64x64x8,
+
+  // TensorCore configs CTA_N = 128, CTA_K = 64
+  // Warp configs for M=16
+  CtaShape16x128x64_WarpShape16x32x64,
+  // Warp configs for M=32
+  CtaShape32x128x64_WarpShape32x32x64,
+
+  // Warp configs for M=64
+  CtaShape64x128x64_WarpShape32x64x64,
+  CtaShape64x64x128_WarpShape32x64x64,
+  CtaShape64x128x64_WarpShape64x32x64,
+
+  // Warp configs for M=128
+  CtaShape128x64x64_WarpShape64x32x64,
+  CtaShape128x128x64_WarpShape64x32x64,
+  CtaShape128x128x64_WarpShape64x64x64,
+  CtaShape128x128x64_WarpShape128x32x64,
+  CtaShape128x256x64_WarpShape64x64x64,
+
+  // Warp configs for M=256
+  CtaShape256x128x64_WarpShape64x64x64,
+
+  // TensorCore config CTA_N = 256, CTA_K = 64
+  CtaShape16x256x64_WarpShape16x64x64
+};
+
+enum class SplitKStyle {
+  NO_SPLIT_K,
+  SPLIT_K_SERIAL,
+  // SPLIT_K_PARALLEL // Not supported yet
+};
+
+enum class CutlassTileConfigSM90 {
+  // Signals that we should run heuristics do choose a config
+  Undefined,
+
+  // Signals that we should run heuristics do choose a config
+  ChooseWithHeuristic,
+
+  // CTA configs for M=64
+  CtaShape64x16x128B,
+  CtaShape64x32x128B,
+  CtaShape64x64x128B,
+  CtaShape64x128x128B,
+  CtaShape64x256x128B,
+
+  // CTA configs for M=128
+  CtaShape128x16x128B,
+  CtaShape128x32x128B,
+  CtaShape128x64x128B,
+  CtaShape128x128x128B,
+  CtaShape128x256x128B,
+};
+
+enum class MainloopScheduleType {
+  AUTO  // Automatically selects between pingpong and cooperative schedules on Hopper. On older architectures, this
+        // defaults to the "legacy" main loop schedule.
+};
+
+enum class EpilogueScheduleType {
+  AUTO  // Automatically chooses an epilogue schedule compatible with the selected main loop schedule for Hopper. For
+        // architectures older than hopper, the epilogue is always performed by the same thread block as the main loop.
+};
+
+enum class ClusterShape { ClusterShape_1x1x1,
+                          ClusterShape_2x1x1,
+                          ClusterShape_1x2x1,
+                          ClusterShape_2x2x1 };
+
+struct CutlassGemmConfig {
+  CutlassTileConfig tile_config = CutlassTileConfig::ChooseWithHeuristic;
+  SplitKStyle split_k_style = SplitKStyle::NO_SPLIT_K;
+  int split_k_factor = -1;
+  int stages = -1;
+
+  // config options for sm90
+  CutlassTileConfigSM90 tile_config_sm90 = CutlassTileConfigSM90::ChooseWithHeuristic;
+  MainloopScheduleType mainloop_schedule = MainloopScheduleType::AUTO;
+  EpilogueScheduleType epilogue_schedule = EpilogueScheduleType::AUTO;
+  ClusterShape cluster_shape = ClusterShape::ClusterShape_1x1x1;
+
+  CutlassGemmConfig() {}
+
+  CutlassGemmConfig(CutlassTileConfig tile_config, SplitKStyle split_k_style, int split_k_factor, int stages)
+      : tile_config(tile_config), split_k_style(split_k_style), split_k_factor(split_k_factor), stages(stages) {}
+
+  CutlassGemmConfig(CutlassTileConfigSM90 tile_config_sm90, MainloopScheduleType mainloop_schedule,
+                    EpilogueScheduleType epilogue_schedule, ClusterShape cluster_shape)
+      : tile_config_sm90(tile_config_sm90),
+        mainloop_schedule(mainloop_schedule),
+        epilogue_schedule(epilogue_schedule),
+        cluster_shape(cluster_shape) {}
+};
+
+}  // namespace ort_fastertransformer
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/interleaved_numeric_conversion.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/interleaved_numeric_conversion.h
new file mode 100644
index 0000000000000..7fd1745aa2c54
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/interleaved_numeric_conversion.h
@@ -0,0 +1,392 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+    \file
+    \brief Boost-like numeric conversion operator for int8 and CUTLASS int4b_t interleaved in a register
+*/
+
+#pragma once
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/array.h"
+#include "cutlass/half.h"
+#include "cutlass/numeric_types.h"
+
+namespace cutlass {
+
+// This converter is meant to be used with data interleaved in a 32-bit register where the even elements are in the low
+// bits and the odd elemeents are in the high bits of the register. In addition, it assumes elements were originally
+// signed and had a bias of 2**(b-1) added (where b is the number of bits in the type) to make all numbers unsigned.
+// This converter will uninterleave the data and subtract the bias while converting to the result type.
+template <typename T, typename S, int N>
+struct FastInterleavedAndBiasedNumericArrayConverter {};
+
+template <>
+struct FastInterleavedAndBiasedNumericArrayConverter<half_t, uint8_t, 4> {
+  using result_type = Array<half_t, 4>;
+  using source_type = Array<uint8_t, 4>;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const& source) {
+    result_type result;
+
+    uint32_t* h = reinterpret_cast<uint32_t*>(&result);
+    uint32_t const i8s = reinterpret_cast<uint32_t const&>(source);
+
+    static constexpr uint32_t mask_for_elt_01 = 0x5250;
+    static constexpr uint32_t mask_for_elt_23 = 0x5351;
+    static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
+    asm volatile("prmt.b32 %0,%1,%2,%3;\n" : "=r"(h[0]) : "r"(i8s), "n"(start_byte_for_fp16), "n"(mask_for_elt_01));
+    asm volatile("prmt.b32 %0,%1,%2,%3;\n" : "=r"(h[1]) : "r"(i8s), "n"(start_byte_for_fp16), "n"(mask_for_elt_23));
+
+    // Lastly, we subtract 1152 from our constructed number using fp16 math to get our signed integer as fp16.
+    static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480;
+    asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[0]) : "r"(h[0]), "r"(I8s_TO_F16s_MAGIC_NUM));
+    asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[1]) : "r"(h[1]), "r"(I8s_TO_F16s_MAGIC_NUM));
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const& s) { return convert(s); }
+};
+
+template <int N>
+struct FastInterleavedAndBiasedNumericArrayConverter<half_t, uint8_t, N> {
+  static constexpr int VEC_WIDTH = 4;
+  static_assert(!(N % VEC_WIDTH), "N must be multiple of 4.");
+
+  using result_type = Array<half_t, N>;
+  using source_type = Array<uint8_t, N>;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const& source) {
+    using scalar_result_type = typename result_type::Element;
+    using scalar_source_type = typename source_type::Element;
+    FastInterleavedAndBiasedNumericArrayConverter<scalar_result_type, scalar_source_type, VEC_WIDTH> convert_vector_;
+
+    result_type result;
+    using vec_result = Array<scalar_result_type, VEC_WIDTH>;
+    using vec_source = Array<scalar_source_type, VEC_WIDTH>;
+
+    vec_result* result_ptr = reinterpret_cast<vec_result*>(&result);
+    vec_source const* source_ptr = reinterpret_cast<vec_source const*>(&source);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / VEC_WIDTH; ++i) {
+      result_ptr[i] = convert_vector_(source_ptr[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const& s) { return convert(s); }
+};
+
+template <>
+struct FastInterleavedAndBiasedNumericArrayConverter<bfloat16_t, uint8_t, 4> {
+  using result_type = Array<bfloat16_t, 4>;
+  using source_type = Array<uint8_t, 4>;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const& source) {
+    result_type result;
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+
+    uint32_t* bf16_result_ptr = reinterpret_cast<uint32_t*>(&result);
+    uint32_t const i8s = reinterpret_cast<uint32_t const&>(source);
+
+    static constexpr uint32_t fp32_base = 0x4B000000;
+    float fp32_intermediates[4];
+
+    // Construct FP32s, bfloat does not have enough mantissa for IADD trick
+    uint32_t* fp32_intermediates_casted = reinterpret_cast<uint32_t*>(fp32_intermediates);
+    fp32_intermediates_casted[0] = __byte_perm(i8s, fp32_base, 0x7650);
+    fp32_intermediates_casted[1] = __byte_perm(i8s, fp32_base, 0x7652);
+    fp32_intermediates_casted[2] = __byte_perm(i8s, fp32_base, 0x7651);
+    fp32_intermediates_casted[3] = __byte_perm(i8s, fp32_base, 0x7653);
+
+    // Subtract out fp32_base + 128 to make the unsigned integer signed.
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < 4; ++ii) {
+      fp32_intermediates[ii] -= 8388736.f;
+    }
+
+    // Truncate the fp32 representation and pack up as bfloat16s.
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < 2; ++ii) {
+      bf16_result_ptr[ii] =
+          __byte_perm(fp32_intermediates_casted[2 * ii + 0], fp32_intermediates_casted[2 * ii + 1], 0x7632);
+    }
+#else
+    // Disable this on architectures older than Ampere since they lack hardware for bf16 mma. If one wishes to use
+    // HMMA on older hardware, they should Convert directly to FP16 using FP16 converters.
+    result.clear();  // Suppress compiler warning
+    arch::device_breakpoint();
+#endif
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const& s) { return convert(s); }
+};
+
+template <int N>
+struct FastInterleavedAndBiasedNumericArrayConverter<bfloat16_t, uint8_t, N> {
+  static constexpr int VEC_WIDTH = 4;
+  static_assert(!(N % VEC_WIDTH), "N must be multiple of 4.");
+
+  using result_type = Array<bfloat16_t, N>;
+  using source_type = Array<uint8_t, N>;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const& source) {
+    using scalar_result_type = typename result_type::Element;
+    using scalar_source_type = typename source_type::Element;
+    FastInterleavedAndBiasedNumericArrayConverter<scalar_result_type, scalar_source_type, VEC_WIDTH> convert_vector_;
+
+    result_type result;
+    using vec_result = Array<scalar_result_type, VEC_WIDTH>;
+    using vec_source = Array<scalar_source_type, VEC_WIDTH>;
+
+    vec_result* result_ptr = reinterpret_cast<vec_result*>(&result);
+    vec_source const* source_ptr = reinterpret_cast<vec_source const*>(&source);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / VEC_WIDTH; ++i) {
+      result_ptr[i] = convert_vector_(source_ptr[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const& s) { return convert(s); }
+};
+
+template <>
+struct FastInterleavedAndBiasedNumericArrayConverter<half_t, uint4b_t, 8> {
+  using result_type = Array<half_t, 8>;
+  using source_type = Array<uint4b_t, 8>;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const& source) {
+    result_type result;
+
+    uint32_t* h = reinterpret_cast<uint32_t*>(&result);
+    uint32_t const i4s = reinterpret_cast<uint32_t const&>(source);
+
+    // First, we extract the i4s and construct an intermediate fp16 number.
+    static constexpr uint32_t immLut = (0xf0 & 0xcc) | 0xaa;
+    static constexpr uint32_t BOTTOM_MASK = 0x000f000f;
+    static constexpr uint32_t TOP_MASK = 0x00f000f0;
+    static constexpr uint32_t I4s_TO_F16s_MAGIC_NUM = 0x64006400;
+
+    // Note that the entire sequence only requires 1 shift instruction. This is thanks to the register packing
+    // format and the fact that we force our integers to be unsigned, and account for this in the fp16 subtractions.
+    // In addition, I exploit the fact that sub and fma have the same throughput in order to convert elt_23 and
+    // elt_67 to fp16 without having to shift them to the bottom bits before hand.
+
+    // Shift right by 8 to now consider elt_45 and elt_67. Issue first to hide RAW dependency if we issue
+    // immediately before required.
+    const uint32_t top_i4s = i4s >> 8;
+    // Extract elt_01 - (i4s & 0x000f000f) | 0x64006400
+    asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+                 : "=r"(h[0])
+                 : "r"(i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
+    // Extract elt_23 (i4s & 0x00f000f0) | 0x64006400
+    asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+                 : "=r"(h[1])
+                 : "r"(i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
+    // Extract elt_45 (top_i4s & 0x000f000f) | 0x64006400
+    asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+                 : "=r"(h[2])
+                 : "r"(top_i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
+    // Extract elt_67 (top_i4s & 0x00f000f0) | 0x64006400
+    asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+                 : "=r"(h[3])
+                 : "r"(top_i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
+
+    // I use inline PTX below because I am not sure if the compiler will emit float2half instructions if I use the
+    // half2 ctor. In this case, I chose performance reliability over code readability.
+
+    // This is the half2 {1032, 1032} represented as an integer.
+    static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64086408;
+    // This is the half2 {1 / 16, 1 / 16} represented as an integer.
+    static constexpr uint32_t ONE_SIXTEENTH = 0x2c002c00;
+    // This is the half2 {-72, -72} represented as an integer.
+    static constexpr uint32_t NEG_72 = 0xd480d480;
+
+    // Finally, we construct the output numbers.
+    // Convert elt_01
+    asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[0]) : "r"(h[0]), "r"(FP16_TOP_MAGIC_NUM));
+    // Convert elt_23
+    asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[1]) : "r"(h[1]), "r"(ONE_SIXTEENTH), "r"(NEG_72));
+    // Convert elt_45
+    asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[2]) : "r"(h[2]), "r"(FP16_TOP_MAGIC_NUM));
+    // Convert elt_67
+    asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[3]) : "r"(h[3]), "r"(ONE_SIXTEENTH), "r"(NEG_72));
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const& s) { return convert(s); }
+};
+
+template <int N>
+struct FastInterleavedAndBiasedNumericArrayConverter<half_t, uint4b_t, N> {
+  static constexpr int VEC_WIDTH = 8;
+  static_assert(!(N % VEC_WIDTH), "N must be multiple of 8.");
+
+  using result_type = Array<half_t, N>;
+  using source_type = Array<uint4b_t, N>;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const& source) {
+    using scalar_result_type = typename result_type::Element;
+    using scalar_source_type = typename source_type::Element;
+    FastInterleavedAndBiasedNumericArrayConverter<scalar_result_type, scalar_source_type, VEC_WIDTH> convert_vector_;
+
+    result_type result;
+    using vec_result = Array<scalar_result_type, VEC_WIDTH>;
+    using vec_source = Array<scalar_source_type, VEC_WIDTH>;
+
+    vec_result* result_ptr = reinterpret_cast<vec_result*>(&result);
+    vec_source const* source_ptr = reinterpret_cast<vec_source const*>(&source);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / VEC_WIDTH; ++i) {
+      result_ptr[i] = convert_vector_(source_ptr[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const& s) { return convert(s); }
+};
+
+template <>
+struct FastInterleavedAndBiasedNumericArrayConverter<bfloat16_t, uint4b_t, 8> {
+  using result_type = Array<bfloat16_t, 8>;
+  using source_type = Array<uint4b_t, 8>;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const& source) {
+    result_type result;
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+
+    uint32_t* h = reinterpret_cast<uint32_t*>(&result);
+    uint32_t const source_i4s = reinterpret_cast<uint32_t const&>(source);
+
+    // First, we extract the i4s and construct an intermediate fp16 number.
+    static constexpr uint32_t immLut = (0xf0 & 0xcc) | 0xaa;
+    static constexpr uint32_t MASK = 0x000f000f;
+    static constexpr uint32_t I4s_TO_BF16s_MAGIC_NUM = 0x43004300;
+
+    // We don't have enough mantissa to remove as much shift overhead as FP16, so we must loop.
+    // No shift needed for first item.
+    uint32_t i4s = source_i4s;
+    asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+                 : "=r"(h[0])
+                 : "r"(i4s), "n"(MASK), "n"(I4s_TO_BF16s_MAGIC_NUM), "n"(immLut));
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 1; ii < result_type::kElements / 2; ++ii) {
+      i4s >>= sizeof_bits<typename source_type::Element>::value;
+      // (i4s & 0x000f000f) | 0x43004300
+      asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+                   : "=r"(h[ii])
+                   : "r"(i4s), "n"(MASK), "n"(I4s_TO_BF16s_MAGIC_NUM), "n"(immLut));
+    }
+
+    // This is the BF16 {-136, -136} represented as an integer.
+    static constexpr uint32_t BF16_BIAS = 0xC308C308;
+    static constexpr uint32_t BF16_ONE = 0x3F803F80;
+
+    // Finally, we construct the output numbers.
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < result_type::kElements / 2; ++ii) {
+      // Since this section is for Ampere+, we use bf16 fma to do the bias subtraction
+      asm("fma.rn.bf16x2 %0, %1, %2, %3;\n" : "=r"(h[ii]) : "r"(h[ii]), "r"(BF16_ONE), "r"(BF16_BIAS));
+    }
+#else
+    // Disable this on architectures older than Ampere since they lack hardware for bf16 mma. If one wishes to use
+    // HMMA on older hardware, they should Convert directly to FP16 using FP16 converters.
+    arch::device_breakpoint();
+    result.clear();  // Suppress compiler warning.
+#endif
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const& s) { return convert(s); }
+};
+
+template <int N>
+struct FastInterleavedAndBiasedNumericArrayConverter<bfloat16_t, uint4b_t, N> {
+  static constexpr int VEC_WIDTH = 8;
+  static_assert(!(N % VEC_WIDTH), "N must be multiple of 8.");
+
+  using result_type = Array<bfloat16_t, N>;
+  using source_type = Array<uint4b_t, N>;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const& source) {
+    using scalar_result_type = typename result_type::Element;
+    using scalar_source_type = typename source_type::Element;
+    FastInterleavedAndBiasedNumericArrayConverter<scalar_result_type, scalar_source_type, VEC_WIDTH> convert_vector_;
+
+    result_type result;
+    using vec_result = Array<scalar_result_type, VEC_WIDTH>;
+    using vec_source = Array<scalar_source_type, VEC_WIDTH>;
+
+    vec_result* result_ptr = reinterpret_cast<vec_result*>(&result);
+    vec_source const* source_ptr = reinterpret_cast<vec_source const*>(&source);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / VEC_WIDTH; ++i) {
+      result_ptr[i] = convert_vector_(source_ptr[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const& s) { return convert(s); }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/tile_interleaved_layout.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/tile_interleaved_layout.h
similarity index 98%
rename from onnxruntime/contrib_ops/cuda/moe/ft_moe/tile_interleaved_layout.h
rename to onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/tile_interleaved_layout.h
index 3505bea24e4d9..e5abefa35bc84 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/tile_interleaved_layout.h
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/tile_interleaved_layout.h
@@ -42,7 +42,7 @@ namespace cutlass {
 namespace layout {
 
 template <int RowsPerTile, int ColumnsInterleaved>
-class ColumnMajorTileInterleave {
+struct ColumnMajorTileInterleave {
   static constexpr int kRowsPerTile = RowsPerTile;
   static constexpr int kColumnsInterleaved = ColumnsInterleaved;
 };
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/transform/threadblock/fine_grained_scale_zero_iterator.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/transform/threadblock/fine_grained_scale_zero_iterator.h
new file mode 100644
index 0000000000000..79811ef3e611b
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/transform/threadblock/fine_grained_scale_zero_iterator.h
@@ -0,0 +1,222 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates for visiting scales to be used when dequantizing the weights for weight-only GEMM
+           quantization.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator_params.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename Shape, typename Element, typename Layout, int AdvanceRank, int Alignment>
+class FineGrainedScaleZeroIterator;
+
+template <typename Shape_, typename Element_, int Alignment_>
+class FineGrainedScaleZeroIterator<Shape_, Element_, layout::RowMajor, 0, Alignment_> {
+ public:
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = 0;
+  static int const kAlignment = Alignment_;
+
+  static int const kAccessesPerVector = 1;
+
+  /// Row index of scales corresponding to the groupsize of 64
+  int row_groupsize64_;
+  int group_size_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using AccessType = AlignedArray<Element, kAlignment>;
+
+  // For compatibility with existing iterator interface
+  struct Params {
+    LongIndex stride_ = 0;
+
+    /// amount (in byte) to increment pointer from first access of current tile
+    /// to first access of next tile
+    LongIndex inc_advance_ = 0;
+
+    // Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    explicit Params(Layout const& layout) : stride_(layout.stride(0)) {
+      inc_advance_ = Shape::kRow * stride_ * sizeof_bits<Element>::value / 8;
+    }
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char*;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Parameters object with precomputed internal state
+  Params const params_;
+
+  /// Internal pointer to first access of tile
+  BytePointer pointer_scale_;
+  BytePointer pointer_zero_;
+
+  bool is_valid_ = false;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_DEVICE
+  FineGrainedScaleZeroIterator(
+      ///< Precomputed parameters object
+      Params const& params,
+      ///< Pointer to start of scale tensor
+      Pointer pointer_scale,
+      ///< Pointer to start of zero tensor
+      Pointer pointer_zero,
+      ///< Extent of the scale and bias
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      ///< Group size
+      int group_size)
+      : params_(params),
+        pointer_scale_(reinterpret_cast<BytePointer>(const_cast<NonConstPointer>(pointer_scale))),
+        pointer_zero_(reinterpret_cast<BytePointer>(const_cast<NonConstPointer>(pointer_zero))) {
+    row_groupsize64_ = threadblock_offset.row();
+    group_size_ = group_size;
+
+    const LongIndex tb_row_byte_offset =
+        threadblock_offset.row() / (group_size / 64) * params_.stride_ * sizeof_bits<Element>::value / 8;
+    const LongIndex tb_col_byte_offset = threadblock_offset.column() * sizeof_bits<Element>::value / 8;
+    pointer_scale_ += (tb_row_byte_offset + tb_col_byte_offset);
+
+    if (pointer_zero_ != nullptr) {
+      pointer_zero_ += (tb_row_byte_offset + tb_col_byte_offset);
+    }
+
+    static constexpr int THREADS_PER_ROW = Shape::kColumn / kAlignment;
+
+    int const thread_row = thread_id / THREADS_PER_ROW;
+    int const thread_col = thread_id % THREADS_PER_ROW;
+
+    const LongIndex thread_row_byte_offset = thread_row * params_.stride_ * sizeof_bits<Element>::value / 8;
+    const LongIndex thread_col_byte_offset = thread_col * kAlignment * sizeof_bits<Element>::value / 8;
+    pointer_scale_ += (thread_row_byte_offset + thread_col_byte_offset);
+    if (pointer_zero_ != nullptr) {
+      pointer_zero_ += (thread_row_byte_offset + thread_col_byte_offset);
+    }
+
+    // For the rows, we must check that we are within the extent AND the tile to avoid extra reads on
+    // a given iteration. The same threads will be responsible for issues reads since the number of scales
+    // read in a given iteration is a constant. Therefore, we should never have to update is_valid_
+    // outside of the constructor.
+    int const global_row = threadblock_offset.row() + thread_row;
+    int const global_col = threadblock_offset.column() + thread_col * kAlignment;
+
+    bool const row_in_bounds = global_row < extent.row() && thread_row < Shape::kRow;
+    bool const col_in_bounds = global_col < extent.column();
+
+    is_valid_ = row_in_bounds && col_in_bounds;
+  }
+
+  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE FineGrainedScaleZeroIterator(Params const& params,   ///< Precomputed parameters object
+                                                   Pointer pointer_scale,  ///< Pointer to start of scale tensor
+                                                   Pointer pointer_zero,   ///< Pointer to start of zero tensor
+                                                   TensorCoord extent,     ///< Extent of tensor
+                                                   int thread_id,          ///< ID of each participating thread
+                                                   int group_size)
+      : FineGrainedScaleZeroIterator(params, pointer_scale, pointer_zero, extent, thread_id, make_Coord(0, 0),
+                                     group_size) {}
+
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    const LongIndex row_byte_offset = tile_offset.row() * params_.inc_advance_;
+    const LongIndex col_byte_offset = tile_offset.column() * Shape::kColumn * sizeof_bits<Element>::value / 8;
+    pointer_scale_ += row_byte_offset + col_byte_offset;
+    if (pointer_zero_ != nullptr) {
+      pointer_zero_ += row_byte_offset + col_byte_offset;
+    }
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE void clear_mask(bool enable = true) { is_valid_ &= (!enable); }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() const { return is_valid_; }
+
+  /// Returns a scale pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get_scale() const { return reinterpret_cast<AccessType*>(pointer_scale_); }
+
+  /// Returns a zero pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get_zero() const { return reinterpret_cast<AccessType*>(pointer_zero_); }
+};
+
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/weight_only_quant_op.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/weight_only_quant_op.h
new file mode 100644
index 0000000000000..403221a956017
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/weight_only_quant_op.h
@@ -0,0 +1,50 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Defines iterators used by warp-level matrix multiply operations targeting Tensor Cores.
+*/
+
+#pragma once
+
+namespace cutlass {
+
+enum class WeightOnlyQuantOp { UNDEFINED,
+                               PER_COLUMN_SCALE_ONLY,
+                               FINEGRAINED_SCALE_ONLY,
+                               FINEGRAINED_SCALE_AND_ZEROS };
+
+constexpr bool isFinegrained(WeightOnlyQuantOp op) {
+  return op == WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS || op == WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY;
+}
+
+constexpr bool hasZero(WeightOnlyQuantOp op) { return op == WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS; }
+
+}  // namespace cutlass
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/cutlass_heuristic.cc b/onnxruntime/contrib_ops/cuda/moe/ft_moe/cutlass_heuristic.cc
index adc043e5689e2..cd59e904ad9eb 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/cutlass_heuristic.cc
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/cutlass_heuristic.cc
@@ -151,8 +151,8 @@ CutlassGemmConfig estimate_best_config_from_occupancies(const std::vector<Cutlas
         const int ctas_for_problem = ctas_in_m_dim * ctas_in_n_dim * split_k_factor;
 
         const int num_waves_total = (ctas_for_problem + ctas_per_wave - 1) / ctas_per_wave;
-        const float num_waves_fractional = ctas_for_problem / float(ctas_per_wave);
-        const float current_score = float(num_waves_total) - num_waves_fractional;
+        const float num_waves_fractional = ctas_for_problem / static_cast<float>(ctas_per_wave);
+        const float current_score = static_cast<float>(num_waves_total) - num_waves_fractional;
 
         const float score_slack = 0.1f;
         if (current_score < config_score ||
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/cutlass_heuristic.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/cutlass_heuristic.h
index e70efe0503b55..0f75a121b3b92 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/cutlass_heuristic.h
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/cutlass_heuristic.h
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include "ft_gemm_configs.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm_configs.h"
 
 #include <cstddef>
 #include <cstdint>
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/ft_gemm_configs.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/ft_gemm_configs.h
deleted file mode 100644
index a5faad423fad9..0000000000000
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/ft_gemm_configs.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-namespace ort_fastertransformer {
-// Note: The shapes are in the format MxNxK. The K shape of the runtime config MUST match the K shape
-//       in the kernel layout details when doing weight only quantization.
-enum class CutlassTileConfig {
-  // Signals that we should run heuristics do choose a config
-  Undefined,
-
-  // Signals that we should run heuristics do choose a config
-  ChooseWithHeuristic,
-
-  // SiMT config
-  CtaShape128x128x8_WarpShape64x64x8,
-
-  // TensorCore configs CTA_N = 128, CTA_K = 64
-  // Warp configs for M=32
-  CtaShape32x128x64_WarpShape32x32x64,
-
-  // Warp configs for M=64
-  CtaShape64x128x64_WarpShape32x64x64,
-  CtaShape64x128x64_WarpShape64x32x64,
-
-  // Warp configs for M=128
-  CtaShape128x128x64_WarpShape64x32x64,
-  CtaShape128x128x64_WarpShape128x32x64
-};
-
-enum class SplitKStyle {
-  NO_SPLIT_K,
-  SPLIT_K_SERIAL,
-  // SPLIT_K_PARALLEL // Not supported yet
-};
-
-struct CutlassGemmConfig {
-  CutlassTileConfig tile_config = CutlassTileConfig::ChooseWithHeuristic;
-  SplitKStyle split_k_style = SplitKStyle::NO_SPLIT_K;
-  int split_k_factor = -1;
-  int stages = -1;
-};
-
-}  // namespace ort_fastertransformer
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_cutlass_kernel.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_cutlass_kernel.h
deleted file mode 100644
index cfe306c2482a5..0000000000000
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_cutlass_kernel.h
+++ /dev/null
@@ -1,463 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#pragma once
-
-#include "cutlass/complex.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/semaphore.h"
-
-#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/trace.h"
-
-#include "gemm_moe_problem_visitor.h"
-#include "tile_interleaved_layout.h"
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// This section exists to that we can use the same kernel code for regular gemm and dequantizing gemms.
-// It will dispatch to the dequantizing gemm if the Mma type has an Iterator for scales in global.
-template <typename...>
-using void_t = void;
-
-template <typename Mma, typename = void>
-struct use_dq_gemm : platform::false_type {};
-
-template <typename Mma>
-struct use_dq_gemm<Mma, void_t<typename Mma::IteratorScale>> : platform::true_type {};
-
-// SFINAE overload for dequantizing gemm
-template <typename Mma, typename ElementScale, typename platform::enable_if<use_dq_gemm<Mma>::value, bool>::type = true>
-CUTLASS_DEVICE static void run_mma(Mma mma, int gemm_k_iterations, typename Mma::FragmentC& accum,
-                                   typename Mma::IteratorA iterator_A, typename Mma::IteratorB iterator_B,
-                                   typename Mma::FragmentC const& src_accum, ElementScale* weight_scale_ptr,
-                                   MatrixCoord scale_extent, const int thread_idx, MatrixCoord tb_offset_scale) {
-  typename Mma::IteratorScale iterator_scale(Mma::IteratorScale::Layout(scale_extent.column()), weight_scale_ptr,
-                                             scale_extent, thread_idx, tb_offset_scale);
-
-  mma(gemm_k_iterations, accum, iterator_A, iterator_B, iterator_scale, src_accum);
-}
-
-// SFINAE overload for normal gemm. This completely ignores the scale parameters
-template <typename Mma, typename ElementScale,
-          typename platform::enable_if<!use_dq_gemm<Mma>::value, bool>::type = true>
-CUTLASS_DEVICE static void run_mma(Mma mma, int gemm_k_iterations, typename Mma::FragmentC& accum,
-                                   typename Mma::IteratorA iterator_A, typename Mma::IteratorB iterator_B,
-                                   typename Mma::FragmentC const& src_accum, ElementScale* weight_scale_ptr,
-                                   MatrixCoord scale_extent, const int thread_idx, MatrixCoord tb_offset_scale) {
-  mma(gemm_k_iterations, accum, iterator_A, iterator_B, src_accum);
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename Mma_,                        ///! Threadblock-scoped matrix multiply-accumulate
-          typename Epilogue_,                   ///! Epilogue
-          typename ThreadblockSwizzle_,         ///! Threadblock swizzling function
-          typename KernelArch,                  ///! The Architecture this kernel is compiled for. Used since SIMT kernels lose
-                                                /// top-level
-                                                /// arch.
-          GroupScheduleMode GroupScheduleMode_  ///! Type of scheduling to perform
-          >
-struct MoeFCGemm {
- public:
-  using Mma = Mma_;
-  using Epilogue = Epilogue_;
-  using EpilogueOutputOp = typename Epilogue::OutputOp;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  static GroupScheduleMode const kGroupScheduleMode = GroupScheduleMode_;
-  static bool const kTransposed = false;
-
-  // Optional transpose
-  using MapArguments =
-      kernel::detail::MapArguments<typename Mma::IteratorA::Element, typename Mma::IteratorA::Layout, Mma::kTransformA,
-                                   Mma::IteratorA::AccessType::kElements, typename Mma::IteratorB::Element,
-                                   typename Mma::IteratorB::Layout, Mma::kTransformB,
-                                   Mma::IteratorB::AccessType::kElements, typename Mma::LayoutC, kTransposed>;
-
-  // Public-facing type definitions related to operand element type, layout, and complex conjugate
-  // operation. Must interact with the 'kTransposed' notion.
-  static_assert(!kTransposed, "Transpose problem not supported");
-  using ElementA = typename MapArguments::ElementA;
-  using LayoutA = typename MapArguments::LayoutA;
-  using ElementB = typename MapArguments::ElementB;
-  using LayoutB = typename MapArguments::LayoutB;
-  using ElementC = typename Epilogue::OutputTileIterator::Element;
-  using LayoutC = typename MapArguments::LayoutC;
-  using ElementScale = ElementC;
-
-  static ComplexTransform const kTransformA = MapArguments::kTransformA;
-  static ComplexTransform const kTransformB = MapArguments::kTransformB;
-
-  // Type definitions about the mainloop.
-  using Operator = typename Mma::Operator;
-  using OperatorClass = typename Mma::Operator::OperatorClass;
-  using ThreadblockShape = typename Mma::Shape;
-  using WarpShape = typename Mma::Operator::Shape;
-  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
-  using ArchTag = typename Mma::ArchTag;
-
-  static int const kStages = Mma::kStages;
-  static int const kAlignmentA = MapArguments::kAlignmentA;
-  static int const kAlignmentB = MapArguments::kAlignmentB;
-  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
-
-  /// Warp count (concept: GemmShape)
-  using WarpCount = typename Mma::WarpCount;
-  static int const kThreadCount = 32 * WarpCount::kCount;
-
-  using ProblemVisitor =
-      GemmMoeProblemVisitor<ThreadblockShape, kGroupScheduleMode, kThreadCount, kThreadCount, kTransposed>;
-
-  //
-  // Structures
-  //
-
-  /// Argument structure
-  struct Arguments {
-    //
-    // Data members
-    //
-
-    int problem_count;
-    int threadblock_count;
-
-    typename EpilogueOutputOp::Params output_op;
-
-    ElementA* ptr_A;
-    ElementB* ptr_B;
-    ElementScale* weight_scales;
-    ElementC* ptr_C;
-    ElementC* ptr_D;
-
-    int64_t* total_rows_before_expert;
-    int64_t gemm_n;
-    int64_t gemm_k;
-
-    // Only used by device-level operator
-    GemmCoord* host_problem_sizes;
-
-    //
-    // Methods
-    //
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Arguments()
-        : problem_count(0),
-          threadblock_count(0),
-          ptr_A(nullptr),
-          ptr_B(nullptr),
-          weight_scales(nullptr),
-          ptr_C(nullptr),
-          ptr_D(nullptr),
-          total_rows_before_expert(nullptr),
-          gemm_n(0),
-          gemm_k(0),
-          host_problem_sizes(nullptr) {}
-
-    /// Ctor
-    CUTLASS_HOST_DEVICE
-    Arguments(int problem_count, int threadblock_count, typename EpilogueOutputOp::Params output_op,
-              const ElementA* ptr_A, const ElementB* ptr_B, const ElementScale* weight_scales, const ElementC* ptr_C,
-              ElementC* ptr_D, int64_t* total_rows_before_expert, int64_t gemm_n, int64_t gemm_k,
-              GemmCoord* host_problem_sizes = nullptr)
-        : problem_count(problem_count),
-          threadblock_count(threadblock_count),
-          output_op(output_op),
-          ptr_A(const_cast<ElementA*>(ptr_A)),
-          ptr_B(const_cast<ElementB*>(ptr_B)),
-          weight_scales(const_cast<ElementScale*>(weight_scales)),
-          ptr_C(const_cast<ElementC*>(ptr_C)),
-          ptr_D(ptr_D),
-          total_rows_before_expert(total_rows_before_expert),
-          gemm_n(gemm_n),
-          gemm_k(gemm_k),
-          host_problem_sizes(host_problem_sizes) {
-      if (platform::is_same<uint8_t, ElementB>::value || platform::is_same<uint4b_t, ElementB>::value) {
-        assert(weight_scales);
-      }
-    }
-  };
-
-  //
-  // Structure for precomputing values in host memory and passing to kernels
-  //
-
-  /// Parameters structure
-  struct Params {
-    typename ProblemVisitor::Params problem_visitor;
-    int threadblock_count;
-
-    typename EpilogueOutputOp::Params output_op;
-
-    ElementA* ptr_A;
-    ElementB* ptr_B;
-    ElementScale* weight_scales;
-    ElementC* ptr_C;
-    ElementC* ptr_D;
-
-    //
-    // Methods
-    //
-
-    CUTLASS_HOST_DEVICE
-    Params() : ptr_A(nullptr), ptr_B(nullptr), weight_scales(nullptr), ptr_C(nullptr), ptr_D(nullptr) {}
-
-    CUTLASS_HOST_DEVICE
-    Params(Arguments const& args, void* workspace = nullptr, int tile_count = 0)
-        : problem_visitor(args.total_rows_before_expert, args.gemm_n, args.gemm_k, args.problem_count, workspace,
-                          tile_count),
-          threadblock_count(args.threadblock_count),
-          output_op(args.output_op),
-          ptr_A(args.ptr_A),
-          ptr_B(args.ptr_B),
-          weight_scales(args.weight_scales),
-          ptr_C(args.ptr_C),
-          ptr_D(args.ptr_D) {}
-
-    CUTLASS_HOST_DEVICE
-    void update(Arguments const& args, void* workspace = nullptr, int tile_count = 0) {
-      problem_visitor = typename ProblemVisitor::Params(args.total_rows_before_expert, args.gemm_n, args.gemm_k,
-                                                        args.problem_count, workspace, tile_count);
-      threadblock_count = args.threadblock_count;
-      output_op = args.output_op;
-      ptr_A = args.ptr_A;
-      ptr_B = args.ptr_B;
-      weight_scales = args.weight_scales;
-      ptr_C = args.ptr_C;
-      ptr_D = args.ptr_D;
-    }
-  };
-
-  /// Shared memory storage structure
-  union SharedStorage {
-    typename ProblemVisitor::SharedStorage problem_visitor;
-    typename Mma::SharedStorage main_loop;
-    typename Epilogue::SharedStorage epilogue;
-  };
-
- public:
-  //
-  // Methods
-  //
-
-  CUTLASS_DEVICE
-  MoeFCGemm() {}
-
-  /// Determines whether kernel satisfies alignment
-  static Status can_implement(cutlass::gemm::GemmCoord const& problem_size) { return Status::kSuccess; }
-
-  static Status can_implement(Arguments const& args) {
-    if (args.weight_scales != nullptr) {
-      CUTLASS_TRACE_HOST(
-          "MoeFCGemm::can_implement() - weight scales are ignored for all types except uint8_t and uint4b_t");
-      return Status::kInvalid;
-    }
-    return Status::kSuccess;
-  }
-
-  static size_t get_extra_workspace_size(Arguments const& args, cutlass::gemm::GemmCoord const& grid_tiled_shape) {
-    return 0;
-  }
-
-  // The dummy template parameter is not used and exists so that we can compile this code using
-  // a standard earlier than C++17. Prior to C++17, fully specialized templates HAD to exists in
-  // a namespace
-  template <bool B, typename dummy = void>
-  struct KernelRunner {
-    CUTLASS_DEVICE
-    static void run_kernel(Params const& params, SharedStorage& shared_storage) { CUTLASS_NOT_IMPLEMENTED(); }
-  };
-
-  template <typename dummy>
-  struct KernelRunner<true, dummy> {
-    CUTLASS_DEVICE
-    static void run_kernel(Params const& params, SharedStorage& shared_storage) {
-      //
-      // These types shadow the type-level definitions and support the ability to implement
-      // a 'transposed' GEMM that computes the transposed problems.
-      //
-      using ElementA = typename Mma::IteratorA::Element;
-      using LayoutA = typename Mma::IteratorA::Layout;
-      using ElementB = typename Mma::IteratorB::Element;
-      using LayoutB = typename Mma::IteratorB::Layout;
-      using ElementC = typename Epilogue::OutputTileIterator::Element;
-      using LayoutC = typename Epilogue::OutputTileIterator::Layout;
-      static constexpr int kInterleave = Mma::IteratorB::Shape::kRow / Mma::Shape::kK;
-      static_assert(platform::is_same<LayoutB, layout::RowMajor>::value && kInterleave == 1 ||
-                        platform::is_same<LayoutB, layout::ColumnMajor>::value && kInterleave >= 1,
-                    "B must be row major/col major OR col major interleaved.");
-
-      //
-      // Problem visitor.
-      //
-      ProblemVisitor problem_visitor(params.problem_visitor, shared_storage.problem_visitor, blockIdx.x);
-
-      const int64_t gemm_k = params.problem_visitor.gemm_k;
-      const int64_t gemm_n = params.problem_visitor.gemm_n;
-      int64_t bytes_per_expert_matrix = (gemm_k * gemm_n / 8) * cutlass::sizeof_bits<ElementB>::value;
-
-      // Outer 'persistent' loop to iterate over tiles
-      while (problem_visitor.next_tile()) {
-        GemmCoord problem_size = problem_visitor.problem_size();
-        int32_t problem_idx = problem_visitor.problem_index();
-        int32_t cta_idx = int32_t(problem_visitor.threadblock_idx());
-
-        GemmCoord grid_shape = problem_visitor.grid_shape(problem_size);
-
-        cutlass::gemm::GemmCoord threadblock_offset(int(cta_idx / grid_shape.n()) * Mma::Shape::kM,
-                                                    int(cta_idx % grid_shape.n()) * Mma::Shape::kN, 0);
-
-        // Load element pointers. Exchange pointers and strides if working on the transpose
-        const int64_t rows_to_jump =
-            problem_idx == 0 ? 0 : params.problem_visitor.last_row_for_problem[problem_idx - 1];
-        ElementA* ptr_A = reinterpret_cast<ElementA*>(params.ptr_A) + rows_to_jump * gemm_k;
-        typename LayoutA::LongIndex ldm_A = gemm_k;
-
-        char* byte_ptr_B = ((char*)params.ptr_B) + problem_idx * bytes_per_expert_matrix;
-        ElementB* ptr_B = reinterpret_cast<ElementB*>(byte_ptr_B);
-        typename LayoutB::LongIndex ldm_B =
-            platform::is_same<layout::RowMajor, LayoutB>::value ? gemm_n : gemm_k * kInterleave;
-
-        // Compute initial location in logical coordinates
-        cutlass::MatrixCoord tb_offset_A{
-            threadblock_offset.m(),
-            0,
-        };
-
-        cutlass::MatrixCoord tb_offset_B{0, threadblock_offset.n() / kInterleave};
-
-        cutlass::MatrixCoord tb_offset_scale{0, threadblock_offset.n()};
-
-        // Compute position within threadblock
-        int thread_idx = threadIdx.x;
-
-        // Construct iterators to A and B operands
-        typename Mma::IteratorA iterator_A(LayoutA(ldm_A), ptr_A, {problem_size.m(), problem_size.k()}, thread_idx,
-                                           tb_offset_A);
-
-        typename Mma::IteratorB iterator_B(LayoutB(ldm_B), ptr_B,
-                                           {problem_size.k() * kInterleave, problem_size.n() / kInterleave}, thread_idx,
-                                           tb_offset_B);
-
-        typename Mma::FragmentC accumulators;
-
-        accumulators.clear();
-
-        // Broadcast the warp_id computed by lane 0 to ensure dependent code
-        // is compiled as warp-uniform.
-        int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
-
-        int lane_idx = threadIdx.x % 32;
-
-        //
-        // Matrix multiply phase
-        //
-
-        // Construct thread-scoped matrix multiply
-        Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
-
-        // Compute threadblock-scoped matrix multiply-add
-        int gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
-
-        // Wait for all threads to finish their epilogue phases from the previous tile.
-        __syncthreads();
-
-        // Compute threadblock-scoped matrix multiply-add
-        ElementScale* weight_scale_ptr = params.weight_scales + problem_idx * problem_size.n();
-        run_mma<Mma>(mma, gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators, weight_scale_ptr,
-                     {1, problem_size.n()}, thread_idx, tb_offset_scale);
-
-        //
-        // Epilogue
-        //
-
-        EpilogueOutputOp output_op(params.output_op);
-
-        ElementC* ptr_C = reinterpret_cast<ElementC*>(params.ptr_C) + problem_idx * gemm_n;
-        ElementC* ptr_D = reinterpret_cast<ElementC*>(params.ptr_D) + rows_to_jump * gemm_n;
-
-        LayoutC layout_C(0);
-        LayoutC layout_D(gemm_n);
-
-        typename Epilogue::OutputTileIterator::Params params_C(layout_C);
-        typename Epilogue::OutputTileIterator::Params params_D(layout_D);
-
-        // Tile iterator loading from source tensor.
-        typename Epilogue::OutputTileIterator iterator_C(params_C, ptr_C, problem_size.mn(), thread_idx,
-                                                         threadblock_offset.mn());
-
-        // Tile iterator writing to destination tensor.
-        typename Epilogue::OutputTileIterator iterator_D(params_D, ptr_D, problem_size.mn(), thread_idx,
-                                                         threadblock_offset.mn());
-
-        Epilogue epilogue(shared_storage.epilogue, thread_idx, warp_idx, lane_idx);
-
-        // Execute the epilogue operator to update the destination tensor.
-        epilogue(output_op, iterator_D, accumulators, iterator_C);
-
-        // Next tile
-        problem_visitor.advance(gridDim.x);
-      }
-    }
-  };
-
-  /*
-    To improve compilation speed, we do not compile the device operator if the CUDA_ARCH does not correspond
-    to the ArchTag of the cutlass kernel operator.
-  */
-  /// Executes one GEMM
-  CUTLASS_DEVICE
-  void operator()(Params const& params, SharedStorage& shared_storage) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700) && (__CUDA_ARCH__ < 750)
-    static constexpr bool compile_needed = platform::is_same<KernelArch, arch::Sm70>::value;
-    KernelRunner<compile_needed>::run_kernel(params, shared_storage);
-#elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750) && (__CUDA_ARCH__ < 800)
-    static constexpr bool compile_needed = platform::is_same<KernelArch, arch::Sm75>::value;
-    KernelRunner<compile_needed>::run_kernel(params, shared_storage);
-#elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) && (__CUDA_ARCH__ < 900)
-    static constexpr bool compile_needed = platform::is_same<KernelArch, arch::Sm80>::value;
-    KernelRunner<compile_needed>::run_kernel(params, shared_storage);
-#else
-    CUTLASS_NOT_IMPLEMENTED();
-#endif
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace kernel
-}  // namespace gemm
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels.h
index e0f91ab806c85..7e29dde8f897b 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels.h
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels.h
@@ -16,8 +16,8 @@
 
 #pragma once
 
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm_configs.h"
 #include <cuda_runtime_api.h>
-#include "ft_gemm_configs.h"
 
 namespace ort_fastertransformer {
 
@@ -42,13 +42,9 @@ class MoeGemmRunner {
                          int64_t* total_rows_before_expert, int64_t total_rows, int64_t gemm_n, int64_t gemm_k,
                          int num_experts, ActivationType activation_type, cudaStream_t stream);
 
-  void moe_gemm_act(const T* A, const WeightType* B, const T* weight_scales, T* C, int64_t* total_rows_before_expert,
-                    int64_t total_rows, int64_t gemm_n, int64_t gemm_k, int num_experts,
-                    ActivationType activation_type, cudaStream_t stream);
-
   void moe_gemm(const T* A, const WeightType* B, const T* weight_scales, const T* biases, T* C,
-                int64_t* total_rows_before_expert, int64_t total_rows, int64_t gemm_n, int64_t gemm_k,
-                int num_experts, cudaStream_t stream);
+                int64_t* total_rows_before_expert, int64_t total_rows, int64_t gemm_n, int64_t gemm_k, int num_experts,
+                cudaStream_t stream);
 
  private:
   template <typename EpilogueTag>
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp16_fp16.cu b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp16_fp16.cu
index 1d9a249db4237..15cab9dd4a9bf 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp16_fp16.cu
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp16_fp16.cu
@@ -13,9 +13,18 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4100)
+#pragma warning(disable : 4244)
+#pragma warning(disable : 4200)
+#endif
 
-#include "moe_gemm_kernels_template.h"
+#include "contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_template.h"
 
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
 namespace ort_fastertransformer {
 template class MoeGemmRunner<half, half>;
 }  // namespace ort_fastertransformer
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp16_uint4.cu b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp16_uint4.cu
new file mode 100644
index 0000000000000..1309a7c32a37a
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp16_uint4.cu
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4100)
+#pragma warning(disable : 4244)
+#pragma warning(disable : 4200)
+#endif
+
+#include "contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_template.h"
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+namespace ort_fastertransformer {
+template class MoeGemmRunner<half, cutlass::uint4b_t>;
+}  // namespace ort_fastertransformer
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp32_fp32.cu b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp32_fp32.cu
index 7b250e6ca9060..0277fab9df95c 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp32_fp32.cu
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp32_fp32.cu
@@ -13,8 +13,18 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4100)
+#pragma warning(disable : 4244)
+#pragma warning(disable : 4200)
+#endif
 
-#include "moe_gemm_kernels_template.h"
+#include "contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_template.h"
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
 
 namespace ort_fastertransformer {
 template class MoeGemmRunner<float, float>;
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_template.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_template.h
index 2a15fdfd1cc1a..d81808e217fbc 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_template.h
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_template.h
@@ -26,21 +26,22 @@
 #pragma warning(disable : 4100)
 #endif
 
+#include "cutlass/arch/arch.h"
 #include "cutlass/array.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/gemm/device/gemm_grouped.h"
-#include "cutlass/gemm/kernel/default_gemm_grouped.h"
 #include "cutlass/cutlass.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/arch/arch.h"
 #include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/gemm/device/gemm_grouped.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/default_gemm_grouped.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
 
-#include "compute_occupancy.h"
-#include "epilogue_helpers.h"
-#include "layout_traits_helper.h"
-#include "moe_cutlass_kernel.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/compute_occupancy.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/epilogue_helpers.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/default_fpA_intB_traits.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/moe_cutlass_kernel.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_mma.h"
 
 #if defined(_MSC_VER)
 #pragma warning(pop)
@@ -67,10 +68,6 @@ void generic_moe_gemm_kernelLauncher(const T* A, const WeightType* B, const T* w
                                      int64_t* total_rows_before_expert, int64_t gemm_n, int64_t gemm_k, int num_experts,
                                      CutlassGemmConfig gemm_config, const int multi_processor_count,
                                      cudaStream_t stream, int* kernel_occupancy = nullptr) {
-  if (gemm_config.split_k_style != SplitKStyle::NO_SPLIT_K) {
-    ORT_THROW("[FT Error][MoeGemm] Grouped gemm does not support split-k");
-  }
-
   static_assert(cutlass::platform::is_same<T, half>::value || cutlass::platform::is_same<T, float>::value,
                 "Specialized for half, float");
 
@@ -87,10 +84,11 @@ void generic_moe_gemm_kernelLauncher(const T* A, const WeightType* B, const T* w
   using CutlassWeightType_ =
       typename cutlass::platform::conditional<cutlass::platform::is_same<WeightType, half>::value, cutlass::half_t,
                                               WeightType>::type;
+
   using CutlassWeightType = CutlassWeightType_;
 
-  // We need separate config for each architecture since we will target different tensorcore instructions. For float,
-  // we do not target TCs.
+  // We need separate config for each architecture since we will target different tensorcore instructions. For
+  // float, we do not target TCs.
   using MixedGemmArchTraits = cutlass::gemm::kernel::MixedGemmArchTraits<ElementType, CutlassWeightType, arch>;
   using ElementAccumulator = typename MixedGemmArchTraits::AccType;
 
@@ -119,17 +117,17 @@ void generic_moe_gemm_kernelLauncher(const T* A, const WeightType* B, const T* w
     return;
   }
   int occupancy = std::min(2, GemmGrouped::maximum_active_blocks());
-  if (occupancy == 0) {
-    ORT_THROW("[FT Error][MoE Runner] GPU lacks the shared memory resources to run GroupedGEMM kernel");
-  }
-  const int threadblock_count = multi_processor_count * occupancy;
+  ORT_ENFORCE(occupancy > 0, "GPU lacks the shared memory resources to run GroupedGEMM kernel");
+  int const threadblock_count = multi_processor_count * occupancy;
 
-  typename EpilogueOp::Params epilogue_op(ElementAccumulator(1.f), ElementAccumulator(0.f));
+  typename EpilogueOp::Params epilogue_op(ElementAccumulator(1.f),
+                                          biases ? ElementAccumulator(1.f) : ElementAccumulator(0.f));
 
+  int const group_size = gemm_k;
   typename GemmGrouped::Arguments args(
-      num_experts, threadblock_count, epilogue_op, reinterpret_cast<const ElementType*>(A),
-      reinterpret_cast<const CutlassWeightType*>(B), reinterpret_cast<const ElementType*>(weight_scales),
-      reinterpret_cast<const ElementType*>(biases), reinterpret_cast<ElementType*>(C), total_rows_before_expert, gemm_n,
+      num_experts, threadblock_count, group_size, epilogue_op, reinterpret_cast<ElementType const*>(A),
+      reinterpret_cast<CutlassWeightType const*>(B), reinterpret_cast<ElementType const*>(weight_scales),
+      reinterpret_cast<ElementType const*>(biases), reinterpret_cast<ElementType*>(C), total_rows_before_expert, gemm_n,
       gemm_k);
 
   GemmGrouped gemm;
@@ -231,11 +229,28 @@ template <
     typename T, typename WeightType, typename arch, typename EpilogueTag,
     typename std::enable_if<!std::is_same<T, float>::value && std::is_same<T, WeightType>::value>::type* = nullptr>
 void dispatch_moe_gemm_to_cutlass(const T* A, const WeightType* B, const T* weight_scales, const T* biases, T* C,
-                                  int64_t* total_rows_before_expert, int64_t /*total_rows*/,
-                                  int64_t gemm_n, int64_t gemm_k, int num_experts, CutlassGemmConfig gemm_config,
-                                  int /*sm_version*/, int multi_processor_count, cudaStream_t stream,
-                                  int* occupancy = nullptr) {
+                                  int64_t* total_rows_before_expert, int64_t /*total_rows*/, int64_t gemm_n,
+                                  int64_t gemm_k, int num_experts, CutlassGemmConfig gemm_config, int /*sm_version*/,
+                                  int multi_processor_count, cudaStream_t stream, int* occupancy = nullptr) {
   switch (gemm_config.tile_config) {
+    case CutlassTileConfig::CtaShape16x128x64_WarpShape16x32x64:
+      ORT_ENFORCE(arch::kMinComputeCapability >= 75, "Invalid config on Volta");
+      if constexpr (arch::kMinComputeCapability >= 75) {
+        dispatch_gemm_config<T, WeightType, arch, EpilogueTag, cutlass::gemm::GemmShape<16, 128, 64>,
+                             cutlass::gemm::GemmShape<16, 32, 64>>(
+            A, B, weight_scales, biases, C, total_rows_before_expert, gemm_n, gemm_k, num_experts, gemm_config,
+            multi_processor_count, stream, occupancy);
+      }
+      break;
+    case CutlassTileConfig::CtaShape16x256x64_WarpShape16x64x64:
+      ORT_ENFORCE(arch::kMinComputeCapability >= 75, "Invalid config on Volta");
+      if constexpr (arch::kMinComputeCapability >= 75) {
+        dispatch_gemm_config<T, WeightType, arch, EpilogueTag, cutlass::gemm::GemmShape<16, 256, 64>,
+                             cutlass::gemm::GemmShape<16, 64, 64>>(
+            A, B, weight_scales, biases, C, total_rows_before_expert, gemm_n, gemm_k, num_experts, gemm_config,
+            multi_processor_count, stream, occupancy);
+      }
+      break;
     case CutlassTileConfig::CtaShape32x128x64_WarpShape32x32x64:
       dispatch_gemm_config<T, WeightType, arch, EpilogueTag, cutlass::gemm::GemmShape<32, 128, 64>,
                            cutlass::gemm::GemmShape<32, 32, 64>>(A, B, weight_scales, biases, C,
@@ -255,13 +270,13 @@ void dispatch_moe_gemm_to_cutlass(const T* A, const WeightType* B, const T* weig
                                                                  gemm_config, multi_processor_count, stream, occupancy);
       break;
     case CutlassTileConfig::Undefined:
-      ORT_THROW("[FT Error][dispatch_moe_gemm_to_cutlass] gemm config undefined.");
+      ORT_THROW("GEMM config undefined.");
       break;
     case CutlassTileConfig::ChooseWithHeuristic:
-      ORT_THROW("[FT Error][dispatch_moe_gemm_to_cutlass] gemm config should have already been set by heuristic.");
+      ORT_THROW("GEMM config should have already been set by heuristic.");
       break;
     default:
-      ORT_THROW("[FT Error][dispatch_moe_gemm_to_cutlass] Config is invalid for same type MoE tensorop GEMM.");
+      ORT_THROW("Config is invalid for same type tensorop GEMM.");
       break;
   }
 }
@@ -404,51 +419,20 @@ void MoeGemmRunner<T, WeightType>::moe_gemm_bias_act(const T* A, const WeightTyp
                                                      cudaStream_t stream) {
   switch (activation_type) {
     case ActivationType::Relu:
-      run_gemm<EpilogueOpBiasReLU>(A, B, weight_scales, biases, C, total_rows_before_expert, total_rows, gemm_n, gemm_k,
-                                   num_experts, stream);
+      run_gemm<EpilogueOpDefaultReLU>(A, B, weight_scales, biases, C, total_rows_before_expert, total_rows, gemm_n,
+                                      gemm_k, num_experts, stream);
       break;
     case ActivationType::Gelu:
-      run_gemm<EpilogueOpBiasFtGelu>(A, B, weight_scales, biases, C, total_rows_before_expert, total_rows, gemm_n,
-                                     gemm_k, num_experts, stream);
+      run_gemm<EpilogueOpDefaultFtGelu>(A, B, weight_scales, biases, C, total_rows_before_expert, total_rows, gemm_n,
+                                        gemm_k, num_experts, stream);
       break;
     case ActivationType::Silu:
-      run_gemm<EpilogueOpBiasSilu>(A, B, weight_scales, biases, C, total_rows_before_expert, total_rows, gemm_n, gemm_k,
-                                   num_experts, stream);
+      run_gemm<EpilogueOpDefaultSilu>(A, B, weight_scales, biases, C, total_rows_before_expert, total_rows, gemm_n,
+                                      gemm_k, num_experts, stream);
       break;
     case ActivationType::Identity:
-      run_gemm<EpilogueOpBias>(A, B, weight_scales, biases, C, total_rows_before_expert, total_rows, gemm_n, gemm_k,
-                               num_experts, stream);
-      break;
-    case ActivationType::InvalidType:
-      ORT_THROW("[FT Error][MoE Runner] Invalid activation type for MoE GEMM");
-      break;
-    default: {
-      ORT_THROW("[FT Error][MoE Runner] Invalid activation type for MoE GEMM");
-    }
-  }
-}
-
-template <typename T, typename WeightType>
-void MoeGemmRunner<T, WeightType>::moe_gemm_act(const T* A, const WeightType* B, const T* weight_scales,
-                                                T* C, int64_t* total_rows_before_expert, int64_t total_rows,
-                                                int64_t gemm_n, int64_t gemm_k, int num_experts,
-                                                ActivationType activation_type, cudaStream_t stream) {
-  switch (activation_type) {
-    case ActivationType::Relu:
-      run_gemm<EpilogueOpNoBiasReLU>(A, B, weight_scales, nullptr, C, total_rows_before_expert, total_rows, gemm_n,
-                                     gemm_k, num_experts, stream);
-      break;
-    case ActivationType::Gelu:
-      run_gemm<EpilogueOpNoBiasFtGelu>(A, B, weight_scales, nullptr, C, total_rows_before_expert, total_rows, gemm_n,
-                                       gemm_k, num_experts, stream);
-      break;
-    case ActivationType::Silu:
-      run_gemm<EpilogueOpNoBiasSilu>(A, B, weight_scales, nullptr, C, total_rows_before_expert, total_rows, gemm_n,
-                                     gemm_k, num_experts, stream);
-      break;
-    case ActivationType::Identity:
-      run_gemm<EpilogueOpNoBias>(A, B, weight_scales, nullptr, C, total_rows_before_expert, total_rows, gemm_n, gemm_k,
-                                 num_experts, stream);
+      run_gemm<EpilogueOpDefault>(A, B, weight_scales, biases, C, total_rows_before_expert, total_rows, gemm_n, gemm_k,
+                                  num_experts, stream);
       break;
     case ActivationType::InvalidType:
       ORT_THROW("[FT Error][MoE Runner] Invalid activation type for MoE GEMM");
@@ -461,15 +445,10 @@ void MoeGemmRunner<T, WeightType>::moe_gemm_act(const T* A, const WeightType* B,
 
 template <typename T, typename WeightType>
 void MoeGemmRunner<T, WeightType>::moe_gemm(const T* A, const WeightType* B, const T* weight_scales, const T* biases,
-                                            T* C, int64_t* total_rows_before_expert, int64_t total_rows,
-                                            int64_t gemm_n, int64_t gemm_k, int num_experts, cudaStream_t stream) {
-  if (biases != nullptr) {
-    run_gemm<EpilogueOpBias>(A, B, weight_scales, biases, C, total_rows_before_expert, total_rows, gemm_n, gemm_k,
-                             num_experts, stream);
-  } else {
-    run_gemm<EpilogueOpNoBias>(A, B, weight_scales, nullptr, C, total_rows_before_expert, total_rows, gemm_n, gemm_k,
-                               num_experts, stream);
-  }
+                                            T* C, int64_t* total_rows_before_expert, int64_t total_rows, int64_t gemm_n,
+                                            int64_t gemm_k, int num_experts, cudaStream_t stream) {
+  run_gemm<EpilogueOpDefault>(A, B, weight_scales, biases, C, total_rows_before_expert, total_rows, gemm_n, gemm_k,
+                              num_experts, stream);
 }
 
 }  // namespace ort_fastertransformer
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu
index c299cdcfe6a3d..360c0aacd9c7a 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu
@@ -16,11 +16,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include <algorithm>
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <math.h>
 #include <sstream>
-#include <algorithm>
 
 // Ignore CUTLASS warnings about type punning
 #ifdef __GNUC__
@@ -54,8 +54,8 @@ static constexpr int WARP_SIZE = 32;
 // We have our own implementation of softmax here so we can support transposing the output
 // in the softmax kernel when we extend this module to support expert-choice routing.
 template <typename T, int TPB>
-__launch_bounds__(TPB) __global__ void moe_softmax(const T* input, const bool* finished, T* output,
-                                                   const int num_cols) {
+__launch_bounds__(TPB) __global__
+    void moe_softmax(const T* input, const bool* finished, T* output, const int num_cols) {
   using BlockReduce = cub::BlockReduce<float, TPB>;
   __shared__ typename BlockReduce::TempStorage tmpStorage;
 
@@ -112,9 +112,9 @@ __launch_bounds__(TPB) __global__ void moe_top_k(const T*, const bool*, T*, int*
 }
 #else
 template <typename T, int TPB>
-__launch_bounds__(TPB) __global__ void moe_top_k(const T* inputs_after_softmax, const bool* finished, T* output,
-                                                 int* indices, int* source_rows, int num_experts, int k,
-                                                 bool normalize_routing_weights) {
+__launch_bounds__(TPB) __global__
+    void moe_top_k(const T* inputs_after_softmax, const bool* finished, T* output, int* indices, int* source_rows,
+                   int num_experts, int k, bool normalize_routing_weights) {
   using cub_kvp = cub::KeyValuePair<int, T>;
   using BlockReduce = cub::BlockReduce<cub_kvp, TPB>;
   __shared__ typename BlockReduce::TempStorage tmpStorage;
@@ -398,15 +398,14 @@ void topk_gating_softmax_launcher_helper(const T* input, const bool* finished, T
   const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
 
   dim3 block_dim(WARP_SIZE, WARPS_PER_TB);
-  topk_gating_softmax<T, VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG>
-      <<<num_blocks, block_dim, 0, stream>>>(input, finished, output, num_rows, indices, source_row, k,
-                                             normalize_routing_weights);
+  topk_gating_softmax<T, VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG><<<num_blocks, block_dim, 0, stream>>>(
+      input, finished, output, num_rows, indices, source_row, k, normalize_routing_weights);
 }
 
 template <typename T>
 void topk_gating_softmax_kernelLauncher(const T* input, const bool* finished, T* output, T* softmax_temp_output,
-                                        int* indices, int* source_row, int num_rows, int num_experts,
-                                        int k, bool normalize_routing_weights, cudaStream_t stream) {
+                                        int* indices, int* source_row, int num_rows, int num_experts, int k,
+                                        bool normalize_routing_weights, cudaStream_t stream) {
   static constexpr int WARPS_PER_TB = 4;
 
   switch (num_experts) {
@@ -453,9 +452,8 @@ void topk_gating_softmax_kernelLauncher(const T* input, const bool* finished, T*
     default: {
       static constexpr int TPB = 256;
       moe_softmax<T, TPB><<<num_rows, TPB, 0, stream>>>(input, finished, softmax_temp_output, num_experts);
-      moe_top_k<T, TPB>
-          <<<num_rows, TPB, 0, stream>>>(softmax_temp_output, finished, output, indices, source_row, num_experts, k,
-                                         normalize_routing_weights);
+      moe_top_k<T, TPB><<<num_rows, TPB, 0, stream>>>(softmax_temp_output, finished, output, indices, source_row,
+                                                      num_experts, k, normalize_routing_weights);
     }
   }
 }
@@ -522,8 +520,8 @@ __global__ void compute_total_rows_before_expert_kernel(const int* sorted_expert
   total_rows_before_expert[expert] = find_total_elts_leq_target(sorted_experts, sorted_experts_len, expert);
 }
 
-__global__ void dispatch_activations_kernel(int64_t* total_rows_before_expert, int num_experts,
-                                            int local_num_experts, int local_experts_start_index) {
+__global__ void dispatch_activations_kernel(int64_t* total_rows_before_expert, int num_experts, int local_num_experts,
+                                            int local_experts_start_index) {
   const int expert = blockIdx.x * blockDim.x + threadIdx.x;
   const int local_experts_end_index = local_experts_start_index + local_num_experts - 1;
 
@@ -540,8 +538,7 @@ __global__ void dispatch_activations_kernel(int64_t* total_rows_before_expert, i
 }
 
 template <typename T, typename WeightType, typename Enable>
-CutlassMoeFCRunner<T, WeightType, Enable>::CutlassMoeFCRunner(int sm_version,
-                                                              bool has_fc3,
+CutlassMoeFCRunner<T, WeightType, Enable>::CutlassMoeFCRunner(int sm_version, bool has_fc3,
                                                               bool normalize_routing_weights)
     : has_fc3_(has_fc3),
       total_past_rows_(0),
@@ -596,12 +593,12 @@ void CutlassMoeFCRunner<T, WeightType, Enable>::configure_ws_ptrs(char* ws_ptr,
   const size_t padded_experts = pad_to_multiple_of_16(num_experts);
   const size_t num_moe_inputs = pad_to_multiple_of_16(k * num_rows);
 
-  source_rows_ = (int*)ws_ptr;
+  source_rows_ = reinterpret_cast<int*>(ws_ptr);
   permuted_rows_ = source_rows_ + num_moe_inputs;
   permuted_experts_ = permuted_rows_ + num_moe_inputs;
-  permuted_data_ = (T*)(permuted_experts_ + num_moe_inputs);
+  permuted_data_ = reinterpret_cast<T*>(permuted_experts_ + num_moe_inputs);
 
-  total_rows_before_expert_ = (int64_t*)(permuted_data_ + buf_size);
+  total_rows_before_expert_ = reinterpret_cast<int64_t*>(permuted_data_ + buf_size);
 
   if (has_fc3_) {
     fc3_result_ = reinterpret_cast<T*>(total_rows_before_expert_ + padded_experts);
@@ -648,9 +645,7 @@ struct T2<half> {
   using Type = half2;
 };
 
-inline __device__ float2 operator*(const float2 a, const float2 b) {
-  return make_float2(a.x * b.x, a.y * b.y);
-}
+inline __device__ float2 operator*(const float2 a, const float2 b) { return make_float2(a.x * b.x, a.y * b.y); }
 
 inline __device__ float4 operator*(const float4 a, const float4 b) {
   return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
@@ -703,15 +698,13 @@ void elementWiseMul(T* output, T const* input, int inter_size, int num_tokens, c
   if (inter_size & 3 == 0) {
     using vec_type = typename T4<T>::Type;
     int const threads = std::min(inter_size / 4, 1024);
-    elementWiseMulKernel<vec_type><<<blocks, threads, 0, stream>>>(reinterpret_cast<vec_type*>(output),
-                                                                   reinterpret_cast<vec_type const*>(input),
-                                                                   inter_size / 4);
+    elementWiseMulKernel<vec_type><<<blocks, threads, 0, stream>>>(
+        reinterpret_cast<vec_type*>(output), reinterpret_cast<vec_type const*>(input), inter_size / 4);
   } else if (inter_size & 1 == 0) {
     using vec_type = typename T2<T>::Type;
     int const threads = std::min(inter_size / 2, 1024);
-    elementWiseMulKernel<vec_type><<<blocks, threads, 0, stream>>>(reinterpret_cast<vec_type*>(output),
-                                                                   reinterpret_cast<vec_type const*>(input),
-                                                                   inter_size / 2);
+    elementWiseMulKernel<vec_type><<<blocks, threads, 0, stream>>>(
+        reinterpret_cast<vec_type*>(output), reinterpret_cast<vec_type const*>(input), inter_size / 2);
   } else {
     int const threads = std::min(inter_size, 1024);
     elementWiseMulKernel<T><<<blocks, threads, 0, stream>>>(output, input, inter_size);
@@ -725,8 +718,7 @@ void CutlassMoeFCRunner<T, WeightType, Enable>::run_moe_fc(
     const T* fc3_scales, const T* fc3_expert_biases, const WeightType* fc2_expert_weights, const T* fc2_scales,
     int num_rows, const int hidden_size, const int inter_size, int num_experts, int local_num_experts,
     int local_experts_start_index, int k, char* workspace_ptr, T* fc2_result, const bool* finished, int active_rows,
-    T* expert_scales, int* expanded_source_row_to_expanded_dest_row, int* expert_for_source_row,
-    cudaStream_t stream) {
+    T* expert_scales, int* expanded_source_row_to_expanded_dest_row, int* expert_for_source_row, cudaStream_t stream) {
   static constexpr bool scales_required =
       std::is_same<WeightType, uint8_t>::value || std::is_same<WeightType, cutlass::uint4b_t>::value;
 
@@ -750,8 +742,8 @@ void CutlassMoeFCRunner<T, WeightType, Enable>::run_moe_fc(
                                         source_rows_, num_rows, num_experts, k, normalize_routing_weights_, stream);
 
   const int sorter_ws_size_bytes = static_cast<int>(pad_to_multiple_of_16(sorter_.getWorkspaceSize(k * num_rows)));
-  sorter_.run((void*)fc1_result_, sorter_ws_size_bytes, expert_for_source_row, permuted_experts_, source_rows_,
-              permuted_rows_, k * num_rows, stream);
+  sorter_.run(reinterpret_cast<void*>(fc1_result_), sorter_ws_size_bytes, expert_for_source_row, permuted_experts_,
+              source_rows_, permuted_rows_, k * num_rows, stream);
 
   initialize_moe_routing_kernelLauncher(input_activations, permuted_data_, permuted_rows_,
                                         expanded_source_row_to_expanded_dest_row, num_rows, active_rows, hidden_size, k,
@@ -765,22 +757,10 @@ void CutlassMoeFCRunner<T, WeightType, Enable>::run_moe_fc(
     dispatch_activations(total_rows_before_expert_, num_experts, local_num_experts, local_experts_start_index, stream);
   }
 
-  // expanded_active_expert_rows is not used
-  if (fc1_expert_biases != nullptr) {
-    moe_gemm_runner_.moe_gemm_bias_act(permuted_data_ + total_past_rows_ * hidden_size,
-                                       fc1_expert_weights, fc1_scales, fc1_expert_biases,
-                                       fc1_result_ + total_past_rows_ * inter_size,
-                                       total_rows_before_expert_ + local_experts_start_index,
-                                       expanded_active_expert_rows, inter_size, hidden_size,
-                                       local_num_experts, fc1_activation_type, stream);
-  } else {
-    moe_gemm_runner_.moe_gemm_act(permuted_data_ + total_past_rows_ * hidden_size,
-                                  fc1_expert_weights, fc1_scales,
-                                  fc1_result_ + total_past_rows_ * inter_size,
-                                  total_rows_before_expert_ + local_experts_start_index,
-                                  expanded_active_expert_rows, inter_size, hidden_size,
-                                  local_num_experts, fc1_activation_type, stream);
-  }
+  moe_gemm_runner_.moe_gemm_bias_act(permuted_data_ + total_past_rows_ * hidden_size, fc1_expert_weights, fc1_scales,
+                                     fc1_expert_biases, fc1_result_ + total_past_rows_ * inter_size,
+                                     total_rows_before_expert_ + local_experts_start_index, expanded_active_expert_rows,
+                                     inter_size, hidden_size, local_num_experts, fc1_activation_type, stream);
 
   if (has_fc3_) {
     if (scales_required) {
@@ -795,24 +775,31 @@ void CutlassMoeFCRunner<T, WeightType, Enable>::run_moe_fc(
     if (fc3_expert_weights == nullptr) {
       ORT_THROW("[FT Error][Run MoE FC] FC3 weights are null");
     }
-    moe_gemm_runner_.moe_gemm(permuted_data_ + total_past_rows_ * hidden_size,
-                              fc3_expert_weights, fc3_scales, fc3_expert_biases,
-                              fc3_result_ + total_past_rows_ * inter_size,
-                              total_rows_before_expert_ + local_experts_start_index,
-                              expanded_active_expert_rows, inter_size, hidden_size,
-                              local_num_experts, stream);
+    moe_gemm_runner_.moe_gemm(permuted_data_ + total_past_rows_ * hidden_size, fc3_expert_weights, fc3_scales,
+                              fc3_expert_biases, fc3_result_ + total_past_rows_ * inter_size,
+                              total_rows_before_expert_ + local_experts_start_index, expanded_active_expert_rows,
+                              inter_size, hidden_size, local_num_experts, stream);
 
     elementWiseMul(fc1_result_ + total_past_rows_ * inter_size, fc3_result_ + total_past_rows_ * inter_size,
                    static_cast<int>(inter_size), static_cast<int>(total_covered_rows_), stream);
   }
 
-  moe_gemm_runner_.moe_gemm(fc1_result_ + total_past_rows_ * inter_size,
-                            fc2_expert_weights, fc2_scales, nullptr,
+  moe_gemm_runner_.moe_gemm(fc1_result_ + total_past_rows_ * inter_size, fc2_expert_weights, fc2_scales, nullptr,
                             fc2_result + total_past_rows_ * hidden_size,
-                            total_rows_before_expert_ + local_experts_start_index,
-                            expanded_active_expert_rows, hidden_size, inter_size, local_num_experts, stream);
+                            total_rows_before_expert_ + local_experts_start_index, expanded_active_expert_rows,
+                            hidden_size, inter_size, local_num_experts, stream);
 }
 
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700
+template <typename T, typename WeightType, typename Enable>
+void CutlassMoeFCRunner<T, WeightType, Enable>::run_moe_fc(const T*, const T*, const WeightType*, const T*, const T*,
+                                                           ActivationType, const WeightType*, const T*, const T*,
+                                                           const WeightType*, const T*, int, const int, const int, int,
+                                                           int, int, int k, char*, T*, T*, int*, int*, cudaStream_t) {
+  // MoE gemm only supports Volta+ architectures
+  ORT_THROW("[FT Error][Run MoE FC] MoE gemm only supports Volta+ architectures");
+}
+#else
 template <typename T, typename WeightType, typename Enable>
 void CutlassMoeFCRunner<T, WeightType, Enable>::run_moe_fc(
     const T* input_activations, const T* gating_output, const WeightType* fc1_expert_weights, const T* fc1_scales,
@@ -824,9 +811,9 @@ void CutlassMoeFCRunner<T, WeightType, Enable>::run_moe_fc(
   run_moe_fc(input_activations, gating_output, fc1_expert_weights, fc1_scales, fc1_expert_biases, fc1_activation_type,
              fc3_expert_weights, fc3_scales, fc3_expert_biases, fc2_expert_weights, fc2_scales, num_rows, hidden_size,
              inter_size, num_experts, local_num_experts, local_experts_start_index, k, workspace_ptr, fc2_result,
-             nullptr, num_rows, expert_scales, expanded_source_row_to_expanded_dest_row, expert_for_source_row,
-             stream);
+             nullptr, num_rows, expert_scales, expanded_source_row_to_expanded_dest_row, expert_for_source_row, stream);
 }
+#endif
 
 template <typename T, typename WeightType, typename Enable>
 void CutlassMoeFCRunner<T, WeightType, Enable>::compute_total_rows_before_expert(const int* sorted_indices,
@@ -842,8 +829,8 @@ void CutlassMoeFCRunner<T, WeightType, Enable>::compute_total_rows_before_expert
 }
 
 template <typename T, typename WeightType, typename Enable>
-void CutlassMoeFCRunner<T, WeightType, Enable>::dispatch_activations(int64_t* total_rows_before_expert,
-                                                                     int num_experts, int local_num_experts,
+void CutlassMoeFCRunner<T, WeightType, Enable>::dispatch_activations(int64_t* total_rows_before_expert, int num_experts,
+                                                                     int local_num_experts,
                                                                      int local_experts_start_index,
                                                                      cudaStream_t stream) {
   total_rows_before_expert_host_.resize(num_experts);
@@ -857,16 +844,15 @@ void CutlassMoeFCRunner<T, WeightType, Enable>::dispatch_activations(int64_t* to
   cudaEventCreateWithFlags(&copy_event, cudaEventDisableTiming);
   cudaEventRecord(copy_event, stream);
 
-  dispatch_activations_kernel<<<blocks, threads, 0, stream>>>(total_rows_before_expert, num_experts,
-                                                              local_num_experts, local_experts_start_index);
+  dispatch_activations_kernel<<<blocks, threads, 0, stream>>>(total_rows_before_expert, num_experts, local_num_experts,
+                                                              local_experts_start_index);
 
   get_total_rows_info(local_experts_start_index, local_num_experts, total_past_rows_, total_covered_rows_);
 }
 
 template <typename T, typename WeightType, typename Enable>
 void CutlassMoeFCRunner<T, WeightType, Enable>::get_total_rows_info(int64_t experts_start_index,
-                                                                    int64_t local_num_experts,
-                                                                    int64_t& total_past_rows,
+                                                                    int64_t local_num_experts, int64_t& total_past_rows,
                                                                     int64_t& total_covered_rows) {
   int64_t experts_end_index = experts_start_index + local_num_experts - 1;
   total_past_rows = 0;
@@ -923,8 +909,8 @@ __global__ void initialize_moe_routing_kernel(const T* unpermuted_input, T* perm
 template <typename T>
 void initialize_moe_routing_kernelLauncher(const T* unpermuted_input, T* permuted_output,
                                            const int* expanded_dest_row_to_expanded_source_row,
-                                           int* expanded_source_row_to_expanded_dest_row, int num_rows,
-                                           int active_rows, int cols, int k, cudaStream_t stream) {
+                                           int* expanded_source_row_to_expanded_dest_row, int num_rows, int active_rows,
+                                           int cols, int k, cudaStream_t stream) {
   const int blocks = num_rows * k;
   const int threads = std::min(cols, 1024);
   initialize_moe_routing_kernel<T>
@@ -980,8 +966,8 @@ __global__ void finalize_moe_routing_kernel(const T* expanded_permuted_rows, T*
       const int expert_idx = expert_for_source_row[k_offset];
       const T* bias_ptr = bias ? bias + expert_idx * cols : nullptr;
 
-      thread_output = thread_output + row_scale * (expanded_permuted_rows_row_ptr[tid] +
-                                                   (bias_ptr ? bias_ptr[tid] : T(0)));
+      thread_output =
+          thread_output + row_scale * (expanded_permuted_rows_row_ptr[tid] + (bias_ptr ? bias_ptr[tid] : T(0)));
     }
     reduced_row_ptr[tid] = thread_output;
   }
@@ -991,8 +977,8 @@ __global__ void finalize_moe_routing_kernel(const T* expanded_permuted_rows, T*
 template <typename T>
 void finalize_moe_routing_kernelLauncher(const T* expanded_permuted_rows, T* reduced_unpermuted_output, const T* bias,
                                          const T* scales, const int* expanded_source_row_to_expanded_dest_row,
-                                         const int* expert_for_source_row, int num_rows, int cols,
-                                         int k, cudaStream_t stream) {
+                                         const int* expert_for_source_row, int num_rows, int cols, int k,
+                                         cudaStream_t stream) {
   const int blocks = num_rows;
   const int threads = std::min(cols, 1024);
   finalize_moe_routing_kernel<T, 0><<<blocks, threads, 0, stream>>>(
@@ -1004,8 +990,8 @@ template <typename T>
 void finalize_moe_routing_kernelLauncher(const T* expanded_permuted_rows, T* reduced_unpermuted_output, const T* skip,
                                          const T* bias, const T* scales,
                                          const int* expanded_source_row_to_expanded_dest_row,
-                                         const int* expert_for_source_row, int num_rows, int cols,
-                                         int k, cudaStream_t stream) {
+                                         const int* expert_for_source_row, int num_rows, int cols, int k,
+                                         cudaStream_t stream) {
   const int blocks = num_rows;
   const int threads = std::min(cols, 1024);
   finalize_moe_routing_kernel<T, 1>
@@ -1017,8 +1003,8 @@ template <typename T>
 void finalize_moe_routing_kernelLauncher(const T* expanded_permuted_rows, T* reduced_unpermuted_output, const T* skip_1,
                                          const T* skip_2, const T* bias, const T* scales,
                                          const int* expanded_source_row_to_expanded_dest_row,
-                                         const int* expert_for_source_row, int num_rows, int cols,
-                                         int k, cudaStream_t stream) {
+                                         const int* expert_for_source_row, int num_rows, int cols, int k,
+                                         cudaStream_t stream) {
   const int blocks = num_rows;
   const int threads = std::min(cols, 1024);
   if (skip_2 == nullptr) {
@@ -1033,20 +1019,21 @@ void finalize_moe_routing_kernelLauncher(const T* expanded_permuted_rows, T* red
 }
 
 // ========================= TopK Softmax specializations ===========================
-template void topk_gating_softmax_kernelLauncher(const float*, const bool*, float*, float*, int*, int*, int,
-                                                 int, int, bool, cudaStream_t);
-template void topk_gating_softmax_kernelLauncher(const half*, const bool*, half*, half*, int*, int*, int,
-                                                 int, int, bool, cudaStream_t);
+template void topk_gating_softmax_kernelLauncher(const float*, const bool*, float*, float*, int*, int*, int, int, int,
+                                                 bool, cudaStream_t);
+template void topk_gating_softmax_kernelLauncher(const half*, const bool*, half*, half*, int*, int*, int, int, int,
+                                                 bool, cudaStream_t);
 
 // ==================== Variable batched GEMM specializations ==================================
 template class CutlassMoeFCRunner<float, float>;
 template class CutlassMoeFCRunner<half, half>;
+template class CutlassMoeFCRunner<half, cutlass::uint4b_t>;
 
 // ===================== Specializations for init routing =========================
-template void initialize_moe_routing_kernelLauncher(const float*, float*, const int*, int*, int, int,
-                                                    int, int, cudaStream_t);
-template void initialize_moe_routing_kernelLauncher(const half*, half*, const int*, int*, int, int,
-                                                    int, int, cudaStream_t);
+template void initialize_moe_routing_kernelLauncher(const float*, float*, const int*, int*, int, int, int, int,
+                                                    cudaStream_t);
+template void initialize_moe_routing_kernelLauncher(const half*, half*, const int*, int*, int, int, int, int,
+                                                    cudaStream_t);
 
 // ==================== Specializations for final routing ===================================
 template void finalize_moe_routing_kernelLauncher(const float*, float*, const float*, const float*, const int*,
@@ -1054,15 +1041,12 @@ template void finalize_moe_routing_kernelLauncher(const float*, float*, const fl
 template void finalize_moe_routing_kernelLauncher(const half*, half*, const half*, const half*, const int*, const int*,
                                                   int, int, int, cudaStream_t);
 template void finalize_moe_routing_kernelLauncher(const float*, float*, const float*, const float*, const float*,
-                                                  const int*, const int*, int, int, int,
-                                                  cudaStream_t);
+                                                  const int*, const int*, int, int, int, cudaStream_t);
 template void finalize_moe_routing_kernelLauncher(const half*, half*, const half*, const half*, const half*, const int*,
                                                   const int*, int, int, int, cudaStream_t);
 template void finalize_moe_routing_kernelLauncher(const float*, float*, const float*, const float*, const float*,
-                                                  const float*, const int*, const int*, int, int, int,
-                                                  cudaStream_t);
+                                                  const float*, const int*, const int*, int, int, int, cudaStream_t);
 template void finalize_moe_routing_kernelLauncher(const half*, half*, const half*, const half*, const half*,
-                                                  const half*, const int*, const int*, int, int, int,
-                                                  cudaStream_t);
+                                                  const half*, const int*, const int*, int, int, int, cudaStream_t);
 
 }  // namespace ort_fastertransformer
diff --git a/onnxruntime/contrib_ops/cuda/moe/moe.cc b/onnxruntime/contrib_ops/cuda/moe/moe.cc
index b13aab959fc48..dbd783c0cb11c 100644
--- a/onnxruntime/contrib_ops/cuda/moe/moe.cc
+++ b/onnxruntime/contrib_ops/cuda/moe/moe.cc
@@ -13,26 +13,16 @@ namespace onnxruntime {
 namespace contrib {
 namespace cuda {
 
-#define REGISTER_KERNEL_TYPED(T)                                  \
-  ONNX_OPERATOR_TYPED_KERNEL_EX(                                  \
-      MoE,                                                        \
-      kMSDomain,                                                  \
-      1,                                                          \
-      T,                                                          \
-      kCudaExecutionProvider,                                     \
-      (*KernelDefBuilder::Create())                               \
-          .MayInplace(0, 0)                                       \
-          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
-      MoE<T>);
+#define REGISTER_KERNEL_TYPED(T)                    \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                    \
+      MoE, kMSDomain, 1, T, kCudaExecutionProvider, \
+      (*KernelDefBuilder::Create()).MayInplace(0, 0).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), MoE<T>);
 
 REGISTER_KERNEL_TYPED(float)
 REGISTER_KERNEL_TYPED(MLFloat16)
 
-using namespace ONNX_NAMESPACE;
-
 template <typename T>
-MoE<T>::MoE(const OpKernelInfo& op_kernel_info) : CudaKernel(op_kernel_info), MoEBase(op_kernel_info) {
-}
+MoE<T>::MoE(const OpKernelInfo& op_kernel_info) : CudaKernel(op_kernel_info), MoEBase(op_kernel_info) {}
 
 template <typename T>
 Status MoE<T>::ComputeInternal(OpKernelContext* context) const {
@@ -46,9 +36,10 @@ Status MoE<T>::ComputeInternal(OpKernelContext* context) const {
   const Tensor* fc3_experts_bias_optional = context->Input<Tensor>(7);
 
   MoEParameters moe_params;
-  ORT_RETURN_IF_ERROR(CheckInputs(moe_params, input, router_probs, fc1_experts_weights, fc1_experts_bias_optional,
-                                  fc2_experts_weights, fc2_experts_bias_optional, fc3_experts_weights_optional,
-                                  fc3_experts_bias_optional));
+  MoEQuantType quant_type = MoEQuantType::None;
+  ORT_RETURN_IF_ERROR(CheckInputs(moe_params, quant_type, input, router_probs, fc1_experts_weights,
+                                  fc1_experts_bias_optional, fc2_experts_weights, fc2_experts_bias_optional,
+                                  fc3_experts_weights_optional, fc3_experts_bias_optional));
 
   typedef typename ToCudaType<T>::MappedType CudaT;
   auto stream = context->GetComputeStream();
@@ -56,14 +47,12 @@ Status MoE<T>::ComputeInternal(OpKernelContext* context) const {
   auto& device_prop = GetDeviceProp();
   const int sm = device_prop.major * 10 + device_prop.minor;
 
-  ort_fastertransformer::CutlassMoeFCRunner<CudaT, CudaT> moe_runner(sm,
-                                                                     fc3_experts_weights_optional != nullptr,
+  ort_fastertransformer::CutlassMoeFCRunner<CudaT, CudaT> moe_runner(sm, fc3_experts_weights_optional != nullptr,
                                                                      normalize_routing_weights_);
 
-  size_t ws_size =
-      moe_runner.getWorkspaceSize(static_cast<size_t>(moe_params.num_rows), static_cast<size_t>(moe_params.hidden_size),
-                                  static_cast<size_t>(moe_params.inter_size),
-                                  static_cast<size_t>(moe_params.num_experts), static_cast<size_t>(k_));
+  size_t ws_size = moe_runner.getWorkspaceSize(
+      static_cast<size_t>(moe_params.num_rows), static_cast<size_t>(moe_params.hidden_size),
+      static_cast<size_t>(moe_params.inter_size), static_cast<size_t>(moe_params.num_experts), static_cast<size_t>(k_));
   size_t fc2_output_size = k_ * moe_params.num_rows * moe_params.hidden_size * sizeof(CudaT);
   size_t expert_scales_size = k_ * moe_params.num_rows * sizeof(CudaT);
   size_t expanded_source_row_to_expanded_dest_row_size = k_ * moe_params.num_rows * sizeof(int);
@@ -83,36 +72,28 @@ Status MoE<T>::ComputeInternal(OpKernelContext* context) const {
       IAllocator::MakeUniquePtr<void>(allocator, expert_for_source_row_size, false, stream);
 
   const CudaT* fc_scales_ptr = nullptr;
-  moe_runner.run_moe_fc(reinterpret_cast<const CudaT*>(input->template Data<T>()),
-                        reinterpret_cast<const CudaT*>(router_probs->template Data<T>()),
-                        reinterpret_cast<const CudaT*>(fc1_experts_weights->DataRaw()),
-                        fc_scales_ptr,
-                        fc1_experts_bias_optional == nullptr
-                            ? nullptr
-                            : reinterpret_cast<const CudaT*>(fc1_experts_bias_optional->template Data<T>()),
-                        activation_type_,
-                        fc3_experts_weights_optional == nullptr
-                            ? nullptr
-                            : reinterpret_cast<const CudaT*>(fc3_experts_weights_optional->DataRaw()),
-                        fc_scales_ptr,
-                        fc3_experts_bias_optional == nullptr
-                            ? nullptr
-                            : reinterpret_cast<const CudaT*>(fc3_experts_bias_optional->template Data<T>()),
-                        reinterpret_cast<const CudaT*>(fc2_experts_weights->DataRaw()),
-                        fc_scales_ptr,
-                        static_cast<int>(moe_params.num_rows),
-                        static_cast<int>(moe_params.hidden_size),
-                        static_cast<int>(moe_params.inter_size),
-                        static_cast<int>(moe_params.num_experts),
-                        static_cast<int>(moe_params.local_num_experts),
-                        0 /*local_experts_start_index_ used in sharded MoE*/,
-                        static_cast<int>(k_),
-                        reinterpret_cast<char*>(work_space.get()),
-                        reinterpret_cast<CudaT*>(fc2_output.get()),
-                        reinterpret_cast<CudaT*>(expert_scales.get()),
-                        reinterpret_cast<int*>(expanded_source_row_to_expanded_dest_row.get()),
-                        reinterpret_cast<int*>(expert_for_source_row.get()),
-                        Stream(context));
+  moe_runner.run_moe_fc(
+      reinterpret_cast<const CudaT*>(input->template Data<T>()),
+      reinterpret_cast<const CudaT*>(router_probs->template Data<T>()),
+      reinterpret_cast<const CudaT*>(fc1_experts_weights->DataRaw()), fc_scales_ptr,
+      fc1_experts_bias_optional == nullptr
+          ? nullptr
+          : reinterpret_cast<const CudaT*>(fc1_experts_bias_optional->template Data<T>()),
+      activation_type_,
+      fc3_experts_weights_optional == nullptr ? nullptr
+                                              : reinterpret_cast<const CudaT*>(fc3_experts_weights_optional->DataRaw()),
+      fc_scales_ptr,
+      fc3_experts_bias_optional == nullptr
+          ? nullptr
+          : reinterpret_cast<const CudaT*>(fc3_experts_bias_optional->template Data<T>()),
+      reinterpret_cast<const CudaT*>(fc2_experts_weights->DataRaw()), fc_scales_ptr,
+      static_cast<int>(moe_params.num_rows), static_cast<int>(moe_params.hidden_size),
+      static_cast<int>(moe_params.inter_size), static_cast<int>(moe_params.num_experts),
+      static_cast<int>(moe_params.local_num_experts), 0 /*local_experts_start_index_ used in sharded MoE*/,
+      static_cast<int>(k_), reinterpret_cast<char*>(work_space.get()), reinterpret_cast<CudaT*>(fc2_output.get()),
+      reinterpret_cast<CudaT*>(expert_scales.get()),
+      reinterpret_cast<int*>(expanded_source_row_to_expanded_dest_row.get()),
+      reinterpret_cast<int*>(expert_for_source_row.get()), Stream(context));
 
   Tensor* output = context->Output(0, input->Shape());
 
@@ -124,8 +105,7 @@ Status MoE<T>::ComputeInternal(OpKernelContext* context) const {
       reinterpret_cast<CudaT*>(expert_scales.get()),
       reinterpret_cast<int*>(expanded_source_row_to_expanded_dest_row.get()),
       reinterpret_cast<int*>(expert_for_source_row.get()), static_cast<int>(moe_params.num_rows),
-      static_cast<int>(moe_params.hidden_size),
-      static_cast<int>(k_), Stream(context));
+      static_cast<int>(moe_params.hidden_size), static_cast<int>(k_), Stream(context));
 
   return Status::OK();
 }
diff --git a/onnxruntime/contrib_ops/cuda/moe/moe_base.h b/onnxruntime/contrib_ops/cuda/moe/moe_base.h
index 84a5e8c7c120d..4ecf1a6206643 100644
--- a/onnxruntime/contrib_ops/cuda/moe/moe_base.h
+++ b/onnxruntime/contrib_ops/cuda/moe/moe_base.h
@@ -18,6 +18,11 @@ enum class MoEParallelType {
   EPAndTP = 3,
 };
 
+enum class MoEQuantType {
+  None = 0,
+  UINT4 = 1,
+};
+
 struct MoEParameters {
   MoEParameters() {}
   explicit MoEParameters(int64_t tensor_shards) : tensor_shards(tensor_shards) {}
@@ -33,14 +38,10 @@ struct MoEParameters {
 
 class MoEBase {
  public:
-  Status CheckInputs(MoEParameters& parameters,
-                     const Tensor* input,
-                     const Tensor* router_probs,
-                     const Tensor* fc1_experts_weights,
-                     const Tensor* fc1_experts_bias_optional,
-                     const Tensor* fc2_experts_weights,
-                     const Tensor* fc2_experts_bias_optional,
-                     const Tensor* fc3_experts_weights_optional,
+  Status CheckInputs(MoEParameters& parameters, MoEQuantType& quant_type, const Tensor* input,
+                     const Tensor* router_probs, const Tensor* fc1_experts_weights,
+                     const Tensor* fc1_experts_bias_optional, const Tensor* fc2_experts_weights,
+                     const Tensor* fc2_experts_bias_optional, const Tensor* fc3_experts_weights_optional,
                      const Tensor* fc3_experts_bias_optional) const {
     const auto& input_dims = input->Shape().GetDims();
     const auto& router_probs_dims = router_probs->Shape().GetDims();
@@ -51,7 +52,7 @@ class MoEBase {
     int64_t hidden_size = input_dims[input_dims.size() - 1];
     int64_t local_num_experts = fc1_experts_weights_dims[0];
     int64_t num_experts = router_probs_dims[1];
-    int64_t inter_size = fc1_experts_weights_dims[2];
+    int64_t inter_size = fc2_experts_weights_dims[1];
 
     if (fc1_experts_weights_dims.size() != 3) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "fc1_experts_weights_dims must be 3D, got ",
@@ -69,20 +70,21 @@ class MoEBase {
     if (fc2_experts_weights_dims[1] != inter_size) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                              "fc2_experts_weights_dims[1] must be equal to inter_size, got ",
-                             fc2_experts_weights_dims[1],
-                             " and ", inter_size);
+                             fc2_experts_weights_dims[1], " and ", inter_size);
     }
-    if (fc1_experts_weights_dims[2] != inter_size) {
+
+    const int64_t coe = quant_type == MoEQuantType::UINT4 ? 2 : 1;
+    if (fc1_experts_weights_dims[2] != inter_size / coe) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                              "fc1_experts_weights_dims[2] must be equal to inter_size, got ",
-                             fc1_experts_weights_dims[2],
-                             " and ", inter_size);
+                             fc1_experts_weights_dims[2], " and ", inter_size);
     }
-    if (fc2_experts_weights_dims[2] != hidden_size) {
+    if (fc2_experts_weights_dims[2] != hidden_size / coe) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                              "fc2_experts_weights_dims[2] must be equal to hidden_size, got ",
                              fc2_experts_weights_dims[2], " and ", hidden_size);
     }
+
     if (router_probs_dims.size() != 2) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "router_probs_dims must be 2D, got ",
                              router_probs_dims.size());
@@ -105,25 +107,21 @@ class MoEBase {
       if (fc1_experts_bias_dims[0] != local_num_experts) {
         return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                                "fc1_experts_bias_dims[0] must be equal to local_num_experts, got ",
-                               fc1_experts_bias_dims[0],
-                               " and ", local_num_experts);
+                               fc1_experts_bias_dims[0], " and ", local_num_experts);
       }
       if (fc2_experts_bias_dims[0] != num_experts) {
         return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                               "fc2_experts_bias_dims[0] must be equal to num_experts, got ",
-                               fc2_experts_bias_dims[0],
+                               "fc2_experts_bias_dims[0] must be equal to num_experts, got ", fc2_experts_bias_dims[0],
                                " and ", num_experts);
       }
       if (fc1_experts_bias_dims[1] != inter_size) {
         return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                               "fc1_experts_bias_dims[1] must be equal to inter_size, got ",
-                               fc1_experts_bias_dims[1],
+                               "fc1_experts_bias_dims[1] must be equal to inter_size, got ", fc1_experts_bias_dims[1],
                                " and ", inter_size);
       }
       if (fc2_experts_bias_dims[1] != hidden_size) {
         return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                               "fc2_experts_bias_dims[1] must be equal to hidden_size, got ",
-                               fc2_experts_bias_dims[1],
+                               "fc2_experts_bias_dims[1] must be equal to hidden_size, got ", fc2_experts_bias_dims[1],
                                " and ", hidden_size);
       }
     }
@@ -137,10 +135,9 @@ class MoEBase {
 
     if (fc3_experts_bias_optional != nullptr && fc1_experts_bias_optional != nullptr &&
         fc3_experts_bias_optional->Shape().GetDims() != fc1_experts_bias_optional->Shape().GetDims()) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                             "fc3_experts_bias_dims must be equal to fc1_experts_bias_dims, got ",
-                             fc3_experts_bias_optional->Shape().GetDims(), " and ",
-                             fc1_experts_bias_optional->Shape().GetDims());
+      return ORT_MAKE_STATUS(
+          ONNXRUNTIME, INVALID_ARGUMENT, "fc3_experts_bias_dims must be equal to fc1_experts_bias_dims, got ",
+          fc3_experts_bias_optional->Shape().GetDims(), " and ", fc1_experts_bias_optional->Shape().GetDims());
     }
 
     parameters.num_rows = num_rows;
@@ -162,8 +159,47 @@ class MoEBase {
       }
     } else {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                             "num_experts must be greater than or equal to local_num_experts, got ",
-                             num_experts, " and ", local_num_experts);
+                             "num_experts must be greater than or equal to local_num_experts, got ", num_experts,
+                             " and ", local_num_experts);
+    }
+
+    return Status::OK();
+  }
+
+  Status CheckInputScales(const Tensor* fc1_experts_scales, const Tensor* fc2_experts_scales,
+                          const Tensor* fc3_experts_scales, int64_t num_experts, int64_t hidden_size,
+                          int64_t inter_size) const {
+    const auto& fc1_experts_scales_dims = fc1_experts_scales->Shape().GetDims();
+    const auto& fc2_experts_scales_dims = fc2_experts_scales->Shape().GetDims();
+
+    if (fc1_experts_scales_dims.size() != 2) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "fc1_experts_scales must be 2D, got ",
+                             fc1_experts_scales->Shape().GetDims().size());
+    }
+    if (fc1_experts_scales_dims[0] != num_experts) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "fc1_experts_scales[0] must be equal to num_experts, got ",
+                             fc1_experts_scales_dims[0], " and ", num_experts);
+    }
+    if (fc1_experts_scales_dims[1] != inter_size) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "fc1_experts_scales[1] must be equal to inter_size, got ",
+                             fc1_experts_scales_dims[1], " and ", inter_size);
+    }
+    if (fc2_experts_scales_dims.size() != 2) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "fc2_experts_scales must be 2D, got ",
+                             fc2_experts_scales->Shape().GetDims().size());
+    }
+    if (fc2_experts_scales_dims[0] != num_experts) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "fc2_experts_scales[0] must be equal to num_experts, got ",
+                             fc2_experts_scales_dims[0], " and ", num_experts);
+    }
+    if (fc2_experts_scales_dims[1] != hidden_size) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "fc2_experts_scales[1] must be equal to hidden_size, got ",
+                             fc2_experts_scales_dims[1], " and ", hidden_size);
+    }
+    if (fc3_experts_scales != nullptr && fc1_experts_scales_dims != fc3_experts_scales->Shape().GetDims()) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "fc3_experts_scales must be equal to fc1_experts_scales, got ",
+                             fc3_experts_scales->Shape().GetDims(), " and ", fc1_experts_scales_dims);
     }
 
     return Status::OK();
diff --git a/onnxruntime/contrib_ops/cuda/quantization/moe_quantization.cc b/onnxruntime/contrib_ops/cuda/quantization/moe_quantization.cc
new file mode 100644
index 0000000000000..7bb0945615d37
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/quantization/moe_quantization.cc
@@ -0,0 +1,143 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <type_traits>
+#include "core/common/safeint.h"
+#include "core/providers/cuda/cuda_common.h"
+#include "contrib_ops/cuda/quantization/moe_quantization.h"
+
+using namespace onnxruntime::cuda;
+using namespace ::onnxruntime::common;
+using namespace ONNX_NAMESPACE;
+
+namespace onnxruntime {
+namespace contrib {
+namespace cuda {
+
+#define REGISTER_KERNEL()                                                                  \
+  ONNX_OPERATOR_KERNEL_EX(QMoE, kMSDomain, 1, kCudaExecutionProvider,                      \
+                          (*KernelDefBuilder::Create())                                    \
+                              .MayInplace(0, 0)                                            \
+                              .TypeConstraint("T", BuildKernelDefConstraints<MLFloat16>()) \
+                              .TypeConstraint("T1", BuildKernelDefConstraints<uint8_t>()), \
+                          QMoE);
+
+REGISTER_KERNEL()
+
+namespace {
+template <typename T, bool use_quint4x2>
+struct ToCudaTypeWrapper : public ToCudaType<T> {};
+
+template <>
+struct ToCudaTypeWrapper<uint8_t, false> {
+  using MappedType = uint8_t;
+};
+
+template <>
+struct ToCudaTypeWrapper<uint8_t, true> {
+  using MappedType = cutlass::uint4b_t;
+};
+}  // anonymous namespace
+
+QMoE::QMoE(const OpKernelInfo& op_kernel_info) : CudaKernel(op_kernel_info), MoEBase(op_kernel_info) {}
+
+Status QMoE::ComputeInternal(OpKernelContext* context) const {
+  const Tensor* input = context->Input<Tensor>(0);
+  const Tensor* router_probs = context->Input<Tensor>(1);
+  const Tensor* fc1_experts_weights = context->Input<Tensor>(2);
+  const Tensor* fc1_scales = context->Input<Tensor>(3);
+  const Tensor* fc1_experts_bias_optional = context->Input<Tensor>(4);
+  const Tensor* fc2_experts_weights = context->Input<Tensor>(5);
+  const Tensor* fc2_scales = context->Input<Tensor>(6);
+  const Tensor* fc2_experts_bias_optional = context->Input<Tensor>(7);
+  const Tensor* fc3_experts_weights_optional = context->Input<Tensor>(8);
+  const Tensor* fc3_scales_optional = context->Input<Tensor>(9);
+  const Tensor* fc3_experts_bias_optional = context->Input<Tensor>(10);
+
+  MoEParameters moe_params;
+  MoEQuantType quant_type = MoEQuantType::UINT4;
+  ORT_RETURN_IF_ERROR(CheckInputs(moe_params, quant_type, input, router_probs, fc1_experts_weights,
+                                  fc1_experts_bias_optional, fc2_experts_weights, fc2_experts_bias_optional,
+                                  fc3_experts_weights_optional, fc3_experts_bias_optional));
+  ORT_RETURN_IF_ERROR(CheckInputScales(fc1_scales, fc2_scales, fc3_scales_optional, moe_params.num_experts,
+                                       moe_params.hidden_size, moe_params.inter_size));
+
+  // Support int4 only at the moment. We can add uint8 if needed.
+  static constexpr bool use_quint4x2 = true;
+  using T = MLFloat16;
+  using CudaT = typename ToCudaType<T>::MappedType;
+  using CudaWeightT = typename ToCudaTypeWrapper<uint8_t, use_quint4x2>::MappedType;
+
+  auto stream = context->GetComputeStream();
+
+  auto& device_prop = GetDeviceProp();
+  const int sm = device_prop.major * 10 + device_prop.minor;
+
+  ort_fastertransformer::CutlassMoeFCRunner<CudaT, CudaWeightT> moe_runner(sm, fc3_experts_weights_optional != nullptr,
+                                                                           normalize_routing_weights_);
+
+  size_t ws_size = moe_runner.getWorkspaceSize(
+      static_cast<size_t>(moe_params.num_rows), static_cast<size_t>(moe_params.hidden_size),
+      static_cast<size_t>(moe_params.inter_size), static_cast<size_t>(moe_params.num_experts), static_cast<size_t>(k_));
+  size_t fc2_output_size = k_ * moe_params.num_rows * moe_params.hidden_size * sizeof(CudaT);
+  size_t expert_scales_size = k_ * moe_params.num_rows * sizeof(CudaT);
+  size_t expanded_source_row_to_expanded_dest_row_size = k_ * moe_params.num_rows * sizeof(int);
+  size_t expert_for_source_row_size = k_ * moe_params.num_rows * sizeof(int);
+
+  AllocatorPtr allocator;
+  ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&allocator));
+
+  IAllocatorUniquePtr<void> work_space = IAllocator::MakeUniquePtr<void>(allocator, ws_size, false, stream);
+  IAllocatorUniquePtr<void> fc2_output = IAllocator::MakeUniquePtr<void>(allocator, fc2_output_size, false, stream);
+  IAllocatorUniquePtr<void> expert_scales =
+      IAllocator::MakeUniquePtr<void>(allocator, expert_scales_size, false, stream);
+  IAllocatorUniquePtr<void> expanded_source_row_to_expanded_dest_row =
+      IAllocator::MakeUniquePtr<void>(allocator, expanded_source_row_to_expanded_dest_row_size, false, stream);
+  IAllocatorUniquePtr<void> expert_for_source_row =
+      IAllocator::MakeUniquePtr<void>(allocator, expert_for_source_row_size, false, stream);
+
+  moe_runner.run_moe_fc(
+      reinterpret_cast<const CudaT*>(input->template Data<T>()),
+      reinterpret_cast<const CudaT*>(router_probs->template Data<T>()),
+      reinterpret_cast<const CudaWeightT*>(fc1_experts_weights->DataRaw()),
+      fc1_scales == nullptr ? nullptr : reinterpret_cast<const CudaT*>(fc1_scales->template Data<T>()),
+      fc1_experts_bias_optional == nullptr
+          ? nullptr
+          : reinterpret_cast<const CudaT*>(fc1_experts_bias_optional->template Data<T>()),
+      activation_type_,
+      fc3_experts_weights_optional == nullptr
+          ? nullptr
+          : reinterpret_cast<const CudaWeightT*>(fc3_experts_weights_optional->DataRaw()),
+      fc3_scales_optional == nullptr ? nullptr
+                                     : reinterpret_cast<const CudaT*>(fc3_scales_optional->template Data<T>()),
+      fc3_experts_bias_optional == nullptr
+          ? nullptr
+          : reinterpret_cast<const CudaT*>(fc3_experts_bias_optional->template Data<T>()),
+      reinterpret_cast<const CudaWeightT*>(fc2_experts_weights->DataRaw()),
+      fc2_scales == nullptr ? nullptr : reinterpret_cast<const CudaT*>(fc2_scales->template Data<T>()),
+      static_cast<int>(moe_params.num_rows), static_cast<int>(moe_params.hidden_size),
+      static_cast<int>(moe_params.inter_size), static_cast<int>(moe_params.num_experts),
+      static_cast<int>(moe_params.local_num_experts), 0 /*local_experts_start_index_ used in sharded MoE*/,
+      static_cast<int>(k_), reinterpret_cast<char*>(work_space.get()), reinterpret_cast<CudaT*>(fc2_output.get()),
+      reinterpret_cast<CudaT*>(expert_scales.get()),
+      reinterpret_cast<int*>(expanded_source_row_to_expanded_dest_row.get()),
+      reinterpret_cast<int*>(expert_for_source_row.get()), Stream(context));
+
+  Tensor* output = context->Output(0, input->Shape());
+
+  ort_fastertransformer::finalize_moe_routing_kernelLauncher(
+      reinterpret_cast<CudaT*>(fc2_output.get()), reinterpret_cast<CudaT*>(output->template MutableData<T>()),
+      fc2_experts_bias_optional == nullptr
+          ? nullptr
+          : reinterpret_cast<const CudaT*>(fc2_experts_bias_optional->template Data<T>()),
+      reinterpret_cast<CudaT*>(expert_scales.get()),
+      reinterpret_cast<int*>(expanded_source_row_to_expanded_dest_row.get()),
+      reinterpret_cast<int*>(expert_for_source_row.get()), static_cast<int>(moe_params.num_rows),
+      static_cast<int>(moe_params.hidden_size), static_cast<int>(k_), Stream(context));
+
+  return Status::OK();
+}
+
+}  // namespace cuda
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/quantization/moe_quantization.h b/onnxruntime/contrib_ops/cuda/quantization/moe_quantization.h
new file mode 100644
index 0000000000000..7b68d2d082de8
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/quantization/moe_quantization.h
@@ -0,0 +1,25 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "contrib_ops/cuda/moe/ft_moe/moe_kernel.h"
+#include "contrib_ops/cuda/moe/moe_base.h"
+#include "core/common/common.h"
+#include "core/providers/cuda/cuda_kernel.h"
+
+namespace onnxruntime {
+namespace contrib {
+namespace cuda {
+
+using namespace onnxruntime::cuda;
+
+class QMoE final : public CudaKernel, public MoEBase {
+ public:
+  explicit QMoE(const OpKernelInfo& op_kernel_info);
+  Status ComputeInternal(OpKernelContext* ctx) const override;
+};
+
+}  // namespace cuda
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
index 82cc16acad582..f4d990540ed51 100644
--- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
@@ -1404,6 +1404,64 @@ ONNX_MS_OPERATOR_SET_SCHEMA(MoE, 1,
                                 .TypeConstraint("T", {"tensor(float)", "tensor(float16)"}, "Constrain input and output types to float or float16 tensors.")
                                 .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput));
 
+ONNX_MS_OPERATOR_SET_SCHEMA(
+    QMoE, 1,
+    OpSchema()
+        .SetDoc("Int4 MoE")
+        .Attr("activation_type",
+              "Activation function to use. Choose from relu, gelu, silu and identity. Default is relu",
+              AttributeProto::STRING,
+              std::string("relu"))
+        .Attr("k",
+              "Number of top experts to select from expert pool",
+              AttributeProto::INT,
+              static_cast<int64_t>(1))
+        .Attr("normalize_routing_weights",
+              "Whether to normalize routing weights",
+              AttributeProto::INT,
+              static_cast<int64_t>(0))
+        .Input(0,
+               "input",
+               "2D input tensor with shape (num_rows, hidden_size) or 3D input tensor with shape "
+               "(batch_size, sequence_length, hidden_size)",
+               "T")
+        .Input(1, "router_probs", "2D input tensor with shape (num_rows, num_experts)", "T")
+        .Input(2, "fc1_experts_weights", "3D input tensor with shape (num_experts, hidden_size, inter_size / 2)", "T1")
+        .Input(3, "fc1_scales", "2D input tensor with shape (num_experts, inter_size)", "T")
+        .Input(4,
+               "fc1_experts_bias",
+               "2D optional input tensor with shape (num_experts, inter_size)", "T", OpSchema::Optional)
+        .Input(5, "fc2_experts_weights", "3D input tensor with shape (num_experts, inter_size, hidden_size / 2)", "T1")
+        .Input(6, "fc2_scales", "2D input tensor with shape (num_experts, hidden_size)", "T")
+        .Input(7,
+               "fc2_experts_bias",
+               "2D optional input tensor with shape (num_experts, hidden_size)",
+               "T",
+               OpSchema::Optional)
+        .Input(8,
+               "fc3_experts_weights",
+               "3D optional input tensor with shape (num_experts, hidden_size, inter_size / 2)",
+               "T1",
+               OpSchema::Optional)
+        .Input(9,
+               "fc3_scales",
+               "2D optional input tensor with shape (num_experts, inter_size)",
+               "T",
+               OpSchema::Optional)
+        .Input(10,
+               "fc3_experts_bias",
+               "2D optional input tensor with shape (num_experts, inter_size)",
+               "T",
+               OpSchema::Optional)
+        .Output(0,
+                "output",
+                "2D input tensor with shape (num_rows, hidden_size) or 3D input tensor with shape "
+                "(batch_size, sequence_length, hidden_size)",
+                "T")
+        .TypeConstraint("T", {"tensor(float16)"}, "Constrain input and output types to float or float16 tensors.")
+        .TypeConstraint("T1", {"tensor(uint8)"}, "Constrain weights type to uint8 tensors.")
+        .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput));
+
 ONNX_MS_OPERATOR_SET_SCHEMA(SampleOp, 1,
                             OpSchema()
                                 .Input(0, "X", "input", "T")
diff --git a/onnxruntime/core/graph/contrib_ops/ms_opset.h b/onnxruntime/core/graph/contrib_ops/ms_opset.h
index 5eef1b33a24dd..ef86352080ff5 100644
--- a/onnxruntime/core/graph/contrib_ops/ms_opset.h
+++ b/onnxruntime/core/graph/contrib_ops/ms_opset.h
@@ -84,6 +84,7 @@ class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, MatMulFpQ4);
 #endif
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, MaxpoolWithMask);
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, MoE);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QMoE);
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, MultiHeadAttention);
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, GroupQueryAttention);
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, MurmurHash3);
@@ -191,6 +192,7 @@ class OpSet_Microsoft_ver1 {
 #endif
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, MaxpoolWithMask)>());
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, MoE)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QMoE)>());
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, MultiHeadAttention)>());
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, GroupQueryAttention)>());
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, MurmurHash3)>());
diff --git a/onnxruntime/test/contrib_ops/moe_test.cc b/onnxruntime/test/contrib_ops/moe_test.cc
index 263ace25ddfe0..7dbaadd51d14a 100644
--- a/onnxruntime/test/contrib_ops/moe_test.cc
+++ b/onnxruntime/test/contrib_ops/moe_test.cc
@@ -9,24 +9,14 @@
 namespace onnxruntime {
 namespace test {
 
-static void RunMoETest(
-    const std::vector<float>& input,
-    const std::vector<float>& router_probs,
-    const std::vector<float>& fc1_experts_weights,
-    const std::vector<float>& fc2_experts_weights,
-    const std::vector<float>& fc3_experts_weights,
-    const std::vector<float>& fc1_experts_bias,
-    const std::vector<float>& fc2_experts_bias,
-    const std::vector<float>& output_data,
-    int num_rows,
-    int num_experts,
-    int hidden_size,
-    int inter_size,
-    std::string activation_type,
-    int normalize_routing_weights = 0,
-    int top_k = 1,
-    bool use_float16 = false) {
-  int min_cuda_architecture = use_float16 ? 530 : 0;
+#ifndef ENABLE_TRAINING
+static void RunMoETest(const std::vector<float>& input, const std::vector<float>& router_probs,
+                       const std::vector<float>& fc1_experts_weights, const std::vector<float>& fc2_experts_weights,
+                       const std::vector<float>& fc3_experts_weights, const std::vector<float>& fc1_experts_bias,
+                       const std::vector<float>& fc2_experts_bias, const std::vector<float>& output_data, int num_rows,
+                       int num_experts, int hidden_size, int inter_size, std::string activation_type,
+                       int normalize_routing_weights = 0, int top_k = 1, bool use_float16 = false) {
+  int min_cuda_architecture = use_float16 ? 700 : 0;
 
   bool enable_cuda = HasCudaEnvironment(min_cuda_architecture);
   if (enable_cuda) {
@@ -92,6 +82,52 @@ static void RunMoETest(
   }
 }
 
+// TODO(wy): Add python parity tests that can serve as examples. Need cutlass upgrade to build cutlass extensions to
+// add weights preprocesser to onnxruntime_pybind_quant.cc
+static void RunQMoETest(const std::vector<float>& input, const std::vector<float>& router_probs,
+                        const std::vector<uint8_t>& fc1_experts_weights,
+                        const std::vector<uint8_t>& fc2_experts_weights,
+                        const std::vector<uint8_t>& fc3_experts_weights, const std::vector<float>& fc1_scales,
+                        const std::vector<float>& fc2_scales, const std::vector<float>& fc3_scales,
+                        const std::vector<float>& output_data, int num_rows, int num_experts, int hidden_size,
+                        int inter_size, std::string activation_type, int normalize_routing_weights = 0, int top_k = 1) {
+  bool enable_cuda = HasCudaEnvironment(700);
+  if (enable_cuda) {
+    OpTester tester("QMoE", 1, onnxruntime::kMSDomain);
+    tester.AddAttribute<int64_t>("k", static_cast<int64_t>(top_k));
+    tester.AddAttribute<std::string>("activation_type", activation_type);
+    tester.AddAttribute<int64_t>("normalize_routing_weights", static_cast<int64_t>(normalize_routing_weights));
+
+    std::vector<int64_t> input_dims = {num_rows, hidden_size};
+    std::vector<int64_t> router_probs_dims = {num_rows, num_experts};
+    std::vector<int64_t> fc1_experts_weights_dims = {num_experts, hidden_size, inter_size / 2};
+    std::vector<int64_t> fc2_experts_weights_dims = {num_experts, inter_size, hidden_size / 2};
+    std::vector<int64_t> fc3_experts_weights_dims = fc1_experts_weights_dims;
+    std::vector<int64_t> fc1_scales_dims = {num_experts, inter_size};
+    std::vector<int64_t> fc2_scales_dims = {num_experts, hidden_size};
+    std::vector<int64_t> fc3_scales_dims = fc1_scales_dims;
+    std::vector<int64_t> output_dims = {num_rows, hidden_size};
+
+    tester.AddInput<MLFloat16>("input", input_dims, ToFloat16(input));
+    tester.AddInput<MLFloat16>("router_probs", router_probs_dims, ToFloat16(router_probs));
+
+    tester.AddInput<uint8_t>("fc1_experts_weights", fc1_experts_weights_dims, fc1_experts_weights);
+    tester.AddInput<MLFloat16>("fc1_scales", fc1_scales_dims, ToFloat16(fc1_scales));
+    tester.AddOptionalInputEdge<MLFloat16>();  // fc1_experts_bias
+    tester.AddInput<uint8_t>("fc2_experts_weights", fc2_experts_weights_dims, fc2_experts_weights);
+    tester.AddInput<MLFloat16>("fc2_scales", fc2_scales_dims, ToFloat16(fc2_scales));
+    tester.AddOptionalInputEdge<MLFloat16>();  // fc2_experts_bias
+    tester.AddInput<uint8_t>("fc3_experts_weights", fc3_experts_weights_dims, fc3_experts_weights);
+    tester.AddInput<MLFloat16>("fc3_scales", fc3_scales_dims, ToFloat16(fc3_scales));
+    tester.AddOutput<MLFloat16>("output", output_dims, ToFloat16(output_data));
+    tester.SetOutputTolerance(0.005f);
+
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCudaExecutionProvider());
+    tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
+}
+
 TEST(MoETest, MoETest_Gelu) {
   int num_rows = 4;
   int num_experts = 4;
@@ -107,135 +143,145 @@ TEST(MoETest, MoETest_Gelu) {
       -0.84837115f, 0.100507565f, -0.10548311f, 0.40957215f, 1.0159845f, 0.26919764f, 0.021741152f, -0.34184334f,
       -0.71324956f, 0.29018253f, -0.18227568f, 0.31496462f, -0.48426327f, -1.006643f, -0.100081146f, -0.07692295f};
   const std::vector<float> fc1_experts_weights = {
-      0.14731085f, 0.52229995f, 0.14753294f, 0.22475791f, 0.20864725f, 0.6708725f, 0.20204341f, 0.4890914f,
-      0.52103406f, 0.8223115f, 0.122039974f, 0.15674388f, 0.20966923f, 0.8499667f, 0.3202675f, 0.92174435f,
-      0.6808038f, 0.563313f, 0.496278f, 0.40115923f, 0.5627332f, 0.38582766f, 0.49648678f, 0.5637965f,
-      0.10889745f, 0.23793429f, 0.90374637f, 0.09422666f, 0.4640969f, 0.99461937f, 0.6806185f, 0.5141565f,
-      0.066695035f, 0.74768895f, 0.14385962f, 0.35806787f, 0.33224183f, 0.4259563f, 0.50546914f, 0.91240376f,
-      0.5624194f, 0.9478464f, 0.8058562f, 0.18389302f, 0.72425205f, 0.14655197f, 0.28808743f, 0.64706135f,
-      0.66509604f, 0.875114f, 0.33904207f, 0.50080043f, 0.7574118f, 0.016453922f, 0.8614903f, 0.08653879f,
-      0.50689125f, 0.41499162f, 0.23666352f, 0.5660855f, 0.91345936f, 0.35384023f, 0.20315295f, 0.31508058f,
-      0.0044258237f, 0.725697f, 0.25986814f, 0.16632986f, 0.21194929f, 0.787478f, 0.76478684f, 0.8837609f,
-      0.68136156f, 0.33302015f, 0.36027592f, 0.647715f, 0.91101736f, 0.6359461f, 0.26342732f, 0.2649613f,
-      0.02726549f, 0.608024f, 0.21940875f, 0.054212093f, 0.93843824f, 0.1752944f, 0.44311923f, 0.64324677f,
-      0.51592916f, 0.16355914f, 0.09583914f, 0.8985412f, 0.58141935f, 0.91481227f, 0.3323797f, 0.6472777f,
-      0.3856619f, 0.47776443f, 0.1954779f, 0.66910046f, 0.65808296f, 0.4896857f, 0.38754892f, 0.1917851f,
-      0.8457724f, 0.12778795f, 0.70483273f, 0.33187324f, 0.258766f, 0.58982253f, 0.24027151f, 0.6152024f,
-      0.5981904f, 0.12875527f, 0.5832493f, 0.7129646f, 0.6979155f, 0.43706065f, 0.09010619f, 0.42292297f,
-      0.67365384f, 0.31756145f, 0.68979055f, 0.8329813f, 0.2389242f, 0.5049309f, 0.7067495f, 0.5391889f,
-      0.54176575f, 0.5624327f, 0.10692614f, 0.5392941f, 0.8462349f, 0.9505569f, 0.79387546f, 0.5670015f,
-      0.7335071f, 0.25676018f, 0.08565581f, 0.07003945f, 0.99880487f, 0.8173947f, 0.15438312f, 0.6956213f,
-      0.8775838f, 0.9998074f, 0.93719745f, 0.8873769f, 0.38537037f, 0.32452917f, 0.9105244f, 0.7801898f,
-      0.19911051f, 0.9495086f, 0.7415793f, 0.77256775f, 0.18661183f, 0.6434499f, 0.32471877f, 0.8906783f,
-      0.4100297f, 0.69465625f, 0.5888109f, 0.7127341f, 0.33008623f, 0.7437857f, 0.15076452f, 0.6129275f,
-      0.16170406f, 0.006731212f, 0.09847212f, 0.89473504f, 0.7705178f, 0.96910787f, 0.9005606f, 0.053477287f,
-      0.15878445f, 0.4192087f, 0.17528385f, 0.84719825f, 0.121996105f, 0.25604928f, 0.016954303f, 0.21612722f,
-      0.91123873f, 0.90938f, 0.85791886f, 0.88606364f, 0.94459325f, 0.3719685f, 0.72000104f, 0.9454652f,
-      0.6654094f, 0.9998382f, 0.75933146f, 0.81082416f, 0.32500392f, 0.73991376f, 0.5574533f, 0.38059133f,
-      0.21814507f, 0.21944171f, 0.11525959f, 0.83566517f, 0.8554656f, 0.44309366f, 0.210657f, 0.88645273f,
-      0.81974447f, 0.537167f, 0.26393235f, 0.9595239f, 0.70447034f, 0.12042731f, 0.97854143f, 0.8796869f,
-      0.31775457f, 0.78107727f, 0.21590549f, 0.42164284f, 0.9245506f, 0.52065957f, 0.14639091f, 0.33288354f,
-      0.36427742f, 0.4035356f, 0.5478503f, 0.9624148f, 0.5267702f, 0.19128f, 0.52562714f, 0.7397436f,
-      0.7480201f, 0.04303074f, 0.41052878f, 0.12842774f, 0.2866572f, 0.6801467f, 0.1449349f, 0.68586344f,
-      0.92438906f, 0.5327942f, 0.16675615f, 0.32085752f, 0.60918206f, 0.11884099f, 0.74840516f, 0.04606521f,
-      0.01935333f, 0.014169693f, 0.39856833f, 0.83621645f, 0.026760519f, 0.91559356f, 0.29998857f, 0.64644206f,
-      0.52280146f, 0.049140453f, 0.9146645f, 0.7692217f, 0.99699783f, 0.7526061f, 0.1699655f, 0.9172919f,
-      0.5268722f, 0.73710823f, 0.09908545f, 0.35618675f, 0.009061217f, 0.30525374f, 0.6078656f, 0.10741913f,
-      0.6593821f, 0.7684034f, 0.56965464f, 0.16545832f, 0.11234015f, 0.3457417f, 0.7194791f, 0.9931982f,
-      0.7875145f, 0.44369537f, 0.6753082f, 0.009468555f, 0.07294935f, 0.73330396f, 0.2167924f, 0.74054784f,
-      0.14703393f, 0.25234455f, 0.08815551f, 0.76092035f, 0.44905245f, 0.88480055f, 0.8094361f, 0.7766713f,
-      0.51607805f, 0.345411f, 0.39128417f, 0.5664503f, 0.74785477f, 0.14970505f, 0.91963893f, 0.44563496f,
-      0.08102721f, 0.22947109f, 0.94240886f, 0.9572636f, 0.036860168f, 0.85264915f, 0.7505796f, 0.79595923f,
-      0.9232646f, 0.23052484f, 0.6578879f, 0.7046166f, 0.35225332f, 0.66732657f, 0.3561433f, 0.80913067f,
-      0.3612727f, 0.31360215f, 0.6258745f, 0.6773468f, 0.25571418f, 0.54419917f, 0.78976786f, 0.45025164f,
-      0.65216696f, 0.3794065f, 0.6752498f, 0.1378029f, 0.2059856f, 0.24620473f, 0.95950544f, 0.36545795f,
-      0.49863482f, 0.25775224f, 0.99914503f, 0.9883351f, 0.122906685f, 0.09466505f, 0.12100351f, 0.49758863f,
-      0.37254804f, 0.17272717f, 0.32066393f, 0.59446543f, 0.23875463f, 0.61079127f, 0.38534206f, 0.25771832f,
-      0.56869274f, 0.9111291f, 0.16196036f, 0.5232172f, 0.31561613f, 0.99065316f, 0.025618374f, 0.0206694f,
-      0.9926925f, 0.18365502f, 0.5958617f, 0.45684695f, 0.3946715f, 0.3883261f, 0.8177203f, 0.5238985f,
-      0.013192713f, 0.20481992f, 0.32954985f, 0.7516082f, 0.17643315f, 0.9714598f, 0.38863534f, 0.410219f,
-      0.891779f, 0.75130385f, 0.92406017f, 0.7892222f, 0.34832305f, 0.1682638f, 0.46279848f, 0.9138188f,
-      0.3321901f, 0.036315024f, 0.7049642f, 0.9867357f, 0.3576584f, 0.08598822f, 0.046470165f, 0.6252997f,
-      0.46214014f, 0.24750638f, 0.60106593f, 0.6898794f, 0.8976595f, 0.8881911f, 0.42515814f, 0.059116423f,
-      0.048188448f, 0.9668448f, 0.7210276f, 0.7179537f, 0.06738949f, 0.96300787f, 0.97367156f, 0.95143014f,
-      0.07820749f, 0.3113383f, 0.1561181f, 0.9734828f, 0.28516f, 0.27172273f, 0.76195645f, 0.26870382f,
-      0.25373894f, 0.45626426f, 0.45194024f, 0.11051077f, 0.91683406f, 0.27943915f, 0.67735744f, 0.9348918f,
-      0.7521582f, 0.57078993f, 0.9254285f, 0.5672131f, 0.2686717f, 0.97299975f, 0.61834025f, 0.012159586f,
-      0.3576542f, 0.15941626f, 0.9383765f, 0.41742706f, 0.044237554f, 0.46856833f, 0.81400645f, 0.6299002f,
-      0.6581022f, 0.5464366f, 0.68640935f, 0.378174f, 0.3010999f, 0.032645762f, 0.12333155f, 0.71670127f,
-      0.20394331f, 0.57173324f, 0.6595957f, 0.53540194f, 0.17582512f, 0.9781642f, 0.20925027f, 0.9112503f,
-      0.10224587f, 0.37972575f, 0.7719844f, 0.29570967f, 0.9200215f, 0.15592176f, 0.080114245f, 0.27454042f,
-      0.5808252f, 0.96037793f, 0.26129955f, 0.6788141f, 0.37464648f, 0.39156884f, 0.8676517f, 0.112507045f,
-      0.55310667f, 0.9702046f, 0.4312939f, 0.88821906f, 0.3460216f, 0.9024811f, 0.016334832f, 0.42793816f,
-      0.4121768f, 0.6620425f, 0.6961637f, 0.88390845f, 0.425507f, 0.48017246f, 0.8424056f, 0.36471343f,
-      0.9383168f, 0.16709393f, 0.44589508f, 0.47314453f, 0.72310495f, 0.84183806f, 0.4207481f, 0.0857597f,
-      0.7477461f, 0.6495659f, 0.70084965f, 0.19156617f, 0.8217978f, 0.9735775f, 0.5433857f, 0.032975793f,
-      0.85099494f, 0.12927437f, 0.61493605f, 0.5726589f, 0.26598173f, 0.6740978f, 0.052783668f, 0.61387974f};
+      0.14731085f, 0.6808038f, 0.066695035f, 0.66509604f, 0.0044258237f, 0.02726549f, 0.3856619f, 0.5981904f,
+      0.52229995f, 0.563313f, 0.74768895f, 0.875114f, 0.725697f, 0.608024f, 0.47776443f, 0.12875527f,
+      0.14753294f, 0.496278f, 0.14385962f, 0.33904207f, 0.25986814f, 0.21940875f, 0.1954779f, 0.5832493f,
+      0.22475791f, 0.40115923f, 0.35806787f, 0.50080043f, 0.16632986f, 0.054212093f, 0.66910046f, 0.7129646f,
+      0.20864725f, 0.5627332f, 0.33224183f, 0.7574118f, 0.21194929f, 0.93843824f, 0.65808296f, 0.6979155f,
+      0.6708725f, 0.38582766f, 0.4259563f, 0.016453922f, 0.787478f, 0.1752944f, 0.4896857f, 0.43706065f,
+      0.20204341f, 0.49648678f, 0.50546914f, 0.8614903f, 0.76478684f, 0.44311923f, 0.38754892f, 0.09010619f,
+      0.4890914f, 0.5637965f, 0.91240376f, 0.08653879f, 0.8837609f, 0.64324677f, 0.1917851f, 0.42292297f,
+      0.52103406f, 0.10889745f, 0.5624194f, 0.50689125f, 0.68136156f, 0.51592916f, 0.8457724f, 0.67365384f,
+      0.8223115f, 0.23793429f, 0.9478464f, 0.41499162f, 0.33302015f, 0.16355914f, 0.12778795f, 0.31756145f,
+      0.122039974f, 0.90374637f, 0.8058562f, 0.23666352f, 0.36027592f, 0.09583914f, 0.70483273f, 0.68979055f,
+      0.15674388f, 0.09422666f, 0.18389302f, 0.5660855f, 0.647715f, 0.8985412f, 0.33187324f, 0.8329813f,
+      0.20966923f, 0.4640969f, 0.72425205f, 0.91345936f, 0.91101736f, 0.58141935f, 0.258766f, 0.2389242f,
+      0.8499667f, 0.99461937f, 0.14655197f, 0.35384023f, 0.6359461f, 0.91481227f, 0.58982253f, 0.5049309f,
+      0.3202675f, 0.6806185f, 0.28808743f, 0.20315295f, 0.26342732f, 0.3323797f, 0.24027151f, 0.7067495f,
+      0.92174435f, 0.5141565f, 0.64706135f, 0.31508058f, 0.2649613f, 0.6472777f, 0.6152024f, 0.5391889f,
+      0.54176575f, 0.8775838f, 0.4100297f, 0.15878445f, 0.6654094f, 0.81974447f, 0.36427742f, 0.92438906f,
+      0.5624327f, 0.9998074f, 0.69465625f, 0.4192087f, 0.9998382f, 0.537167f, 0.4035356f, 0.5327942f,
+      0.10692614f, 0.93719745f, 0.5888109f, 0.17528385f, 0.75933146f, 0.26393235f, 0.5478503f, 0.16675615f,
+      0.5392941f, 0.8873769f, 0.7127341f, 0.84719825f, 0.81082416f, 0.9595239f, 0.9624148f, 0.32085752f,
+      0.8462349f, 0.38537037f, 0.33008623f, 0.121996105f, 0.32500392f, 0.70447034f, 0.5267702f, 0.60918206f,
+      0.9505569f, 0.32452917f, 0.7437857f, 0.25604928f, 0.73991376f, 0.12042731f, 0.19128f, 0.11884099f,
+      0.79387546f, 0.9105244f, 0.15076452f, 0.016954303f, 0.5574533f, 0.97854143f, 0.52562714f, 0.74840516f,
+      0.5670015f, 0.7801898f, 0.6129275f, 0.21612722f, 0.38059133f, 0.8796869f, 0.7397436f, 0.04606521f,
+      0.7335071f, 0.19911051f, 0.16170406f, 0.91123873f, 0.21814507f, 0.31775457f, 0.7480201f, 0.01935333f,
+      0.25676018f, 0.9495086f, 0.006731212f, 0.90938f, 0.21944171f, 0.78107727f, 0.04303074f, 0.014169693f,
+      0.08565581f, 0.7415793f, 0.09847212f, 0.85791886f, 0.11525959f, 0.21590549f, 0.41052878f, 0.39856833f,
+      0.07003945f, 0.77256775f, 0.89473504f, 0.88606364f, 0.83566517f, 0.42164284f, 0.12842774f, 0.83621645f,
+      0.99880487f, 0.18661183f, 0.7705178f, 0.94459325f, 0.8554656f, 0.9245506f, 0.2866572f, 0.026760519f,
+      0.8173947f, 0.6434499f, 0.96910787f, 0.3719685f, 0.44309366f, 0.52065957f, 0.6801467f, 0.91559356f,
+      0.15438312f, 0.32471877f, 0.9005606f, 0.72000104f, 0.210657f, 0.14639091f, 0.1449349f, 0.29998857f,
+      0.6956213f, 0.8906783f, 0.053477287f, 0.9454652f, 0.88645273f, 0.33288354f, 0.68586344f, 0.64644206f,
+      0.52280146f, 0.6593821f, 0.14703393f, 0.08102721f, 0.3612727f, 0.49863482f, 0.56869274f, 0.013192713f,
+      0.049140453f, 0.7684034f, 0.25234455f, 0.22947109f, 0.31360215f, 0.25775224f, 0.9111291f, 0.20481992f,
+      0.9146645f, 0.56965464f, 0.08815551f, 0.94240886f, 0.6258745f, 0.99914503f, 0.16196036f, 0.32954985f,
+      0.7692217f, 0.16545832f, 0.76092035f, 0.9572636f, 0.6773468f, 0.9883351f, 0.5232172f, 0.7516082f,
+      0.99699783f, 0.11234015f, 0.44905245f, 0.036860168f, 0.25571418f, 0.122906685f, 0.31561613f, 0.17643315f,
+      0.7526061f, 0.3457417f, 0.88480055f, 0.85264915f, 0.54419917f, 0.09466505f, 0.99065316f, 0.9714598f,
+      0.1699655f, 0.7194791f, 0.8094361f, 0.7505796f, 0.78976786f, 0.12100351f, 0.025618374f, 0.38863534f,
+      0.9172919f, 0.9931982f, 0.7766713f, 0.79595923f, 0.45025164f, 0.49758863f, 0.0206694f, 0.410219f,
+      0.5268722f, 0.7875145f, 0.51607805f, 0.9232646f, 0.65216696f, 0.37254804f, 0.9926925f, 0.891779f,
+      0.73710823f, 0.44369537f, 0.345411f, 0.23052484f, 0.3794065f, 0.17272717f, 0.18365502f, 0.75130385f,
+      0.09908545f, 0.6753082f, 0.39128417f, 0.6578879f, 0.6752498f, 0.32066393f, 0.5958617f, 0.92406017f,
+      0.35618675f, 0.009468555f, 0.5664503f, 0.7046166f, 0.1378029f, 0.59446543f, 0.45684695f, 0.7892222f,
+      0.009061217f, 0.07294935f, 0.74785477f, 0.35225332f, 0.2059856f, 0.23875463f, 0.3946715f, 0.34832305f,
+      0.30525374f, 0.73330396f, 0.14970505f, 0.66732657f, 0.24620473f, 0.61079127f, 0.3883261f, 0.1682638f,
+      0.6078656f, 0.2167924f, 0.91963893f, 0.3561433f, 0.95950544f, 0.38534206f, 0.8177203f, 0.46279848f,
+      0.10741913f, 0.74054784f, 0.44563496f, 0.80913067f, 0.36545795f, 0.25771832f, 0.5238985f, 0.9138188f,
+      0.3321901f, 0.048188448f, 0.25373894f, 0.3576542f, 0.20394331f, 0.5808252f, 0.4121768f, 0.7477461f,
+      0.036315024f, 0.9668448f, 0.45626426f, 0.15941626f, 0.57173324f, 0.96037793f, 0.6620425f, 0.6495659f,
+      0.7049642f, 0.7210276f, 0.45194024f, 0.9383765f, 0.6595957f, 0.26129955f, 0.6961637f, 0.70084965f,
+      0.9867357f, 0.7179537f, 0.11051077f, 0.41742706f, 0.53540194f, 0.6788141f, 0.88390845f, 0.19156617f,
+      0.3576584f, 0.06738949f, 0.91683406f, 0.044237554f, 0.17582512f, 0.37464648f, 0.425507f, 0.8217978f,
+      0.08598822f, 0.96300787f, 0.27943915f, 0.46856833f, 0.9781642f, 0.39156884f, 0.48017246f, 0.9735775f,
+      0.046470165f, 0.97367156f, 0.67735744f, 0.81400645f, 0.20925027f, 0.8676517f, 0.8424056f, 0.5433857f,
+      0.6252997f, 0.95143014f, 0.9348918f, 0.6299002f, 0.9112503f, 0.112507045f, 0.36471343f, 0.032975793f,
+      0.46214014f, 0.07820749f, 0.7521582f, 0.6581022f, 0.10224587f, 0.55310667f, 0.9383168f, 0.85099494f,
+      0.24750638f, 0.3113383f, 0.57078993f, 0.5464366f, 0.37972575f, 0.9702046f, 0.16709393f, 0.12927437f,
+      0.60106593f, 0.1561181f, 0.9254285f, 0.68640935f, 0.7719844f, 0.4312939f, 0.44589508f, 0.61493605f,
+      0.6898794f, 0.9734828f, 0.5672131f, 0.378174f, 0.29570967f, 0.88821906f, 0.47314453f, 0.5726589f,
+      0.8976595f, 0.28516f, 0.2686717f, 0.3010999f, 0.9200215f, 0.3460216f, 0.72310495f, 0.26598173f,
+      0.8881911f, 0.27172273f, 0.97299975f, 0.032645762f, 0.15592176f, 0.9024811f, 0.84183806f, 0.6740978f,
+      0.42515814f, 0.76195645f, 0.61834025f, 0.12333155f, 0.080114245f, 0.016334832f, 0.4207481f, 0.052783668f,
+      0.059116423f, 0.26870382f, 0.012159586f, 0.71670127f, 0.27454042f, 0.42793816f, 0.0857597f, 0.61387974f};
   const std::vector<float> fc2_experts_weights = {
-      0.18302453f, 0.44593316f, 0.5643144f, 0.9259722f, 0.26143986f, 0.82031804f, 0.4364831f, 0.2625361f,
-      0.06460017f, 0.04124081f, 0.98830533f, 0.37530023f, 0.5249744f, 0.63555616f, 0.8398661f, 0.92673707f,
-      0.9055086f, 0.12955844f, 0.4198916f, 0.20413119f, 0.21432412f, 0.6186035f, 0.969324f, 0.099448025f,
-      0.80260223f, 0.24076664f, 0.40261286f, 0.89688545f, 0.38691485f, 0.5455279f, 0.15048373f, 0.92562044f,
-      0.43536508f, 0.13430476f, 0.64640516f, 0.14449131f, 0.10324633f, 0.5304596f, 0.8964218f, 0.358508f,
-      0.73533344f, 0.9296606f, 0.83163047f, 0.23771948f, 0.44519007f, 0.34265757f, 0.09793854f, 0.5002066f,
-      0.87621754f, 0.9212578f, 0.54665035f, 0.6135615f, 0.28353918f, 0.8774212f, 0.29194576f, 0.1526736f,
-      0.57699674f, 0.7996927f, 0.04920423f, 0.95198375f, 0.67986554f, 0.14969361f, 0.39229625f, 0.93378997f,
-      0.11638266f, 0.3538614f, 0.66399014f, 0.06195748f, 0.7740991f, 0.7602738f, 0.81010276f, 0.18122643f,
-      0.9980005f, 0.20361924f, 0.99917024f, 0.020154774f, 0.054515004f, 0.80709815f, 0.55225646f, 0.52884465f,
-      0.22312081f, 0.29026228f, 0.35380626f, 0.012922287f, 0.52598435f, 0.58842945f, 0.4995767f, 0.66146517f,
-      0.9744255f, 0.632942f, 0.3169638f, 0.29422665f, 0.18009722f, 0.15339059f, 0.41947508f, 0.4115672f,
-      0.72243124f, 0.2862816f, 0.89860183f, 0.14915991f, 0.5014211f, 0.94945997f, 0.99719256f, 0.21036887f,
-      0.5890645f, 0.55906135f, 0.26557416f, 0.32725257f, 0.635427f, 0.1523174f, 0.58249784f, 0.71636236f,
-      0.30296493f, 0.9153206f, 0.46709478f, 0.72685635f, 0.9951532f, 0.34716582f, 0.7717041f, 0.3569854f,
-      0.4269635f, 0.41526443f, 0.4968937f, 0.3111158f, 0.61719346f, 0.5188402f, 0.8169449f, 0.39879733f,
-      0.5501401f, 0.31400484f, 0.08127314f, 0.7023336f, 0.56397897f, 0.29975814f, 0.33094752f, 0.63076067f,
-      0.40959156f, 0.82673794f, 0.52832156f, 0.68886834f, 0.7178481f, 0.37731683f, 0.71633244f, 0.86896664f,
-      0.5230092f, 0.59784645f, 0.5181678f, 0.8461837f, 0.28890234f, 0.23421508f, 0.7178768f, 0.06484294f,
-      0.5080162f, 0.27005446f, 0.8300168f, 0.034480453f, 0.8031663f, 0.9946784f, 0.60117006f, 0.46668667f,
-      0.9921749f, 0.28632385f, 0.45993322f, 0.28104752f, 0.43097937f, 0.60866946f, 0.5667807f, 0.40556252f,
-      7.969141e-05f, 0.52560204f, 0.48518902f, 0.5752184f, 0.8831251f, 0.9860047f, 0.20335877f, 0.46882278f,
-      0.2996632f, 0.03917718f, 0.13617045f, 0.96928054f, 0.79153055f, 0.76857555f, 0.7778716f, 0.102760494f,
-      0.5525096f, 0.9653573f, 0.22095704f, 0.94479716f, 0.63141924f, 0.8517718f, 0.28580618f, 0.73050886f,
-      0.05675614f, 0.46825224f, 0.6667756f, 0.6499472f, 0.91840404f, 0.99132854f, 0.9548785f, 0.8356961f,
-      0.851531f, 0.43548512f, 0.111976564f, 0.31438643f, 0.44386774f, 0.22980672f, 0.75558543f, 0.6755136f,
-      0.58067596f, 0.62078035f, 0.93922615f, 0.6821157f, 0.061530292f, 0.13705963f, 0.7203748f, 0.5681396f,
-      0.7438458f, 0.0006400347f, 0.038565338f, 0.8066132f, 0.81982285f, 0.047644496f, 0.68979263f, 0.109577894f,
-      0.8786539f, 0.6568952f, 0.99439347f, 0.0070040226f, 0.018661916f, 0.838051f, 0.94391155f, 0.80634f,
-      0.8324149f, 0.078864336f, 0.8619068f, 0.027926445f, 0.61170083f, 0.17248261f, 0.30140227f, 0.5885344f,
-      0.30341f, 0.42088854f, 0.02608782f, 0.02856338f, 0.69368154f, 0.28836077f, 0.19580519f, 0.30270886f,
-      0.09121573f, 0.100299895f, 0.79918617f, 0.75412107f, 0.56660175f, 0.22687018f, 0.6663505f, 0.5224626f,
-      0.1426636f, 0.6075949f, 0.95527196f, 0.008196831f, 0.0028039217f, 0.5640625f, 0.87651116f, 0.19575512f,
-      0.61006856f, 0.85149264f, 0.6541582f, 0.6082054f, 0.998863f, 0.82573634f, 0.21878648f, 0.54321826f,
-      0.7554362f, 0.94095474f, 0.002533555f, 0.77075267f, 0.35483408f, 0.010389388f, 0.610987f, 0.22779316f,
-      0.5708561f, 0.17537653f, 0.12373549f, 0.4575745f, 0.33203715f, 0.79243237f, 0.54310906f, 0.8902793f,
-      0.5937015f, 0.33921933f, 0.8386668f, 0.52732253f, 0.59384584f, 0.3391887f, 0.5017944f, 0.40386343f,
-      0.45749134f, 0.110060334f, 0.49692506f, 0.084977865f, 0.3924346f, 0.7897731f, 0.15232486f, 0.16297412f,
-      0.37791175f, 0.36293298f, 0.5846437f, 0.5830078f, 0.75354826f, 0.15555972f, 0.4647144f, 0.7796456f,
-      0.93248576f, 0.46352726f, 0.2106899f, 0.6437313f, 0.78473866f, 0.18762505f, 0.20985329f, 0.7209991f,
-      0.464967f, 0.02775067f, 0.21170747f, 0.7027664f, 0.33041215f, 0.8451145f, 0.89526993f, 0.57273495f,
-      0.46046263f, 0.34128642f, 0.47471708f, 0.59101045f, 0.11807448f, 0.38050216f, 0.08409953f, 0.80687743f,
-      0.18158185f, 0.9567719f, 0.3711096f, 0.21356237f, 0.74022657f, 0.57453954f, 0.846228f, 0.70873487f,
-      0.018330276f, 0.8162452f, 0.40584308f, 0.27901447f, 0.81752694f, 0.86466515f, 0.060534656f, 0.45478833f,
-      0.9106033f, 0.6936434f, 0.92123467f, 0.32865065f, 0.22417879f, 0.9299548f, 0.70841146f, 0.97999126f,
-      0.2911517f, 0.17896658f, 0.44139355f, 0.029210031f, 0.6959876f, 0.8687942f, 0.62002844f, 0.45059657f,
-      0.74790317f, 0.18262434f, 0.98912156f, 0.0028281808f, 0.021027386f, 0.38184917f, 0.90842223f, 0.5500629f,
-      0.69202286f, 0.13349658f, 0.6823429f, 0.44412827f, 0.7004118f, 0.8531213f, 0.7173401f, 0.4574679f,
-      0.46920043f, 0.18640989f, 0.31914896f, 0.82491904f, 0.29950172f, 0.8105199f, 0.30173403f, 0.38355058f,
-      0.5106411f, 0.04116726f, 0.49500751f, 0.44960213f, 0.45508182f, 0.4000479f, 0.89418864f, 0.8689936f,
-      0.16112137f, 0.7322634f, 0.10780871f, 0.07433933f, 0.652841f, 0.50734824f, 0.26674682f, 0.017748117f,
-      0.30643195f, 0.66699976f, 0.03719926f, 0.014267266f, 0.56343627f, 0.13979793f, 0.061959863f, 0.3073569f,
-      0.41949958f, 0.045647383f, 0.16613615f, 0.5327839f, 0.028514147f, 0.4297228f, 0.17714864f, 0.15338135f,
-      0.6965155f, 0.11515516f, 0.1210829f, 0.78514075f, 0.59348315f, 0.9553564f, 0.36635226f, 0.25849247f,
-      0.45372677f, 0.5025297f, 0.88132215f, 0.0019600391f, 0.46439964f, 0.7211761f, 0.22465849f, 0.2459296f,
-      0.7416339f, 0.020907402f, 0.6184779f, 0.112906754f, 0.7485309f, 0.072479784f, 0.8074024f, 0.026683688f,
-      0.07971662f, 0.50736845f, 0.8939942f, 0.0718022f, 0.27697015f, 0.9391413f, 0.4161513f, 0.7071423f,
-      0.019000888f, 0.34275955f, 0.24608392f, 0.9215306f, 0.70751995f, 0.13516217f, 0.5806135f, 0.49425328f,
-      0.29456508f, 0.21446168f, 0.3340807f, 0.89411324f, 0.14157385f, 0.14382833f, 0.34574044f, 0.50869817f,
-      0.63610595f, 0.51500404f, 0.37963718f, 0.19682491f, 0.41028368f, 0.29872334f, 0.9039644f, 0.013295233f,
-      0.1810705f, 0.093204916f, 0.4086216f, 0.8896367f, 0.9382696f, 0.06472236f, 0.47833657f, 0.7934831f,
-      0.7203987f, 0.9095519f, 0.4861309f, 0.16405362f, 0.83076525f, 0.3285427f, 0.7588931f, 0.37678176f,
-      0.71254706f, 0.949713f, 0.96492773f, 0.044967473f, 0.16925985f, 0.2932666f, 0.18114948f, 0.97975004f,
-      0.4558406f, 0.16832972f, 0.27750528f, 0.2238177f, 0.7039947f, 0.06387442f, 0.033798456f, 0.007119417f};
+      0.18302453f, 0.06460017f, 0.9055086f, 0.80260223f, 0.43536508f, 0.73533344f, 0.87621754f,
+      0.57699674f, 0.11638266f, 0.9980005f, 0.22312081f, 0.9744255f, 0.72243124f, 0.5890645f,
+      0.30296493f, 0.4269635f, 0.44593316f, 0.04124081f, 0.12955844f, 0.24076664f, 0.13430476f,
+      0.9296606f, 0.9212578f, 0.7996927f, 0.3538614f, 0.20361924f, 0.29026228f, 0.632942f,
+      0.2862816f, 0.55906135f, 0.9153206f, 0.41526443f, 0.5643144f, 0.98830533f, 0.4198916f,
+      0.40261286f, 0.64640516f, 0.83163047f, 0.54665035f, 0.04920423f, 0.66399014f, 0.99917024f,
+      0.35380626f, 0.3169638f, 0.89860183f, 0.26557416f, 0.46709478f, 0.4968937f, 0.9259722f,
+      0.37530023f, 0.20413119f, 0.89688545f, 0.14449131f, 0.23771948f, 0.6135615f, 0.95198375f,
+      0.06195748f, 0.020154774f, 0.012922287f, 0.29422665f, 0.14915991f, 0.32725257f, 0.72685635f,
+      0.3111158f, 0.26143986f, 0.5249744f, 0.21432412f, 0.38691485f, 0.10324633f, 0.44519007f,
+      0.28353918f, 0.67986554f, 0.7740991f, 0.054515004f, 0.52598435f, 0.18009722f, 0.5014211f,
+      0.635427f, 0.9951532f, 0.61719346f, 0.82031804f, 0.63555616f, 0.6186035f, 0.5455279f,
+      0.5304596f, 0.34265757f, 0.8774212f, 0.14969361f, 0.7602738f, 0.80709815f, 0.58842945f,
+      0.15339059f, 0.94945997f, 0.1523174f, 0.34716582f, 0.5188402f, 0.4364831f, 0.8398661f,
+      0.969324f, 0.15048373f, 0.8964218f, 0.09793854f, 0.29194576f, 0.39229625f, 0.81010276f,
+      0.55225646f, 0.4995767f, 0.41947508f, 0.99719256f, 0.58249784f, 0.7717041f, 0.8169449f,
+      0.2625361f, 0.92673707f, 0.099448025f, 0.92562044f, 0.358508f, 0.5002066f, 0.1526736f,
+      0.93378997f, 0.18122643f, 0.52884465f, 0.66146517f, 0.4115672f, 0.21036887f, 0.71636236f,
+      0.3569854f, 0.39879733f, 0.5501401f, 0.40959156f, 0.5230092f, 0.5080162f, 0.9921749f,
+      7.969141e-05f, 0.2996632f, 0.5525096f, 0.05675614f, 0.851531f, 0.58067596f, 0.7438458f,
+      0.8786539f, 0.8324149f, 0.30341f, 0.09121573f, 0.31400484f, 0.82673794f, 0.59784645f,
+      0.27005446f, 0.28632385f, 0.52560204f, 0.03917718f, 0.9653573f, 0.46825224f, 0.43548512f,
+      0.62078035f, 0.0006400347f, 0.6568952f, 0.078864336f, 0.42088854f, 0.100299895f, 0.08127314f,
+      0.52832156f, 0.5181678f, 0.8300168f, 0.45993322f, 0.48518902f, 0.13617045f, 0.22095704f,
+      0.6667756f, 0.111976564f, 0.93922615f, 0.038565338f, 0.99439347f, 0.8619068f, 0.02608782f,
+      0.79918617f, 0.7023336f, 0.68886834f, 0.8461837f, 0.034480453f, 0.28104752f, 0.5752184f,
+      0.96928054f, 0.94479716f, 0.6499472f, 0.31438643f, 0.6821157f, 0.8066132f, 0.0070040226f,
+      0.027926445f, 0.02856338f, 0.75412107f, 0.56397897f, 0.7178481f, 0.28890234f, 0.8031663f,
+      0.43097937f, 0.8831251f, 0.79153055f, 0.63141924f, 0.91840404f, 0.44386774f, 0.061530292f,
+      0.81982285f, 0.018661916f, 0.61170083f, 0.69368154f, 0.56660175f, 0.29975814f, 0.37731683f,
+      0.23421508f, 0.9946784f, 0.60866946f, 0.9860047f, 0.76857555f, 0.8517718f, 0.99132854f,
+      0.22980672f, 0.13705963f, 0.047644496f, 0.838051f, 0.17248261f, 0.28836077f, 0.22687018f,
+      0.33094752f, 0.71633244f, 0.7178768f, 0.60117006f, 0.5667807f, 0.20335877f, 0.7778716f,
+      0.28580618f, 0.9548785f, 0.75558543f, 0.7203748f, 0.68979263f, 0.94391155f, 0.30140227f,
+      0.19580519f, 0.6663505f, 0.63076067f, 0.86896664f, 0.06484294f, 0.46668667f, 0.40556252f,
+      0.46882278f, 0.102760494f, 0.73050886f, 0.8356961f, 0.6755136f, 0.5681396f, 0.109577894f,
+      0.80634f, 0.5885344f, 0.30270886f, 0.5224626f, 0.1426636f, 0.61006856f, 0.7554362f,
+      0.5708561f, 0.5937015f, 0.45749134f, 0.37791175f, 0.93248576f, 0.464967f, 0.46046263f,
+      0.18158185f, 0.018330276f, 0.9106033f, 0.2911517f, 0.74790317f, 0.69202286f, 0.6075949f,
+      0.85149264f, 0.94095474f, 0.17537653f, 0.33921933f, 0.110060334f, 0.36293298f, 0.46352726f,
+      0.02775067f, 0.34128642f, 0.9567719f, 0.8162452f, 0.6936434f, 0.17896658f, 0.18262434f,
+      0.13349658f, 0.95527196f, 0.6541582f, 0.002533555f, 0.12373549f, 0.8386668f, 0.49692506f,
+      0.5846437f, 0.2106899f, 0.21170747f, 0.47471708f, 0.3711096f, 0.40584308f, 0.92123467f,
+      0.44139355f, 0.98912156f, 0.6823429f, 0.008196831f, 0.6082054f, 0.77075267f, 0.4575745f,
+      0.52732253f, 0.084977865f, 0.5830078f, 0.6437313f, 0.7027664f, 0.59101045f, 0.21356237f,
+      0.27901447f, 0.32865065f, 0.029210031f, 0.0028281808f, 0.44412827f, 0.0028039217f, 0.998863f,
+      0.35483408f, 0.33203715f, 0.59384584f, 0.3924346f, 0.75354826f, 0.78473866f, 0.33041215f,
+      0.11807448f, 0.74022657f, 0.81752694f, 0.22417879f, 0.6959876f, 0.021027386f, 0.7004118f,
+      0.5640625f, 0.82573634f, 0.010389388f, 0.79243237f, 0.3391887f, 0.7897731f, 0.15555972f,
+      0.18762505f, 0.8451145f, 0.38050216f, 0.57453954f, 0.86466515f, 0.9299548f, 0.8687942f,
+      0.38184917f, 0.8531213f, 0.87651116f, 0.21878648f, 0.610987f, 0.54310906f, 0.5017944f,
+      0.15232486f, 0.4647144f, 0.20985329f, 0.89526993f, 0.08409953f, 0.846228f, 0.060534656f,
+      0.70841146f, 0.62002844f, 0.90842223f, 0.7173401f, 0.19575512f, 0.54321826f, 0.22779316f,
+      0.8902793f, 0.40386343f, 0.16297412f, 0.7796456f, 0.7209991f, 0.57273495f, 0.80687743f,
+      0.70873487f, 0.45478833f, 0.97999126f, 0.45059657f, 0.5500629f, 0.4574679f, 0.46920043f,
+      0.5106411f, 0.16112137f, 0.30643195f, 0.41949958f, 0.6965155f, 0.45372677f, 0.7416339f,
+      0.07971662f, 0.019000888f, 0.29456508f, 0.63610595f, 0.1810705f, 0.7203987f, 0.71254706f,
+      0.4558406f, 0.18640989f, 0.04116726f, 0.7322634f, 0.66699976f, 0.045647383f, 0.11515516f,
+      0.5025297f, 0.020907402f, 0.50736845f, 0.34275955f, 0.21446168f, 0.51500404f, 0.093204916f,
+      0.9095519f, 0.949713f, 0.16832972f, 0.31914896f, 0.49500751f, 0.10780871f, 0.03719926f,
+      0.16613615f, 0.1210829f, 0.88132215f, 0.6184779f, 0.8939942f, 0.24608392f, 0.3340807f,
+      0.37963718f, 0.4086216f, 0.4861309f, 0.96492773f, 0.27750528f, 0.82491904f, 0.44960213f,
+      0.07433933f, 0.014267266f, 0.5327839f, 0.78514075f, 0.0019600391f, 0.112906754f, 0.0718022f,
+      0.9215306f, 0.89411324f, 0.19682491f, 0.8896367f, 0.16405362f, 0.044967473f, 0.2238177f,
+      0.29950172f, 0.45508182f, 0.652841f, 0.56343627f, 0.028514147f, 0.59348315f, 0.46439964f,
+      0.7485309f, 0.27697015f, 0.70751995f, 0.14157385f, 0.41028368f, 0.9382696f, 0.83076525f,
+      0.16925985f, 0.7039947f, 0.8105199f, 0.4000479f, 0.50734824f, 0.13979793f, 0.4297228f,
+      0.9553564f, 0.7211761f, 0.072479784f, 0.9391413f, 0.13516217f, 0.14382833f, 0.29872334f,
+      0.06472236f, 0.3285427f, 0.2932666f, 0.06387442f, 0.30173403f, 0.89418864f, 0.26674682f,
+      0.061959863f, 0.17714864f, 0.36635226f, 0.22465849f, 0.8074024f, 0.4161513f, 0.5806135f,
+      0.34574044f, 0.9039644f, 0.47833657f, 0.7588931f, 0.18114948f, 0.033798456f, 0.38355058f,
+      0.8689936f, 0.017748117f, 0.3073569f, 0.15338135f, 0.25849247f, 0.2459296f, 0.026683688f,
+      0.7071423f, 0.49425328f, 0.50869817f, 0.013295233f, 0.7934831f, 0.37678176f, 0.97975004f,
+      0.007119417f};
   const std::vector<float> fc1_experts_bias = {
       0.71526206f, 0.7472273f, 0.18946046f, 0.6239893f, 0.86909235f, 0.5726507f, 0.3942092f, 0.5369412f,
       0.44638616f, 0.7517496f, 0.16049433f, 0.75355124f, 0.7818118f, 0.19706267f, 0.9082818f, 0.9910924f,
@@ -256,19 +302,8 @@ TEST(MoETest, MoETest_Gelu) {
       0.565234f, 0.17098689f, 0.10810414f, 0.43916586f, 0.3535297f, 0.45673048f, 0.3853893f, 0.18613164f,
       1.3354061f, 0.5049282f, 0.72775036f, 0.90331376f, 1.2945517f, 0.9123066f, 1.1995136f, 0.7708638f};
 
-  RunMoETest(input,
-             router_probs,
-             fc1_experts_weights,
-             fc2_experts_weights,
-             {},
-             fc1_experts_bias,
-             fc2_experts_bias,
-             output,
-             num_rows,
-             num_experts,
-             hidden_size,
-             inter_size,
-             "gelu");
+  RunMoETest(input, router_probs, fc1_experts_weights, fc2_experts_weights, {}, fc1_experts_bias, fc2_experts_bias,
+             output, num_rows, num_experts, hidden_size, inter_size, "gelu");
 }
 
 TEST(MoETest, MoETest_Relu) {
@@ -286,135 +321,145 @@ TEST(MoETest, MoETest_Relu) {
       -0.08146476f, -0.40439552f, 1.0100367f, -0.7724162f, -0.08113786f, -0.36328858f, 0.3688482f, -0.013465762f,
       -0.32420647f, -0.3815508f, 0.79585606f, 0.14430691f, -0.21869831f, 0.11483674f, -0.11992836f, 0.35216537f};
   const std::vector<float> fc1_experts_weights = {
-      0.81960344f, 0.9296998f, 0.45050132f, 0.38805157f, 0.50729614f, 0.47014588f, 0.62020564f, 0.6401168f,
-      0.045871615f, 0.31548113f, 0.92106473f, 0.6947775f, 0.4751312f, 0.19854712f, 0.19409746f, 0.052116573f,
-      0.3370188f, 0.6688521f, 0.8188108f, 0.73084867f, 0.058027983f, 0.19931877f, 0.42109168f, 0.98367476f,
-      0.57232875f, 0.37051463f, 0.7068576f, 0.30955923f, 0.17637217f, 0.8649436f, 0.2726491f, 0.39976662f,
-      0.0025978684f, 0.8346353f, 0.8788173f, 0.6822241f, 0.1513629f, 0.0065300465f, 0.093910515f, 0.8728501f,
-      0.7400529f, 0.9207522f, 0.76193494f, 0.6265461f, 0.49510366f, 0.11974698f, 0.07161391f, 0.032325685f,
-      0.704681f, 0.254516f, 0.3993737f, 0.21224737f, 0.40888822f, 0.14808255f, 0.17329216f, 0.6658554f,
-      0.3514018f, 0.8086716f, 0.33959562f, 0.13321638f, 0.41178054f, 0.2576263f, 0.3470292f, 0.024002194f,
-      0.77974546f, 0.15189773f, 0.75130886f, 0.7268921f, 0.85721636f, 0.11647397f, 0.8595984f, 0.2636242f,
-      0.6855346f, 0.96955734f, 0.42948407f, 0.49613327f, 0.38488472f, 0.08250773f, 0.73995143f, 0.003641069f,
-      0.81039995f, 0.87411255f, 0.9728532f, 0.38206023f, 0.08917904f, 0.61241513f, 0.77621365f, 0.0023456216f,
-      0.38650817f, 0.20027226f, 0.45626813f, 0.25389326f, 0.2956162f, 0.34127057f, 0.024847984f, 0.91025376f,
-      0.9191656f, 0.42156547f, 0.44305897f, 0.29594004f, 0.04846859f, 0.013427794f, 0.6858292f, 0.22547692f,
-      0.17856151f, 0.4609884f, 0.33349442f, 0.3382396f, 0.5160656f, 0.3939438f, 0.3278438f, 0.26059705f,
-      0.0930863f, 0.9192536f, 0.29990643f, 0.63248974f, 0.32651705f, 0.54063064f, 0.9661502f, 0.73036134f,
-      0.06670016f, 0.6984514f, 0.9746214f, 0.63154167f, 0.83521235f, 0.99294376f, 0.4233855f, 0.6037772f,
-      0.15248245f, 0.39696145f, 0.8702919f, 0.7563229f, 0.18360549f, 0.099057496f, 0.15831816f, 0.00656116f,
-      0.114180505f, 0.3763513f, 0.8374386f, 0.5836911f, 0.11969727f, 0.09888804f, 0.74873763f, 0.12807935f,
-      0.43843627f, 0.739853f, 0.26859397f, 0.44548005f, 0.45647776f, 0.38170832f, 0.24648392f, 0.054280818f,
-      0.0958215f, 0.23226917f, 0.98291886f, 0.25849265f, 0.16423601f, 0.6211971f, 0.63780516f, 0.77395487f,
-      0.8800602f, 0.7784371f, 0.004249513f, 0.5443443f, 0.80287653f, 0.45378727f, 0.20536041f, 0.9766699f,
-      0.31298608f, 0.21532774f, 0.04922247f, 0.52233416f, 0.72156656f, 0.6106814f, 0.59887487f, 0.12080628f,
-      0.03305638f, 0.5088047f, 0.95591706f, 0.7884607f, 0.20888287f, 0.43509573f, 0.13140821f, 0.2587883f,
-      0.5905492f, 0.77226925f, 0.91418463f, 0.04094696f, 0.8343076f, 0.14735395f, 0.6872336f, 0.92312264f,
-      0.5070212f, 0.9549045f, 0.07397425f, 0.3090204f, 0.79162645f, 0.39106607f, 0.39764988f, 0.29160416f,
-      0.84465307f, 0.7452516f, 0.66022503f, 0.21901816f, 0.09412521f, 0.5540803f, 0.6481394f, 0.26914406f,
-      0.36010116f, 0.83768386f, 0.53982985f, 0.52255917f, 0.37694973f, 0.04720515f, 0.029871285f, 0.26099247f,
-      0.2458393f, 0.6557768f, 0.35444462f, 0.30438894f, 0.9767149f, 0.67416143f, 0.85645115f, 0.25794363f,
-      0.2957666f, 0.68377024f, 0.16686243f, 0.17314798f, 0.47585016f, 0.31711966f, 0.125171f, 0.7965795f,
-      0.90208143f, 0.58111167f, 0.41294336f, 0.036863506f, 0.31788063f, 0.6272928f, 0.73576546f, 0.43679124f,
-      0.30232358f, 0.77861303f, 0.10180014f, 0.816009f, 0.30602258f, 0.5076527f, 0.40119207f, 0.5606195f,
-      0.3489008f, 0.8635635f, 0.48700142f, 0.89029974f, 0.98074025f, 0.25640452f, 0.13524544f, 0.901151f,
-      0.89180696f, 0.11822635f, 0.46134835f, 0.006936848f, 0.09070045f, 0.59657127f, 0.6330173f, 0.6059905f,
-      0.36391765f, 0.96128887f, 0.571489f, 0.2049576f, 0.4716931f, 0.6200726f, 0.67509633f, 0.14645958f,
-      0.6873948f, 0.24455917f, 0.08452982f, 0.22689629f, 0.9822047f, 0.9274289f, 0.9477422f, 0.7935056f,
-      0.87772477f, 0.43307513f, 0.22488606f, 0.7498283f, 0.24090862f, 0.16256708f, 0.34033298f, 0.6049296f,
-      0.7573983f, 0.3057955f, 0.20571685f, 0.56744653f, 0.2052834f, 0.17446929f, 0.76062596f, 0.4160077f,
-      0.9568925f, 0.9863913f, 0.64955276f, 0.67207885f, 0.61514187f, 0.50783044f, 0.46363378f, 0.50687206f,
-      0.6867124f, 0.9648854f, 0.37042046f, 0.2886421f, 0.37891757f, 0.25843787f, 0.58501935f, 0.8732242f,
-      0.8909887f, 0.72956276f, 0.13203424f, 0.23164761f, 0.3901443f, 0.40783793f, 0.54112387f, 0.041014254f,
-      0.65562236f, 0.11856395f, 0.18362767f, 0.08430874f, 0.9356598f, 0.026530087f, 0.8771834f, 0.48319155f,
-      0.4418506f, 0.81273925f, 0.4537862f, 0.81357706f, 0.8615075f, 0.06589496f, 0.692392f, 0.5943895f,
-      0.60750586f, 0.5729957f, 0.6367655f, 0.2594666f, 0.43602943f, 0.97506f, 0.83592474f, 0.48121578f,
-      0.029734552f, 0.5219139f, 0.15951324f, 0.90659577f, 0.19645631f, 0.4638992f, 0.38902867f, 0.5889769f,
-      0.9705138f, 0.5475096f, 0.789582f, 0.8881108f, 0.9036556f, 0.32732427f, 0.38817167f, 0.7409689f,
-      0.36356616f, 0.734132f, 0.39076614f, 0.16087383f, 0.70352167f, 0.576659f, 0.7229242f, 0.996743f,
-      0.84136647f, 0.97399056f, 0.5267614f, 0.06989372f, 0.14923638f, 0.18941313f, 0.059375823f, 0.24937624f,
-      0.039716125f, 0.038692355f, 0.20122272f, 0.0070830584f, 0.19309378f, 0.69065434f, 0.9170264f, 0.3512686f,
-      0.3545606f, 0.76697665f, 0.25331455f, 0.26358372f, 0.80806476f, 0.064349174f, 0.5611374f, 0.941691f,
-      0.58574325f, 0.6359719f, 0.20880443f, 0.49310172f, 0.5274922f, 0.62271714f, 0.694273f, 0.9344639f,
-      0.11835027f, 0.51498765f, 0.25018185f, 0.10446805f, 0.45996118f, 0.059881568f, 0.8489496f, 0.5579074f,
-      0.23052096f, 0.76128954f, 0.02678603f, 0.3066004f, 0.40259063f, 0.07512486f, 0.18205583f, 0.4183907f,
-      0.8793823f, 0.9828271f, 0.8181312f, 0.20143801f, 0.17288941f, 0.9363466f, 0.6768587f, 0.51328385f,
-      0.56766605f, 0.098151624f, 0.33305728f, 0.98130906f, 0.3766839f, 0.47491795f, 0.08483446f, 0.22029644f,
-      0.4897902f, 0.18942028f, 0.4379952f, 0.7034796f, 0.0109113455f, 0.64850605f, 0.16939592f, 0.25597447f,
-      0.69195485f, 0.8975601f, 0.36334568f, 0.29471546f, 0.04788208f, 0.24217117f, 0.062181532f, 0.38556474f,
-      0.6020277f, 0.03156215f, 0.93655676f, 0.81369543f, 0.010527074f, 0.2611835f, 0.6630776f, 0.3972702f,
-      0.44551176f, 0.27424216f, 0.9016098f, 0.22050089f, 0.9146384f, 0.53226113f, 0.6005109f, 0.8900659f,
-      0.4176172f, 0.21532834f, 0.4191329f, 0.9055267f, 0.12900633f, 0.6134902f, 0.008604288f, 0.76215106f,
-      0.68473387f, 0.5211961f, 0.71459657f, 0.50056237f, 0.7766764f, 0.10418975f, 0.42657375f, 0.7218073f,
-      0.9979084f, 0.7546957f, 0.1364128f, 0.8845484f, 0.38850087f, 0.39324278f, 0.04554516f, 0.42129284f,
-      0.8536634f, 0.5697224f, 0.20877302f, 0.65390605f, 0.3396778f, 0.956497f, 0.066022694f, 0.34206223f,
-      0.017213225f, 0.3030849f, 0.6576238f, 0.9813073f, 0.58397317f, 0.99017924f, 0.59782606f, 0.788768f,
-      0.9008311f, 0.91796166f, 0.22013813f, 0.959695f, 0.80288273f, 0.2662105f, 0.26139832f, 0.080626905f};
+      0.81960344f, 0.3370188f, 0.0025978684f, 0.704681f, 0.77974546f, 0.81039995f, 0.9191656f,
+      0.0930863f, 0.9296998f, 0.6688521f, 0.8346353f, 0.254516f, 0.15189773f, 0.87411255f,
+      0.42156547f, 0.9192536f, 0.45050132f, 0.8188108f, 0.8788173f, 0.3993737f, 0.75130886f,
+      0.9728532f, 0.44305897f, 0.29990643f, 0.38805157f, 0.73084867f, 0.6822241f, 0.21224737f,
+      0.7268921f, 0.38206023f, 0.29594004f, 0.63248974f, 0.50729614f, 0.058027983f, 0.1513629f,
+      0.40888822f, 0.85721636f, 0.08917904f, 0.04846859f, 0.32651705f, 0.47014588f, 0.19931877f,
+      0.0065300465f, 0.14808255f, 0.11647397f, 0.61241513f, 0.013427794f, 0.54063064f, 0.62020564f,
+      0.42109168f, 0.093910515f, 0.17329216f, 0.8595984f, 0.77621365f, 0.6858292f, 0.9661502f,
+      0.6401168f, 0.98367476f, 0.8728501f, 0.6658554f, 0.2636242f, 0.0023456216f, 0.22547692f,
+      0.73036134f, 0.045871615f, 0.57232875f, 0.7400529f, 0.3514018f, 0.6855346f, 0.38650817f,
+      0.17856151f, 0.06670016f, 0.31548113f, 0.37051463f, 0.9207522f, 0.8086716f, 0.96955734f,
+      0.20027226f, 0.4609884f, 0.6984514f, 0.92106473f, 0.7068576f, 0.76193494f, 0.33959562f,
+      0.42948407f, 0.45626813f, 0.33349442f, 0.9746214f, 0.6947775f, 0.30955923f, 0.6265461f,
+      0.13321638f, 0.49613327f, 0.25389326f, 0.3382396f, 0.63154167f, 0.4751312f, 0.17637217f,
+      0.49510366f, 0.41178054f, 0.38488472f, 0.2956162f, 0.5160656f, 0.83521235f, 0.19854712f,
+      0.8649436f, 0.11974698f, 0.2576263f, 0.08250773f, 0.34127057f, 0.3939438f, 0.99294376f,
+      0.19409746f, 0.2726491f, 0.07161391f, 0.3470292f, 0.73995143f, 0.024847984f, 0.3278438f,
+      0.4233855f, 0.052116573f, 0.39976662f, 0.032325685f, 0.024002194f, 0.003641069f, 0.91025376f,
+      0.26059705f, 0.6037772f, 0.15248245f, 0.43843627f, 0.8800602f, 0.03305638f, 0.5070212f,
+      0.36010116f, 0.2957666f, 0.30232358f, 0.39696145f, 0.739853f, 0.7784371f, 0.5088047f,
+      0.9549045f, 0.83768386f, 0.68377024f, 0.77861303f, 0.8702919f, 0.26859397f, 0.004249513f,
+      0.95591706f, 0.07397425f, 0.53982985f, 0.16686243f, 0.10180014f, 0.7563229f, 0.44548005f,
+      0.5443443f, 0.7884607f, 0.3090204f, 0.52255917f, 0.17314798f, 0.816009f, 0.18360549f,
+      0.45647776f, 0.80287653f, 0.20888287f, 0.79162645f, 0.37694973f, 0.47585016f, 0.30602258f,
+      0.099057496f, 0.38170832f, 0.45378727f, 0.43509573f, 0.39106607f, 0.04720515f, 0.31711966f,
+      0.5076527f, 0.15831816f, 0.24648392f, 0.20536041f, 0.13140821f, 0.39764988f, 0.029871285f,
+      0.125171f, 0.40119207f, 0.00656116f, 0.054280818f, 0.9766699f, 0.2587883f, 0.29160416f,
+      0.26099247f, 0.7965795f, 0.5606195f, 0.114180505f, 0.0958215f, 0.31298608f, 0.5905492f,
+      0.84465307f, 0.2458393f, 0.90208143f, 0.3489008f, 0.3763513f, 0.23226917f, 0.21532774f,
+      0.77226925f, 0.7452516f, 0.6557768f, 0.58111167f, 0.8635635f, 0.8374386f, 0.98291886f,
+      0.04922247f, 0.91418463f, 0.66022503f, 0.35444462f, 0.41294336f, 0.48700142f, 0.5836911f,
+      0.25849265f, 0.52233416f, 0.04094696f, 0.21901816f, 0.30438894f, 0.036863506f, 0.89029974f,
+      0.11969727f, 0.16423601f, 0.72156656f, 0.8343076f, 0.09412521f, 0.9767149f, 0.31788063f,
+      0.98074025f, 0.09888804f, 0.6211971f, 0.6106814f, 0.14735395f, 0.5540803f, 0.67416143f,
+      0.6272928f, 0.25640452f, 0.74873763f, 0.63780516f, 0.59887487f, 0.6872336f, 0.6481394f,
+      0.85645115f, 0.73576546f, 0.13524544f, 0.12807935f, 0.77395487f, 0.12080628f, 0.92312264f,
+      0.26914406f, 0.25794363f, 0.43679124f, 0.901151f, 0.89180696f, 0.6873948f, 0.7573983f,
+      0.6867124f, 0.65562236f, 0.60750586f, 0.9705138f, 0.84136647f, 0.11822635f, 0.24455917f,
+      0.3057955f, 0.9648854f, 0.11856395f, 0.5729957f, 0.5475096f, 0.97399056f, 0.46134835f,
+      0.08452982f, 0.20571685f, 0.37042046f, 0.18362767f, 0.6367655f, 0.789582f, 0.5267614f,
+      0.006936848f, 0.22689629f, 0.56744653f, 0.2886421f, 0.08430874f, 0.2594666f, 0.8881108f,
+      0.06989372f, 0.09070045f, 0.9822047f, 0.2052834f, 0.37891757f, 0.9356598f, 0.43602943f,
+      0.9036556f, 0.14923638f, 0.59657127f, 0.9274289f, 0.17446929f, 0.25843787f, 0.026530087f,
+      0.97506f, 0.32732427f, 0.18941313f, 0.6330173f, 0.9477422f, 0.76062596f, 0.58501935f,
+      0.8771834f, 0.83592474f, 0.38817167f, 0.059375823f, 0.6059905f, 0.7935056f, 0.4160077f,
+      0.8732242f, 0.48319155f, 0.48121578f, 0.7409689f, 0.24937624f, 0.36391765f, 0.87772477f,
+      0.9568925f, 0.8909887f, 0.4418506f, 0.029734552f, 0.36356616f, 0.039716125f, 0.96128887f,
+      0.43307513f, 0.9863913f, 0.72956276f, 0.81273925f, 0.5219139f, 0.734132f, 0.038692355f,
+      0.571489f, 0.22488606f, 0.64955276f, 0.13203424f, 0.4537862f, 0.15951324f, 0.39076614f,
+      0.20122272f, 0.2049576f, 0.7498283f, 0.67207885f, 0.23164761f, 0.81357706f, 0.90659577f,
+      0.16087383f, 0.0070830584f, 0.4716931f, 0.24090862f, 0.61514187f, 0.3901443f, 0.8615075f,
+      0.19645631f, 0.70352167f, 0.19309378f, 0.6200726f, 0.16256708f, 0.50783044f, 0.40783793f,
+      0.06589496f, 0.4638992f, 0.576659f, 0.69065434f, 0.67509633f, 0.34033298f, 0.46363378f,
+      0.54112387f, 0.692392f, 0.38902867f, 0.7229242f, 0.9170264f, 0.14645958f, 0.6049296f,
+      0.50687206f, 0.041014254f, 0.5943895f, 0.5889769f, 0.996743f, 0.3512686f, 0.3545606f,
+      0.11835027f, 0.8793823f, 0.4897902f, 0.6020277f, 0.4176172f, 0.9979084f, 0.017213225f,
+      0.76697665f, 0.51498765f, 0.9828271f, 0.18942028f, 0.03156215f, 0.21532834f, 0.7546957f,
+      0.3030849f, 0.25331455f, 0.25018185f, 0.8181312f, 0.4379952f, 0.93655676f, 0.4191329f,
+      0.1364128f, 0.6576238f, 0.26358372f, 0.10446805f, 0.20143801f, 0.7034796f, 0.81369543f,
+      0.9055267f, 0.8845484f, 0.9813073f, 0.80806476f, 0.45996118f, 0.17288941f, 0.0109113455f,
+      0.010527074f, 0.12900633f, 0.38850087f, 0.58397317f, 0.064349174f, 0.059881568f, 0.9363466f,
+      0.64850605f, 0.2611835f, 0.6134902f, 0.39324278f, 0.99017924f, 0.5611374f, 0.8489496f,
+      0.6768587f, 0.16939592f, 0.6630776f, 0.008604288f, 0.04554516f, 0.59782606f, 0.941691f,
+      0.5579074f, 0.51328385f, 0.25597447f, 0.3972702f, 0.76215106f, 0.42129284f, 0.788768f,
+      0.58574325f, 0.23052096f, 0.56766605f, 0.69195485f, 0.44551176f, 0.68473387f, 0.8536634f,
+      0.9008311f, 0.6359719f, 0.76128954f, 0.098151624f, 0.8975601f, 0.27424216f, 0.5211961f,
+      0.5697224f, 0.91796166f, 0.20880443f, 0.02678603f, 0.33305728f, 0.36334568f, 0.9016098f,
+      0.71459657f, 0.20877302f, 0.22013813f, 0.49310172f, 0.3066004f, 0.98130906f, 0.29471546f,
+      0.22050089f, 0.50056237f, 0.65390605f, 0.959695f, 0.5274922f, 0.40259063f, 0.3766839f,
+      0.04788208f, 0.9146384f, 0.7766764f, 0.3396778f, 0.80288273f, 0.62271714f, 0.07512486f,
+      0.47491795f, 0.24217117f, 0.53226113f, 0.10418975f, 0.956497f, 0.2662105f, 0.694273f,
+      0.18205583f, 0.08483446f, 0.062181532f, 0.6005109f, 0.42657375f, 0.066022694f, 0.26139832f,
+      0.9344639f, 0.4183907f, 0.22029644f, 0.38556474f, 0.8900659f, 0.7218073f, 0.34206223f,
+      0.080626905f};
   const std::vector<float> fc2_experts_weights = {
-      0.6255686f, 0.09472537f, 0.71121234f, 0.65789884f, 0.065598905f, 0.63625044f, 0.45933473f, 0.7284089f,
-      0.7868948f, 0.0029274821f, 0.95854944f, 0.919321f, 0.6989418f, 0.043019474f, 0.32138962f, 0.35509557f,
-      0.37150103f, 0.78196156f, 0.6817853f, 0.89608955f, 0.31273842f, 0.6682699f, 0.6778976f, 0.08370459f,
-      0.014990091f, 0.24055547f, 0.84227383f, 0.029270172f, 0.0647831f, 0.7801003f, 0.7697645f, 0.91119635f,
-      0.12253064f, 0.13405013f, 0.75649333f, 0.9348151f, 0.7991694f, 0.57832605f, 0.66478735f, 0.97456336f,
-      0.17739785f, 0.2729941f, 0.8497335f, 0.15788019f, 0.22429371f, 0.86499554f, 0.65776104f, 0.661535f,
-      0.2880798f, 0.49309975f, 0.9576164f, 0.19988996f, 0.5039311f, 0.73779976f, 0.15482187f, 0.98558843f,
-      0.25019473f, 0.379932f, 0.36471486f, 0.17417055f, 0.009367704f, 0.7819258f, 0.63283706f, 0.031699598f,
-      0.1781866f, 0.994184f, 0.6911175f, 0.7006223f, 0.20085096f, 0.28080195f, 0.42452294f, 0.40856004f,
-      0.15737581f, 0.5411925f, 0.549694f, 0.4366895f, 0.5693159f, 0.3018247f, 0.63012594f, 0.6885702f,
-      0.2366305f, 0.004210472f, 0.7617172f, 0.61926836f, 0.24570602f, 0.981851f, 0.273876f, 0.8378734f,
-      0.75366426f, 0.080795944f, 0.82247066f, 0.040263534f, 0.22299266f, 0.41664255f, 0.16297674f, 0.98845494f,
-      0.39971018f, 0.69859487f, 0.053544044f, 0.7878332f, 0.34460813f, 0.11966437f, 0.5731115f, 0.7422309f,
-      0.93269855f, 0.19460368f, 0.25394785f, 0.59613144f, 0.6356306f, 0.6922361f, 0.7744376f, 0.38662314f,
-      0.7777848f, 0.8686458f, 0.36938924f, 0.8557286f, 0.74428976f, 0.9410264f, 0.21586305f, 0.2530955f,
-      0.35543054f, 0.52536315f, 0.8000995f, 0.21456867f, 0.750327f, 0.3208093f, 0.80205464f, 0.47626138f,
-      0.061956525f, 0.22487706f, 0.13812399f, 0.74798125f, 0.1647259f, 0.45834088f, 0.6078779f, 0.22580266f,
-      0.644235f, 0.011788309f, 0.14224577f, 0.0469383f, 0.34876132f, 0.3178513f, 0.5715967f, 0.40754277f,
-      0.735041f, 0.9583977f, 0.67939556f, 0.30301625f, 0.031807184f, 0.68110096f, 0.25227106f, 0.75443816f,
-      0.83424246f, 0.69286025f, 0.9691554f, 0.9748982f, 0.60586995f, 0.13568163f, 0.94672066f, 0.26275212f,
-      0.2638232f, 0.9183893f, 0.88740516f, 0.65107566f, 0.5313419f, 0.07941705f, 0.44809794f, 0.9795632f,
-      0.6273294f, 0.542809f, 0.3961745f, 0.32560885f, 0.79801136f, 0.53083426f, 0.8252871f, 0.4115007f,
-      0.7184546f, 0.70638496f, 0.57973206f, 0.8141865f, 0.81332296f, 0.96346164f, 0.88438797f, 0.37215167f,
-      0.0766899f, 0.5914087f, 0.49563587f, 0.3695873f, 0.41627264f, 0.5235164f, 0.86481494f, 0.6558706f,
-      0.32245284f, 0.29438752f, 0.37618434f, 0.3067485f, 0.9496114f, 0.76482266f, 0.95148784f, 0.5015968f,
-      0.60083544f, 0.67338234f, 0.026723444f, 0.5446483f, 0.466555f, 0.21967298f, 0.112026334f, 0.9426372f,
-      0.906533f, 0.73173434f, 0.97712487f, 0.29709607f, 0.41363865f, 0.6893093f, 0.4173867f, 0.4018826f,
-      0.086719275f, 0.63433063f, 0.1978364f, 0.5181831f, 0.9874878f, 0.34609234f, 0.34240413f, 0.8016564f,
-      0.31617337f, 0.4570613f, 0.96686924f, 0.29501313f, 0.14229488f, 0.22017813f, 0.36137718f, 0.26275063f,
-      0.24053413f, 0.70197225f, 0.58496886f, 0.33996922f, 0.11154431f, 0.34257007f, 0.28898042f, 0.33729053f,
-      0.048938513f, 0.60771453f, 0.13263822f, 0.11060041f, 0.091483414f, 0.70869184f, 0.19898665f, 0.29362458f,
-      0.8919203f, 0.7654821f, 0.7866956f, 0.02524674f, 0.1414501f, 0.3112445f, 0.9130488f, 0.5511502f,
-      0.12605143f, 0.5031309f, 0.11166459f, 0.39045036f, 0.36251247f, 0.9328308f, 0.65486836f, 0.41281444f,
-      0.5844644f, 0.35566723f, 0.6964502f, 0.6977819f, 0.63427305f, 0.30511153f, 0.92657536f, 0.42781502f,
-      0.30534166f, 0.813157f, 0.90752834f, 0.9975799f, 0.64812917f, 0.32955307f, 0.753946f, 0.92897725f,
-      0.009582937f, 0.43805653f, 0.15901726f, 0.5931799f, 0.7067924f, 0.39670604f, 0.45817143f, 0.7250554f,
-      0.41596514f, 0.08011025f, 0.900068f, 0.24834275f, 0.44507074f, 0.5471632f, 0.46995157f, 0.029657006f,
-      0.7294f, 0.27288425f, 0.2406702f, 0.6194577f, 0.23906898f, 0.26892018f, 0.33152503f, 0.3121612f,
-      0.29118127f, 0.36515707f, 0.6299379f, 0.095391035f, 0.19735986f, 0.5072957f, 0.56953406f, 0.77614623f,
-      0.14877802f, 0.65959847f, 0.7841949f, 0.7776301f, 0.03428924f, 0.3091979f, 0.07021719f, 0.18359429f,
-      0.77849144f, 0.42534047f, 0.7123557f, 0.20649683f, 0.57597995f, 0.19757104f, 0.749946f, 0.2813105f,
-      0.37462044f, 0.06618434f, 0.50165176f, 0.9747401f, 0.7426891f, 0.23322952f, 0.50672436f, 0.44517577f,
-      0.09746289f, 0.89204556f, 0.50806034f, 0.6052985f, 0.2980855f, 0.26604044f, 0.5824448f, 0.68485546f,
-      0.612149f, 0.25902748f, 0.9854489f, 0.4263978f, 0.19379246f, 0.26614368f, 0.9922104f, 0.5000241f,
-      0.4321279f, 0.2919191f, 0.3689273f, 0.078885734f, 0.10265827f, 0.79264474f, 0.9277247f, 0.9771502f,
-      0.13902885f, 0.77043164f, 0.19051671f, 0.7982801f, 0.86077714f, 0.8869355f, 0.86002564f, 0.81278664f,
-      0.5097318f, 0.7297412f, 0.32111454f, 0.7177174f, 0.33929902f, 0.49160433f, 0.064810574f, 0.3692627f,
-      0.23706353f, 0.3313396f, 0.18070674f, 0.05027789f, 0.53255826f, 0.8244896f, 0.9553747f, 0.7917771f,
-      0.24083132f, 0.005495131f, 0.6896569f, 0.78015697f, 0.07074398f, 0.67929304f, 0.9227386f, 0.5302883f,
-      0.19877058f, 0.90993816f, 0.71350795f, 0.8311006f, 0.16185725f, 0.79097277f, 0.15846318f, 0.99474716f,
-      0.28815013f, 0.80128354f, 0.6001208f, 0.63250524f, 0.4233225f, 0.7053677f, 0.29161406f, 0.028710365f,
-      0.30789846f, 0.8917693f, 0.36836517f, 0.6571592f, 0.3151368f, 0.8750746f, 0.7992451f, 0.6765068f,
-      0.24441916f, 0.091435075f, 0.5188247f, 0.20667112f, 0.9110969f, 0.019512117f, 0.72343415f, 0.998457f,
-      0.7504142f, 0.6704894f, 0.01892668f, 0.9809466f, 0.41447622f, 0.032795787f, 0.9935814f, 0.29653466f,
-      0.4646262f, 0.95763975f, 0.15339965f, 0.14625502f, 0.58130866f, 0.43307304f, 0.6151709f, 0.08064735f,
-      0.5149533f, 0.27762014f, 0.25419557f, 0.04218155f, 0.7651092f, 0.59631824f, 0.077278376f, 0.89677596f,
-      0.6508104f, 0.5927816f, 0.2064318f, 0.57540226f, 0.9817701f, 0.84294224f, 0.11056489f, 0.9564106f,
-      0.5387549f, 0.74048257f, 0.88833815f, 0.9262546f, 0.11023259f, 0.93783194f, 0.16041255f, 0.53748304f,
-      0.1506182f, 0.39038336f, 0.47727865f, 0.44018233f, 0.42101204f, 0.53943527f, 0.99320936f, 0.79050577f,
-      0.77973497f, 0.7001237f, 0.88709056f, 0.4769255f, 0.5397561f, 0.60289854f, 0.06393474f, 0.09722155f,
-      0.5613007f, 0.30437487f, 0.49082512f, 0.3852706f, 0.5778314f, 0.8253078f, 0.33417904f, 0.9004303f,
-      0.8947809f, 0.11625093f, 0.11388689f, 0.09546256f, 0.22598988f, 0.30536187f, 0.46236527f, 0.3784039f,
-      0.24737573f, 0.3411532f, 0.31912774f, 0.9905191f, 0.31468558f, 0.14199954f, 0.7078488f, 0.47111923f,
-      0.882782f, 0.8124163f, 0.9593644f, 0.13382024f, 0.8214317f, 0.9196194f, 0.25308424f, 0.95958996f};
+      0.6255686f, 0.7868948f, 0.37150103f, 0.014990091f, 0.12253064f, 0.17739785f, 0.2880798f, 0.25019473f,
+      0.1781866f, 0.15737581f, 0.2366305f, 0.75366426f, 0.39971018f, 0.93269855f, 0.7777848f, 0.35543054f,
+      0.09472537f, 0.0029274821f, 0.78196156f, 0.24055547f, 0.13405013f, 0.2729941f, 0.49309975f, 0.379932f,
+      0.994184f, 0.5411925f, 0.004210472f, 0.080795944f, 0.69859487f, 0.19460368f, 0.8686458f, 0.52536315f,
+      0.71121234f, 0.95854944f, 0.6817853f, 0.84227383f, 0.75649333f, 0.8497335f, 0.9576164f, 0.36471486f,
+      0.6911175f, 0.549694f, 0.7617172f, 0.82247066f, 0.053544044f, 0.25394785f, 0.36938924f, 0.8000995f,
+      0.65789884f, 0.919321f, 0.89608955f, 0.029270172f, 0.9348151f, 0.15788019f, 0.19988996f, 0.17417055f,
+      0.7006223f, 0.4366895f, 0.61926836f, 0.040263534f, 0.7878332f, 0.59613144f, 0.8557286f, 0.21456867f,
+      0.065598905f, 0.6989418f, 0.31273842f, 0.0647831f, 0.7991694f, 0.22429371f, 0.5039311f, 0.009367704f,
+      0.20085096f, 0.5693159f, 0.24570602f, 0.22299266f, 0.34460813f, 0.6356306f, 0.74428976f, 0.750327f,
+      0.63625044f, 0.043019474f, 0.6682699f, 0.7801003f, 0.57832605f, 0.86499554f, 0.73779976f, 0.7819258f,
+      0.28080195f, 0.3018247f, 0.981851f, 0.41664255f, 0.11966437f, 0.6922361f, 0.9410264f, 0.3208093f,
+      0.45933473f, 0.32138962f, 0.6778976f, 0.7697645f, 0.66478735f, 0.65776104f, 0.15482187f, 0.63283706f,
+      0.42452294f, 0.63012594f, 0.273876f, 0.16297674f, 0.5731115f, 0.7744376f, 0.21586305f, 0.80205464f,
+      0.7284089f, 0.35509557f, 0.08370459f, 0.91119635f, 0.97456336f, 0.661535f, 0.98558843f, 0.031699598f,
+      0.40856004f, 0.6885702f, 0.8378734f, 0.98845494f, 0.7422309f, 0.38662314f, 0.2530955f, 0.47626138f,
+      0.061956525f, 0.644235f, 0.735041f, 0.83424246f, 0.2638232f, 0.6273294f, 0.7184546f, 0.0766899f,
+      0.32245284f, 0.60083544f, 0.906533f, 0.086719275f, 0.31617337f, 0.24053413f, 0.048938513f, 0.8919203f,
+      0.22487706f, 0.011788309f, 0.9583977f, 0.69286025f, 0.9183893f, 0.542809f, 0.70638496f, 0.5914087f,
+      0.29438752f, 0.67338234f, 0.73173434f, 0.63433063f, 0.4570613f, 0.70197225f, 0.60771453f, 0.7654821f,
+      0.13812399f, 0.14224577f, 0.67939556f, 0.9691554f, 0.88740516f, 0.3961745f, 0.57973206f, 0.49563587f,
+      0.37618434f, 0.026723444f, 0.97712487f, 0.1978364f, 0.96686924f, 0.58496886f, 0.13263822f, 0.7866956f,
+      0.74798125f, 0.0469383f, 0.30301625f, 0.9748982f, 0.65107566f, 0.32560885f, 0.8141865f, 0.3695873f,
+      0.3067485f, 0.5446483f, 0.29709607f, 0.5181831f, 0.29501313f, 0.33996922f, 0.11060041f, 0.02524674f,
+      0.1647259f, 0.34876132f, 0.031807184f, 0.60586995f, 0.5313419f, 0.79801136f, 0.81332296f, 0.41627264f,
+      0.9496114f, 0.466555f, 0.41363865f, 0.9874878f, 0.14229488f, 0.11154431f, 0.091483414f, 0.1414501f,
+      0.45834088f, 0.3178513f, 0.68110096f, 0.13568163f, 0.07941705f, 0.53083426f, 0.96346164f, 0.5235164f,
+      0.76482266f, 0.21967298f, 0.6893093f, 0.34609234f, 0.22017813f, 0.34257007f, 0.70869184f, 0.3112445f,
+      0.6078779f, 0.5715967f, 0.25227106f, 0.94672066f, 0.44809794f, 0.8252871f, 0.88438797f, 0.86481494f,
+      0.95148784f, 0.112026334f, 0.4173867f, 0.34240413f, 0.36137718f, 0.28898042f, 0.19898665f, 0.9130488f,
+      0.22580266f, 0.40754277f, 0.75443816f, 0.26275212f, 0.9795632f, 0.4115007f, 0.37215167f, 0.6558706f,
+      0.5015968f, 0.9426372f, 0.4018826f, 0.8016564f, 0.26275063f, 0.33729053f, 0.29362458f, 0.5511502f,
+      0.12605143f, 0.5844644f, 0.30534166f, 0.009582937f, 0.41596514f, 0.7294f, 0.29118127f, 0.14877802f,
+      0.77849144f, 0.37462044f, 0.09746289f, 0.612149f, 0.4321279f, 0.13902885f, 0.5097318f, 0.23706353f,
+      0.5031309f, 0.35566723f, 0.813157f, 0.43805653f, 0.08011025f, 0.27288425f, 0.36515707f, 0.65959847f,
+      0.42534047f, 0.06618434f, 0.89204556f, 0.25902748f, 0.2919191f, 0.77043164f, 0.7297412f, 0.3313396f,
+      0.11166459f, 0.6964502f, 0.90752834f, 0.15901726f, 0.900068f, 0.2406702f, 0.6299379f, 0.7841949f,
+      0.7123557f, 0.50165176f, 0.50806034f, 0.9854489f, 0.3689273f, 0.19051671f, 0.32111454f, 0.18070674f,
+      0.39045036f, 0.6977819f, 0.9975799f, 0.5931799f, 0.24834275f, 0.6194577f, 0.095391035f, 0.7776301f,
+      0.20649683f, 0.9747401f, 0.6052985f, 0.4263978f, 0.078885734f, 0.7982801f, 0.7177174f, 0.05027789f,
+      0.36251247f, 0.63427305f, 0.64812917f, 0.7067924f, 0.44507074f, 0.23906898f, 0.19735986f, 0.03428924f,
+      0.57597995f, 0.7426891f, 0.2980855f, 0.19379246f, 0.10265827f, 0.86077714f, 0.33929902f, 0.53255826f,
+      0.9328308f, 0.30511153f, 0.32955307f, 0.39670604f, 0.5471632f, 0.26892018f, 0.5072957f, 0.3091979f,
+      0.19757104f, 0.23322952f, 0.26604044f, 0.26614368f, 0.79264474f, 0.8869355f, 0.49160433f, 0.8244896f,
+      0.65486836f, 0.92657536f, 0.753946f, 0.45817143f, 0.46995157f, 0.33152503f, 0.56953406f, 0.07021719f,
+      0.749946f, 0.50672436f, 0.5824448f, 0.9922104f, 0.9277247f, 0.86002564f, 0.064810574f, 0.9553747f,
+      0.41281444f, 0.42781502f, 0.92897725f, 0.7250554f, 0.029657006f, 0.3121612f, 0.77614623f, 0.18359429f,
+      0.2813105f, 0.44517577f, 0.68485546f, 0.5000241f, 0.9771502f, 0.81278664f, 0.3692627f, 0.7917771f,
+      0.24083132f, 0.19877058f, 0.28815013f, 0.30789846f, 0.24441916f, 0.7504142f, 0.4646262f, 0.5149533f,
+      0.6508104f, 0.5387549f, 0.1506182f, 0.77973497f, 0.5613007f, 0.8947809f, 0.24737573f, 0.882782f,
+      0.005495131f, 0.90993816f, 0.80128354f, 0.8917693f, 0.091435075f, 0.6704894f, 0.95763975f, 0.27762014f,
+      0.5927816f, 0.74048257f, 0.39038336f, 0.7001237f, 0.30437487f, 0.11625093f, 0.3411532f, 0.8124163f,
+      0.6896569f, 0.71350795f, 0.6001208f, 0.36836517f, 0.5188247f, 0.01892668f, 0.15339965f, 0.25419557f,
+      0.2064318f, 0.88833815f, 0.47727865f, 0.88709056f, 0.49082512f, 0.11388689f, 0.31912774f, 0.9593644f,
+      0.78015697f, 0.8311006f, 0.63250524f, 0.6571592f, 0.20667112f, 0.9809466f, 0.14625502f, 0.04218155f,
+      0.57540226f, 0.9262546f, 0.44018233f, 0.4769255f, 0.3852706f, 0.09546256f, 0.9905191f, 0.13382024f,
+      0.07074398f, 0.16185725f, 0.4233225f, 0.3151368f, 0.9110969f, 0.41447622f, 0.58130866f, 0.7651092f,
+      0.9817701f, 0.11023259f, 0.42101204f, 0.5397561f, 0.5778314f, 0.22598988f, 0.31468558f, 0.8214317f,
+      0.67929304f, 0.79097277f, 0.7053677f, 0.8750746f, 0.019512117f, 0.032795787f, 0.43307304f, 0.59631824f,
+      0.84294224f, 0.93783194f, 0.53943527f, 0.60289854f, 0.8253078f, 0.30536187f, 0.14199954f, 0.9196194f,
+      0.9227386f, 0.15846318f, 0.29161406f, 0.7992451f, 0.72343415f, 0.9935814f, 0.6151709f, 0.077278376f,
+      0.11056489f, 0.16041255f, 0.99320936f, 0.06393474f, 0.33417904f, 0.46236527f, 0.7078488f, 0.25308424f,
+      0.5302883f, 0.99474716f, 0.028710365f, 0.6765068f, 0.998457f, 0.29653466f, 0.08064735f, 0.89677596f,
+      0.9564106f, 0.53748304f, 0.79050577f, 0.09722155f, 0.9004303f, 0.3784039f, 0.47111923f, 0.95958996f};
   const std::vector<float> fc1_experts_bias = {
       0.8748215f, 0.5054756f, 0.74107623f, 0.32518923f, 0.0639081f, 0.62639004f, 0.64906263f, 0.17322052f,
       0.7424998f, 0.07288867f, 0.93031204f, 0.9841952f, 0.6361292f, 0.18628561f, 0.7433356f, 0.5852079f,
@@ -435,19 +480,8 @@ TEST(MoETest, MoETest_Relu) {
       0.012911659f, 0.045757107f, 0.27884653f, 0.3585817f, 0.116771236f, 0.25755364f, 0.23161705f, 0.2906256f,
       4.8571277f, 5.649453f, 5.485141f, 5.306299f, 4.767025f, 6.9010167f, 5.3520975f, 6.711155f};
 
-  RunMoETest(input,
-             router_probs,
-             fc1_experts_weights,
-             fc2_experts_weights,
-             {},
-             fc1_experts_bias,
-             fc2_experts_bias,
-             output,
-             num_rows,
-             num_experts,
-             hidden_size,
-             inter_size,
-             "relu");
+  RunMoETest(input, router_probs, fc1_experts_weights, fc2_experts_weights, {}, fc1_experts_bias, fc2_experts_bias,
+             output, num_rows, num_experts, hidden_size, inter_size, "relu");
 }
 
 TEST(MoETest, MoETest_Mixtral) {
@@ -456,10 +490,10 @@ TEST(MoETest, MoETest_Mixtral) {
   int hidden_size = 4;
   int inter_size = 8;
 
-  const std::vector<float> input = {
-      0.9212995f, 0.5282444f, -0.008228387f, -1.449332f, -0.6051824f, -0.17924511f, 0.1995587f, -1.2461947f,
-      0.86708033f, 0.19191018f, 1.1600108f, -0.008815222f, 0.8504777f, -0.84964496f, -1.4019964f, 0.17225051f,
-      0.35569248f, 1.2056456f, 1.3690308f, -0.69495815f, 1.4324434f, 0.22761835f, -1.1286871f, 1.124213f};
+  const std::vector<float> input = {0.9212995f, 0.5282444f, -0.008228387f, -1.449332f, -0.6051824f, -0.17924511f,
+                                    0.1995587f, -1.2461947f, 0.86708033f, 0.19191018f, 1.1600108f, -0.008815222f,
+                                    0.8504777f, -0.84964496f, -1.4019964f, 0.17225051f, 0.35569248f, 1.2056456f,
+                                    1.3690308f, -0.69495815f, 1.4324434f, 0.22761835f, -1.1286871f, 1.124213f};
   const std::vector<float> router_probs = {
       -0.09331456f, -0.47121337f, 0.07311103f, 0.47643483f, 0.21135253f, -0.72226393f, -0.048502743f, 0.39447474f,
       -0.9014899f, -0.36629856f, -0.23088816f, -0.099606544f, -0.45191774f, -0.30394578f, 0.6266495f, 0.67937183f,
@@ -468,125 +502,771 @@ TEST(MoETest, MoETest_Mixtral) {
       0.15675665f, -0.4546509f, 0.24447554f, 0.5921611f, -0.18192923f, -0.66116416f, -0.40265432f, 0.33475468f,
       1.2906091f, 0.4709078f, 0.16256471f, 0.19308007f, 0.97568524f, 0.25876164f, -0.7964541f, -1.0319631f};
   const std::vector<float> fc1_experts_weights = {
-      0.3860137f, 0.077925384f, 0.13434184f, 0.28902978f, 0.25391752f, -0.38351142f, 0.15813059f, 0.031481862f,
-      0.083209574f, 0.4039817f, -0.13558972f, -0.21858627f, -0.30475253f, 0.41026944f, -0.008697987f, -0.3412701f,
-      -0.16235226f, 0.054659843f, 0.21042877f, 0.28863233f, -0.49495423f, 0.14401567f, 0.39130414f, 0.154176f,
-      0.30897498f, -0.15768659f, 0.44641107f, 0.089463115f, -0.19318026f, 0.20710677f, -0.3552568f, -0.17219114f,
-      0.41923493f, -0.4233985f, -0.41503525f, 0.19466156f, -0.08633667f, 0.45547962f, -0.054792404f, 0.26722562f,
-      -0.09923202f, 0.3460176f, -0.49708033f, -0.41033173f, 0.10443485f, -0.39646107f, -0.37424505f, 0.1757198f,
-      0.43019837f, -0.13757241f, 0.14305532f, 0.37121457f, 0.2581259f, 0.12583363f, 0.45542932f, 0.16247797f,
-      0.15579104f, -0.19166303f, -0.109221935f, -0.36702687f, 0.40365517f, -0.21506298f, -0.36697525f, -0.2703231f,
-      -0.49740213f, -0.3486371f, 0.24005288f, -0.0048963428f, 0.20468098f, -0.09111178f, -0.1485982f, -0.088219464f,
-      0.33463532f, -0.49346995f, 0.42075223f, -0.38025302f, -0.245484f, -0.35191745f, 0.3086716f, -0.2423737f,
-      0.37881732f, -0.40608948f, 0.26193494f, -0.4283861f, -0.10062629f, -0.32670784f, -0.16040438f, -0.15297079f,
-      0.1822241f, 0.37285012f, 0.12654608f, -0.46767431f, -0.28775263f, 0.16585541f, -0.36678362f, -0.4759978f,
-      -0.34751755f, -0.3163945f, -0.3858195f, -0.38030273f, -0.06156373f, -0.04352224f, -0.4041785f, -0.335764f,
-      -0.10303855f, -0.4009425f, -0.1236487f, -0.40111196f, 0.23985302f, -0.118291676f, -0.26773083f, 0.121197104f,
-      0.3702919f, -0.34168184f, 0.33743858f, 0.24873763f, -0.23140603f, -0.25351608f, 0.48291886f, 0.13780516f,
-      0.25632292f, -0.49343884f, 0.08369112f, -0.37192065f, -0.05451995f, -0.44571918f, -0.24150735f, 0.27395487f,
-      -0.20423341f, -0.024149835f, 0.40208143f, -0.18211937f, -0.19767642f, -0.19397742f, -0.1510992f, 0.48074025f,
-      0.18377024f, -0.18288034f, 0.08111167f, 0.12729281f, 0.27861303f, 0.0076527f, 0.36356348f, -0.24359548f,
-      -0.33313757f, -0.374829f, -0.08705664f, 0.23576546f, -0.39819986f, -0.09880793f, -0.012998581f, -0.36475456f,
-      -0.32685202f, 0.29657948f, -0.4631365f, -0.06320876f, 0.31600899f, 0.060619473f, 0.39029974f, 0.401151f,
-      0.15562236f, 0.43565983f, -0.058149397f, 0.36150748f, 0.10750586f, -0.063970566f, -0.47026545f, -0.3035437f,
-      -0.38143605f, -0.4734699f, 0.31273925f, -0.43410504f, 0.07299572f, 0.47506f, 0.021913886f, -0.036100805f,
-      -0.31637233f, 0.37718338f, -0.046213806f, 0.19239199f, 0.13676548f, 0.33592474f, -0.34048676f, -0.11097133f,
-      -0.41569126f, -0.01680845f, 0.31357706f, 0.0943895f, -0.24053341f, -0.018784225f, 0.40659577f, 0.08897692f,
-      0.3793823f, -0.3271106f, 0.067666054f, -0.12331611f, -0.010209799f, -0.48908865f, 0.19195485f, -0.45211792f,
-      0.48282713f, 0.4363466f, -0.40184838f, -0.025082052f, -0.31057972f, 0.14850605f, 0.39756012f, -0.25782883f,
-      0.3181312f, 0.17685872f, -0.16694272f, -0.41516554f, -0.062004805f, -0.33060408f, -0.13665432f, -0.43781847f,
-      -0.298562f, 0.013283849f, 0.48130906f, -0.27970356f, 0.20347959f, -0.24402553f, -0.20528454f, -0.114435256f,
-      0.12556863f, -0.4344011f, 0.2868948f, 0.19894183f, -0.12849897f, -0.18726158f, -0.4850099f, -0.4352169f,
-      -0.40527463f, 0.13625044f, -0.49707252f, -0.45698053f, 0.28196156f, 0.16826987f, -0.25944453f, 0.2801003f,
-      0.21121234f, -0.04066527f, 0.45854944f, -0.17861038f, 0.18178529f, 0.17789757f, 0.34227383f, 0.26976448f,
-      0.15789884f, 0.22840887f, 0.419321f, -0.14490443f, 0.39608955f, -0.4162954f, -0.47072983f, 0.41119635f};
+      0.3860137f, 0.083209574f, -0.16235226f, 0.30897498f, 0.077925384f, 0.4039817f, 0.054659843f,
+      -0.15768659f, 0.13434184f, -0.13558972f, 0.21042877f, 0.44641107f, 0.28902978f, -0.21858627f,
+      0.28863233f, 0.089463115f, 0.25391752f, -0.30475253f, -0.49495423f, -0.19318026f, -0.38351142f,
+      0.41026944f, 0.14401567f, 0.20710677f, 0.15813059f, -0.008697987f, 0.39130414f, -0.3552568f,
+      0.031481862f, -0.3412701f, 0.154176f, -0.17219114f, 0.41923493f, -0.09923202f, 0.43019837f,
+      0.15579104f, -0.4233985f, 0.3460176f, -0.13757241f, -0.19166303f, -0.41503525f, -0.49708033f,
+      0.14305532f, -0.109221935f, 0.19466156f, -0.41033173f, 0.37121457f, -0.36702687f, -0.08633667f,
+      0.10443485f, 0.2581259f, 0.40365517f, 0.45547962f, -0.39646107f, 0.12583363f, -0.21506298f,
+      -0.054792404f, -0.37424505f, 0.45542932f, -0.36697525f, 0.26722562f, 0.1757198f, 0.16247797f,
+      -0.2703231f, -0.49740213f, 0.33463532f, 0.37881732f, 0.1822241f, -0.3486371f, -0.49346995f,
+      -0.40608948f, 0.37285012f, 0.24005288f, 0.42075223f, 0.26193494f, 0.12654608f, -0.0048963428f,
+      -0.38025302f, -0.4283861f, -0.46767431f, 0.20468098f, -0.245484f, -0.10062629f, -0.28775263f,
+      -0.09111178f, -0.35191745f, -0.32670784f, 0.16585541f, -0.1485982f, 0.3086716f, -0.16040438f,
+      -0.36678362f, -0.088219464f, -0.2423737f, -0.15297079f, -0.4759978f, -0.34751755f, -0.10303855f,
+      0.3702919f, 0.25632292f, -0.3163945f, -0.4009425f, -0.34168184f, -0.49343884f, -0.3858195f,
+      -0.1236487f, 0.33743858f, 0.08369112f, -0.38030273f, -0.40111196f, 0.24873763f, -0.37192065f,
+      -0.06156373f, 0.23985302f, -0.23140603f, -0.05451995f, -0.04352224f, -0.118291676f, -0.25351608f,
+      -0.44571918f, -0.4041785f, -0.26773083f, 0.48291886f, -0.24150735f, -0.335764f, 0.121197104f,
+      0.13780516f, 0.27395487f, -0.20423341f, 0.18377024f, -0.33313757f, -0.32685202f, -0.024149835f,
+      -0.18288034f, -0.374829f, 0.29657948f, 0.40208143f, 0.08111167f, -0.08705664f, -0.4631365f,
+      -0.18211937f, 0.12729281f, 0.23576546f, -0.06320876f, -0.19767642f, 0.27861303f, -0.39819986f,
+      0.31600899f, -0.19397742f, 0.0076527f, -0.09880793f, 0.060619473f, -0.1510992f, 0.36356348f,
+      -0.012998581f, 0.39029974f, 0.48074025f, -0.24359548f, -0.36475456f, 0.401151f, 0.15562236f,
+      -0.38143605f, -0.31637233f, -0.41569126f, 0.43565983f, -0.4734699f, 0.37718338f, -0.01680845f,
+      -0.058149397f, 0.31273925f, -0.046213806f, 0.31357706f, 0.36150748f, -0.43410504f, 0.19239199f,
+      0.0943895f, 0.10750586f, 0.07299572f, 0.13676548f, -0.24053341f, -0.063970566f, 0.47506f,
+      0.33592474f, -0.018784225f, -0.47026545f, 0.021913886f, -0.34048676f, 0.40659577f, -0.3035437f,
+      -0.036100805f, -0.11097133f, 0.08897692f, 0.3793823f, 0.48282713f, 0.3181312f, -0.298562f,
+      -0.3271106f, 0.4363466f, 0.17685872f, 0.013283849f, 0.067666054f, -0.40184838f, -0.16694272f,
+      0.48130906f, -0.12331611f, -0.025082052f, -0.41516554f, -0.27970356f, -0.010209799f, -0.31057972f,
+      -0.062004805f, 0.20347959f, -0.48908865f, 0.14850605f, -0.33060408f, -0.24402553f, 0.19195485f,
+      0.39756012f, -0.13665432f, -0.20528454f, -0.45211792f, -0.25782883f, -0.43781847f, -0.114435256f,
+      0.12556863f, -0.40527463f, 0.21121234f, 0.15789884f, -0.4344011f, 0.13625044f, -0.04066527f,
+      0.22840887f, 0.2868948f, -0.49707252f, 0.45854944f, 0.419321f, 0.19894183f, -0.45698053f,
+      -0.17861038f, -0.14490443f, -0.12849897f, 0.28196156f, 0.18178529f, 0.39608955f, -0.18726158f,
+      0.16826987f, 0.17789757f, -0.4162954f, -0.4850099f, -0.25944453f, 0.34227383f, -0.47072983f,
+      -0.4352169f, 0.2801003f, 0.26976448f, 0.41119635f};
   const std::vector<float> fc2_experts_weights = {
-      0.10833451f, 0.34020698f, -0.18258394f, -0.17842063f, -0.07365984f, -0.29177922f, -0.24102151f, 0.1077901f,
-      0.2932343f, -0.35068116f, 0.1875877f, 0.07474385f, -0.20955177f, -0.27660736f, -0.14290786f, -0.09014153f,
-      -0.21085852f, -0.2378315f, 0.21457997f, 0.21074237f, -0.21087126f, 0.14320332f, -0.08389844f, 0.24034885f,
-      0.31800103f, 0.12659892f, 0.20224877f, -0.2563875f, 0.11782206f, 0.29377612f, -0.27469966f, -0.18875091f,
-      0.32136288f, 0.0788243f, -0.26413083f, 0.18453442f, 0.0776935f, -0.19561274f, 0.12608862f, 0.18579696f,
-      0.045481127f, -0.17894714f, 0.27366453f, 0.13220324f, -0.3115706f, -0.016884197f, -0.3328494f, -0.062126897f,
-      0.14841764f, 0.19741052f, 0.08211302f, -0.09362138f, -0.053040292f, -0.090344846f, 0.18264277f, 0.037823465f,
-      -0.16197139f, -0.20172869f, 0.064109616f, -0.062456656f, 0.30368346f, -0.12107184f, -0.12590908f, -0.10535928f,
-      0.1978099f, 0.13119277f, 0.21948591f, -0.080250844f, -0.24614547f, 0.33202717f, 0.2645375f, -0.21193951f,
-      0.17770219f, -0.04986229f, 0.33435768f, -0.0309231f, 0.16043694f, -0.0027341924f, -0.08339601f, -0.17402375f,
-      0.2525901f, -0.0813988f, -0.2904943f, -0.14452116f, -0.27119386f, -0.2952116f, 0.0794895f, -0.11223866f,
-      0.25427446f, 0.16967128f, 0.19531254f, -0.33598322f, -0.16714293f, -0.35097876f, -0.35189477f, 0.2900932f,
-      0.26874313f, -0.1322388f, -0.330179f, 0.064027935f, 0.19688474f, -0.20129368f, 0.006225848f, 0.19252343f,
-      -0.35054854f, -0.31874785f, 0.32238203f, 0.29287276f, 0.03135616f, 0.015792634f, 0.20397249f, -0.3245995f,
-      0.21416605f, 0.15667121f, -0.2058509f, 0.23639117f, -0.032677338f, 0.07826358f, -0.04589425f, -0.24935842f,
-      -0.20834164f, 0.069915086f, -0.26063374f, 0.13239416f, 0.33705652f, -0.26813045f, -0.17056243f, 0.29919288f,
-      0.27704936f, -0.096224755f, 0.13250813f, 0.26709175f, -0.26995474f, 0.3261805f, -0.18062393f, -0.04732303f,
-      -0.02733084f, 0.050550338f, -0.2937818f, -0.19453493f, -0.34864828f, -0.20862648f, -0.19311349f, 0.17665526f,
-      -0.2894185f, -0.020016002f, 0.3409702f, -0.18320526f, 0.068286195f, 0.08490415f, 0.30223787f, -0.2386011f,
-      0.09405743f, 0.123811804f, 0.31660154f, -0.11290163f, 0.07494662f, -0.24999082f, 0.2075398f, 0.07419645f,
-      0.3327035f, -0.09647329f, 0.24138254f, -0.32546985f, 0.033594366f, 0.16555631f, 0.33516192f, -0.32619375f,
-      0.20476541f, -0.07724f, 0.018923176f, -0.21126744f, 0.2744358f, -0.23979841f, -0.30413106f, -0.3485449f,
-      0.2854276f, 0.14391156f, -0.24802732f, -0.21701548f, -0.122100174f, 0.054206114f, -0.21961808f, 0.13481297f,
-      -0.07907457f, 0.15763119f, -0.31156835f, 0.29488218f, 0.17039073f, 0.35125035f, -0.17721775f, -0.10516899f,
-      0.072144486f, -0.038529005f, -0.058253434f, 0.13062657f, -0.3312356f, -0.15963489f, -0.20129326f, 0.014987925f,
-      0.30869225f, 0.283981f, -0.057181682f, 0.15174268f, 0.22181617f, -0.19763571f, 0.28675067f, 0.0003976555f,
-      -0.34610963f, 0.2931936f, -0.26233214f, 0.19563977f, -0.16886877f, 0.022812065f, 0.080249704f, -0.2798801f,
-      0.11531327f, 0.07107194f, -0.34746924f, -0.051920194f, -0.07264093f, 0.27581826f, 0.18536879f, 0.15684144f,
-      -0.26691115f, -0.22811417f, -0.1498502f, -0.176639f, -0.25876564f, -0.16051741f, -0.0048792143f, -0.08490091f,
-      0.18136817f, 0.24729891f, 0.32358363f, -0.09566104f, 0.3074607f, -0.24191524f, -0.21220984f, -0.23039621f,
-      0.21154472f, -0.19495378f, 0.002779711f, -0.34692943f, 0.055384878f, 0.25809082f, 0.16814983f, 0.19935164f,
-      0.11652225f, 0.1115539f, -0.24407779f, 0.09392998f, 0.33556697f, 0.11422251f, 0.34336287f, -0.33113837f};
+      0.10833451f, -0.07365984f, 0.2932343f, -0.20955177f, -0.21085852f, -0.21087126f, 0.31800103f,
+      0.11782206f, 0.34020698f, -0.29177922f, -0.35068116f, -0.27660736f, -0.2378315f, 0.14320332f,
+      0.12659892f, 0.29377612f, -0.18258394f, -0.24102151f, 0.1875877f, -0.14290786f, 0.21457997f,
+      -0.08389844f, 0.20224877f, -0.27469966f, -0.17842063f, 0.1077901f, 0.07474385f, -0.09014153f,
+      0.21074237f, 0.24034885f, -0.2563875f, -0.18875091f, 0.32136288f, 0.0776935f, 0.045481127f,
+      -0.3115706f, 0.14841764f, -0.053040292f, -0.16197139f, 0.30368346f, 0.0788243f, -0.19561274f,
+      -0.17894714f, -0.016884197f, 0.19741052f, -0.090344846f, -0.20172869f, -0.12107184f, -0.26413083f,
+      0.12608862f, 0.27366453f, -0.3328494f, 0.08211302f, 0.18264277f, 0.064109616f, -0.12590908f,
+      0.18453442f, 0.18579696f, 0.13220324f, -0.062126897f, -0.09362138f, 0.037823465f, -0.062456656f,
+      -0.10535928f, 0.1978099f, -0.24614547f, 0.17770219f, 0.16043694f, 0.2525901f, -0.27119386f,
+      0.25427446f, -0.16714293f, 0.13119277f, 0.33202717f, -0.04986229f, -0.0027341924f, -0.0813988f,
+      -0.2952116f, 0.16967128f, -0.35097876f, 0.21948591f, 0.2645375f, 0.33435768f, -0.08339601f,
+      -0.2904943f, 0.0794895f, 0.19531254f, -0.35189477f, -0.080250844f, -0.21193951f, -0.0309231f,
+      -0.17402375f, -0.14452116f, -0.11223866f, -0.33598322f, 0.2900932f, 0.26874313f, 0.19688474f,
+      -0.35054854f, 0.03135616f, 0.21416605f, -0.032677338f, -0.20834164f, 0.33705652f, -0.1322388f,
+      -0.20129368f, -0.31874785f, 0.015792634f, 0.15667121f, 0.07826358f, 0.069915086f, -0.26813045f,
+      -0.330179f, 0.006225848f, 0.32238203f, 0.20397249f, -0.2058509f, -0.04589425f, -0.26063374f,
+      -0.17056243f, 0.064027935f, 0.19252343f, 0.29287276f, -0.3245995f, 0.23639117f, -0.24935842f,
+      0.13239416f, 0.29919288f, 0.27704936f, -0.26995474f, -0.02733084f, -0.34864828f, -0.2894185f,
+      0.068286195f, 0.09405743f, 0.07494662f, -0.096224755f, 0.3261805f, 0.050550338f, -0.20862648f,
+      -0.020016002f, 0.08490415f, 0.123811804f, -0.24999082f, 0.13250813f, -0.18062393f, -0.2937818f,
+      -0.19311349f, 0.3409702f, 0.30223787f, 0.31660154f, 0.2075398f, 0.26709175f, -0.04732303f,
+      -0.19453493f, 0.17665526f, -0.18320526f, -0.2386011f, -0.11290163f, 0.07419645f, 0.3327035f,
+      0.033594366f, 0.20476541f, 0.2744358f, 0.2854276f, -0.122100174f, -0.07907457f, 0.17039073f,
+      -0.09647329f, 0.16555631f, -0.07724f, -0.23979841f, 0.14391156f, 0.054206114f, 0.15763119f,
+      0.35125035f, 0.24138254f, 0.33516192f, 0.018923176f, -0.30413106f, -0.24802732f, -0.21961808f,
+      -0.31156835f, -0.17721775f, -0.32546985f, -0.32619375f, -0.21126744f, -0.3485449f, -0.21701548f,
+      0.13481297f, 0.29488218f, -0.10516899f, 0.072144486f, -0.3312356f, 0.30869225f, 0.22181617f,
+      -0.34610963f, -0.16886877f, 0.11531327f, -0.07264093f, -0.038529005f, -0.15963489f, 0.283981f,
+      -0.19763571f, 0.2931936f, 0.022812065f, 0.07107194f, 0.27581826f, -0.058253434f, -0.20129326f,
+      -0.057181682f, 0.28675067f, -0.26233214f, 0.080249704f, -0.34746924f, 0.18536879f, 0.13062657f,
+      0.014987925f, 0.15174268f, 0.0003976555f, 0.19563977f, -0.2798801f, -0.051920194f, 0.15684144f,
+      -0.26691115f, -0.25876564f, 0.18136817f, 0.3074607f, 0.21154472f, 0.055384878f, 0.11652225f,
+      0.33556697f, -0.22811417f, -0.16051741f, 0.24729891f, -0.24191524f, -0.19495378f, 0.25809082f,
+      0.1115539f, 0.11422251f, -0.1498502f, -0.0048792143f, 0.32358363f, -0.21220984f, 0.002779711f,
+      0.16814983f, -0.24407779f, 0.34336287f, -0.176639f, -0.08490091f, -0.09566104f, -0.23039621f,
+      -0.34692943f, 0.19935164f, 0.09392998f, -0.33113837f};
   const std::vector<float> fc3_experts_weights = {
-      0.45783097f, -0.2863351f, 0.011728346f, -0.43760604f, 0.15407985f, 0.07818556f, 0.0013856292f, -0.34319758f,
-      -0.16871625f, 0.12490183f, -0.34154075f, -0.31836903f, -0.46634215f, -0.43996066f, -0.1860516f, -0.2917009f,
-      -0.1772582f, -0.06599659f, -0.42419833f, 0.49980444f, -0.3283869f, -0.21543652f, -0.034647882f, -0.17114872f,
-      -0.4837973f, -0.362943f, -0.27533132f, 0.09443748f, -0.16642791f, -0.2993343f, -0.33881485f, -0.39464045f,
-      0.31960344f, 0.007296145f, -0.45412838f, -0.024868786f, -0.16298121f, -0.44197202f, 0.07232875f, -0.32362783f,
-      0.42969978f, -0.029854119f, -0.18451887f, -0.30145288f, 0.16885209f, -0.30068123f, -0.12948537f, 0.36494362f,
-      -0.049498677f, 0.12020564f, 0.42106473f, -0.30590254f, 0.31881082f, -0.078908324f, 0.20685762f, -0.22735089f,
-      -0.11194843f, 0.14011681f, 0.19477749f, -0.44788343f, 0.23084867f, 0.48367476f, -0.19044077f, -0.100233376f,
-      0.4191656f, -0.4515314f, -0.3214385f, 0.016065598f, -0.4069137f, -0.17348295f, -0.43329984f, 0.33521235f,
-      -0.07843453f, -0.4865722f, -0.039011598f, -0.10605621f, 0.4192536f, 0.04063064f, 0.1984514f, 0.49294376f,
-      -0.056941032f, 0.18582922f, -0.16650558f, -0.17215621f, -0.20009357f, 0.46615022f, 0.47462142f, -0.0766145f,
-      -0.20405996f, -0.27452308f, -0.16176039f, -0.23940295f, 0.13248974f, 0.23036134f, 0.13154167f, 0.10377723f,
-      0.0070211887f, 0.29162645f, 0.34465307f, -0.4058748f, -0.13989884f, -0.12305027f, -0.2541607f, 0.4767149f,
-      0.4549045f, -0.108933926f, 0.2452516f, 0.054080307f, 0.33768386f, -0.45279485f, 0.1557768f, 0.17416143f,
-      -0.42602575f, -0.102350116f, 0.16022503f, 0.14813942f, 0.03982985f, -0.47012872f, -0.14555538f, 0.35645115f,
-      -0.1909796f, -0.20839584f, -0.28098184f, -0.23085594f, 0.022559166f, -0.23900753f, -0.19561106f, -0.24205637f,
-      0.2573983f, -0.2947166f, 0.4568925f, 0.11514187f, 0.18671238f, -0.121082425f, 0.3909887f, -0.10985571f,
-      -0.19420451f, -0.3255307f, 0.4863913f, 0.007830441f, 0.4648854f, -0.24156213f, 0.22956276f, -0.09216207f,
-      -0.29428315f, 0.26062596f, 0.14955276f, -0.036366224f, -0.12957954f, 0.08501935f, -0.36796576f, 0.041123867f,
-      0.06744653f, -0.0839923f, 0.17207885f, 0.006872058f, -0.21135789f, 0.3732242f, -0.2683524f, -0.45898575f,
-      -0.14543939f, 0.30806476f, 0.08574325f, 0.027492225f, -0.38164973f, -0.040038824f, -0.26947904f, -0.09740937f,
-      0.26697665f, -0.43565083f, 0.1359719f, 0.12271714f, 0.0149876475f, -0.44011843f, 0.26128954f, -0.42487514f,
-      -0.24668545f, 0.06113738f, -0.29119557f, 0.194273f, -0.24981815f, 0.3489496f, -0.47321397f, -0.31794417f,
-      -0.23641628f, 0.44169098f, -0.006898284f, 0.43446392f, -0.39553195f, 0.057907403f, -0.19339961f, -0.08160931f,
-      0.4979084f, -0.11149913f, 0.35366338f, -0.16032219f, -0.48278677f, 0.08397317f, 0.4008311f, 0.30288273f,
-      0.2546957f, -0.10675722f, 0.069722414f, 0.456497f, -0.19691509f, 0.49017924f, 0.41796166f, -0.2337895f,
-      -0.3635872f, -0.45445484f, -0.29122698f, -0.4339773f, 0.15762383f, 0.09782606f, -0.27986187f, -0.23860168f,
-      0.38454843f, -0.07870716f, 0.15390605f, -0.15793777f, 0.48130733f, 0.288768f, 0.45969498f, -0.4193731f,
-      -0.3218134f, -0.29914904f, -0.3426242f, 0.06931591f, -0.2633695f, -0.25429398f, 0.25366426f, -0.27700734f,
-      0.49418402f, -0.21919805f, 0.041192472f, -0.19817531f, -0.49578953f, 0.48185098f, -0.41920406f, -0.08335745f,
-      0.19111753f, -0.07547706f, 0.049694f, 0.13012594f, 0.2617172f, -0.22612399f, 0.32247066f, -0.33702326f,
-      0.20062232f, -0.09143996f, -0.063310504f, 0.1885702f, 0.11926836f, 0.3378734f, -0.45973647f, 0.48845494f};
-  const std::vector<float> output = {
-      0.026516449f, 0.04061616f, 0.04403834f, -0.13644142f, 0.038774252f, 0.024002096f, -0.061423667f, 0.034824893f,
-      -0.022858473f, 0.04693405f, -0.0120724365f, -0.028846134f, -0.0168579f, -0.07958221f, 0.048179876f, 0.053492386f,
-      -0.026292695f, -0.009724421f, -0.026503641f, 0.031220898f, 0.04189077f, 0.11775493f, -0.037770163f, -0.0790936f};
+      0.45783097f, -0.16871625f, -0.1772582f, -0.4837973f, -0.2863351f, 0.12490183f, -0.06599659f,
+      -0.362943f, 0.011728346f, -0.34154075f, -0.42419833f, -0.27533132f, -0.43760604f, -0.31836903f,
+      0.49980444f, 0.09443748f, 0.15407985f, -0.46634215f, -0.3283869f, -0.16642791f, 0.07818556f,
+      -0.43996066f, -0.21543652f, -0.2993343f, 0.0013856292f, -0.1860516f, -0.034647882f, -0.33881485f,
+      -0.34319758f, -0.2917009f, -0.17114872f, -0.39464045f, 0.31960344f, 0.42969978f, -0.049498677f,
+      -0.11194843f, 0.007296145f, -0.029854119f, 0.12020564f, 0.14011681f, -0.45412838f, -0.18451887f,
+      0.42106473f, 0.19477749f, -0.024868786f, -0.30145288f, -0.30590254f, -0.44788343f, -0.16298121f,
+      0.16885209f, 0.31881082f, 0.23084867f, -0.44197202f, -0.30068123f, -0.078908324f, 0.48367476f,
+      0.07232875f, -0.12948537f, 0.20685762f, -0.19044077f, -0.32362783f, 0.36494362f, -0.22735089f,
+      -0.100233376f, 0.4191656f, -0.07843453f, -0.056941032f, -0.20405996f, -0.4515314f, -0.4865722f,
+      0.18582922f, -0.27452308f, -0.3214385f, -0.039011598f, -0.16650558f, -0.16176039f, 0.016065598f,
+      -0.10605621f, -0.17215621f, -0.23940295f, -0.4069137f, 0.4192536f, -0.20009357f, 0.13248974f,
+      -0.17348295f, 0.04063064f, 0.46615022f, 0.23036134f, -0.43329984f, 0.1984514f, 0.47462142f,
+      0.13154167f, 0.33521235f, 0.49294376f, -0.0766145f, 0.10377723f, 0.0070211887f, 0.4549045f,
+      -0.42602575f, -0.1909796f, 0.29162645f, -0.108933926f, -0.102350116f, -0.20839584f, 0.34465307f,
+      0.2452516f, 0.16022503f, -0.28098184f, -0.4058748f, 0.054080307f, 0.14813942f, -0.23085594f,
+      -0.13989884f, 0.33768386f, 0.03982985f, 0.022559166f, -0.12305027f, -0.45279485f, -0.47012872f,
+      -0.23900753f, -0.2541607f, 0.1557768f, -0.14555538f, -0.19561106f, 0.4767149f, 0.17416143f,
+      0.35645115f, -0.24205637f, 0.2573983f, -0.19420451f, -0.29428315f, 0.06744653f, -0.2947166f,
+      -0.3255307f, 0.26062596f, -0.0839923f, 0.4568925f, 0.4863913f, 0.14955276f, 0.17207885f,
+      0.11514187f, 0.007830441f, -0.036366224f, 0.006872058f, 0.18671238f, 0.4648854f, -0.12957954f,
+      -0.21135789f, -0.121082425f, -0.24156213f, 0.08501935f, 0.3732242f, 0.3909887f, 0.22956276f,
+      -0.36796576f, -0.2683524f, -0.10985571f, -0.09216207f, 0.041123867f, -0.45898575f, -0.14543939f,
+      0.26697665f, -0.24668545f, -0.23641628f, 0.30806476f, -0.43565083f, 0.06113738f, 0.44169098f,
+      0.08574325f, 0.1359719f, -0.29119557f, -0.006898284f, 0.027492225f, 0.12271714f, 0.194273f,
+      0.43446392f, -0.38164973f, 0.0149876475f, -0.24981815f, -0.39553195f, -0.040038824f, -0.44011843f,
+      0.3489496f, 0.057907403f, -0.26947904f, 0.26128954f, -0.47321397f, -0.19339961f, -0.09740937f,
+      -0.42487514f, -0.31794417f, -0.08160931f, 0.4979084f, 0.2546957f, -0.3635872f, 0.38454843f,
+      -0.11149913f, -0.10675722f, -0.45445484f, -0.07870716f, 0.35366338f, 0.069722414f, -0.29122698f,
+      0.15390605f, -0.16032219f, 0.456497f, -0.4339773f, -0.15793777f, -0.48278677f, -0.19691509f,
+      0.15762383f, 0.48130733f, 0.08397317f, 0.49017924f, 0.09782606f, 0.288768f, 0.4008311f,
+      0.41796166f, -0.27986187f, 0.45969498f, 0.30288273f, -0.2337895f, -0.23860168f, -0.4193731f,
+      -0.3218134f, 0.49418402f, 0.19111753f, 0.20062232f, -0.29914904f, -0.21919805f, -0.07547706f,
+      -0.09143996f, -0.3426242f, 0.041192472f, 0.049694f, -0.063310504f, 0.06931591f, -0.19817531f,
+      0.13012594f, 0.1885702f, -0.2633695f, -0.49578953f, 0.2617172f, 0.11926836f, -0.25429398f,
+      0.48185098f, -0.22612399f, 0.3378734f, 0.25366426f, -0.41920406f, 0.32247066f, -0.45973647f,
+      -0.27700734f, -0.08335745f, -0.33702326f, 0.48845494f};
+  const std::vector<float> output = {0.026516449f, 0.04061616f, 0.04403834f, -0.13644142f, 0.038774252f,
+                                     0.024002096f, -0.061423667f, 0.034824893f, -0.022858473f, 0.04693405f,
+                                     -0.0120724365f, -0.028846134f, -0.0168579f, -0.07958221f, 0.048179876f,
+                                     0.053492386f, -0.026292695f, -0.009724421f, -0.026503641f, 0.031220898f,
+                                     0.04189077f, 0.11775493f, -0.037770163f, -0.0790936f};
 
-  RunMoETest(input,
-             router_probs,
-             fc1_experts_weights,
-             fc2_experts_weights,
-             fc3_experts_weights,
-             {},
-             {},
-             output,
-             num_rows,
-             num_experts,
-             hidden_size,
-             inter_size,
-             "silu",
-             1, /*normalize_routing_weights*/
+  RunMoETest(input, router_probs, fc1_experts_weights, fc2_experts_weights, fc3_experts_weights, {}, {}, output,
+             num_rows, num_experts, hidden_size, inter_size, "silu", 1, /*normalize_routing_weights*/
              2 /*top_k*/);
 }
 
+TEST(MoETest, QMoETest_Mixtral_Int4) {
+  int num_rows = 2;
+  int num_experts = 2;
+  int hidden_size = 64;
+  int inter_size = 64;
+
+  const std::vector<float> input = {
+      -0.8477f, -0.0746f, 1.606f, -0.3242f, 0.4028f, 0.2384f, -0.0359f, -1.667f, -1.265f, -0.3035f, 0.5327f,
+      1.109f, 1.111f, 0.533f, -0.5947f, -0.2009f, 0.4224f, -0.576f, 0.825f, 1.038f, -0.2722f, 0.0497f,
+      1.963f, -1.075f, -0.8374f, 1.055f, 0.448f, -0.602f, -0.2874f, -1.311f, -0.0609f, -1.991f, -0.0732f,
+      -1.49f, 0.6636f, -0.4053f, -1.603f, -1.088f, 0.09534f, -0.6807f, -0.3958f, 1.205f, -0.4275f, 0.82f,
+      1.029f, 0.2693f, 1.229f, 1.116f, 0.718f, -0.827f, 2.527f, -1.041f, 1.042f, -2.771f, -0.654f,
+      0.7144f, 0.6255f, -0.00957f, -0.2313f, 0.4663f, 2.803f, 0.0655f, 1.232f, 1.557f, -1.238f, -1.337f,
+      0.1522f, -0.2783f, 0.2252f, 2.252f, 0.557f, -0.6885f, 1.16f, -0.5244f, -1.424f, -0.02344f, -1.09f,
+      -0.749f, -1.118f, -2.6f, -1.308f, -0.742f, 0.3064f, 1.892f, 1.573f, -0.3843f, 0.6475f, 0.38f,
+      -1.423f, -2.04f, 0.005592f, -0.5977f, 1.063f, -1.626f, -0.04883f, -2.041f, -0.502f, -0.8906f, -0.3987f,
+      0.387f, 0.4644f, -1.419f, -1.35f, -0.9634f, -0.871f, -0.53f, 0.495f, -0.6157f, 0.6523f, -1.036f,
+      -1.234f, 0.11566f, 0.2035f, -1.782f, 0.837f, -0.8955f, -1.392f, -0.4f, 0.6533f, -0.289f, -1.328f,
+      0.528f, -1.269f, -0.581f, -0.4805f, -1.539f, -0.554f, 0.478f};
+  const std::vector<float> router_probs = {-0.579f, -0.07007f, 0.0784f, 0.5327f};
+  const std::vector<uint8_t> fc1_experts_weights = {
+      31, 119, 6, 42, 175, 252, 107, 46, 177, 207, 30, 178, 230, 186, 37, 69, 175, 194, 74, 203, 73, 190, 129,
+      112, 203, 106, 103, 156, 52, 121, 95, 101, 29, 149, 95, 107, 247, 189, 182, 92, 136, 49, 56, 227, 58, 71,
+      26, 111, 192, 107, 253, 212, 206, 86, 171, 35, 130, 119, 32, 66, 17, 99, 14, 11, 188, 109, 242, 62, 124,
+      127, 140, 70, 110, 81, 18, 90, 206, 12, 4, 240, 63, 17, 119, 191, 87, 245, 85, 170, 129, 213, 96, 249,
+      91, 127, 68, 172, 34, 23, 102, 76, 222, 244, 156, 71, 35, 55, 34, 166, 194, 164, 32, 193, 94, 144, 60,
+      200, 16, 255, 137, 29, 205, 211, 167, 39, 163, 178, 47, 244, 232, 70, 207, 190, 177, 105, 53, 88, 29, 246,
+      150, 176, 166, 224, 69, 68, 245, 85, 155, 237, 114, 129, 3, 237, 88, 245, 165, 72, 194, 38, 125, 249, 110,
+      235, 242, 116, 151, 65, 48, 196, 129, 79, 170, 63, 186, 95, 42, 173, 252, 235, 245, 207, 163, 38, 185, 69,
+      163, 102, 131, 116, 124, 153, 135, 30, 123, 10, 210, 137, 115, 58, 209, 244, 239, 55, 16, 127, 190, 109, 168,
+      152, 111, 29, 201, 31, 109, 56, 62, 1, 191, 50, 149, 179, 198, 241, 252, 100, 150, 66, 172, 200, 52, 4,
+      132, 82, 141, 103, 161, 17, 185, 62, 255, 121, 158, 184, 31, 243, 93, 103, 243, 91, 99, 16, 143, 74, 27,
+      19, 56, 100, 122, 183, 64, 131, 239, 183, 77, 143, 240, 194, 148, 42, 171, 112, 230, 204, 239, 224, 156, 245,
+      50, 75, 174, 255, 86, 217, 246, 79, 17, 74, 87, 29, 218, 6, 1, 77, 251, 235, 251, 14, 177, 21, 26,
+      219, 87, 203, 108, 241, 50, 69, 66, 104, 236, 111, 47, 51, 77, 8, 233, 220, 187, 162, 191, 15, 35, 196,
+      159, 16, 155, 220, 123, 26, 147, 68, 84, 127, 167, 52, 183, 253, 246, 108, 59, 142, 65, 214, 173, 62, 165,
+      204, 178, 95, 201, 211, 108, 132, 161, 220, 3, 191, 34, 251, 28, 245, 99, 160, 87, 43, 154, 244, 100, 76,
+      164, 28, 159, 155, 110, 137, 90, 109, 217, 125, 145, 143, 215, 238, 127, 182, 165, 31, 181, 172, 6, 63, 149,
+      133, 127, 247, 60, 91, 47, 87, 248, 214, 69, 225, 183, 202, 159, 210, 85, 2, 34, 33, 83, 91, 86, 243,
+      84, 149, 146, 5, 104, 125, 178, 63, 182, 159, 166, 186, 95, 101, 250, 124, 119, 207, 199, 37, 65, 182, 78,
+      218, 164, 193, 71, 19, 73, 88, 88, 25, 80, 60, 109, 161, 23, 78, 139, 248, 243, 122, 201, 53, 67, 68,
+      140, 249, 116, 139, 39, 72, 197, 92, 54, 53, 209, 28, 226, 149, 237, 216, 239, 241, 223, 214, 52, 85, 240,
+      237, 187, 106, 220, 186, 49, 87, 219, 235, 63, 213, 248, 176, 196, 135, 177, 79, 8, 66, 243, 18, 127, 201,
+      16, 167, 252, 26, 95, 225, 154, 210, 202, 182, 227, 232, 249, 135, 47, 151, 254, 169, 34, 31, 159, 29, 233,
+      228, 37, 213, 40, 245, 22, 47, 73, 213, 93, 242, 117, 75, 110, 248, 206, 74, 24, 139, 252, 174, 245, 101,
+      161, 214, 58, 154, 202, 25, 147, 127, 100, 111, 217, 190, 167, 20, 16, 237, 167, 247, 247, 192, 54, 153, 63,
+      187, 178, 64, 182, 209, 247, 218, 211, 45, 242, 132, 94, 137, 238, 184, 240, 250, 42, 193, 150, 139, 159, 242,
+      149, 89, 22, 111, 9, 78, 194, 146, 249, 173, 185, 243, 147, 250, 228, 144, 123, 250, 49, 92, 231, 1, 152,
+      179, 101, 178, 255, 94, 136, 6, 30, 77, 173, 137, 110, 56, 16, 90, 95, 115, 145, 113, 51, 172, 152, 242,
+      119, 7, 186, 149, 168, 213, 228, 229, 133, 31, 40, 189, 19, 74, 88, 75, 134, 255, 17, 116, 208, 224, 242,
+      252, 156, 153, 44, 165, 119, 23, 206, 175, 33, 213, 59, 243, 103, 244, 92, 130, 184, 162, 229, 49, 203, 157,
+      208, 106, 156, 237, 218, 223, 235, 58, 203, 117, 228, 119, 127, 58, 169, 171, 166, 203, 180, 254, 149, 90, 37,
+      117, 63, 98, 107, 130, 143, 179, 72, 168, 184, 61, 137, 185, 123, 57, 70, 90, 115, 78, 26, 36, 167, 237,
+      145, 247, 220, 103, 129, 207, 183, 6, 56, 178, 46, 198, 245, 220, 42, 59, 134, 217, 186, 33, 50, 200, 145,
+      84, 96, 139, 72, 115, 253, 221, 42, 177, 49, 22, 201, 183, 194, 46, 222, 94, 169, 233, 22, 252, 228, 48,
+      91, 72, 201, 150, 203, 219, 247, 30, 117, 19, 131, 74, 235, 214, 223, 221, 244, 36, 170, 94, 84, 253, 52,
+      186, 223, 127, 156, 248, 182, 74, 89, 223, 202, 255, 194, 151, 195, 115, 176, 180, 55, 194, 33, 14, 133, 39,
+      250, 129, 142, 25, 49, 126, 47, 67, 215, 56, 116, 242, 117, 36, 98, 207, 78, 168, 150, 175, 109, 229, 54,
+      45, 221, 205, 130, 52, 133, 208, 174, 234, 234, 188, 71, 250, 3, 43, 225, 57, 144, 225, 157, 202, 251, 194,
+      242, 106, 188, 121, 239, 104, 206, 238, 116, 28, 253, 1, 62, 153, 193, 147, 24, 120, 70, 241, 148, 54, 227,
+      159, 242, 208, 23, 14, 102, 41, 29, 254, 184, 52, 27, 45, 69, 137, 149, 23, 151, 123, 99, 190, 107, 247,
+      81, 164, 57, 163, 77, 213, 250, 203, 235, 134, 85, 83, 1, 188, 70, 100, 70, 56, 217, 34, 55, 103, 123,
+      9, 191, 147, 132, 36, 21, 38, 89, 82, 170, 166, 167, 129, 238, 171, 227, 254, 188, 14, 202, 249, 54, 158,
+      12, 146, 139, 203, 113, 153, 163, 180, 104, 220, 210, 108, 77, 200, 183, 135, 118, 241, 170, 143, 238, 120, 68,
+      215, 67, 133, 182, 222, 155, 12, 254, 157, 250, 87, 190, 226, 141, 62, 250, 212, 43, 218, 109, 229, 79, 213,
+      249, 190, 225, 107, 238, 15, 233, 72, 56, 193, 64, 86, 156, 215, 170, 70, 69, 28, 221, 97, 21, 73, 8,
+      184, 53, 138, 80, 239, 59, 117, 23, 39, 104, 167, 202, 47, 231, 253, 174, 226, 114, 40, 140, 3, 200, 198,
+      134, 85, 10, 104, 50, 1, 75, 243, 1, 248, 148, 143, 180, 213, 47, 198, 76, 78, 12, 15, 128, 23, 144,
+      82, 50, 132, 219, 73, 81, 45, 169, 39, 178, 85, 130, 111, 17, 111, 54, 13, 47, 184, 139, 245, 23, 74,
+      202, 138, 183, 30, 136, 134, 120, 105, 95, 253, 89, 252, 67, 169, 149, 251, 4, 57, 175, 30, 79, 15, 104,
+      34, 65, 252, 101, 211, 6, 81, 72, 40, 18, 240, 163, 255, 53, 132, 173, 124, 251, 145, 69, 98, 46, 137,
+      179, 11, 113, 247, 158, 232, 179, 215, 14, 47, 185, 237, 58, 121, 159, 227, 227, 244, 73, 214, 181, 149, 47,
+      93, 20, 245, 237, 166, 192, 82, 133, 84, 33, 124, 12, 229, 59, 44, 30, 62, 136, 243, 143, 119, 101, 253,
+      36, 159, 245, 87, 175, 250, 223, 86, 185, 146, 51, 39, 248, 128, 212, 189, 62, 190, 232, 134, 148, 88, 158,
+      113, 157, 130, 201, 122, 132, 250, 250, 87, 200, 79, 47, 148, 30, 232, 74, 199, 188, 175, 234, 8, 95, 141,
+      253, 49, 158, 20, 135, 153, 195, 212, 255, 104, 68, 78, 75, 155, 253, 163, 146, 58, 124, 34, 231, 4, 138,
+      241, 19, 36, 25, 114, 230, 167, 147, 72, 69, 242, 130, 46, 228, 130, 210, 149, 74, 248, 79, 251, 31, 180,
+      249, 233, 148, 243, 47, 170, 153, 176, 185, 207, 111, 191, 191, 13, 180, 247, 72, 91, 196, 244, 187, 245, 16,
+      188, 239, 66, 253, 181, 45, 52, 245, 26, 63, 112, 42, 68, 29, 216, 166, 57, 87, 252, 27, 159, 177, 143,
+      142, 34, 244, 38, 214, 144, 243, 72, 165, 69, 30, 241, 164, 126, 164, 228, 142, 184, 251, 172, 51, 49, 202,
+      241, 76, 203, 169, 210, 46, 204, 49, 108, 138, 164, 116, 188, 23, 163, 251, 107, 156, 159, 69, 125, 163, 93,
+      80, 31, 97, 200, 83, 212, 111, 248, 154, 82, 128, 187, 232, 254, 195, 174, 195, 69, 204, 39, 67, 34, 78,
+      253, 107, 2, 219, 245, 222, 22, 200, 83, 39, 220, 206, 47, 76, 118, 63, 67, 142, 24, 115, 97, 56, 233,
+      199, 230, 206, 186, 91, 18, 141, 148, 43, 94, 25, 96, 140, 233, 165, 202, 242, 71, 209, 235, 173, 105, 40,
+      82, 63, 165, 218, 95, 48, 89, 103, 89, 24, 100, 195, 208, 251, 28, 240, 235, 157, 207, 191, 52, 249, 203,
+      134, 40, 153, 219, 102, 29, 32, 196, 71, 149, 124, 189, 183, 14, 244, 90, 123, 152, 206, 232, 70, 138, 111,
+      231, 6, 237, 243, 69, 101, 118, 41, 70, 171, 97, 133, 207, 234, 21, 24, 223, 131, 55, 18, 253, 128, 63,
+      179, 85, 174, 62, 36, 217, 191, 90, 142, 215, 79, 247, 16, 53, 200, 241, 85, 141, 194, 125, 75, 30, 237,
+      4, 255, 247, 80, 69, 127, 204, 238, 106, 255, 98, 24, 206, 110, 57, 103, 90, 175, 100, 253, 151, 206, 90,
+      33, 243, 52, 13, 51, 212, 99, 182, 10, 105, 41, 111, 22, 182, 207, 205, 81, 138, 203, 205, 99, 204, 125,
+      152, 55, 3, 130, 116, 116, 106, 178, 74, 234, 56, 51, 20, 223, 180, 49, 155, 79, 53, 194, 227, 118, 116,
+      165, 100, 196, 255, 91, 74, 250, 19, 177, 79, 188, 141, 91, 149, 65, 28, 76, 53, 185, 123, 164, 127, 74,
+      168, 73, 65, 202, 112, 41, 111, 233, 240, 200, 241, 155, 181, 78, 136, 222, 55, 161, 1, 252, 241, 35, 54,
+      223, 122, 32, 31, 59, 46, 177, 168, 246, 226, 25, 71, 24, 221, 120, 27, 118, 242, 216, 200, 252, 208, 174,
+      165, 169, 177, 84, 249, 19, 236, 146, 17, 15, 29, 117, 31, 24, 67, 208, 105, 117, 100, 109, 19, 34, 159,
+      82, 33, 194, 122, 211, 167, 239, 207, 153, 109, 211, 29, 209, 111, 224, 178, 249, 244, 35, 127, 72, 18, 36,
+      211, 211, 251, 98, 142, 200, 131, 114, 93, 84, 51, 75, 87, 252, 180, 27, 111, 16, 238, 233, 186, 168, 181,
+      99, 31, 43, 101, 0, 127, 202, 206, 219, 93, 42, 206, 142, 209, 28, 58, 252, 168, 119, 101, 236, 157, 53,
+      61, 31, 129, 120, 81, 116, 129, 20, 20, 170, 181, 173, 33, 40, 67, 24, 78, 159, 158, 255, 210, 197, 193,
+      73, 127, 254, 239, 216, 130, 121, 97, 249, 22, 136, 179, 221, 10, 40, 246, 89, 48, 36, 57, 186, 236, 58,
+      110, 45, 217, 173, 46, 210, 225, 223, 235, 91, 55, 79, 82, 162, 17, 22, 223, 11, 1, 189, 11, 180, 47,
+      191, 225, 173, 175, 46, 212, 158, 75, 68, 62, 99, 203, 115, 174, 139, 141, 81, 135, 75, 131, 221, 138, 247,
+      155, 154, 56, 220, 123, 32, 28, 151, 47, 241, 14, 14, 84, 147, 39, 247, 99, 249, 187, 111, 26, 60, 140,
+      194, 5, 130, 101, 114, 74, 229, 127, 52, 252, 233, 79, 13, 213, 46, 196, 171, 215, 250, 166, 165, 37, 171,
+      143, 244, 108, 131, 87, 95, 248, 223, 169, 98, 195, 211, 227, 201, 122, 202, 170, 220, 98, 246, 166, 182, 115,
+      177, 67, 189, 94, 175, 28, 9, 225, 100, 173, 217, 150, 32, 90, 72, 190, 222, 65, 169, 170, 175, 179, 194,
+      42, 102, 122, 218, 174, 217, 177, 200, 249, 111, 195, 246, 172, 60, 60, 160, 113, 205, 27, 87, 233, 189, 95,
+      10, 176, 72, 54, 206, 131, 114, 245, 31, 123, 31, 55, 15, 189, 165, 122, 93, 45, 65, 154, 252, 102, 37,
+      39, 132, 85, 148, 140, 37, 110, 201, 35, 31, 54, 243, 174, 164, 230, 255, 55, 7, 220, 123, 246, 98, 31,
+      12, 137, 63, 95, 44, 255, 165, 241, 78, 225, 172, 72, 119, 209, 63, 209, 116, 7, 152, 47, 38, 191, 210,
+      155, 175, 100, 95, 147, 210, 30, 137, 13, 226, 201, 232, 63, 169, 236, 164, 29, 225, 254, 225, 181, 253, 155,
+      211, 100, 33, 255, 75, 14, 86, 238, 0, 124, 199, 102, 67, 230, 84, 161, 90, 169, 181, 140, 11, 133, 41,
+      201, 177, 136, 234, 155, 31, 67, 254, 100, 21, 224, 17, 164, 218, 193, 147, 191, 64, 21, 125, 43, 63, 221,
+      90, 160, 95, 68, 244, 34, 115, 79, 241, 171, 151, 21, 167, 122, 221, 1, 236, 85, 41, 156, 73, 72, 218,
+      146, 171, 44, 62, 233, 174, 75, 237, 232, 23, 207, 64, 128, 165, 209, 135, 170, 49, 143, 238, 109, 228, 200,
+      67, 114, 135, 72, 51, 253, 107, 74, 202, 58, 160, 186, 194, 63, 97, 175, 241, 180, 58, 187, 19, 212, 175,
+      225, 184, 224, 21, 190, 67, 154, 106, 170, 129, 195, 92, 4, 122, 15, 252, 88, 182, 218, 127, 20, 137, 226,
+      143, 205, 70, 126, 137, 190, 120, 186, 175, 44, 169, 76, 26, 45, 213, 99, 171, 144, 90, 100, 175, 104, 66,
+      159, 155, 218, 31, 54, 242, 98, 50, 204, 82, 65, 8, 138, 143, 109, 222, 40, 65, 244, 4, 165, 153, 195,
+      253, 217, 11, 186, 54, 53, 37, 66, 35, 95, 241, 185, 18, 18, 37, 245, 63, 157, 139, 109, 250, 235, 110,
+      6, 85, 234, 85, 1, 211, 82, 159, 245, 71, 92, 52, 233, 124, 250, 244, 174, 218, 47, 37, 232, 203, 199,
+      243, 175, 244, 166, 44, 108, 93, 79, 51, 241, 241, 64, 124, 61, 69, 246, 241, 140, 255, 79, 235, 58, 196,
+      255, 216, 207, 193, 143, 86, 89, 30, 158, 57, 120, 55, 60, 130, 220, 148, 89, 138, 253, 210, 86, 125, 104,
+      83, 244, 158, 100, 56, 216, 103, 13, 77, 106, 41, 163, 164, 237, 2, 95, 141, 76, 250, 238, 199, 129, 41,
+      69, 128, 127, 87, 22, 124, 27, 95, 172, 233, 47, 29, 198, 57, 234, 251, 207, 107, 215, 49, 147, 20, 116,
+      146, 77, 48, 138, 156, 137, 218, 160, 168, 31, 175, 101, 61, 169, 170, 6, 113, 255, 228, 156, 107, 248, 243,
+      109, 201, 245, 134, 240, 37, 96, 165, 63, 94, 56, 191, 97, 58, 57, 125, 247, 21, 249, 124, 143, 140, 235,
+      234, 45, 188, 27, 239, 149, 235, 122, 76, 82, 124, 242, 163, 65, 95, 1, 64, 227, 26, 55, 211, 151, 138,
+      124, 191, 100, 88, 55, 80, 16, 31, 101, 51, 138, 141, 235, 99, 60, 118, 40, 168, 141, 138, 25, 97, 167,
+      37, 174, 98, 213, 26, 76, 121, 235, 113, 247, 131, 114, 133, 252, 245, 244, 33, 108, 253, 122, 201, 143, 92,
+      140, 29, 201, 197, 144, 65, 189, 251, 147, 211, 43, 234, 170, 44, 101, 36, 40, 29, 19, 216, 134, 82, 71,
+      147, 110, 170, 208, 184, 3, 212, 1, 121, 63, 188, 135, 53, 31, 89, 232, 63, 158, 70, 30, 231, 102, 65,
+      22, 150, 244, 121, 148, 90, 103, 155, 242, 48, 185, 245, 59, 105, 182, 110, 59, 127, 33, 248, 52, 192, 125,
+      218, 50, 169, 252, 255, 122, 71, 158, 255, 158, 178, 65, 164, 223, 137, 182, 78, 253, 118, 66, 130, 238, 90,
+      81, 214, 160, 24, 40, 113, 112, 63, 111, 218, 234, 203, 30, 49, 97, 137, 97, 30, 212, 158, 89, 156, 224,
+      56, 70, 203, 48, 119, 148, 255, 18, 4, 35, 117, 42, 79, 248, 178, 226, 250, 67, 250, 71, 215, 170, 197,
+      176, 124, 207, 83, 34, 72, 73, 46, 133, 1, 15, 117, 18, 225, 95, 60, 186, 169, 53, 176, 136, 63, 104,
+      75, 219, 115, 78, 239, 211, 44, 198, 130, 156, 166, 98, 215, 144, 240, 93, 209, 254, 255, 251, 150, 124, 172,
+      228, 6, 183, 79, 127, 241, 62, 121, 129, 183, 228, 30, 237, 244, 108, 246, 65, 75, 241, 145, 82, 217, 210,
+      122, 79, 79, 244, 167, 167, 119, 20, 72, 202, 139, 57, 212, 141, 246, 239, 50, 204, 179, 122, 156, 146, 216,
+      243, 73, 159, 31, 182, 227, 67, 191, 130, 248, 227, 191, 85, 85, 241, 37, 147, 37, 171, 50, 38, 240, 241,
+      140, 61, 83, 236, 116, 62, 245, 44, 91, 227, 189, 220, 246, 243, 186, 175, 145, 88, 23, 61, 175, 6, 175,
+      148, 255, 118, 53, 237, 38, 109, 110, 254, 191, 236, 58, 3, 73, 194, 159, 83, 80, 145, 251, 129, 101, 216,
+      241, 156, 178, 255, 46, 230, 198, 160, 190, 236, 187, 91, 48, 127, 111, 250, 168, 218, 87, 111, 95, 101, 207,
+      168, 98, 64, 199, 207, 170, 37, 101, 125, 190, 166, 139, 157, 39, 246, 39, 21, 137, 250, 172, 253, 80, 113,
+      5, 18, 155, 92, 18, 180, 111, 239, 165, 246, 91, 87, 57, 223, 206, 62, 245, 227, 121, 84, 180, 162, 25,
+      240, 223, 225, 111, 28, 203, 8, 20, 203, 17, 235, 56, 233, 198, 158, 250, 181, 57, 191, 173, 72, 136, 169,
+      232, 179, 51, 222, 165, 152, 85, 125, 199, 118, 82, 182, 77, 240, 249, 255, 209, 127, 49, 190, 235, 18, 143,
+      125, 52, 60, 12, 111, 95, 115, 117, 111, 220, 90, 159, 147, 26, 152, 84, 137, 70, 8, 182, 12, 249, 252,
+      191, 247, 49, 180, 130, 239, 87, 201, 99, 86, 80, 122, 26, 104, 243, 250, 245, 225, 46, 242, 102, 143, 15,
+      154, 27, 134, 236, 86, 50, 31, 152, 254, 47, 127, 249, 204, 245, 53, 207, 100, 84, 47, 88, 210, 213, 161,
+      227, 71, 182, 30, 141, 127, 78, 83, 33, 213, 131, 33, 169, 222, 202, 80, 179, 164, 227, 222, 218, 65, 254,
+      228, 75, 140, 218, 190, 214, 157, 224, 118, 74, 82, 134, 148, 70, 143, 225, 35, 43, 53, 91, 177, 240, 239,
+      132, 139, 226, 245, 39, 97, 45, 74, 251, 87, 175, 185, 187, 205, 83, 33, 45, 155, 164, 132, 61, 146, 56,
+      229, 175, 110, 94, 204, 177, 111, 126, 206, 105, 67, 215, 205, 253, 237, 46, 89, 116, 14, 35, 92, 123, 43,
+      242, 249, 35, 227, 233, 16, 251, 175, 245, 103, 127, 196, 233, 43, 23, 135, 27, 189, 97, 29, 5, 178, 37,
+      52, 27, 140, 229, 63, 86, 138, 32, 34, 87, 38, 206, 32, 155, 191, 48, 61, 63, 113, 27, 43, 92, 149,
+      143, 47, 137, 116, 150, 127, 49, 82, 25, 22, 99, 7, 61, 255, 246, 203, 83, 132, 166, 44, 75, 211, 131,
+      94, 26, 127, 105, 73, 189, 163, 182, 57, 95, 248, 34, 97, 244, 255, 47, 131, 55, 188, 97, 80, 205, 124,
+      184, 111, 50, 168, 44, 159, 249, 74, 164, 200, 227, 33, 253, 24, 217, 38, 135, 130, 87, 114, 90, 199, 216,
+      27, 127, 252, 246, 134, 80, 49, 147, 186, 122, 107, 70, 249, 30, 95, 72, 34, 205, 229, 95, 79, 142, 71,
+      194, 133, 36, 215, 188, 100, 111, 129, 190, 93, 113, 227, 167, 29, 254, 212, 38, 141, 182, 105, 164, 121, 147,
+      225, 252, 137, 32, 50, 31, 64, 35, 168, 78, 185, 31, 38, 73, 255, 33, 60, 38, 125, 49, 12, 242, 248,
+      94, 104, 114, 78, 255, 101, 76, 158, 95, 13, 212, 169, 242, 116, 250, 154, 184, 86, 68, 172, 166, 95, 182,
+      21, 191, 24, 145, 207, 26, 119, 201, 58, 126, 203, 10, 147, 111, 165, 251, 99, 64, 26, 77, 218, 208, 244,
+      239, 93, 226, 254, 68, 198, 82, 188, 47, 200, 15, 44, 100, 33, 90, 55, 71, 249, 148, 175, 219, 59, 159,
+      191, 127, 8, 125, 206, 27, 67, 136, 234, 89, 12, 203, 163, 52, 78, 167, 144, 68, 112, 127, 176, 235, 113,
+      116, 39, 217, 143, 26, 23, 232, 23, 54, 203, 231, 43, 41, 22, 56, 137, 150, 111, 47, 107, 242, 146, 190,
+      63, 154, 248, 73, 25, 251, 161, 77, 187, 158, 34, 87, 21, 224, 133, 0, 239, 190, 152, 87, 128, 229, 24,
+      130, 40, 95, 111, 241, 251, 251, 12, 26, 254, 81, 111, 47, 91, 137, 22, 61, 202, 85, 175, 174, 169, 196,
+      218, 253, 1, 192, 54, 46, 81, 190, 33, 17, 248, 66, 248, 240, 226, 49, 52, 148, 177, 246, 20, 41, 211,
+      128, 12, 247, 221, 130, 118, 113, 183, 47, 230, 233, 57, 162, 183, 74, 28, 195, 132, 210, 124, 248, 255, 166,
+      51, 207, 135, 157, 115, 248, 238, 126, 195, 175, 177, 36, 68, 225, 216, 205, 214, 200, 243, 248, 22, 231, 138,
+      168, 34, 210, 145, 236, 23, 175, 25, 248, 208, 77, 215, 160, 242, 209, 168, 25, 216, 191, 84, 182, 191, 143,
+      243, 240, 182, 210, 241, 170, 216, 182, 29, 87, 168, 99, 128, 216, 169, 245, 68, 33, 153, 164, 28, 56, 79,
+      157, 145, 17, 158, 124, 171, 255, 225, 225, 252, 43, 61, 249, 165, 40, 197, 111, 7, 8, 166, 211, 241, 84,
+      185, 240, 35, 48, 146, 167, 163, 110, 247, 119, 50, 112, 101, 239, 87, 66, 34, 107, 106, 138, 29, 234, 43,
+      151, 206, 254, 95, 142, 85, 46, 74, 46, 89, 200, 199, 210, 198, 156, 126, 247, 17, 44, 140, 84, 159, 187,
+      213, 202, 151, 46, 234, 165, 78, 247, 210, 250, 228, 223, 85, 27, 137, 127, 131, 91, 42, 124, 97, 237, 139,
+      24, 175, 47, 104, 81, 163, 210, 56, 193, 107, 115, 203, 250, 184, 140, 206, 51, 13, 118, 156, 164, 132, 226,
+      127, 213, 32, 23, 66, 55, 137, 154, 29, 125, 226, 85, 38, 207, 180, 91, 24, 35, 167, 224, 138, 93, 66,
+      40, 85, 185, 43, 88, 184, 142, 37, 3, 234, 220, 165, 127, 219, 130, 48, 49, 175, 55, 79, 196, 50, 197,
+      141, 111, 217, 44, 209, 150, 57, 253, 208, 107, 222, 146, 130, 22, 161, 118, 96, 177, 219, 128, 67, 114, 228,
+      143, 247, 49, 204, 69, 25, 7, 169, 52, 35, 168, 255, 181, 86, 79, 169, 135, 151, 21, 74, 119, 106, 15,
+      26, 68, 16, 118, 42, 93, 207, 189, 59, 138, 190, 206, 204, 123, 53, 197, 83, 61, 149, 123, 54, 26, 247,
+      22, 63, 254, 255, 24, 154, 63, 232, 29, 97, 233, 128, 207, 79, 169, 239, 251, 57, 38, 51, 102, 63, 72,
+      226, 234, 250, 254, 190, 141, 96, 137, 54, 234, 111, 111, 190, 106, 95, 233, 147, 226, 15, 63, 251, 41, 53,
+      58, 132, 97, 73, 38, 80, 213, 162, 250, 125, 179, 154, 27, 229, 171, 159, 237, 145, 128, 207, 211, 74, 134,
+      249, 155, 25, 47, 163, 247, 183, 184, 94, 59, 212, 227, 188, 162, 253, 114, 79, 250, 92, 130, 111, 42, 75,
+      45, 136, 209, 108, 30, 100, 93, 221, 142, 125, 228, 143, 255, 219, 238, 67, 182, 40, 158, 133, 21, 248, 143,
+      195, 220, 193, 96, 111, 120, 208, 6, 249, 42, 225, 30, 159, 223, 116, 203, 35, 243, 9, 147, 251, 29, 217,
+      184, 125, 131, 165, 79, 85, 53, 127, 47, 146, 229, 245, 185, 173, 37, 230, 209, 118, 108, 78, 114, 246, 208,
+      200, 186, 239, 244, 2, 188, 114, 255, 76, 89, 86, 157, 237, 225, 41, 43, 43, 86, 89, 52, 132, 26, 148,
+      253, 31, 185, 174, 248, 131, 92, 34, 173, 11, 72, 198, 145, 215, 180, 106, 173, 44, 99, 46, 116, 31, 51,
+      71, 77, 203, 138, 17, 55, 70, 183, 181, 155, 151, 229, 81, 204, 249, 19, 158, 237, 134, 96, 126, 88, 139,
+      111, 138, 247, 110, 253, 217, 107, 111, 77, 234, 130, 199, 177, 143, 199, 47, 123, 155, 32, 99, 117, 16, 181,
+      155, 87, 166, 244, 42, 36, 251, 108, 15, 45, 108, 33, 85, 97, 117, 203, 142, 45, 102, 225, 249, 255, 191,
+      19, 17, 255, 229, 214, 84, 89, 201, 41, 249, 242, 208, 188, 220, 51, 140, 233, 224, 97, 192, 114, 242, 46,
+      23, 241, 29, 59, 150, 179, 21, 239, 109, 77, 205, 63, 81, 102, 116, 33, 87, 84, 190, 255, 249, 71, 40,
+      97, 117};
+  const std::vector<uint8_t> fc2_experts_weights = {
+      194, 252, 114, 86, 142, 245, 201, 173, 104, 137, 96, 176, 255, 143, 35, 68, 92, 181, 221, 16, 7, 116, 143,
+      99, 41, 223, 180, 116, 244, 139, 190, 255, 138, 100, 204, 207, 5, 23, 77, 101, 217, 92, 134, 241, 138, 183,
+      146, 41, 171, 194, 248, 212, 175, 20, 12, 202, 83, 43, 255, 239, 87, 77, 70, 168, 40, 227, 76, 38, 121,
+      67, 146, 21, 139, 160, 109, 169, 212, 66, 131, 219, 235, 113, 1, 246, 88, 5, 180, 106, 247, 179, 253, 170,
+      103, 204, 178, 62, 173, 37, 80, 214, 130, 113, 23, 137, 0, 183, 32, 115, 1, 253, 222, 223, 19, 114, 243,
+      57, 208, 115, 251, 89, 66, 63, 81, 9, 83, 94, 128, 146, 23, 85, 208, 96, 142, 87, 190, 157, 83, 0,
+      169, 183, 221, 110, 96, 118, 39, 215, 61, 44, 19, 42, 64, 207, 130, 184, 50, 255, 132, 126, 191, 108, 110,
+      226, 232, 44, 175, 234, 228, 67, 85, 191, 56, 90, 44, 219, 49, 21, 246, 113, 70, 69, 67, 118, 132, 241,
+      118, 75, 185, 159, 254, 44, 183, 104, 40, 86, 18, 95, 106, 13, 65, 216, 245, 242, 46, 230, 223, 59, 167,
+      239, 230, 159, 252, 41, 141, 193, 65, 187, 74, 95, 187, 86, 99, 68, 255, 230, 93, 241, 138, 74, 56, 162,
+      138, 150, 143, 220, 59, 152, 212, 218, 251, 165, 13, 207, 184, 78, 11, 49, 79, 245, 181, 136, 197, 239, 110,
+      7, 251, 90, 78, 151, 23, 74, 27, 159, 97, 82, 79, 181, 228, 111, 136, 86, 227, 86, 170, 147, 172, 245,
+      65, 136, 162, 119, 224, 94, 252, 1, 190, 25, 96, 181, 83, 95, 237, 132, 22, 19, 251, 63, 94, 99, 137,
+      62, 219, 223, 159, 206, 246, 96, 71, 85, 255, 119, 55, 190, 175, 108, 175, 189, 101, 172, 224, 175, 191, 204,
+      137, 52, 46, 107, 6, 177, 160, 96, 253, 131, 123, 215, 95, 107, 220, 123, 143, 9, 251, 101, 87, 241, 163,
+      162, 66, 97, 166, 206, 251, 244, 206, 125, 242, 246, 9, 110, 49, 168, 82, 249, 238, 214, 209, 91, 212, 76,
+      122, 12, 88, 247, 41, 241, 253, 29, 203, 226, 51, 60, 17, 0, 255, 193, 237, 213, 45, 96, 201, 175, 111,
+      148, 209, 250, 140, 237, 197, 55, 248, 158, 236, 245, 78, 111, 27, 103, 235, 191, 23, 250, 216, 95, 2, 45,
+      79, 46, 116, 153, 169, 43, 183, 41, 86, 119, 228, 158, 138, 62, 145, 30, 15, 199, 59, 223, 67, 124, 237,
+      186, 98, 138, 94, 184, 156, 249, 117, 40, 223, 158, 38, 91, 176, 94, 160, 254, 23, 175, 142, 235, 3, 146,
+      251, 255, 205, 42, 122, 255, 26, 149, 136, 207, 79, 31, 218, 150, 248, 161, 14, 220, 55, 76, 135, 3, 195,
+      108, 35, 239, 171, 120, 201, 250, 219, 16, 72, 181, 232, 245, 63, 243, 197, 68, 177, 83, 53, 106, 254, 28,
+      89, 45, 31, 243, 162, 150, 122, 82, 155, 74, 77, 220, 46, 49, 254, 105, 43, 18, 62, 234, 13, 189, 85,
+      189, 222, 206, 63, 99, 171, 247, 242, 136, 166, 167, 31, 166, 174, 85, 232, 25, 7, 179, 91, 218, 212, 129,
+      68, 31, 208, 101, 133, 45, 11, 163, 41, 104, 190, 169, 239, 254, 196, 162, 158, 220, 62, 239, 25, 225, 48,
+      20, 147, 119, 151, 178, 216, 238, 185, 119, 191, 34, 191, 87, 230, 94, 212, 148, 130, 130, 15, 101, 26, 99,
+      130, 17, 252, 229, 56, 247, 175, 12, 40, 12, 224, 210, 247, 23, 152, 63, 67, 131, 220, 86, 217, 69, 116,
+      5, 240, 239, 173, 165, 138, 254, 182, 208, 192, 27, 39, 97, 171, 251, 22, 154, 110, 149, 89, 107, 245, 100,
+      230, 64, 137, 85, 62, 152, 166, 223, 246, 250, 186, 183, 146, 212, 207, 169, 191, 247, 246, 19, 248, 49, 47,
+      180, 213, 25, 108, 253, 153, 189, 149, 193, 191, 87, 44, 88, 63, 62, 254, 166, 55, 191, 207, 81, 143, 161,
+      247, 15, 55, 174, 158, 214, 233, 181, 156, 134, 109, 159, 179, 245, 28, 197, 86, 249, 145, 113, 14, 44, 101,
+      41, 221, 15, 230, 151, 232, 47, 69, 199, 172, 147, 143, 113, 70, 70, 222, 162, 71, 29, 38, 71, 183, 252,
+      22, 233, 175, 247, 29, 95, 118, 153, 17, 83, 31, 125, 137, 238, 255, 47, 92, 104, 211, 106, 104, 102, 39,
+      86, 228, 92, 105, 108, 84, 175, 159, 185, 190, 121, 144, 74, 222, 159, 38, 35, 192, 209, 205, 1, 211, 73,
+      84, 162, 192, 236, 250, 155, 245, 223, 52, 242, 14, 242, 37, 245, 170, 110, 181, 82, 251, 244, 143, 71, 56,
+      231, 86, 236, 55, 130, 21, 232, 156, 40, 96, 113, 17, 253, 35, 168, 107, 247, 254, 253, 147, 252, 85, 222,
+      211, 238, 231, 70, 149, 174, 246, 194, 212, 163, 254, 59, 233, 45, 182, 244, 132, 173, 33, 64, 79, 71, 118,
+      42, 134, 0, 202, 30, 47, 103, 212, 40, 232, 124, 116, 75, 29, 254, 253, 124, 47, 217, 35, 87, 114, 228,
+      75, 47, 54, 212, 222, 225, 72, 99, 109, 198, 88, 63, 1, 98, 76, 242, 18, 193, 24, 47, 1, 103, 153,
+      41, 11, 140, 100, 217, 110, 96, 24, 136, 97, 116, 138, 42, 48, 114, 232, 164, 201, 62, 236, 255, 202, 249,
+      253, 66, 246, 68, 21, 220, 15, 47, 239, 205, 150, 127, 104, 154, 20, 176, 48, 41, 90, 149, 181, 251, 19,
+      232, 41, 64, 55, 91, 155, 23, 33, 100, 102, 238, 79, 19, 169, 49, 152, 105, 98, 137, 250, 166, 38, 33,
+      254, 251, 160, 142, 200, 171, 236, 247, 153, 177, 36, 178, 219, 228, 253, 145, 12, 152, 68, 34, 89, 163, 34,
+      47, 247, 140, 228, 45, 179, 100, 3, 134, 4, 151, 67, 144, 180, 40, 245, 234, 113, 29, 149, 201, 44, 132,
+      86, 90, 228, 121, 64, 66, 78, 194, 200, 235, 92, 133, 3, 157, 126, 152, 191, 85, 222, 85, 168, 83, 202,
+      178, 216, 154, 101, 95, 186, 106, 183, 53, 103, 202, 110, 72, 37, 81, 15, 194, 228, 251, 255, 246, 219, 65,
+      168, 83, 30, 11, 63, 23, 28, 87, 106, 185, 138, 246, 89, 132, 45, 255, 107, 85, 18, 90, 244, 158, 214,
+      76, 81, 90, 249, 151, 58, 232, 91, 231, 99, 214, 141, 165, 69, 140, 234, 193, 72, 73, 158, 23, 38, 244,
+      213, 247, 250, 8, 88, 241, 159, 91, 111, 253, 99, 66, 232, 155, 17, 167, 61, 199, 236, 255, 204, 253, 118,
+      34, 194, 8, 136, 237, 159, 117, 63, 247, 30, 83, 229, 226, 10, 253, 235, 148, 85, 85, 127, 221, 153, 165,
+      245, 141, 239, 227, 216, 24, 91, 50, 246, 171, 215, 113, 184, 17, 153, 164, 60, 29, 105, 190, 123, 143, 111,
+      177, 225, 68, 78, 210, 217, 28, 191, 26, 255, 187, 255, 166, 69, 239, 79, 236, 85, 155, 34, 6, 247, 172,
+      61, 41, 68, 159, 247, 54, 159, 103, 98, 112, 209, 188, 252, 148, 125, 113, 230, 167, 154, 53, 143, 168, 172,
+      66, 253, 52, 209, 191, 253, 238, 9, 46, 240, 23, 175, 148, 85, 54, 116, 55, 214, 253, 152, 175, 6, 98,
+      158, 109, 204, 219, 107, 107, 109, 206, 150, 57, 250, 125, 170, 229, 20, 175, 104, 90, 250, 54, 112, 46, 92,
+      250, 156, 181, 177, 177, 166, 167, 250, 181, 209, 208, 127, 235, 151, 133, 86, 147, 42, 42, 84, 133, 136, 170,
+      202, 172, 28, 115, 160, 106, 251, 79, 205, 177, 67, 169, 69, 184, 109, 207, 164, 38, 247, 16, 245, 251, 129,
+      246, 85, 156, 255, 218, 74, 212, 246, 81, 196, 161, 196, 23, 174, 234, 115, 35, 56, 126, 133, 16, 65, 139,
+      113, 63, 243, 23, 18, 211, 30, 236, 167, 189, 129, 111, 15, 228, 254, 120, 127, 141, 215, 232, 119, 119, 113,
+      127, 254, 111, 241, 86, 14, 154, 50, 201, 146, 87, 25, 239, 39, 24, 232, 94, 185, 226, 130, 95, 0, 127,
+      6, 142, 209, 31, 203, 85, 215, 81, 22, 189, 109, 109, 252, 219, 175, 70, 246, 194, 198, 21, 227, 52, 247,
+      149, 32, 242, 126, 5, 236, 140, 73, 117, 177, 169, 162, 68, 44, 132, 68, 77, 110, 54, 130, 161, 235, 118,
+      154, 250, 47, 75, 249, 8, 169, 195, 231, 244, 192, 33, 155, 133, 27, 103, 173, 213, 231, 168, 44, 225, 30,
+      9, 29, 19, 172, 211, 253, 226, 31, 191, 0, 237, 92, 149, 224, 121, 198, 142, 200, 255, 28, 33, 22, 196,
+      66, 213, 73, 104, 64, 133, 221, 181, 147, 69, 105, 234, 243, 97, 169, 130, 242, 229, 171, 202, 210, 53, 29,
+      97, 103, 20, 53, 224, 37, 74, 155, 98, 73, 255, 124, 188, 118, 166, 188, 103, 236, 19, 39, 106, 205, 254,
+      161, 251, 181, 191, 223, 211, 79, 153, 209, 128, 160, 205, 36, 186, 39, 22, 187, 92, 237, 130, 212, 8, 190,
+      91, 166, 145, 223, 232, 83, 88, 94, 242, 164, 94, 97, 120, 246, 31, 243, 57, 59, 100, 221, 60, 1, 39,
+      178, 111, 61, 50, 115, 127, 72, 103, 226, 15, 221, 46, 237, 91, 111, 230, 43, 181, 183, 23, 146, 24, 67,
+      248, 191, 168, 74, 170, 95, 163, 177, 171, 47, 80, 175, 11, 13, 111, 125, 254, 55, 202, 56, 216, 103, 216,
+      134, 50, 222, 169, 211, 118, 107, 135, 221, 90, 102, 69, 142, 108, 40, 223, 86, 15, 157, 79, 110, 198, 12,
+      70, 54, 184, 118, 159, 87, 150, 78, 25, 254, 251, 196, 205, 153, 157, 43, 17, 54, 214, 210, 174, 22, 238,
+      161, 175, 239, 239, 28, 128, 234, 25, 67, 97, 136, 171, 177, 12, 95, 212, 195, 251, 50, 246, 175, 61, 3,
+      89, 68, 142, 41, 43, 114, 67, 130, 48, 53, 106, 47, 119, 106, 253, 129, 242, 254, 223, 90, 77, 56, 221,
+      31, 140, 194, 65, 80, 73, 228, 3, 94, 181, 72, 206, 223, 81, 171, 103, 203, 225, 152, 9, 204, 170, 247,
+      26, 155, 197, 65, 239, 112, 5, 22, 72, 142, 69, 172, 127, 13, 131, 246, 189, 175, 242, 157, 130, 212, 59,
+      175, 254, 55, 144, 163, 60, 59, 194, 103, 144, 95, 123, 18, 234, 183, 239, 45, 52, 212, 242, 172, 164, 128,
+      50, 69, 183, 85, 18, 116, 120, 242, 151, 179, 4, 246, 161, 74, 69, 170, 39, 16, 84, 192, 27, 178, 122,
+      30, 248, 79, 135, 98, 118, 26, 212, 52, 0, 175, 51, 29, 222, 134, 114, 206, 37, 167, 13, 110, 143, 136,
+      164, 249, 59, 209, 230, 181, 205, 237, 203, 28, 177, 224, 228, 102, 39, 156, 22, 204, 142, 160, 195, 51, 207,
+      56, 255, 63, 24, 123, 153, 92, 90, 244, 117, 187, 115, 21, 85, 129, 166, 122, 31, 19, 214, 173, 76, 255,
+      204, 133, 127, 106, 94, 133, 225, 222, 97, 100, 170, 103, 11, 227, 2, 144, 174, 103, 177, 168, 133, 19, 83,
+      255, 100, 55, 128, 230, 142, 163, 136, 206, 45, 122, 78, 99, 236, 150, 133, 34, 71, 93, 245, 246, 251, 149,
+      65, 14, 240, 34, 234, 100, 250, 142, 80, 39, 97, 61, 76, 75, 85, 49, 113, 29, 99, 223, 156, 243, 90,
+      240, 121, 208, 51, 204, 72, 17, 255, 105, 228, 128, 249, 147, 253, 127, 160, 38, 207, 22, 79, 137, 226, 207,
+      218, 213, 166, 154, 177, 4, 204, 235, 182, 59, 245, 206, 213, 244, 135, 71, 25, 92, 216, 142, 81, 129, 255,
+      100, 99, 36, 38, 79, 47, 168, 252, 7, 146, 46, 191, 72, 131, 83, 98, 121, 94, 178, 31, 41, 229, 220,
+      210, 255, 209, 16, 40, 159, 126, 189, 250, 10, 89, 245, 39, 222, 208, 124, 201, 127, 84, 69, 125, 186, 65,
+      250, 52, 79, 165, 45, 232, 234, 250, 162, 85, 220, 186, 165, 229, 136, 255, 72, 34, 47, 239, 15, 213, 129,
+      25, 247, 146, 116, 222, 242, 57, 229, 28, 34, 180, 222, 205, 162, 167, 237, 191, 141, 142, 164, 253, 44, 254,
+      44, 183, 245, 250, 227, 65, 192, 165, 238, 39, 33, 210, 236, 204, 139, 95, 88, 140, 115, 116, 46, 131, 119,
+      2, 141, 243, 37, 247, 49, 36, 56, 63, 218, 242, 28, 149, 100, 49, 87, 183, 210, 250, 168, 148, 244, 117,
+      68, 159, 53, 247, 76, 57, 82, 245, 63, 105, 3, 236, 94, 205, 232, 36, 248, 156, 254, 185, 7, 139, 218,
+      99, 207, 166, 54, 148, 52, 97, 102, 62, 230, 143, 115, 170, 72, 140, 38, 202, 147, 49, 242, 245, 165, 219,
+      158, 240, 111, 177, 193, 81, 176, 181, 14, 18, 110, 70, 55, 99, 80, 249, 127, 161, 29, 51, 220, 143, 44,
+      204, 79, 168, 245, 244, 58, 254, 108, 184, 34, 198, 152, 196, 218, 239, 245, 61, 138, 145, 221, 65, 4, 68,
+      104, 6, 39, 40, 241, 99, 104, 203, 108, 255, 127, 123, 246, 118, 93, 230, 70, 242, 114, 117, 189, 127, 84,
+      233, 2, 183, 88, 47, 243, 217, 98, 89, 225, 89, 31, 138, 2, 221, 5, 79, 247, 63, 120, 99, 71, 184,
+      62, 131, 142, 217, 209, 215, 251, 125, 126, 54, 109, 89, 193, 234, 51, 215, 72, 85, 140, 245, 39, 243, 24,
+      70, 4, 18, 120, 151, 166, 170, 154, 105, 160, 65, 185, 132, 227, 238, 52, 188, 219, 71, 233, 130, 91, 93,
+      239, 218, 72, 195, 199, 157, 199, 211, 98, 202, 118, 163, 28, 178, 64, 181, 169, 2, 191, 74, 161, 254, 19,
+      33, 154, 163, 191, 187, 158, 232, 245, 255, 250, 205, 62, 70, 111, 63, 158, 94, 136, 253, 46, 220, 41, 174,
+      183, 211, 91, 15, 136, 2, 75, 124, 246, 236, 204, 196, 66, 238, 79, 122, 225, 219, 108, 228, 156, 172, 39,
+      107, 163, 191, 196, 182, 29, 69, 147, 250, 36, 42, 188, 116, 196, 163, 146, 161, 255, 228, 4, 125, 253, 124,
+      72, 248, 19, 108, 223, 250, 103, 242, 78, 245, 90, 53, 58, 16, 42, 31, 177, 173, 245, 126, 84, 52, 55,
+      243, 148, 239, 57, 126, 220, 252, 172, 72, 175, 65, 119, 38, 188, 178, 7, 180, 76, 95, 31, 230, 253, 123,
+      81, 187, 184, 39, 111, 218, 94, 251, 187, 175, 24, 158, 90, 128, 130, 145, 33, 86, 46, 143, 33, 175, 134,
+      215, 99, 54, 214, 70, 28, 204, 205, 238, 252, 172, 166, 210, 42, 11, 213, 67, 143, 48, 145, 152, 19, 147,
+      18, 31, 17, 174, 233, 186, 132, 215, 24, 205, 87, 250, 100, 128, 249, 20, 66, 3, 79, 50, 88, 42, 145,
+      156, 45, 246, 169, 230, 163, 130, 13, 164, 127, 76, 150, 60, 101, 209, 112, 34, 247, 117, 197, 23, 114, 164,
+      27, 142, 56, 227, 79, 49, 159, 21, 177, 124, 212, 89, 250, 113, 6, 16, 159, 33, 231, 238, 80, 245, 195,
+      207, 248, 205, 253, 145, 154, 220, 96, 164, 140, 231, 89, 161, 6, 213, 163, 30, 123, 232, 102, 188, 217, 159,
+      128, 16, 29, 27, 28, 199, 114, 212, 95, 111, 255, 109, 215, 5, 101, 191, 216, 134, 194, 108, 98, 167, 61,
+      218, 49, 159, 10, 95, 159, 158, 166, 37, 129, 62, 151, 144, 55, 36, 54, 107, 156, 216, 219, 203, 138, 36,
+      250, 220, 215, 237, 226, 88, 159, 136, 107, 117, 153, 239, 155, 18, 156, 69, 49, 2, 118, 142, 241, 41, 168,
+      56, 155, 250, 105, 112, 217, 189, 214, 164, 162, 244, 52, 95, 36, 35, 191, 66, 51, 75, 192, 171, 103, 118,
+      93, 237, 124, 70, 141, 10, 62, 84, 150, 120, 228, 85, 156, 140, 144, 68, 189, 239, 222, 203, 119, 197, 132,
+      209, 162, 135, 179, 163, 212, 117, 79, 47, 104, 239, 19, 41, 166, 95, 129, 187, 67, 44, 159, 141, 167, 248,
+      43, 179, 42, 15, 94, 249, 234, 104, 118, 225, 254, 236, 204, 92, 13, 26, 204, 141, 195, 227, 2, 104, 251,
+      22, 146, 225, 179, 250, 49, 224, 65, 253, 5, 64, 245, 122, 155, 253, 156, 100, 114, 239, 111, 97, 199, 251,
+      133, 253, 25, 27, 143, 251, 212, 169, 66, 111, 131, 58, 23, 117, 31, 123, 61, 148, 166, 127, 179, 37, 163,
+      3, 82, 94, 113, 41, 80, 56, 207, 67, 173, 41, 28, 249, 157, 10, 42, 98, 136, 201, 213, 127, 102, 34,
+      44, 228, 230, 63, 23, 158, 15, 75, 69, 241, 72, 131, 6, 199, 253, 165, 175, 217, 39, 101, 29, 84, 213,
+      51, 132, 21, 195, 127, 246, 254, 191, 165, 96, 71, 190, 91, 124, 252, 76, 97, 188, 153, 47, 210, 193, 105,
+      71, 206, 209, 146, 102, 254, 251, 43, 152, 117, 188, 195, 196, 88, 246, 223, 102, 100, 159, 7, 252, 43, 240,
+      167, 106, 228, 175, 127, 237, 159, 249, 211, 203, 193, 191, 135, 109, 12, 190, 241, 67, 93, 130, 72, 251, 84,
+      77, 243, 169, 112, 243, 219, 129, 23, 91, 251, 43, 188, 141, 32, 170, 91, 173, 231, 60, 198, 42, 159, 117,
+      218, 197, 63, 226, 96, 127, 201, 179, 206, 186, 144, 219, 84, 145, 216, 51, 47, 2, 113, 241, 68, 149, 98,
+      200, 203, 232, 43, 239, 21, 129, 255, 213, 175, 119, 246, 229, 22, 159, 49, 54, 246, 93, 2, 130, 116, 49,
+      212, 38, 51, 244, 99, 63, 24, 220, 4, 60, 33, 250, 190, 121, 197, 254, 50, 120, 181, 65, 124, 0, 30,
+      249, 209, 209, 124, 136, 241, 134, 4, 116, 174, 87, 11, 113, 177, 239, 82, 66, 125, 215, 84, 105, 153, 227,
+      166, 132, 214, 76, 209, 163, 69, 58, 111, 167, 253, 217, 27, 20, 189, 196, 49, 180, 181, 0, 78, 99, 103,
+      175, 67, 39, 202, 161, 48, 125, 182, 236, 196, 98, 69, 93, 246, 174, 179, 195, 35, 233, 196, 105, 95, 127,
+      25, 7, 153, 69, 42, 94, 19, 113, 40, 171, 113, 33, 191, 176, 215, 71, 230, 134, 187, 213, 233, 107, 225,
+      254, 33, 249, 248, 165, 74, 227, 56, 153, 191, 195, 249, 57, 223, 184, 218, 58, 246, 83, 149, 108, 135, 21,
+      187, 33, 12, 179, 72, 152, 59, 2, 92, 18, 244, 185, 76, 243, 166, 247, 159, 185, 169, 116, 174, 67, 98,
+      241, 248, 137, 239, 17, 0, 224, 0, 191, 230, 117, 248, 58, 170, 27, 251, 202, 176, 148, 57, 2, 220, 130,
+      86, 242, 217, 193, 120, 89, 173, 143, 55, 5, 94, 41, 63, 75, 33, 221, 126, 16, 95, 166, 209, 253, 151,
+      131, 17, 245, 250, 24, 129, 69, 161, 105, 67, 100, 222, 34, 67, 226, 68, 20, 151, 12, 117, 213, 29, 193,
+      241, 90, 238, 15, 162, 60, 146, 66, 155, 52, 196, 197, 65, 138, 25, 44, 216, 56, 241, 191, 200, 76, 86,
+      47, 214, 250, 46, 177, 66, 186, 21, 98, 46, 176, 216, 221, 140, 172, 42, 124, 105, 253, 239, 125, 13, 29,
+      147, 84, 165, 142, 87, 5, 35, 116, 212, 17, 54, 193, 35, 92, 16, 88, 172, 184, 172, 39, 245, 149, 60,
+      115, 76, 95, 185, 254, 16, 177, 116, 160, 191, 43, 191, 84, 204, 56, 77, 109, 91, 224, 120, 244, 95, 165,
+      156, 109, 255, 226, 223, 142, 170, 220, 55, 99, 112, 27, 108, 223, 185, 107, 155, 104, 191, 78, 70, 81, 19,
+      40, 242, 10, 56, 244, 79, 245, 204, 23, 33, 161, 108, 63, 239, 46, 174, 214, 105, 235, 180, 148, 88, 84,
+      30, 27, 164, 84, 58, 30, 73, 220, 135, 108, 3, 117, 204, 51, 24, 236, 192, 115, 254, 74, 123, 58, 28,
+      200, 228, 192, 61, 229, 214, 34, 51, 251, 89, 95, 240, 67, 162, 83, 90, 159, 79, 26, 115, 109, 129, 142,
+      218, 249, 19, 164, 245, 60, 244, 193, 2, 136, 211, 95, 73, 194, 17, 44, 204, 251, 223, 92, 146, 213, 187,
+      84, 49, 120, 216, 49, 63, 102, 160, 85, 185, 148, 233, 207, 135, 254, 245, 236, 4, 21, 61, 177, 109, 24,
+      151, 195, 36, 109, 159, 20, 86, 153, 79, 159, 237, 176, 176, 223, 127, 134, 143, 150, 65, 243, 222, 249, 52,
+      254, 244, 23, 19, 149, 180, 211, 181, 147, 100, 106, 175, 230, 90, 147, 239, 232, 253, 178, 56, 147, 97, 210,
+      177, 108, 185, 98, 207, 98, 102, 18, 90, 161, 240, 94, 85, 165, 127, 226, 74, 46, 249, 21, 19, 165, 20,
+      51, 43, 10, 209, 210, 23, 111, 215, 31, 98, 239, 191, 175, 86, 217, 76, 113, 61, 26, 7, 251, 102, 7,
+      76, 75, 107, 72, 233, 115, 123, 178, 36, 23, 29, 187, 131, 216, 36, 9, 251, 79, 47, 93, 105, 210, 236,
+      102, 228, 110, 156, 202, 228, 73, 95, 119, 118, 126, 247, 248, 118, 250, 227, 29, 113, 89, 66, 115, 154, 217,
+      12, 104, 183, 138, 158, 252, 87, 156, 43, 159, 59, 19, 205, 128, 91, 41, 113, 100, 167, 253, 95, 44, 235,
+      60, 51, 223, 27, 57, 40, 196, 182, 60, 16, 167, 3, 136, 249, 80, 187, 250, 94, 39, 231, 15, 181, 181,
+      181, 97, 214, 211, 145, 119, 234, 78, 24, 75, 190, 30, 218, 221, 215, 211, 93, 54, 173, 14, 28, 159, 69,
+      73, 108, 46, 255, 186, 178, 20, 246, 189, 23, 45, 65, 132, 145, 98, 54, 220, 199, 136, 181, 64, 205, 240,
+      72, 105, 157, 91, 89, 120, 93, 231, 20, 39, 216, 92, 109, 95, 177, 134, 57, 15, 126, 12, 218, 17, 58,
+      210, 252, 71, 28, 28, 128, 147, 70, 217, 27, 69, 76, 185, 250, 25, 18, 46, 109, 127, 161, 211, 68, 202,
+      251, 5, 153, 176, 102, 129, 20, 183, 249, 50, 61, 241, 252, 139, 220, 52, 235, 254, 188, 186, 230, 227, 214,
+      225, 116, 64, 241, 251, 102, 213, 191, 115, 183, 46, 59, 79, 168, 93, 49, 53, 207, 242, 249, 40, 214, 197,
+      193, 49, 210, 201, 63, 182, 136, 247, 29, 72, 113, 255, 164, 204, 109, 141, 120, 3, 73, 129, 230, 214, 107,
+      190, 246, 94, 13, 157, 202, 235, 83, 239, 106, 101, 143, 82, 207, 107, 28, 66, 153, 191, 236, 11, 133, 242,
+      125, 163, 39, 49, 69, 49, 222, 129, 241, 215, 170, 85, 177, 93, 103, 215, 221, 88, 31, 191, 61, 221, 105,
+      117, 55, 26, 41, 27, 197, 220, 160, 188, 36, 162, 144, 43, 239, 28, 121, 250, 43, 148, 78, 244, 136, 110,
+      26, 73, 81, 243, 183, 25, 19, 101, 94, 8, 211, 45, 225, 81, 247, 255, 236, 111, 19, 230, 60, 132, 4,
+      182, 154, 154, 96, 185, 131, 51, 191, 58, 78, 135, 170, 201, 210, 81, 135, 94, 250, 49, 47, 180, 5, 37,
+      145, 237, 223, 111, 40, 24, 255, 39, 101, 76, 94, 125, 106, 113, 33, 20, 254, 69, 180, 67, 137, 223, 61,
+      128, 78, 34, 48, 75, 108, 31, 207, 180, 254, 205, 40, 182, 205, 233, 35, 149, 76, 37, 179, 134, 43, 184,
+      219, 25, 203, 175, 188, 83, 202, 58, 90, 254, 39, 139, 203, 118, 159, 143, 140, 77, 159, 107, 104, 168, 203,
+      44, 18, 191, 164, 39, 62, 150, 71, 52, 173, 13, 212, 244, 190, 57, 68, 119, 235, 226, 12, 87, 102, 41,
+      156, 68, 47, 36, 39, 188, 231, 97, 65, 72, 129, 223, 19, 145, 227, 191, 14, 254, 213, 118, 119, 101, 214,
+      182, 121, 2, 170, 21, 9, 14, 28, 255, 9, 205, 182, 69, 58, 172, 245, 120, 237, 11, 53, 49, 90, 79,
+      235, 241, 95, 166, 90, 215, 119, 77, 111, 145, 13, 157, 33, 57, 39, 252, 127, 2, 153, 151, 12, 110, 81,
+      136, 107, 239, 74, 38, 156, 179, 53, 89, 106, 169, 158, 72, 190, 38, 125, 187, 253, 113, 212, 139, 191, 147,
+      155, 65, 159, 249, 251, 63, 67, 201, 173, 24, 76, 207, 101, 109, 31, 163, 136, 252, 202, 37, 52, 126, 120,
+      81, 225};
+  const std::vector<uint8_t> fc3_experts_weights = {
+      123, 186, 42, 165, 140, 44, 223, 124, 136, 165, 213, 231, 84, 236, 233, 49, 19, 130, 60, 166, 63, 110, 31,
+      193, 171, 238, 175, 234, 180, 113, 216, 26, 159, 185, 138, 72, 239, 132, 203, 94, 120, 242, 252, 33, 253, 154,
+      247, 128, 96, 218, 30, 47, 218, 24, 82, 105, 64, 231, 188, 35, 255, 89, 198, 116, 84, 157, 44, 160, 175,
+      53, 93, 90, 166, 198, 49, 152, 113, 238, 86, 121, 159, 234, 205, 166, 70, 130, 244, 136, 58, 124, 65, 174,
+      228, 142, 107, 216, 135, 94, 196, 77, 42, 195, 99, 2, 61, 31, 31, 170, 41, 108, 54, 169, 204, 106, 67,
+      204, 147, 111, 35, 27, 113, 55, 167, 76, 95, 191, 248, 208, 231, 172, 193, 31, 121, 140, 250, 187, 117, 93,
+      27, 82, 229, 56, 251, 164, 33, 238, 22, 124, 152, 80, 154, 78, 241, 217, 250, 54, 229, 38, 183, 63, 74,
+      161, 62, 69, 90, 232, 59, 49, 162, 140, 130, 210, 131, 253, 179, 159, 87, 120, 231, 123, 96, 166, 105, 95,
+      18, 220, 249, 7, 150, 79, 29, 206, 112, 20, 181, 3, 52, 171, 124, 187, 88, 136, 98, 25, 94, 47, 212,
+      53, 29, 239, 154, 164, 212, 61, 208, 171, 253, 243, 216, 71, 207, 231, 49, 174, 29, 244, 43, 168, 77, 203,
+      129, 233, 123, 131, 33, 254, 151, 138, 110, 245, 89, 173, 220, 53, 60, 80, 147, 195, 255, 42, 8, 18, 139,
+      97, 146, 248, 54, 220, 163, 22, 242, 58, 198, 57, 126, 122, 203, 118, 149, 13, 145, 193, 154, 237, 207, 99,
+      82, 122, 2, 207, 26, 88, 62, 71, 173, 148, 251, 231, 41, 35, 168, 110, 226, 13, 109, 159, 216, 39, 100,
+      43, 65, 167, 20, 255, 15, 37, 110, 220, 54, 175, 40, 88, 49, 5, 189, 146, 218, 110, 207, 118, 213, 76,
+      191, 145, 12, 214, 238, 32, 123, 62, 58, 106, 223, 63, 24, 122, 127, 192, 189, 215, 129, 2, 212, 242, 207,
+      161, 183, 246, 181, 194, 106, 153, 134, 105, 255, 242, 131, 252, 110, 153, 79, 240, 153, 191, 197, 205, 161, 136,
+      55, 243, 141, 212, 116, 169, 162, 14, 45, 222, 22, 54, 142, 136, 247, 194, 72, 198, 20, 219, 125, 238, 45,
+      255, 217, 134, 19, 234, 36, 57, 49, 108, 38, 54, 166, 101, 200, 31, 64, 90, 147, 144, 71, 41, 138, 12,
+      78, 52, 66, 241, 216, 30, 187, 179, 205, 162, 63, 60, 236, 223, 20, 247, 199, 235, 194, 94, 223, 239, 127,
+      169, 251, 238, 199, 172, 206, 164, 227, 220, 243, 105, 125, 113, 116, 245, 247, 194, 249, 28, 174, 151, 28, 114,
+      47, 205, 148, 241, 95, 55, 55, 197, 188, 152, 96, 85, 190, 114, 108, 144, 226, 237, 86, 25, 130, 200, 17,
+      199, 242, 100, 248, 154, 140, 44, 154, 209, 155, 27, 249, 26, 226, 214, 211, 172, 184, 122, 61, 17, 247, 255,
+      14, 111, 39, 28, 63, 224, 27, 132, 42, 79, 45, 171, 155, 35, 230, 163, 215, 225, 175, 198, 184, 110, 21,
+      247, 251, 83, 77, 223, 230, 135, 216, 199, 119, 131, 71, 121, 129, 185, 244, 155, 125, 205, 17, 249, 114, 133,
+      79, 117, 53, 115, 65, 178, 226, 113, 69, 96, 246, 8, 16, 223, 210, 108, 68, 61, 89, 170, 3, 133, 11,
+      197, 243, 89, 15, 201, 85, 125, 219, 193, 47, 106, 248, 52, 145, 191, 176, 207, 159, 219, 43, 242, 52, 250,
+      250, 31, 248, 131, 175, 249, 43, 250, 239, 101, 45, 114, 62, 95, 167, 237, 190, 169, 109, 76, 119, 193, 229,
+      157, 148, 90, 62, 23, 207, 40, 136, 131, 133, 119, 84, 101, 20, 217, 28, 144, 237, 169, 75, 100, 104, 110,
+      113, 1, 101, 153, 253, 243, 13, 43, 171, 90, 255, 170, 204, 118, 251, 129, 253, 239, 233, 122, 134, 239, 225,
+      167, 176, 94, 50, 76, 164, 234, 125, 109, 238, 114, 164, 164, 49, 163, 125, 193, 38, 32, 193, 242, 119, 38,
+      116, 84, 189, 242, 161, 159, 179, 191, 253, 14, 43, 179, 37, 215, 181, 230, 94, 44, 140, 61, 82, 182, 210,
+      221, 248, 104, 224, 249, 18, 126, 36, 40, 119, 72, 49, 171, 203, 86, 208, 15, 15, 201, 18, 164, 157, 171,
+      155, 225, 176, 137, 228, 246, 228, 160, 159, 116, 245, 94, 195, 219, 197, 207, 254, 188, 135, 104, 64, 223, 233,
+      212, 137, 31, 115, 115, 202, 40, 247, 111, 39, 142, 213, 171, 229, 10, 92, 234, 56, 50, 192, 69, 9, 253,
+      208, 111, 36, 110, 137, 128, 94, 136, 191, 243, 41, 197, 61, 102, 46, 220, 39, 45, 155, 112, 110, 47, 84,
+      21, 43, 68, 172, 154, 1, 76, 26, 110, 94, 181, 58, 63, 173, 127, 12, 19, 124, 6, 212, 182, 76, 21,
+      209, 235, 123, 48, 151, 172, 189, 241, 240, 147, 144, 147, 216, 236, 53, 56, 252, 241, 123, 63, 25, 202, 131,
+      40, 163, 119, 254, 98, 86, 245, 86, 229, 62, 42, 175, 76, 226, 8, 246, 251, 17, 76, 204, 59, 132, 73,
+      196, 194, 88, 51, 117, 19, 146, 154, 90, 79, 72, 207, 177, 182, 99, 185, 113, 179, 91, 130, 156, 47, 209,
+      187, 191, 130, 62, 22, 66, 189, 174, 137, 176, 174, 253, 161, 142, 35, 67, 243, 1, 96, 158, 67, 36, 255,
+      223, 214, 181, 200, 234, 230, 127, 52, 155, 241, 6, 100, 56, 227, 28, 172, 149, 47, 63, 107, 40, 35, 173,
+      174, 76, 173, 64, 78, 101, 103, 104, 150, 95, 240, 148, 226, 120, 56, 72, 118, 183, 142, 40, 246, 75, 34,
+      37, 95, 230, 179, 96, 99, 245, 140, 64, 87, 103, 161, 158, 130, 87, 164, 11, 249, 254, 65, 14, 82, 60,
+      179, 94, 242, 43, 238, 167, 108, 171, 21, 128, 238, 98, 136, 133, 196, 209, 111, 34, 202, 98, 90, 104, 9,
+      222, 99, 70, 246, 47, 77, 219, 35, 28, 125, 79, 231, 54, 31, 207, 119, 159, 95, 239, 54, 3, 39, 18,
+      5, 96, 101, 166, 21, 15, 138, 198, 44, 154, 148, 50, 219, 73, 218, 240, 78, 76, 176, 252, 37, 221, 230,
+      17, 67, 113, 11, 155, 201, 7, 53, 250, 188, 223, 102, 79, 217, 24, 170, 148, 25, 82, 11, 45, 75, 211,
+      72, 10, 166, 78, 79, 17, 152, 143, 58, 118, 135, 90, 156, 24, 157, 253, 205, 60, 241, 8, 251, 77, 172,
+      173, 148, 172, 66, 19, 59, 126, 50, 87, 172, 251, 25, 2, 212, 111, 215, 239, 28, 63, 79, 179, 167, 20,
+      18, 222, 66, 167, 253, 226, 115, 132, 81, 156, 61, 225, 188, 178, 225, 185, 9, 27, 242, 0, 196, 191, 122,
+      133, 8, 101, 21, 250, 28, 133, 39, 69, 126, 149, 43, 239, 152, 127, 155, 2, 231, 72, 156, 169, 20, 171,
+      210, 230, 14, 116, 110, 128, 245, 224, 89, 153, 49, 63, 90, 52, 75, 106, 77, 188, 26, 186, 120, 146, 223,
+      223, 158, 252, 93, 231, 155, 50, 99, 131, 42, 9, 88, 131, 247, 41, 245, 255, 77, 26, 151, 121, 79, 47,
+      250, 41, 29, 120, 133, 198, 177, 202, 100, 80, 185, 112, 134, 179, 39, 164, 190, 68, 72, 48, 104, 253, 117,
+      223, 190, 244, 134, 232, 153, 147, 56, 242, 197, 75, 77, 69, 148, 161, 155, 101, 173, 224, 66, 154, 129, 126,
+      4, 188, 79, 90, 119, 146, 255, 140, 202, 133, 153, 190, 5, 251, 228, 174, 183, 164, 171, 251, 67, 209, 255,
+      20, 67, 86, 226, 209, 178, 7, 87, 222, 69, 203, 57, 225, 24, 172, 250, 127, 241, 114, 215, 38, 28, 209,
+      130, 89, 108, 28, 28, 138, 196, 195, 127, 56, 164, 178, 206, 236, 146, 29, 190, 129, 213, 193, 222, 84, 41,
+      148, 249, 112, 172, 201, 0, 119, 252, 182, 238, 23, 210, 63, 94, 217, 146, 222, 238, 33, 51, 64, 80, 138,
+      218, 136, 244, 105, 22, 126, 205, 221, 143, 237, 111, 152, 218, 223, 126, 229, 178, 201, 202, 84, 244, 33, 234,
+      89, 196, 147, 77, 51, 156, 28, 50, 10, 154, 6, 245, 214, 69, 131, 141, 100, 86, 179, 157, 124, 104, 179,
+      48, 174, 183, 189, 21, 98, 55, 87, 117, 119, 200, 16, 233, 204, 151, 169, 119, 236, 151, 8, 202, 202, 255,
+      99, 41, 33, 207, 124, 239, 212, 147, 235, 18, 129, 37, 125, 151, 58, 79, 26, 75, 13, 169, 205, 167, 161,
+      235, 76, 41, 29, 203, 4, 26, 25, 232, 17, 41, 68, 111, 147, 17, 31, 178, 111, 220, 148, 42, 136, 79,
+      141, 186, 138, 191, 35, 61, 248, 121, 130, 218, 139, 234, 154, 56, 10, 188, 194, 33, 138, 246, 59, 13, 179,
+      27, 78, 23, 174, 61, 78, 46, 74, 132, 100, 127, 78, 207, 97, 167, 82, 249, 208, 84, 65, 87, 74, 9,
+      210, 195, 244, 249, 208, 223, 89, 246, 229, 62, 35, 74, 253, 82, 115, 50, 76, 111, 139, 21, 249, 165, 205,
+      48, 54, 31, 159, 53, 74, 37, 173, 232, 62, 184, 63, 109, 49, 221, 193, 196, 139, 214, 212, 139, 245, 23,
+      211, 243, 17, 135, 235, 31, 179, 184, 152, 210, 202, 245, 85, 31, 228, 193, 202, 234, 14, 133, 94, 118, 255,
+      68, 210, 46, 204, 199, 106, 16, 25, 19, 126, 112, 104, 220, 241, 218, 67, 216, 219, 40, 192, 159, 205, 49,
+      108, 58, 60, 25, 63, 184, 247, 213, 211, 54, 198, 46, 103, 186, 155, 224, 159, 246, 131, 190, 215, 221, 194,
+      234, 26, 201, 71, 170, 40, 185, 236, 169, 210, 66, 249, 78, 245, 67, 35, 221, 252, 180, 25, 43, 200, 53,
+      250, 108, 207, 118, 135, 99, 116, 213, 153, 255, 22, 94, 248, 58, 204, 4, 2, 190, 208, 191, 130, 87, 156,
+      2, 174, 15, 248, 164, 159, 41, 39, 29, 47, 42, 102, 248, 116, 59, 77, 228, 157, 61, 121, 4, 163, 165,
+      33, 156, 242, 247, 45, 31, 51, 170, 23, 183, 252, 245, 124, 4, 87, 103, 144, 118, 182, 237, 159, 140, 242,
+      190, 131, 126, 16, 179, 105, 31, 72, 254, 243, 25, 207, 45, 194, 234, 241, 55, 18, 69, 118, 30, 1, 252,
+      40, 164, 231, 225, 6, 23, 104, 157, 51, 249, 247, 4, 208, 149, 17, 58, 180, 248, 215, 140, 236, 178, 21,
+      133, 110, 155, 79, 245, 69, 35, 255, 189, 245, 87, 216, 123, 3, 155, 202, 253, 32, 237, 154, 120, 20, 232,
+      47, 178, 109, 200, 177, 43, 8, 97, 82, 115, 166, 106, 161, 120, 28, 44, 227, 84, 165, 86, 229, 168, 9,
+      234, 233, 80, 215, 118, 220, 176, 138, 218, 127, 251, 169, 236, 121, 215, 98, 72, 27, 221, 203, 67, 59, 194,
+      79, 167, 118, 50, 98, 141, 162, 224, 181, 124, 57, 57, 191, 230, 201, 213, 15, 84, 6, 28, 112, 228, 53,
+      196, 61, 143, 154, 249, 110, 47, 236, 33, 191, 95, 102, 22, 189, 73, 108, 112, 122, 23, 147, 216, 229, 147,
+      255, 63, 41, 93, 129, 131, 251, 88, 168, 75, 39, 252, 249, 227, 52, 175, 93, 254, 96, 196, 121, 155, 36,
+      95, 252, 88, 9, 74, 50, 254, 40, 64, 75, 121, 88, 185, 98, 15, 51, 87, 163, 253, 122, 132, 35, 196,
+      194, 250, 163, 100, 70, 55, 39, 184, 4, 171, 216, 95, 204, 243, 111, 47, 254, 95, 47, 90, 182, 101, 46,
+      140, 56, 97, 201, 83, 225, 128, 147, 66, 29, 222, 54, 133, 44, 97, 249, 177, 222, 158, 76, 59, 164, 195,
+      230, 151, 58, 68, 117, 55, 8, 94, 107, 233, 48, 49, 214, 230, 114, 239, 48, 97, 92, 5, 101, 95, 58,
+      245, 213, 148, 33, 38, 49, 232, 92, 24, 110, 188, 149, 243, 8, 152, 252, 202, 254, 203, 220, 72, 198, 176,
+      218, 156, 63, 227, 106, 145, 178, 255, 86, 33, 191, 117, 175, 161, 249, 78, 146, 56, 136, 217, 158, 87, 98,
+      212, 131, 81, 58, 6, 70, 59, 239, 155, 247, 169, 63, 92, 5, 228, 162, 69, 40, 221, 95, 111, 13, 182,
+      79, 180, 10, 165, 242, 161, 149, 88, 246, 201, 96, 107, 89, 250, 220, 212, 253, 10, 108, 31, 167, 130, 126,
+      119, 96, 225, 70, 149, 46, 151, 131, 84, 246, 188, 184, 146, 36, 160, 72, 194, 214, 161, 223, 235, 222, 233,
+      243, 70, 158, 131, 103, 22, 120, 58, 89, 190, 17, 187, 92, 104, 191, 103, 187, 218, 244, 111, 246, 178, 73,
+      95, 188, 254, 52, 116, 31, 195, 66, 148, 54, 231, 109, 151, 251, 35, 100, 146, 49, 96, 194, 213, 250, 143,
+      95, 111, 193, 114, 212, 250, 225, 46, 249, 179, 211, 75, 149, 221, 133, 74, 138, 230, 61, 87, 215, 106, 199,
+      246, 239, 31, 63, 81, 172, 247, 206, 87, 118, 1, 38, 125, 196, 78, 138, 99, 58, 81, 157, 82, 252, 59,
+      118, 117, 83, 172, 39, 222, 163, 181, 121, 204, 142, 99, 101, 248, 55, 17, 182, 75, 71, 170, 77, 70, 154,
+      242, 159, 178, 243, 201, 235, 165, 129, 127, 149, 158, 20, 52, 193, 8, 231, 210, 53, 47, 47, 220, 127, 101,
+      243, 220, 219, 188, 221, 166, 84, 173, 140, 111, 106, 42, 88, 15, 200, 59, 248, 214, 246, 202, 242, 77, 238,
+      210, 113, 10, 217, 241, 191, 201, 132, 122, 248, 173, 236, 214, 18, 201, 205, 198, 218, 36, 127, 95, 56, 251,
+      233, 169, 218, 173, 243, 251, 84, 38, 133, 178, 108, 150, 181, 244, 78, 143, 34, 167, 87, 12, 255, 138, 242,
+      194, 174, 198, 243, 100, 250, 227, 136, 35, 222, 53, 144, 249, 60, 105, 28, 111, 0, 53, 46, 239, 81, 21,
+      137, 143, 84, 4, 91, 52, 233, 158, 55, 181, 38, 125, 25, 20, 198, 103, 17, 190, 248, 20, 186, 22, 86,
+      165, 42, 175, 85, 85, 75, 171, 41, 51, 14, 207, 23, 250, 196, 249, 230, 50, 118, 29, 255, 191, 3, 55,
+      58, 237, 106, 172, 15, 75, 117, 89, 122, 108, 248, 227, 79, 106, 245, 61, 211, 190, 49, 165, 202, 29, 82,
+      94, 6, 141, 157, 173, 71, 69, 64, 69, 57, 181, 106, 234, 9, 69, 15, 203, 135, 60, 143, 197, 117, 117,
+      175, 202, 251, 115, 238, 130, 72, 111, 93, 161, 138, 251, 23, 78, 205, 1, 211, 30, 22, 239, 25, 162, 140,
+      10, 245, 22, 172, 181, 244, 50, 3, 85, 101, 120, 101, 68, 67, 70, 180, 195, 73, 121, 91, 138, 136, 241,
+      248, 246, 98, 252, 169, 57, 159, 255, 223, 50, 78, 76, 205, 164, 136, 180, 44, 135, 175, 81, 215, 120, 154,
+      38, 200, 129, 173, 201, 8, 129, 153, 247, 223, 172, 143, 80, 207, 197, 47, 84, 140, 253, 248, 103, 95, 156,
+      230, 161, 60, 71, 95, 207, 101, 134, 247, 244, 196, 212, 140, 30, 143, 60, 115, 145, 181, 89, 105, 61, 146,
+      175, 136, 244, 84, 223, 22, 180, 140, 25, 143, 76, 36, 58, 127, 46, 106, 169, 94, 45, 240, 44, 26, 83,
+      48, 253, 35, 237, 214, 251, 249, 233, 172, 102, 175, 93, 182, 170, 227, 221, 49, 23, 202, 231, 44, 247, 4,
+      188, 212, 182, 132, 73, 159, 179, 216, 180, 127, 45, 145, 104, 99, 63, 228, 185, 239, 176, 24, 69, 78, 174,
+      213, 230, 142, 172, 217, 197, 219, 103, 133, 76, 50, 192, 28, 30, 218, 40, 15, 141, 191, 173, 10, 93, 196,
+      231, 205, 17, 60, 117, 219, 13, 255, 194, 190, 159, 77, 24, 116, 41, 206, 245, 224, 91, 255, 162, 171, 89,
+      106, 168, 18, 18, 140, 89, 190, 38, 120, 107, 200, 236, 225, 170, 35, 81, 50, 243, 111, 124, 255, 83, 95,
+      175, 242, 219, 166, 102, 254, 175, 176, 194, 142, 110, 15, 63, 59, 37, 241, 172, 113, 29, 120, 63, 68, 128,
+      186, 187, 186, 197, 177, 214, 80, 204, 68, 250, 169, 188, 162, 47, 244, 27, 229, 247, 108, 96, 124, 100, 70,
+      103, 205, 126, 239, 229, 155, 255, 221, 54, 15, 168, 96, 137, 149, 15, 209, 250, 54, 248, 246, 198, 105, 118,
+      244, 46, 90, 111, 159, 15, 153, 211, 206, 199, 251, 96, 240, 55, 75, 191, 228, 152, 228, 178, 162, 143, 15,
+      247, 81, 144, 18, 178, 145, 107, 182, 216, 90, 255, 149, 126, 246, 178, 212, 1, 71, 105, 114, 123, 223, 69,
+      171, 91, 140, 137, 133, 143, 44, 107, 123, 80, 132, 240, 75, 175, 43, 102, 89, 98, 174, 187, 153, 250, 191,
+      99, 182, 248, 69, 196, 142, 123, 25, 248, 37, 221, 127, 68, 110, 101, 134, 196, 64, 35, 223, 173, 217, 79,
+      234, 96, 255, 134, 65, 253, 133, 50, 164, 119, 99, 109, 175, 251, 158, 242, 247, 223, 68, 177, 239, 57, 199,
+      236, 54, 140, 254, 113, 101, 195, 41, 60, 38, 187, 210, 252, 83, 178, 85, 143, 135, 12, 148, 90, 115, 73,
+      207, 165, 247, 178, 53, 159, 176, 219, 145, 177, 21, 112, 107, 170, 150, 91, 196, 27, 128, 210, 219, 65, 206,
+      182, 214, 188, 253, 126, 94, 245, 158, 223, 151, 81, 232, 194, 157, 166, 147, 253, 161, 69, 77, 254, 4, 59,
+      210, 70, 31, 118, 108, 68, 253, 200, 212, 52, 159, 68, 118, 190, 191, 157, 218, 110, 103, 247, 207, 183, 23,
+      223, 223, 206, 105, 105, 95, 22, 83, 90, 117, 242, 123, 120, 91, 253, 63, 74, 109, 125, 207, 127, 154, 240,
+      242, 17, 167, 246, 107, 59, 226, 135, 72, 196, 28, 197, 153, 150, 221, 53, 251, 60, 193, 20, 176, 81, 177,
+      178, 116, 175, 106, 24, 95, 255, 5, 253, 95, 187, 51, 220, 252, 94, 45, 16, 188, 108, 132, 231, 136, 10,
+      162, 230, 36, 138, 212, 111, 217, 53, 172, 97, 241, 123, 113, 181, 128, 159, 189, 71, 32, 46, 32, 246, 31,
+      245, 138, 84, 120, 198, 62, 86, 218, 244, 121, 111, 179, 46, 244, 1, 120, 214, 209, 138, 249, 146, 40, 31,
+      193, 19, 239, 37, 150, 12, 111, 82, 197, 55, 234, 31, 121, 213, 148, 90, 114, 253, 33, 242, 235, 138, 143,
+      87, 230, 42, 173, 239, 104, 161, 115, 237, 212, 223, 67, 79, 250, 219, 249, 116, 65, 25, 105, 52, 116, 51,
+      229, 214, 197, 34, 80, 117, 131, 219, 129, 251, 169, 247, 43, 46, 255, 43, 171, 41, 229, 43, 175, 205, 23,
+      219, 169, 79, 166, 58, 81, 147, 240, 195, 138, 32, 138, 71, 59, 188, 232, 228, 253, 252, 182, 39, 183, 62,
+      243, 24, 107, 212, 142, 152, 168, 63, 227, 81, 254, 105, 239, 203, 232, 74, 154, 73, 101, 106, 235, 247, 42,
+      211, 17, 65, 231, 97, 128, 71, 218, 148, 196, 223, 155, 242, 122, 131, 41, 59, 136, 130, 42, 104, 250, 175,
+      238, 42, 149, 245, 143, 252, 65, 243, 60, 170, 132, 221, 126, 251, 135, 102, 120, 113, 241, 249, 220, 37, 248,
+      166, 19, 77, 30, 55, 242, 61, 191, 217, 211, 43, 15, 218, 194, 151, 255, 153, 33, 203, 97, 77, 9, 179,
+      55, 140, 185, 90, 62, 163, 86, 171, 173, 118, 173, 247, 151, 89, 48, 86, 195, 131, 42, 8, 47, 254, 97,
+      38, 182, 80, 53, 111, 93, 147, 111, 240, 124, 102, 94, 127, 141, 247, 86, 11, 37, 11, 73, 218, 224, 62,
+      75, 235, 197, 123, 98, 62, 127, 246, 27, 1, 132, 244, 217, 26, 182, 110, 220, 209, 153, 188, 207, 118, 72,
+      109, 16, 23, 45, 244, 86, 240, 47, 49, 169, 233, 49, 151, 242, 76, 216, 202, 243, 101, 16, 176, 196, 51,
+      225, 204, 68, 217, 185, 59, 42, 36, 255, 117, 191, 219, 145, 31, 220, 37, 17, 237, 35, 7, 194, 57, 168,
+      163, 175, 3, 93, 54, 12, 203, 153, 35, 116, 191, 244, 196, 132, 226, 217, 171, 25, 92, 207, 51, 250, 245,
+      217, 152, 244, 71, 34, 74, 194, 176, 127, 253, 83, 41, 48, 199, 82, 72, 244, 228, 136, 146, 145, 179, 33,
+      103, 202, 149, 92, 159, 26, 233, 154, 133, 183, 194, 36, 2, 206, 31, 209, 131, 230, 109, 0, 109, 92, 49,
+      150, 19, 112, 19, 68, 126, 120, 207, 138, 93, 11, 22, 197, 140, 188, 85, 173, 117, 194, 200, 31, 221, 152,
+      138, 251, 192, 185, 47, 116, 118, 13, 189, 34, 142, 103, 47, 69, 151, 134, 122, 13, 34, 66, 211, 25, 180,
+      151, 227, 104, 201, 175, 96, 248, 114, 140, 90, 145, 203, 90, 171, 129, 187, 219, 162, 245, 119, 107, 84, 171,
+      255, 218, 26, 46, 132, 203, 79, 191, 87, 10, 124, 85, 164, 105, 115, 82, 158, 175, 182, 105, 246, 29, 97,
+      86, 2, 246, 234, 81, 29, 36, 68, 197, 37, 172, 30, 45, 44, 111, 44, 249, 63, 146, 187, 14, 224, 254,
+      52, 218, 75, 54, 185, 165, 92, 214, 70, 247, 253, 79, 232, 112, 230, 42, 58, 53, 15, 98, 135, 139, 76,
+      47, 244, 68, 7, 48, 75, 211, 87, 107, 203, 94, 67, 35, 241, 248, 146, 213, 53, 47, 98, 45, 173, 206,
+      82, 65, 229, 182, 96, 175, 234, 252, 131, 78, 120, 104, 105, 150, 119, 123, 122, 61, 65, 185, 39, 199, 163,
+      127, 193, 222, 149, 184, 233, 75, 98, 170, 150, 196, 226, 235, 242, 26, 192, 17, 84, 218, 119, 62, 58, 59,
+      164, 80, 54, 117, 164, 212, 6, 219, 212, 203, 175, 242, 162, 87, 170, 19, 90, 212, 37, 108, 141, 83, 157,
+      77, 155, 25, 236, 8, 216, 21, 6, 95, 88, 239, 19, 31, 188, 103, 29, 186, 92, 129, 6, 76, 254, 130,
+      242, 209, 181, 53, 29, 251, 164, 165, 111, 47, 89, 122, 196, 103, 252, 112, 127, 133, 76, 110, 172, 82, 119,
+      201, 222, 85, 188, 238, 131, 110, 187, 204, 175, 185, 68, 85, 31, 222, 35, 119, 138, 88, 223, 71, 154, 152,
+      251, 161, 216, 192, 252, 133, 120, 177, 212, 172, 147, 122, 184, 154, 42, 179, 56, 255, 191, 235, 172, 134, 60,
+      98, 187, 243, 199, 253, 250, 84, 72, 228, 121, 168, 20, 168, 99, 120, 93, 242, 177, 110, 195, 143, 217, 54,
+      104, 223, 14, 11, 95, 226, 43, 129, 23, 135, 152, 95, 147, 3, 131, 18, 127, 187, 67, 250, 240, 206, 40,
+      140, 218, 67, 161, 252, 85, 12, 27, 193, 43, 242, 106, 220, 158, 164, 222, 236, 198, 240, 202, 201, 108, 38,
+      52, 96, 238, 24, 175, 118, 65, 194, 116, 236, 248, 29, 1, 106, 176, 24, 229, 192, 172, 95, 83, 180, 253,
+      254, 245, 105, 88, 154, 53, 183, 41, 57, 245, 127, 98, 27, 250, 251, 207, 48, 148, 251, 89, 12, 227, 214,
+      141, 37, 133, 160, 251, 112, 230, 191, 106, 244, 74, 98, 178, 221, 118, 102, 127, 7, 27, 179, 110, 61, 252,
+      133, 107, 97, 23, 98, 244, 211, 98, 89, 191, 44, 170, 197, 43, 240, 29, 199, 248, 210, 239, 148, 106, 209,
+      195, 252, 178, 79, 140, 234, 75, 108, 78, 194, 175, 251, 246, 146, 26, 242, 212, 60, 235, 225, 76, 140, 68,
+      185, 219, 190, 137, 159, 32, 237, 188, 101, 65, 177, 28, 238, 152, 161, 137, 117, 245, 3, 149, 126, 114, 199,
+      39, 49, 255, 13, 15, 53, 186, 74, 245, 25, 245, 197, 251, 129, 47, 25, 153, 198, 133, 226, 167, 88, 94,
+      245, 245, 74, 129, 255, 233, 121, 145, 219, 243, 157, 239, 152, 121, 161, 190, 223, 197, 240, 230, 55, 25, 246,
+      156, 255, 197, 160, 239, 136, 214, 13, 203, 163, 208, 79, 246, 181, 213, 167, 56, 104, 245, 33, 48, 191, 251,
+      33, 127, 100, 71, 66, 54, 104, 224, 85, 34, 255, 52, 247, 83, 68, 227, 120, 232, 117, 105, 66, 237, 217,
+      169, 175, 191, 17, 72, 214, 5, 99, 191, 227, 121, 171, 67, 226, 190, 150, 152, 81, 255, 3, 156, 119, 228,
+      98, 215};
+  const std::vector<float> fc1_scales = {
+      0.01553376f, 0.015543817f, 0.015551699f, 0.015492203f, 0.015023133f, 0.0154082235f, 0.0155232195f,
+      0.01528402f, 0.015559638f, 0.015533516f, 0.015493423f, 0.015256615f, 0.0152339935f, 0.015549371f,
+      0.015381575f, 0.015576782f, 0.015412793f, 0.015498972f, 0.0151363555f, 0.015505189f, 0.014904913f,
+      0.015218727f, 0.015376769f, 0.015279377f, 0.015432924f, 0.015483502f, 0.015457189f, 0.015407557f,
+      0.0156120695f, 0.014825948f, 0.015501786f, 0.015303297f, 0.015532501f, 0.0152144935f, 0.015333908f,
+      0.01479763f, 0.015206473f, 0.01543629f, 0.015437368f, 0.01513233f, 0.015589874f, 0.015567031f,
+      0.015393224f, 0.014935784f, 0.015579218f, 0.015432265f, 0.015484579f, 0.015261326f, 0.015371274f,
+      0.015189547f, 0.015558099f, 0.014714118f, 0.015086958f, 0.015577158f, 0.014815275f, 0.01525769f,
+      0.015569633f, 0.014951542f, 0.015491992f, 0.015379513f, 0.015588352f, 0.015455488f, 0.015094815f,
+      0.015585413f, 0.0151954815f, 0.015539678f, 0.015179157f, 0.015570812f, 0.015453467f, 0.015222808f,
+      0.015130177f, 0.015514964f, 0.015050512f, 0.013596393f, 0.015181009f, 0.014813691f, 0.015430912f,
+      0.015623035f, 0.015465939f, 0.0155621655f, 0.015619047f, 0.015616288f, 0.015411615f, 0.015294425f,
+      0.015334727f, 0.01536013f, 0.015485667f, 0.015279645f, 0.015232291f, 0.015200818f, 0.014945071f,
+      0.015612004f, 0.015533011f, 0.01562017f, 0.015604494f, 0.015526485f, 0.014934285f, 0.015624931f,
+      0.015617797f, 0.0155350845f, 0.015362147f, 0.015408119f, 0.01547795f, 0.014903402f, 0.0154722165f,
+      0.015608951f, 0.015536772f, 0.015497636f, 0.01543246f, 0.015433108f, 0.015222307f, 0.0156019665f,
+      0.0154854f, 0.014986996f, 0.015555747f, 0.015378246f, 0.015050007f, 0.015395556f, 0.0154241435f,
+      0.015317103f, 0.015418313f, 0.015221456f, 0.015339879f, 0.015616156f, 0.01556934f, 0.015396217f,
+      0.015617745f, 0.015584825f};
+  const std::vector<float> fc2_scales = {
+      0.015234984f, 0.015523607f, 0.015164727f, 0.01548125f, 0.015093872f, 0.015315635f, 0.015266418f,
+      0.015527874f, 0.015592782f, 0.015093137f, 0.014813861f, 0.015202709f, 0.0153913535f, 0.01537223f,
+      0.015511734f, 0.015440272f, 0.015092988f, 0.015597204f, 0.015287647f, 0.015497316f, 0.015502119f,
+      0.015546441f, 0.015100006f, 0.015404332f, 0.015531912f, 0.015555983f, 0.01507354f, 0.015588721f,
+      0.01545357f, 0.015513655f, 0.015537361f, 0.015617292f, 0.015471501f, 0.015559636f, 0.015541913f,
+      0.015565485f, 0.015380409f, 0.015168384f, 0.0155151095f, 0.015469871f, 0.015443675f, 0.015554659f,
+      0.015623292f, 0.014806481f, 0.015374577f, 0.015407367f, 0.015303424f, 0.015412778f, 0.015173398f,
+      0.015220221f, 0.015319703f, 0.015124975f, 0.015372854f, 0.015297962f, 0.015397722f, 0.015355343f,
+      0.015466366f, 0.01507015f, 0.015495513f, 0.015593667f, 0.015281979f, 0.015336113f, 0.015525f,
+      0.01537925f, 0.015516909f, 0.015614616f, 0.015543677f, 0.015600901f, 0.0153762605f, 0.015399329f,
+      0.015290953f, 0.015491776f, 0.015287561f, 0.015271302f, 0.015343454f, 0.015566604f, 0.015624354f,
+      0.01533857f, 0.015119089f, 0.015481008f, 0.015398314f, 0.015596798f, 0.0153150465f, 0.015608612f,
+      0.015555618f, 0.015332868f, 0.015389856f, 0.015581448f, 0.015621847f, 0.015410677f, 0.01556886f,
+      0.015614897f, 0.01547879f, 0.015478665f, 0.015515525f, 0.01555785f, 0.01561863f, 0.015433328f,
+      0.015305866f, 0.015573423f, 0.015373498f, 0.0155666135f, 0.015396729f, 0.015547626f, 0.014429122f,
+      0.015496805f, 0.015291028f, 0.015550148f, 0.015425619f, 0.0155315865f, 0.015438886f, 0.015576545f,
+      0.015619017f, 0.01515908f, 0.015479961f, 0.015447514f, 0.015065838f, 0.015309097f, 0.015131723f,
+      0.014979966f, 0.014841583f, 0.015531611f, 0.015469328f, 0.015101345f, 0.015491165f, 0.0155728385f,
+      0.015560919f, 0.015370855f};
+  const std::vector<float> fc3_scales = {
+      0.015415549f, 0.015507627f, 0.014678219f, 0.015550405f, 0.015007719f, 0.015621224f, 0.0155345425f,
+      0.015270567f, 0.015584674f, 0.015545895f, 0.015420519f, 0.015511904f, 0.015497334f, 0.015613152f,
+      0.015344387f, 0.015462939f, 0.015408138f, 0.015263364f, 0.015522234f, 0.015557403f, 0.015617529f,
+      0.0155323f, 0.015070785f, 0.0154183265f, 0.015569469f, 0.014966013f, 0.015585924f, 0.0155711975f,
+      0.01525447f, 0.015368329f, 0.015493156f, 0.015439328f, 0.015451316f, 0.015313955f, 0.015007403f,
+      0.015397709f, 0.015486734f, 0.01554385f, 0.015589319f, 0.015365845f, 0.0152554605f, 0.015575631f,
+      0.015524423f, 0.015446551f, 0.01492084f, 0.015455352f, 0.014697226f, 0.015101928f, 0.01525531f,
+      0.01557962f, 0.015178623f, 0.015425265f, 0.015473807f, 0.015434511f, 0.015518608f, 0.015348455f,
+      0.014946166f, 0.0153529495f, 0.015595689f, 0.015601011f, 0.015585726f, 0.0155280195f, 0.014892634f,
+      0.015474405f, 0.015582396f, 0.01517096f, 0.015513012f, 0.015467694f, 0.015459979f, 0.015562061f,
+      0.015136767f, 0.015591653f, 0.015295904f, 0.014878606f, 0.015608272f, 0.015360581f, 0.015440369f,
+      0.015552597f, 0.0153689645f, 0.015544422f, 0.015161956f, 0.015341356f, 0.015590522f, 0.0155716f,
+      0.0153000355f, 0.015417134f, 0.015441434f, 0.015425701f, 0.015540993f, 0.015532201f, 0.015549095f,
+      0.015335085f, 0.01554049f, 0.015028752f, 0.015245372f, 0.01556482f, 0.015607696f, 0.015421748f,
+      0.0154471155f, 0.015398482f, 0.015602099f, 0.015455678f, 0.015591139f, 0.01557602f, 0.015448909f,
+      0.0153864585f, 0.015211966f, 0.015580256f, 0.015525388f, 0.015311712f, 0.015527213f, 0.015249299f,
+      0.015606547f, 0.0154935885f, 0.015555864f, 0.01537651f, 0.015581995f, 0.015337018f, 0.01547428f,
+      0.015216509f, 0.015208464f, 0.015577957f, 0.015380967f, 0.015528679f, 0.015578562f, 0.015344413f,
+      0.015526013f, 0.015194058f};
+  const std::vector<float> output = {
+      0.04828f, -0.05322f, -0.11176f, 0.09344f, -0.02678f, 0.09827f, 0.06616f, -0.04233f, -0.03937f, 0.1582f,
+      -0.0437f, 0.04413f, 0.0931f, 0.11127f, -0.0747f, -0.10297f, -0.06226f, 0.02866f, -0.1395f, -0.008934f,
+      -0.0385f, 0.1564f, 0.1207f, -0.104f, 0.131f, -0.01776f, -0.00962f, 0.05615f, -0.0129f, -0.01724f,
+      -0.06555f, 0.00729f, -0.02585f, 0.01662f, 0.1351f, -0.02095f, 0.1703f, -0.0237f, -0.1381f, 0.10895f,
+      -0.0724f, 0.04358f, 0.1371f, -0.0707f, 0.02188f, -0.06122f, -0.03586f, -0.01924f, 0.01304f, -0.039f,
+      0.12317f, -0.2336f, 0.0972f, -0.0862f, 0.05716f, 0.05075f, 0.1477f, 0.1316f, -0.05365f, -0.1301f,
+      0.01836f, -0.09186f, 0.0641f, -0.10913f, -0.1576f, 0.0441f, 0.03537f, -0.062f, 0.06915f, 0.02954f,
+      0.1605f, -0.05975f, -0.08435f, 0.1779f, -0.01181f, 0.001026f, 0.1284f, 0.1531f, 0.0571f, -0.1577f,
+      0.05838f, 0.1444f, -0.02432f, 0.10065f, -0.04343f, -0.09296f, 0.0335f, -0.00582f, 0.004944f, -0.013054f,
+      -0.049f, 0.0776f, 0.04633f, 0.0746f, -0.1191f, -0.1118f, -0.209f, -0.09753f, -0.02882f, -0.01466f,
+      -0.08655f, 0.1167f, -0.02155f, 0.05896f, 0.0117f, -0.05618f, 0.0908f, 0.1324f, -0.04462f, 0.04077f,
+      -0.02385f, 0.01863f, 0.0729f, 0.1226f, -0.1261f, -0.0583f, 0.0774f, -0.1523f, 0.2018f, 0.1119f,
+      -0.04095f, -0.01188f, 0.1113f, 0.0502f, 0.00584f, -0.02325f, 0.02837f, 0.04144f};
+
+  RunQMoETest(input, router_probs, fc1_experts_weights, fc2_experts_weights, fc3_experts_weights, fc1_scales,
+              fc2_scales, fc3_scales, output, num_rows, num_experts, hidden_size, inter_size, "silu",
+              1, /*normalize_routing_weights*/
+              2 /*top_k*/);
+}
+#endif
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/python/transformers/test_parity_mixtral_moe.py b/onnxruntime/test/python/transformers/test_parity_mixtral_moe.py
index 90b7da255081a..50292f186df15 100644
--- a/onnxruntime/test/python/transformers/test_parity_mixtral_moe.py
+++ b/onnxruntime/test/python/transformers/test_parity_mixtral_moe.py
@@ -226,9 +226,9 @@ def __init__(self, config, batch_size, sequence_length):
         w2_list = []
         w3_list = []
         for i in range(self.num_experts):
-            w1_list.append(self.experts[i].w1.weight.transpose(0, 1))
-            w2_list.append(self.experts[i].w2.weight.transpose(0, 1))
-            w3_list.append(self.experts[i].w3.weight.transpose(0, 1))
+            w1_list.append(self.experts[i].w1.weight)
+            w2_list.append(self.experts[i].w2.weight)
+            w3_list.append(self.experts[i].w3.weight)
 
         self.moe_experts_weight1 = torch.stack(w1_list, dim=0)
         self.moe_experts_weight2 = torch.stack(w2_list, dim=0)
diff --git a/onnxruntime/test/python/transformers/test_parity_moe.py b/onnxruntime/test/python/transformers/test_parity_moe.py
index dbf6ee7dabb0e..aa480a1af4587 100644
--- a/onnxruntime/test/python/transformers/test_parity_moe.py
+++ b/onnxruntime/test/python/transformers/test_parity_moe.py
@@ -249,9 +249,9 @@ def __init__(
             num_experts,
             in_features,
             hidden_features,
-            self.moe_experts.weight1,
+            self.moe_experts.weight1.transpose(1, 2),
             self.moe_experts.bias1,
-            self.moe_experts.weight2,
+            self.moe_experts.weight2.transpose(1, 2),
             self.moe_experts.bias2,
         )
 

From b1a5eb255ed1a09b4ef13fd967b3e01e3e22771a Mon Sep 17 00:00:00 2001
From: Adrian Lizarraga <adlizarraga@microsoft.com>
Date: Fri, 29 Mar 2024 11:54:55 -0700
Subject: [PATCH 272/279] [Quant] Fix accuracy_level config option for MatMul
 4bits quantizer (#20146)

### Description
Fixes code that extracts the accuracy level when creating a MatMulNBits
node in the `DefaultWeightOnlyQuantizer` class.


### Motivation and Context
Error from line 443: `AttributeError: 'DefaultWeightOnlyQuantizer'
object has no attribute 'accuracy_level'`. The solution is to access
`self.config.accuracy_level` instead of `self.accuracy_level`.

Relevant commit: https://github.com/microsoft/onnxruntime/pull/19106
---
 onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
index f4bcd508960a1..3090296b774aa 100644
--- a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
+++ b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
@@ -440,7 +440,7 @@ def quantize(self, node: NodeProto, graph_stack: list[GraphProto]) -> NodeProto:
         kwargs["bits"] = 4
         kwargs["block_size"] = self.config.block_size
         if self.config.accuracy_level is not None:
-            kwargs["accuracy_level"] = self.accuracy_level
+            kwargs["accuracy_level"] = self.config.accuracy_level
 
         matmul_q4_node = onnx.helper.make_node(
             "MatMulNBits",

From 83968458066fc96713b7f998cff09ccd883dd1af Mon Sep 17 00:00:00 2001
From: inisis <46103969+inisis@users.noreply.github.com>
Date: Sat, 30 Mar 2024 04:06:27 +0800
Subject: [PATCH 273/279] fix shape inference bug (#19848)

### Description
for nodes like add, their input should be merged dynamically

### Motivation and Context
when doing shape inference, for nodes like Add, currently when doing _onnx_infer_single_node, their inputs are generated from last node's output, but they should be merged.
---
 .../python/tools/symbolic_shape_infer.py      | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/onnxruntime/python/tools/symbolic_shape_infer.py b/onnxruntime/python/tools/symbolic_shape_infer.py
index 8a911071864aa..040bf5ae76fff 100755
--- a/onnxruntime/python/tools/symbolic_shape_infer.py
+++ b/onnxruntime/python/tools/symbolic_shape_infer.py
@@ -495,6 +495,28 @@ def _onnx_infer_single_node(self, node):
                     if (name in self.initializers_ and name not in self.graph_inputs_)
                 ]
 
+            if node.op_type in [
+                "Add",
+                "Sub",
+                "Mul",
+                "Div",
+                "MatMul",
+                "MatMulInteger",
+                "MatMulInteger16",
+                "Where",
+                "Sum",
+            ]:
+                if node.output[0] in self.known_vi_:
+                    vi = self.known_vi_[node.output[0]]
+                    out_rank = len(get_shape_from_type_proto(vi.type))
+                    in_shapes = [self._get_shape(node, i) for i in range(len(node.input))]
+                    for d in range(
+                        out_rank - (2 if node.op_type in ["MatMul", "MatMulInteger", "MatMulInteger16"] else 0)
+                    ):
+                        in_dims = [s[len(s) - out_rank + d] for s in in_shapes if len(s) + d >= out_rank]
+                        if len(in_dims) > 1:
+                            self._check_merged_dims(in_dims, allow_broadcast=True)
+
             # run single node inference with self.known_vi_ shapes
             tmp_graph = helper.make_graph(
                 [node],

From 604b284261318ee1c7fd5f33dfb04e4efe0e1ef1 Mon Sep 17 00:00:00 2001
From: cao lei <jslhcl@gmail.com>
Date: Fri, 29 Mar 2024 13:49:56 -0700
Subject: [PATCH 274/279] add API function GetAliasMap and ReleaseAliasMap in
 OrtCustomOp (#20145)

### Description
<!-- Describe your changes. -->
Add API function GetAliasMap and ReleaseAliasMap in OrtCustomOp


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
Add API function GetAliasMap and ReleaseAliasMap in OrtCustomOp
---
 .../core/session/onnxruntime_c_api.h          |  4 +++
 .../core/session/onnxruntime_cxx_api.h        |  2 ++
 .../core/session/onnxruntime_lite_custom_op.h |  2 ++
 onnxruntime/core/session/custom_ops.cc        | 10 ++++++++
 onnxruntime/test/shared_lib/test_inference.cc | 25 +++++++++++++++++++
 5 files changed, 43 insertions(+)

diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index c4fb0d3a83a67..e40c375cab119 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -4738,6 +4738,10 @@ struct OrtCustomOp {
   // Release the pointer input_index and output_index allocated from GetMayInplace() function.
   // If GetMayInplace() is defined, this function MUST be defined as well.
   void(ORT_API_CALL* ReleaseMayInplace)(_Frees_ptr_opt_ int* input_index, _Frees_ptr_opt_ int* output_index);
+
+  // Same as GetMayInplace() and ReleaseMayInplace()
+  size_t(ORT_API_CALL* GetAliasMap)(_Out_ int** input_index, _Out_ int** output_index);
+  void(ORT_API_CALL* ReleaseAliasMap)(_Frees_ptr_opt_ int* input_index, _Frees_ptr_opt_ int* output_index);
 };
 
 /*
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
index 5f2e0a470a133..fd0e3490426a7 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
@@ -2304,6 +2304,8 @@ struct CustomOpBase : OrtCustomOp {
 
     OrtCustomOp::GetMayInplace = nullptr;
     OrtCustomOp::ReleaseMayInplace = nullptr;
+    OrtCustomOp::GetAliasMap = nullptr;
+    OrtCustomOp::ReleaseAliasMap = nullptr;
   }
 
   // Default implementation of GetExecutionProviderType that returns nullptr to default to the CPU provider
diff --git a/include/onnxruntime/core/session/onnxruntime_lite_custom_op.h b/include/onnxruntime/core/session/onnxruntime_lite_custom_op.h
index 896893e986e05..ee60f25da115e 100644
--- a/include/onnxruntime/core/session/onnxruntime_lite_custom_op.h
+++ b/include/onnxruntime/core/session/onnxruntime_lite_custom_op.h
@@ -865,6 +865,8 @@ struct OrtLiteCustomOp : public OrtCustomOp {
 
     OrtCustomOp::GetMayInplace = {};
     OrtCustomOp::ReleaseMayInplace = {};
+    OrtCustomOp::GetAliasMap = {};
+    OrtCustomOp::ReleaseAliasMap = {};
   }
 
   const std::string op_name_;
diff --git a/onnxruntime/core/session/custom_ops.cc b/onnxruntime/core/session/custom_ops.cc
index cc23d0822c36e..d0c46142ac060 100644
--- a/onnxruntime/core/session/custom_ops.cc
+++ b/onnxruntime/core/session/custom_ops.cc
@@ -875,6 +875,16 @@ KernelCreateInfo CreateKernelCreateInfo(const std::string& domain, const OrtCust
     }
   }
 
+  if (op->version >= 18 && op->GetAliasMap != nullptr) {
+    int* input_index = nullptr;
+    int* output_index = nullptr;
+    size_t len = op->GetAliasMap(&input_index, &output_index);
+    if (len > 0) {
+      for (size_t i = 0; i < len; i++) def_builder.Alias(input_index[i], output_index[i]);
+      op->ReleaseAliasMap(input_index, output_index);
+    }
+  }
+
   KernelCreateFn kernel_create_fn = [op](FuncManager&, const OpKernelInfo& info,
                                          std::unique_ptr<OpKernel>& out) -> Status {
     out = std::make_unique<CustomOpKernel>(info, *op);
diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc
index 5dd5fabd26fb4..a7ce8127a7f50 100644
--- a/onnxruntime/test/shared_lib/test_inference.cc
+++ b/onnxruntime/test/shared_lib/test_inference.cc
@@ -4039,6 +4039,20 @@ struct MockGQA : public OrtCustomOp {
       free(input_index);
       free(output_index);
     };
+    OrtCustomOp::GetAliasMap = [](int** input_index, int** output_index) {
+      size_t ret = 2;
+      *input_index = static_cast<int*>(malloc(ret * sizeof(int)));
+      (*input_index)[0] = 5;
+      (*input_index)[1] = 6;
+      *output_index = static_cast<int*>(malloc(ret * sizeof(int)));
+      (*output_index)[0] = 7;
+      (*output_index)[1] = 8;
+      return ret;
+    };
+    OrtCustomOp::ReleaseAliasMap = [](int* input_index, int* output_index) {
+      free(input_index);
+      free(output_index);
+    };
   }
 };
 
@@ -4055,4 +4069,15 @@ TEST(CApiTest, OrtCustomOp_GetInPlace) {
   ASSERT_EQ(output_index[1], 2);
   ASSERT_EQ(len, static_cast<size_t>(2));
   mock_gqa.ReleaseMayInplace(input_index, output_index);
+
+  input_index = output_index = nullptr;
+  len = mock_gqa.GetAliasMap(&input_index, &output_index);
+  ASSERT_NE(input_index, nullptr);
+  ASSERT_NE(output_index, nullptr);
+  ASSERT_EQ(input_index[0], 5);
+  ASSERT_EQ(input_index[1], 6);
+  ASSERT_EQ(output_index[0], 7);
+  ASSERT_EQ(output_index[1], 8);
+  ASSERT_EQ(len, static_cast<size_t>(2));
+  mock_gqa.ReleaseAliasMap(input_index, output_index);
 }

From 2f31560430c3c28a1eed9588d69165b0a0179954 Mon Sep 17 00:00:00 2001
From: Jeff Bloomfield <38966965+jeffbloo@users.noreply.github.com>
Date: Fri, 29 Mar 2024 14:37:30 -0700
Subject: [PATCH 275/279] Enable generic feature level devices in DML EP
 (#20114)

### Description
Enable NPUs supporting DXCORE_ADAPTER_ATTRIBUTE_D3D12_GENERIC_ML and
D3D_FEATURE_LEVEL_1_0_GENERIC with DML EP. This also begins ingesting DX
headers through the DirectX-Headers repo.

Note that this includes an update to cgamanifest.json for onnx-tensorrt
which is triggered during re-generation due to a prior changes to
deps.txt.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 cgmanifests/generated/cgmanifest.json         | 12 +++++-
 cmake/deps.txt                                |  1 +
 cmake/external/dml.cmake                      | 11 +++++
 .../DmlExecutionProvider/src/AllocationInfo.h |  2 +-
 .../src/DmlExternalBufferAllocator.h          |  2 +-
 .../src/ExecutionProvider.cpp                 | 15 +++----
 .../src/IExecutionProvider.h                  |  2 +-
 .../src/Operators/DmlDFT.h                    |  2 +-
 .../src/Operators/DmlGridSample.h             |  2 +-
 .../dml/DmlExecutionProvider/src/precomp.h    |  2 +-
 .../providers/dml/dml_provider_factory.cc     | 40 ++++++++++++++-----
 .../dml/dml_provider_factory_creator.h        |  4 +-
 .../templates/download-deps.yml               |  4 +-
 13 files changed, 70 insertions(+), 29 deletions(-)

diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json
index 3e13a567b1eaa..5a955324414e1 100644
--- a/cgmanifests/generated/cgmanifest.json
+++ b/cgmanifests/generated/cgmanifest.json
@@ -216,7 +216,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "a43ce67187bab219520fd80f21af8bbd4354bc8c",
+          "commitHash": "bacfaaa951653cd4e72efe727a543567cb38f7de",
           "repositoryUrl": "https://github.com/onnx/onnx-tensorrt.git"
         },
         "comments": "onnx_tensorrt"
@@ -341,6 +341,16 @@
         },
         "comments": "composable_kernel"
       }
+    },
+    {
+      "component": {
+        "type": "git",
+        "git": {
+          "commitHash": "de28d93dfa9ebf3e473127c1c657e1920a5345ee",
+          "repositoryUrl": "https://github.com/microsoft/DirectX-Headers.git"
+        },
+        "comments": "directx_headers"
+      }
     }
   ]
 }
diff --git a/cmake/deps.txt b/cmake/deps.txt
index 22ad9338ea59a..720dbe107c9f1 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -57,3 +57,4 @@ cutlass;https://github.com/NVIDIA/cutlass/archive/refs/tags/v3.1.0.zip;757f90a79
 utf8_range;https://github.com/protocolbuffers/utf8_range/archive/72c943dea2b9240cd09efde15191e144bc7c7d38.zip;9925739c9debc0efa2adcb194d371a35b6a03156
 extensions;https://github.com/microsoft/onnxruntime-extensions/archive/94142d8391c9791ec71c38336436319a2d4ac7a0.zip;4365ac5140338b4cb75a39944a4be276e3829b3c
 composable_kernel;https://github.com/ROCmSoftwarePlatform/composable_kernel/archive/5356c4a943a35e74d7cdc69486afcb8703b9a59a.zip;522382c2af437e09124287e5879ab64af5b2e299
+directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e
\ No newline at end of file
diff --git a/cmake/external/dml.cmake b/cmake/external/dml.cmake
index ae7e6d3801a64..8f18059ffdfe5 100644
--- a/cmake/external/dml.cmake
+++ b/cmake/external/dml.cmake
@@ -99,3 +99,14 @@ else()
     set(DML_PACKAGE_DIR ${dml_INCLUDE_DIR}/..)
   endif()
 endif()
+
+FetchContent_Declare(
+    directx_headers
+    URL ${DEP_URL_directx_headers}
+    URL_HASH SHA1=${DEP_SHA1_directx_headers}
+)
+
+FetchContent_Populate(directx_headers)
+set(directx_headers_INCLUDE_DIRS  "${directx_headers_SOURCE_DIR}/include")
+
+include_directories(BEFORE ${directx_headers_INCLUDE_DIRS})
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AllocationInfo.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AllocationInfo.h
index 59a827a4ffa1b..9c395e9cc906b 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AllocationInfo.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AllocationInfo.h
@@ -5,7 +5,7 @@
 
 #include <wrl/client.h>
 #include <wrl/implements.h>
-#include <d3d12.h>
+#include "directx/d3d12.h"
 #include "DmlResourceWrapper.h"
 
 namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalBufferAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalBufferAllocator.h
index 22fd3be42c416..b22f0b2853e5d 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalBufferAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalBufferAllocator.h
@@ -4,7 +4,7 @@
 #pragma once
 
 #include <wrl/client.h>
-#include <d3d12.h>
+#include "directx/d3d12.h"
 #include <wil/wrl.h>
 #include <wil/result_macros.h>
 #include "External/D3DX12/d3dx12.h"
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index 6c347ebdca7c1..d24bf3350b292 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -142,13 +142,7 @@ namespace Dml
         }
     }
 
-// ORT release pipelines agent pools do not have 19H1 SDK installed which defines D3D_FEATURE_LEVEL_1_0_CORE.
-// Once ORT/WinML github project can be built with VS2019, we can update these pools to use install the 19H1 SDK
-// using the command line installer tool with VS2019
-// Task 24384515: Update ORT AIInfra release agent pool to install 19H1 SDK on VM bootstrap
-#define D3D_FEATURE_LEVEL_1_0_CORE_PRIVATE ((D3D_FEATURE_LEVEL)0x1000)
-
-    ExecutionProviderImpl::ExecutionProviderImpl(IDMLDevice* dmlDevice, ID3D12Device* d3d12Device, ID3D12CommandQueue* queue, bool enableMetacommands, bool enableDynamicGraphFusion)
+ExecutionProviderImpl::ExecutionProviderImpl(IDMLDevice* dmlDevice, ID3D12Device* d3d12Device, ID3D12CommandQueue* queue, bool enableMetacommands, bool enableDynamicGraphFusion)
         : m_d3d12Device(d3d12Device),
           m_dmlDevice(dmlDevice),
           m_areMetacommandsEnabled(enableMetacommands),
@@ -157,7 +151,10 @@ namespace Dml
         D3D12_FEATURE_DATA_FEATURE_LEVELS featureLevels = {};
 
         D3D_FEATURE_LEVEL featureLevelsList[] = {
-            D3D_FEATURE_LEVEL_1_0_CORE_PRIVATE,
+  #ifndef _GAMING_XBOX
+            D3D_FEATURE_LEVEL_1_0_GENERIC,
+  #endif
+            D3D_FEATURE_LEVEL_1_0_CORE,
             D3D_FEATURE_LEVEL_11_0,
             D3D_FEATURE_LEVEL_11_1,
             D3D_FEATURE_LEVEL_12_0,
@@ -181,7 +178,7 @@ namespace Dml
             m_native16BitShaderOpsSupported = featureOptions.Native16BitShaderOpsSupported;
         }
 
-        m_isMcdmDevice = (featureLevels.MaxSupportedFeatureLevel == D3D_FEATURE_LEVEL_1_0_CORE_PRIVATE);
+        m_isMcdmDevice = (featureLevels.MaxSupportedFeatureLevel <= D3D_FEATURE_LEVEL_1_0_CORE);
         m_areCustomHeapsSupported = !m_isMcdmDevice;
 
         if (m_isMcdmDevice)
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/IExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/IExecutionProvider.h
index 17fd7c18ba4a1..f4c3f326274ad 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/IExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/IExecutionProvider.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include <d3d12.h>
+#include "directx/d3d12.h"
 
 #include "core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h"
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
index c285cf1a070b9..ddd6d56128461 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
@@ -4,7 +4,7 @@
 #include "../../../OperatorAuthorHelper/OperatorHelper.h"
 
 #include "../External/D3DX12/d3dx12.h"
-#include <d3d12.h>
+#include "directx/d3d12.h"
 
 // NOTE: When this operator's implementation is moved into DML, the associated FP16 fallback
 //       should be removed from IsCustomOpShader(...) in
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h
index 4bbc8a4b718da..4f5da9dd05491 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h
@@ -4,7 +4,7 @@
 #include "../MLOperatorAuthorImpl.h"
 
 #include "../External/D3DX12/d3dx12.h"
-#include <d3d12.h>
+#include "directx/d3d12.h"
 
 // NOTE: When this operator's implementation is moved into DML, the associated FP16 fallback
 //       should be removed from IsCustomOpShader(...) in
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/precomp.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/precomp.h
index 1a796b25c5d1f..5bdc68b685ee4 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/precomp.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/precomp.h
@@ -35,7 +35,7 @@
 #include <d3d12_x.h>
 #include <d3dx12_x.h>
 #else // Desktop
-#include <d3d12.h>
+#include "directx/d3d12.h"
 #include <d3d12sdklayers.h>
 #include "External/D3DX12/d3dx12.h"
 #endif
diff --git a/onnxruntime/core/providers/dml/dml_provider_factory.cc b/onnxruntime/core/providers/dml/dml_provider_factory.cc
index b2688094a6d78..9ba1c35efb27b 100644
--- a/onnxruntime/core/providers/dml/dml_provider_factory.cc
+++ b/onnxruntime/core/providers/dml/dml_provider_factory.cc
@@ -1,9 +1,15 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <dxcore.h>
 #include <vector>
 
+#define INITGUID
+#include <guiddef.h>
+#include <directx/dxcore.h>
+#undef INITGUID
+
+#include "directx/d3d12.h"
+
 #include <DirectML.h>
 #ifndef _GAMING_XBOX
 #include <dxgi1_4.h>
@@ -157,12 +163,15 @@ static ComPtr<IDXCoreAdapterList> EnumerateDXCoreAdapters(IDXCoreAdapterFactory*
   // When DXCore APIs are available QI for relevant enumeration interfaces
   constexpr bool use_dxcore_workload_enumeration = false;
   if (!use_dxcore_workload_enumeration) {
-    // Get a list of all the adapters that support compute
-    GUID attributes[]{ DXCORE_ADAPTER_ATTRIBUTE_D3D12_CORE_COMPUTE };
     ORT_THROW_IF_FAILED(
-      adapter_factory->CreateAdapterList(_countof(attributes),
-        attributes,
+      adapter_factory->CreateAdapterList(1,
+        &DXCORE_ADAPTER_ATTRIBUTE_D3D12_GENERIC_ML,
         adapter_list.GetAddressOf()));
+
+    if (adapter_list->GetAdapterCount() == 0)
+    {
+        ORT_THROW_IF_FAILED(adapter_factory->CreateAdapterList(1, &DXCORE_ADAPTER_ATTRIBUTE_D3D12_CORE_COMPUTE, adapter_list.GetAddressOf()));
+    }
   }
 
   return adapter_list;
@@ -477,6 +486,9 @@ static D3D12_COMMAND_LIST_TYPE CalculateCommandListType(ID3D12Device* d3d12_devi
   D3D12_FEATURE_DATA_FEATURE_LEVELS feature_levels = {};
 
   D3D_FEATURE_LEVEL feature_levels_list[] = {
+  #ifndef _GAMING_XBOX
+      D3D_FEATURE_LEVEL_1_0_GENERIC,
+  #endif
       D3D_FEATURE_LEVEL_1_0_CORE,
       D3D_FEATURE_LEVEL_11_0,
       D3D_FEATURE_LEVEL_11_1,
@@ -492,8 +504,9 @@ static D3D12_COMMAND_LIST_TYPE CalculateCommandListType(ID3D12Device* d3d12_devi
       sizeof(feature_levels)
       ));
 
-  auto is_feature_level_1_0_core = (feature_levels.MaxSupportedFeatureLevel == D3D_FEATURE_LEVEL_1_0_CORE);
-  if (is_feature_level_1_0_core) {
+  auto use_compute_command_list = (feature_levels.MaxSupportedFeatureLevel <= D3D_FEATURE_LEVEL_1_0_CORE);
+  if (use_compute_command_list)
+  {
     return D3D12_COMMAND_LIST_TYPE_COMPUTE;
   }
 
@@ -533,12 +546,21 @@ std::shared_ptr<IExecutionProviderFactory> DMLProviderFactoryCreator::CreateFrom
 
   auto feature_level = D3D_FEATURE_LEVEL_11_0;
   if (IsNPU(adapter.Get())) {
-    feature_level = D3D_FEATURE_LEVEL_1_0_CORE;
+    feature_level = D3D_FEATURE_LEVEL_1_0_GENERIC;
   }
 
   // Create D3D12 Device from DXCore Adapter
   ComPtr<ID3D12Device> d3d12_device;
-  ORT_THROW_IF_FAILED(D3D12CreateDevice(adapter.Get(), feature_level, IID_GRAPHICS_PPV_ARGS(d3d12_device.ReleaseAndGetAddressOf())));
+  if (feature_level == D3D_FEATURE_LEVEL_1_0_GENERIC) {
+      // Attempt to create a D3D_FEATURE_LEVEL_1_0_CORE device first, in case the device supports this
+      // feature level and the D3D runtime does not support D3D_FEATURE_LEVEL_1_0_GENERIC
+      HRESULT hrUnused = D3D12CreateDevice(adapter.Get(), D3D_FEATURE_LEVEL_1_0_CORE, IID_GRAPHICS_PPV_ARGS(d3d12_device.ReleaseAndGetAddressOf()));
+  }
+  
+  if (!d3d12_device) {
+    ORT_THROW_IF_FAILED(D3D12CreateDevice(adapter.Get(), feature_level, IID_GRAPHICS_PPV_ARGS(d3d12_device.ReleaseAndGetAddressOf())));
+  }
+
   return CreateDMLDeviceAndProviderFactory(d3d12_device.Get(), disable_metacommands, enable_dynamic_graph_fusion);
 }
 
diff --git a/onnxruntime/core/providers/dml/dml_provider_factory_creator.h b/onnxruntime/core/providers/dml/dml_provider_factory_creator.h
index 0fab9fe902526..61d0cba0e1f98 100644
--- a/onnxruntime/core/providers/dml/dml_provider_factory_creator.h
+++ b/onnxruntime/core/providers/dml/dml_provider_factory_creator.h
@@ -6,12 +6,12 @@
 #include <memory>
 
 #include <wrl/client.h>
-#include <d3d12.h>
+#include "directx/d3d12.h"
 #include "core/framework/provider_options.h"
 #include "core/providers/providers.h"
 #include "core/providers/dml/dml_provider_factory.h"
 
-#include <dxcore.h>
+#include <directx/dxcore.h>
 #include <vector>
 
 namespace onnxruntime {
diff --git a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
index 4fd33b4f0bc09..eb87a57024ed4 100644
--- a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
@@ -11,7 +11,7 @@ steps:
       packageType: upack
       feed: '/7424c8e4-5c62-490e-95c4-79446f31017c'
       definition: '517c4f6f-5437-4392-a70d-4f15ec5be2f0'
-      version: 1.0.145
+      version: 1.0.149
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # The private ADO project
@@ -22,7 +22,7 @@ steps:
       packageType: upack
       feed: '/4c7631f5-24c0-4307-8822-1aa8f180c325'
       definition: 'fd9dd5ad-b73e-4678-890e-edcf680dbc1a'
-      version: 1.0.145
+      version: 1.0.149
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # You can add more ADO accounts at here.

From f3a864217f3aad5453f8c5a2728224698631362b Mon Sep 17 00:00:00 2001
From: Ye Wang <52801275+wangyems@users.noreply.github.com>
Date: Fri, 29 Mar 2024 16:10:09 -0700
Subject: [PATCH 276/279] Fix MoE tensor parallelism tests (#20147)

### Description
<!-- Describe your changes. -->
Previously the expert weights are in row-major. But with the updated
cutlass extension introduced by
https://github.com/microsoft/onnxruntime/pull/20108, weights are stored
in col-major that aligns with Pytorch implementation. This change fixes
the way the tensors are sliced across shards.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../sharded_moe/test_sharded_moe.py           | 30 +++++++++++++------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/onnxruntime/test/python/transformers/sharded_moe/test_sharded_moe.py b/onnxruntime/test/python/transformers/sharded_moe/test_sharded_moe.py
index ec64f2359f4be..42682d67e94ec 100644
--- a/onnxruntime/test/python/transformers/sharded_moe/test_sharded_moe.py
+++ b/onnxruntime/test/python/transformers/sharded_moe/test_sharded_moe.py
@@ -277,15 +277,27 @@ def test_moe_with_tensor_parallelism(
         inter_size,
     )
 
-    fc1_experts_weights = fc1_experts_weights_all[
-        :, :, local_rank * inter_size // get_size() : (local_rank + 1) * inter_size // get_size()
-    ]
-    fc2_experts_weights = fc2_experts_weights_all[
-        :, local_rank * inter_size // get_size() : (local_rank + 1) * inter_size // get_size(), :
-    ]
-    fc3_experts_weights = fc3_experts_weights_all[
-        :, :, local_rank * inter_size // get_size() : (local_rank + 1) * inter_size // get_size()
-    ]
+    def get_fc1_tensor_shards(expert_weights):
+        return (
+            expert_weights.reshape(-1, inter_size, hidden_size)
+            .transpose(0, 2, 1)[
+                :, :, local_rank * inter_size // get_size() : (local_rank + 1) * inter_size // get_size()
+            ]
+            .transpose(0, 2, 1)
+        )
+
+    def get_fc2_tensor_shards(expert_weights):
+        return (
+            expert_weights.reshape(-1, hidden_size, inter_size)
+            .transpose(0, 2, 1)[
+                :, local_rank * inter_size // get_size() : (local_rank + 1) * inter_size // get_size(), :
+            ]
+            .transpose(0, 2, 1)
+        )
+
+    fc1_experts_weights = get_fc1_tensor_shards(fc1_experts_weights_all)
+    fc2_experts_weights = get_fc2_tensor_shards(fc2_experts_weights_all)
+    fc3_experts_weights = get_fc1_tensor_shards(fc3_experts_weights_all)
     fc1_experts_bias = fc1_experts_bias_all[
         :, local_rank * inter_size // get_size() : (local_rank + 1) * inter_size // get_size()
     ]

From 00244ea1439078f49d9aa284101ae0c566d476e2 Mon Sep 17 00:00:00 2001
From: mo-ja <60505697+mo-ja@users.noreply.github.com>
Date: Sat, 30 Mar 2024 13:36:15 +0900
Subject: [PATCH 277/279] fix quantization errors of ConvTranspose with
 per_channel=True (#19996)

### Description
<!-- Describe your changes. -->
 - update axis value for per_channel quantization of QDQConv
   - we should use `axis=1` for ConvTranspose operator.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
- this PR fixes https://github.com/microsoft/onnxruntime/issues/19694,
which I have opened
---
 onnxruntime/python/tools/quantization/operators/conv.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/python/tools/quantization/operators/conv.py b/onnxruntime/python/tools/quantization/operators/conv.py
index 7054173450569..b053c65ad6f85 100644
--- a/onnxruntime/python/tools/quantization/operators/conv.py
+++ b/onnxruntime/python/tools/quantization/operators/conv.py
@@ -247,7 +247,8 @@ def quantize(self):
             self.quantizer.quantize_activation_tensor(node.output[0])
 
         if self.quantizer.is_per_channel():
-            self.quantizer.quantize_weight_tensor_per_channel(node.input[1], 0)
+            axis = 0 if node.op_type == "Conv" else 1
+            self.quantizer.quantize_weight_tensor_per_channel(node.input[1], axis)
         else:
             self.quantizer.quantize_weight_tensor(node.input[1])
 

From a0ebd5fee5bbecffca9477cdc4a25dcfc0de3d11 Mon Sep 17 00:00:00 2001
From: kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
Date: Fri, 29 Mar 2024 23:09:37 -0700
Subject: [PATCH 278/279] Add flash attention v2 and INT4 CUDA for LLaMA E2E
 benchmarking (#20149)

### Description
This PR adds flash attention v2 and support for INT4 CUDA benchmarking
in PyTorch.

### Motivation and Context
The [flash attention v2](https://github.com/Dao-AILab/flash-attention)
algorithm helps improve model performance in PyTorch. Support for INT4
CUDA in PyTorch is done through the
[`bitsandbytes`](https://github.com/TimDettmers/bitsandbytes) package.
---
 .../models/llama/benchmark_e2e.py             | 51 +++++++++++++++----
 .../transformers/models/llama/llama_parity.py |  2 +-
 2 files changed, 42 insertions(+), 11 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/models/llama/benchmark_e2e.py b/onnxruntime/python/tools/transformers/models/llama/benchmark_e2e.py
index 47b7f35cbdd7c..b69bd229745c6 100644
--- a/onnxruntime/python/tools/transformers/models/llama/benchmark_e2e.py
+++ b/onnxruntime/python/tools/transformers/models/llama/benchmark_e2e.py
@@ -20,6 +20,14 @@
 # 4) Install the latest ONNX Runtime version
 #
 # $ pip install onnxruntime-gpu
+#
+# 5) Install flash attention v2
+#
+# $ pip install flash-attn --no-build-isolation
+#
+# 6) Install bitsandbytes
+#
+# $ pip install bitsandbytes
 
 from __future__ import annotations
 
@@ -38,22 +46,44 @@
 import torch
 from benchmark_helper import setup_logger
 from llama_inputs import add_io_bindings_as_tensors, get_initial_inputs_and_outputs
-from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 
 import onnxruntime as ort
 
 logger = logging.getLogger(__name__)
 
 
-def get_model(args):
+def get_model(args: argparse.Namespace):
     if args.benchmark_type in {"pt-eager", "pt-compile"}:
-        model = AutoModelForCausalLM.from_pretrained(
-            args.hf_dir_path if args.hf_dir_path != "" else args.model_name,
-            cache_dir=args.cache_dir,
-            torch_dtype=args.torch_dtype,
-            use_auth_token=args.auth,
-            use_cache=True,
-        ).to(args.target_device)
+        model = None
+        if args.onnx_precision == "int4" and args.device == "cuda":
+            bnb_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_use_double_quant=True,
+                bnb_4bit_quant_type="nf4",
+                bnb_4bit_compute_dtype=torch.float16,
+            )
+
+            model = AutoModelForCausalLM.from_pretrained(
+                args.hf_dir_path if args.hf_dir_path != "" else args.model_name,
+                cache_dir=args.cache_dir,
+                torch_dtype=args.torch_dtype,
+                use_auth_token=args.auth,
+                use_cache=True,
+                attn_implementation="flash_attention_2",
+                quantization_config=bnb_config,
+                max_memory={args.device_id: "80GB"},
+            )
+        else:
+            model = AutoModelForCausalLM.from_pretrained(
+                args.hf_dir_path if args.hf_dir_path != "" else args.model_name,
+                cache_dir=args.cache_dir,
+                torch_dtype=args.torch_dtype,
+                use_auth_token=args.auth,
+                use_cache=True,
+                attn_implementation=("flash_attention_2" if args.device == "cuda" else "sdpa"),
+            ).to(args.target_device)
+
         model.eval()
 
         if args.benchmark_type == "pt-compile":
@@ -223,7 +253,7 @@ def get_args():
     parser.add_argument(
         "-s",
         "--prompt-lengths",
-        default="32 64 128 256 512",
+        default="16 64 256 1024",
     )
 
     parser.add_argument(
@@ -277,6 +307,7 @@ def get_args():
     args.prompt_lengths = args.prompt_lengths.split(" ")
 
     # Use FP32 precision for FP32, INT8, INT4 CPU models, use FP16 precision for FP16 and INT4 GPU models
+    setattr(args, "onnx_precision", args.precision)  # noqa: B010
     args.precision = (
         "fp32" if args.precision in {"int8", "fp32"} or (args.precision == "int4" and args.device == "cpu") else "fp16"
     )
diff --git a/onnxruntime/python/tools/transformers/models/llama/llama_parity.py b/onnxruntime/python/tools/transformers/models/llama/llama_parity.py
index 9cbc9af7fe9b5..7b186eec2f5a9 100644
--- a/onnxruntime/python/tools/transformers/models/llama/llama_parity.py
+++ b/onnxruntime/python/tools/transformers/models/llama/llama_parity.py
@@ -170,7 +170,7 @@ def get_args(argv: list[str]):
     parser.add_argument(
         "-m",
         "--model_name",
-        required=True,
+        required=False,
         help="Model name in Hugging Face",
     )
 

From e1e292f94c4f125bfd95703961f32718a032f921 Mon Sep 17 00:00:00 2001
From: Sumit Agarwal <sumitagarwal330@gmail.com>
Date: Sun, 31 Mar 2024 14:41:42 -0700
Subject: [PATCH 279/279] [DML EP] DML Graph Serialization Bug (#19748)

### Description
This pull request addresses several issues:

- The DML Graph's nodes were not sorted in a topologically ordered
sequence, leading to crashes during deserialization when a child node
preceded its parent node. This PR resolves this issue by implementing a
topological sorting algorithm before serialization.

- During the `RemoveUnconnectedNodes` process:
- we update `intermeidateEdge.FromNodeIndex`. Additionally, we must
update `intermediateEdge.Name` when it includes
`intermediateEdge.FromNodeIndex`, as serialization/deserialization
heavily relies on edge names.

- we also eliminate unused edges. Consequently, we must erase inputs
(now unused) from corresponding maps
`serializedGraphInputIndexToSubgraphInputIndex` and
`serializedGraphLargeConstantNameToSubgraphInputIndex`.


### Motivation and Context
Why is this change required? What problem does it solve?
There are few ONNX Zoo public models which were crashing during
deserialization.
<!-- - - If it fixes an open issue, please link to the issue here. -->

---------

Co-authored-by: Jeff Bloomfield <38966965+jeffbloo@users.noreply.github.com>
---
 .../src/DmlGraphDeserialization.cpp           |  7 ++-
 .../src/DmlGraphFusionHelper.cpp              | 62 +++++++++----------
 .../src/DmlGraphSerialization.cpp             | 13 ++--
 .../External/DirectMLHelpers/DmlGraphHelper.h | 52 ++++++++++++++++
 .../src/GraphDescBuilder.cpp                  | 57 ++++++++++++++++-
 .../dml/DmlExecutionProvider/src/precomp.h    |  1 +
 6 files changed, 151 insertions(+), 41 deletions(-)
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphHelper.h

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphDeserialization.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphDeserialization.cpp
index 7d8ed17e7d925..013ad949c1c3f 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphDeserialization.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphDeserialization.cpp
@@ -415,7 +415,7 @@ template <typename EdgeType> void PopulateEdges(
                 if (edgeToOutgoingNodeIndexMap.find(edgeName->string_view()) == edgeToOutgoingNodeIndexMap.end())
                 {
                     throw std::range_error("Neither there is any graph input with name " + edgeName->str() + 
-                                           "nor there is any node which has " + edgeName->str() + " as one of the output.");
+                                           " nor there is any node which has " + edgeName->str() + " as one of the output.");
                 }
                 auto& intermediateEdgeNodeIndex = edgeToOutgoingNodeIndexMap[edgeName->string_view()];
                 DmlIntermediateSerializedGraphEdge intermediateEdge = {};
@@ -475,6 +475,7 @@ DmlSerializedGraphDesc DeserializeDmlGraph(
             inputEdges,
             intermediateEdges,
             edgeToOutgoingNodeIndexMap);
+
         PopulateEdges<DmlOutputSerializedGraphEdge>(
             nodeIndex,
             flatbufferNode->outputNames(),
@@ -482,7 +483,7 @@ DmlSerializedGraphDesc DeserializeDmlGraph(
             outputEdges,
             intermediateEdges,
             edgeToOutgoingNodeIndexMap);
-
+        
         DmlSerializedGraphNode node = {};
         if (flatbufferNode->name()->size() == 0)
         {
@@ -503,7 +504,7 @@ DmlSerializedGraphDesc DeserializeDmlGraph(
 
                 ConstantName constantNode = {flatbufferConstantNode->data_as_ConstantName()->name()->c_str()};
                 node.Desc = constantNode;
-                // output of this node will part of constantInputs list
+                // Output of this node will be part of constantInputs list.
                 for (uint32_t outputIndex = 0; outputIndex < flatbufferNode->outputNames()->size(); outputIndex++)
                 {
                     constantInputs.insert(flatbufferNode->outputNames()->Get(outputIndex)->c_str());
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
index 202b762d99e01..27168bc8e9763 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
@@ -596,37 +596,37 @@ namespace DmlGraphFusionHelper
         const std::unordered_map<uint32_t, uint32_t>* serializedGraphInputIndexToSubgraphInputIndex,
         const std::unordered_map<std::string_view, uint32_t>* serializedGraphLargeConstantNameToSubgraphInputIndex)
     {
-      if (graphSerializationEnabled)
-      {
-
-        const std::wstring modelName = GetModelName(graph.ModelPath());
-        auto buffer = SerializeDmlGraph(graphDesc);
-
-        const std::wstring partitionName =
-            L"Partition_" +
-            std::to_wstring(partitionIndex) +
-            L".bin";
-        WriteToFile(modelName, partitionName, buffer.data(), buffer.size());
-
-        std::vector<std::unique_ptr<std::byte[]>> rawData;
-        DmlSerializedGraphDesc deserializedGraphDesc = DeserializeDmlGraph(buffer.data(), rawData);
-        GraphDescBuilder::GraphDesc deserializedDmlGraphDesc = {};
-        deserializedDmlGraphDesc.InputCount = deserializedGraphDesc.InputCount;
-        deserializedDmlGraphDesc.InputEdges = std::move(deserializedGraphDesc.InputEdges);
-        deserializedDmlGraphDesc.IntermediateEdges = std::move(deserializedGraphDesc.IntermediateEdges);
-        deserializedDmlGraphDesc.Nodes = std::move(deserializedGraphDesc.Nodes);
-        deserializedDmlGraphDesc.OutputCount = deserializedGraphDesc.OutputCount;
-        deserializedDmlGraphDesc.OutputEdges = std::move(deserializedGraphDesc.OutputEdges);
-        deserializedDmlGraphDesc.reuseCommandList = graphDesc.reuseCommandList;
-        deserializedDmlGraphDesc.outputShapes = graphDesc.outputShapes;
-
-        compiledExecutionPlanOperator = DmlGraphFusionHelper::TryCreateCompiledOperator(
-                        deserializedDmlGraphDesc,
-                        indexedSubGraph,
-                        providerImpl,
-                        serializedGraphInputIndexToSubgraphInputIndex,
-                        serializedGraphLargeConstantNameToSubgraphInputIndex);
-      }
+        if (graphSerializationEnabled)
+        {
+        
+          const std::wstring modelName = GetModelName(graph.ModelPath());
+          auto buffer = SerializeDmlGraph(graphDesc);
+        
+          const std::wstring partitionName =
+              L"Partition_" +
+              std::to_wstring(partitionIndex) +
+              L".bin";
+          WriteToFile(modelName, partitionName, buffer.data(), buffer.size());
+        
+          std::vector<std::unique_ptr<std::byte[]>> rawData;
+          DmlSerializedGraphDesc deserializedGraphDesc = DeserializeDmlGraph(buffer.data(), rawData);
+          GraphDescBuilder::GraphDesc deserializedDmlGraphDesc = {};
+          deserializedDmlGraphDesc.InputCount = deserializedGraphDesc.InputCount;
+          deserializedDmlGraphDesc.InputEdges = std::move(deserializedGraphDesc.InputEdges);
+          deserializedDmlGraphDesc.IntermediateEdges = std::move(deserializedGraphDesc.IntermediateEdges);
+          deserializedDmlGraphDesc.Nodes = std::move(deserializedGraphDesc.Nodes);
+          deserializedDmlGraphDesc.OutputCount = deserializedGraphDesc.OutputCount;
+          deserializedDmlGraphDesc.OutputEdges = std::move(deserializedGraphDesc.OutputEdges);
+          deserializedDmlGraphDesc.reuseCommandList = graphDesc.reuseCommandList;
+          deserializedDmlGraphDesc.outputShapes = graphDesc.outputShapes;
+        
+          compiledExecutionPlanOperator = DmlGraphFusionHelper::TryCreateCompiledOperator(
+                          deserializedDmlGraphDesc,
+                          indexedSubGraph,
+                          providerImpl,
+                          serializedGraphInputIndexToSubgraphInputIndex,
+                          serializedGraphLargeConstantNameToSubgraphInputIndex);
+        }
 
         auto& fusedNode = graph.BeginFuseSubGraph(indexedSubGraph, indexedSubGraph.GetMetaDef()->name);
         fusedNode.SetExecutionProviderType(onnxruntime::kDmlExecutionProvider);
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphSerialization.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphSerialization.cpp
index 5355964e8db74..ed406fa259fe6 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphSerialization.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphSerialization.cpp
@@ -517,6 +517,9 @@ flatbuffers::DetachedBuffer SerializeDmlGraph(const DmlSerializedGraphDesc& grap
         return builder.Release();
     }
 
+    std::vector<uint32_t> nodesInTopologicalOrder(graphDesc.Nodes.size());
+    PerformTopologicalSortAndCheckIsAcyclic(graphDesc, nodesInTopologicalOrder);
+
     // create input/output edge index to name map
     std::unordered_map<uint32_t, flatbuffers::Offset<flatbuffers::String>> graphInputIndexToNameMap = 
         ConvertToEdgeIndexToNameMap<DmlInputSerializedGraphEdge>(graphDesc.InputEdges, builder);
@@ -548,14 +551,14 @@ flatbuffers::DetachedBuffer SerializeDmlGraph(const DmlSerializedGraphDesc& grap
 
     // Create flatbuffer node objects
     std::vector<flatbuffers::Offset<dml::ir::DmlGraphNode>> nodes(graphDesc.Nodes.size());
-    for (uint32_t nodeIndex = 0; nodeIndex < static_cast<uint32_t>(graphDesc.Nodes.size()); nodeIndex++)
+    for (uint32_t nodeIndex = 0; nodeIndex < static_cast<uint32_t>(nodesInTopologicalOrder.size()); nodeIndex++)
     {
         nodes[nodeIndex] = SerializeNode(
                             builder,
-                            nodeIndex,
-                            graphDesc.Nodes[nodeIndex],
-                            nodeToInputNames[nodeIndex],
-                            nodeToOutputNames[nodeIndex]);
+                            nodesInTopologicalOrder[nodeIndex],
+                            graphDesc.Nodes[nodesInTopologicalOrder[nodeIndex]],
+                            nodeToInputNames[nodesInTopologicalOrder[nodeIndex]],
+                            nodeToOutputNames[nodesInTopologicalOrder[nodeIndex]]);
     }
 
     // Convert to std::vector to create the <dml::ir::DmlGraphDesc> object.
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphHelper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphHelper.h
new file mode 100644
index 0000000000000..d2dd7cd8eff1b
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphHelper.h
@@ -0,0 +1,52 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+
+#pragma once
+#include <queue>
+
+inline void PerformTopologicalSortAndCheckIsAcyclic(
+    const DmlSerializedGraphDesc& graphDesc,
+    std::vector<uint32_t>& nodesInTopologicalOrder)
+{
+    uint32_t nodeCount = static_cast<uint32_t>(graphDesc.Nodes.size());
+    std::queue<uint32_t> queue;
+    std::vector<uint32_t> inDegree(nodeCount, 0);
+    std::vector<std::vector<uint32_t>> children(nodeCount);
+
+    // Don't need to iterate through InputEdges because those inputs don't represent any node
+    // and the purpose of this topological sort is to come up with a order to correctly iterate 
+    // through nodes .
+    for (const DmlIntermediateSerializedGraphEdge& intermediateEdge : graphDesc.IntermediateEdges)
+    {
+        inDegree[intermediateEdge.ToNodeIndex]++;
+        children[intermediateEdge.FromNodeIndex].push_back(intermediateEdge.ToNodeIndex);
+    }
+
+    for (uint32_t nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++)
+    {
+        if (inDegree[nodeIndex] == 0)
+        {
+            queue.push(nodeIndex);
+        }
+    }
+
+    uint32_t nodeIndex = 0;
+    while (!queue.empty())
+    {
+        if (nodeIndex >= nodeCount)
+        {
+            throw std::invalid_argument("Given graph is not acyclic.");
+        }
+
+        uint32_t currNodeIndex = queue.front();
+        queue.pop();
+        nodesInTopologicalOrder[nodeIndex++] = currNodeIndex;
+
+        for (uint32_t child : children[currNodeIndex])
+        {
+            if (--inDegree[child] == 0)
+            {
+                queue.push(child);
+            }
+        }
+    }
+}
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp
index e6f008af5c23f..a346c0c9fb17a 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp
@@ -36,7 +36,9 @@ namespace Dml::GraphDescBuilder
         std::vector<DmlSerializedGraphNode>& graphNodes,
         std::vector<DmlInputSerializedGraphEdge>& graphInputEdges,
         std::vector<DmlIntermediateSerializedGraphEdge>& graphIntermediateEdges,
-        std::vector<DmlOutputSerializedGraphEdge>& graphOutputEdges)
+        std::vector<DmlOutputSerializedGraphEdge>& graphOutputEdges,
+        std::unordered_map<uint32_t, uint32_t>& serializedGraphInputIndexToSubgraphInputIndex,
+        std::unordered_map<std::string_view, uint32_t>& serializedGraphLargeConstantNameToSubgraphInputIndex)
     {
         enum class NodeState
         {
@@ -124,8 +126,10 @@ namespace Dml::GraphDescBuilder
         graphNodes.resize(graphNodes.size() - shift);
 
         // Adjust the node indices in the input edges
+        std::unordered_set<uint32_t> usedInputEdgeIndex;
         for (auto& inputEdge : graphInputEdges)
         {
+            usedInputEdgeIndex.insert(inputEdge.GraphInputIndex);
             inputEdge.ToNodeIndex = shiftedIndicesMapping[inputEdge.ToNodeIndex];
         }
 
@@ -136,10 +140,54 @@ namespace Dml::GraphDescBuilder
         }
 
         // Adjust the node indices in the intermediate edges
+        std::unordered_set<std::string> usedLargeConstantNames;
         for (auto& intermediateEdge : graphIntermediateEdges)
         {
             intermediateEdge.FromNodeIndex = shiftedIndicesMapping[intermediateEdge.FromNodeIndex];
             intermediateEdge.ToNodeIndex = shiftedIndicesMapping[intermediateEdge.ToNodeIndex];
+            // We need to update the edge name only when the name contains the intermediateEdge.FromNodeIndex
+            size_t pos = intermediateEdge.Name.find("nodeIdx:");
+            if (pos != std::string::npos)
+            {
+                if (pos != 0)
+                {
+                    std::string constantNamePartComingFromModel = intermediateEdge.Name.substr(0, pos - 1);
+                    usedLargeConstantNames.insert(constantNamePartComingFromModel); // need part of name which is coming from the model.
+                    intermediateEdge.Name = constantNamePartComingFromModel;
+                    intermediateEdge.Name += "-nodeIdx:" + std::to_string(intermediateEdge.FromNodeIndex) + "-outputIdx:" + std::to_string(intermediateEdge.FromNodeOutputIndex);
+                }
+                else
+                {
+                    intermediateEdge.Name = "nodeIdx:" + std::to_string(intermediateEdge.FromNodeIndex) + "-outputIdx:" + std::to_string(intermediateEdge.FromNodeOutputIndex);
+                }
+            }
+        }
+
+
+        // Erase the mapping if the input Edge is not used by any node
+        for (auto it = serializedGraphInputIndexToSubgraphInputIndex.begin(); it != serializedGraphInputIndexToSubgraphInputIndex.end();)
+        {
+            if (!usedInputEdgeIndex.count(it->first))
+            {
+                it = serializedGraphInputIndexToSubgraphInputIndex.erase(it);
+            }
+            else
+            {
+                it++;
+            }
+        }
+        
+        // Erase the mapping if the input Edge is not used by any node
+        for (auto it = serializedGraphLargeConstantNameToSubgraphInputIndex.begin(); it != serializedGraphLargeConstantNameToSubgraphInputIndex.end();)
+        {
+            if (!usedLargeConstantNames.count(std::string(it->first)))
+            {
+                it = serializedGraphLargeConstantNameToSubgraphInputIndex.erase(it);
+            }
+            else
+            {
+                it++;
+            }
         }
     }
 
@@ -516,7 +564,12 @@ namespace Dml::GraphDescBuilder
             graphOutputShapes.GetMutableShape(outputIndex) = nodeOutputShapes[graphOutput->Name()].GetShape(outputNodeAndIndex.targetIndex);
         }
 
-        RemoveUnconnectedNodes(dmlGraphNodes, dmlGraphInputEdges, dmlGraphIntermediateEdges, dmlGraphOutputEdges);
+        RemoveUnconnectedNodes(dmlGraphNodes,
+                               dmlGraphInputEdges,
+                               dmlGraphIntermediateEdges,
+                               dmlGraphOutputEdges,
+                               serializedGraphInputIndexToSubgraphInputIndex,
+                               serializedGraphLargeConstantNameToSubgraphInputIndex);
 
         GraphDesc graphDesc{};
         graphDesc.InputCount = static_cast<uint32_t>(dmlGraphInputEdges.size());
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/precomp.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/precomp.h
index 5bdc68b685ee4..49abb89449658 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/precomp.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/precomp.h
@@ -59,6 +59,7 @@
 #include "External/DirectMLHelpers/DmlSerializedGraphDesc.h"
 #include "External/DirectMLHelpers/DmlGraphSerialization.h"
 #include "External/DirectMLHelpers/DmlGraphDeserialization.h"
+#include "External/DirectMLHelpers/DmlGraphHelper.h"
 
 using Microsoft::WRL::ComPtr;